In [2]:
import h5py
import numpy as np
import os

# === USER CONFIGURATIONS ===
input_file = "COMPAS_Output.h5"
split_group = "BSE_Double_Compact_Objects"
columns_to_remove = ["column1", "column2"]  # Replace with your list
rows_per_file = 100000  # Adjust how many rows you want per output file
output_prefix = "COMPAS_Output"

# === SCRIPT START ===

def get_num_splits(group, rows_per_file):
    first_key = list(group.keys())[0]
    total_rows = group[first_key].shape[0]
    return (total_rows + rows_per_file - 1) // rows_per_file

with h5py.File(input_file, "r") as infile:
    if split_group not in infile:
        raise ValueError(f"Group '{split_group}' not found in {input_file}")

    split_grp = infile[split_group]
    num_splits = get_num_splits(split_grp, rows_per_file)

    print(f"Splitting group '{split_group}' into {num_splits} files...")

    for i in range(num_splits):
        output_file = f"{output_prefix}_{i}.h5"
        with h5py.File(output_file, "w") as outfile:
            # 1. Copy all groups EXCEPT the one we split
            for grp_name in infile:
                if grp_name != split_group:
                    infile.copy(grp_name, outfile)

            # 2. Create split group
            out_split_grp = outfile.create_group(split_group)

            for dataset_name in split_grp:
                if dataset_name in columns_to_remove:
                    continue

                data = split_grp[dataset_name]
                start = i * rows_per_file
                end = min((i + 1) * rows_per_file, data.shape[0])

                # Slice and write
                sliced_data = data[start:end]
                out_split_grp.create_dataset(dataset_name, data=sliced_data, compression="gzip")

        print(f"Written: {output_file}")

print("✅ Done.")


FileNotFoundError: [Errno 2] Unable to open file (unable to open file: name = 'COMPAS_Output.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

## or with a list of things to keep 

In [1]:
import h5py
import numpy as np
import os

# === USER CONFIGURATIONS ===
input_file = "COMPAS_Output.h5"
split_group = "BSE_Double_Compact_Objects"
columns_to_keep = ["SEED", "Mass_1", "Mass_2"]  # <<< Replace with your desired columns
rows_per_file = 100000  # Number of rows per output file
output_prefix = "COMPAS_Output"

# === SCRIPT START ===

def get_num_splits(group, rows_per_file):
    first_key = list(group.keys())[0]
    total_rows = group[first_key].shape[0]
    return (total_rows + rows_per_file - 1) // rows_per_file

with h5py.File(input_file, "r") as infile:
    if split_group not in infile:
        raise ValueError(f"Group '{split_group}' not found in {input_file}")

    split_grp = infile[split_group]
    num_splits = get_num_splits(split_grp, rows_per_file)

    print(f"Splitting group '{split_group}' into {num_splits} files...")

    for i in range(num_splits):
        output_file = f"{output_prefix}_{i}.h5"
        with h5py.File(output_file, "w") as outfile:
            # 1. Copy all groups EXCEPT the one we split
            for grp_name in infile:
                if grp_name != split_group:
                    infile.copy(grp_name, outfile)

            # 2. Create split group
            out_split_grp = outfile.create_group(split_group)

            for dataset_name in split_grp:
                if dataset_name not in columns_to_keep:
                    continue

                data = split_grp[dataset_name]
                start = i * rows_per_file
                end = min((i + 1) * rows_per_file, data.shape[0])

                # Slice and write
                sliced_data = data[start:end]
                out_split_grp.create_dataset(dataset_name, data=sliced_data, compression="gzip")

        print(f"Written: {output_file}")

print("✅ Done.")


FileNotFoundError: [Errno 2] Unable to open file (unable to open file: name = 'COMPAS_Output.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)