In [None]:
import h5py
from itertools import repeat
from tqdm import tqdm
from glob import glob
import os


def merge_h5_files(input_paths, output_path):
    # Open the output file
    with h5py.File(output_path, 'w') as h5file_out:
        # Ensure the 'data' group exists in the output file
        if 'data' not in h5file_out:
            data_group_out = h5file_out.create_group('data')
        else:
            data_group_out = h5file_out['data']
        
        # Iterate through each input file
        for input_path in input_paths:
            print(input_path)
            # Determine the prefix ('a' or 'b') based on the input file name
            prefix = os.path.basename(os.path.dirname(input_path))  # Assumes file name is 'a.h5' or 'b.h5'
            
            # Open the input file
            with h5py.File(input_path, 'r') as h5file_in:
                # Iterate through chemical groups in the input file
                for chem_group_name, chem_group in tqdm(h5file_in['data'].items(), desc="Formulas"):
                    # Ensure the chemical group exists in the output file
                    if chem_group_name not in data_group_out:
                        chem_group_out = data_group_out.create_group(chem_group_name)
                    else:
                        chem_group_out = data_group_out[chem_group_name]
                    
                    # Iterate through reaction groups in the chemical group
                    for rxn_group_name, rxn_group in tqdm(chem_group.items(), desc=f"Rxns in {chem_group_name}", leave=False):
                        # Prefix the reaction group name with 'a' or 'b'
                        rxn_group_name_prefixed = f"{prefix}_{rxn_group_name}"
                        
                        # Ensure the reaction group exists in the output file
                        if rxn_group_name_prefixed not in chem_group_out:
                            rxn_group_out = chem_group_out.create_group(rxn_group_name_prefixed)
                        else:
                            rxn_group_out = chem_group_out[rxn_group_name_prefixed]
                        
                        # Copy datasets from input to output, creating new datasets
                        for dset_name, dset in rxn_group.items():
                            data = dset[:]
                            rxn_group_out.create_dataset(dset_name, data=data)

# Define your input and output file paths
input_paths = glob('/home/mhyeok/transition2x_v3/**/wb97x.h5', recursive=True)
output_path = 'c.h5'

# Merge the files
merge_h5_files(input_paths, output_path)