In [4]:
import os
import h5py
import multiprocessing
from tqdm import tqdm

directory = "/raven/ptmp/arego/temp/"
column_to_check = "event_no"
datasets_to_check = ["/hits", "/records"]

def check_and_delete_hdf5(file_path):
    """Check if an HDF5 file has the required column in datasets, delete if missing."""
    try:
        with h5py.File(file_path, "r") as f:
            for dataset in datasets_to_check:
                if dataset in f:
                    data = f[dataset]
                    if not isinstance(data, h5py.Dataset):
                        print('skip')# Check if it's a dataset
                        continue  # Skip groups
                    if column_to_check.encode() not in data.dtype.names:
                        os.remove(file_path)
                        return file_path  # Return deleted file
                else:
                    os.remove(file_path)  # Delete if dataset is missing
                    return file_path
        return None  # File is valid
    except Exception as e:
        print(f"❌ Error processing {file_path}: {e}")
        return None

def process_files_parallel(files):
    """Process HDF5 files in parallel with a progress bar."""
    with multiprocessing.Pool(processes=8) as pool:
        with tqdm(total=len(files), desc="Checking HDF5 Files") as pbar:
            for deleted_file in pool.imap_unordered(check_and_delete_hdf5, files):
                if deleted_file:
                    print(f"🗑️ Deleted: {deleted_file}")
                pbar.update(1)

if __name__ == "__main__":
    hdf_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith(".h5")]

    if not hdf_files:
        print("No HDF5 files found.")
    else:
        print(f"🔍 Checking {len(hdf_files)} HDF5 files in parallel...")
        process_files_parallel(hdf_files)
        print("✅ Processing complete!")


🔍 Checking 1 HDF5 files in parallel...


Checking HDF5 Files: 100%|██████████| 1/1 [00:00<00:00, 91.87it/s]

✅ Processing complete!



