In [None]:
import os
import h5py
import multiprocessing
from tqdm import tqdm

directory = "/viper/ptmp/arego/LBC/"
column_to_check = "event_no"
datasets_to_check = ["/hits/table", "/records/table"]

def check_and_delete_hdf5(file_path):
    """Check if an HDF5 file has the required column in datasets, delete if missing."""
    logs = []  # Collect logs here instead of printing
    try:
        with h5py.File(file_path, "r") as f:
            for dataset in datasets_to_check:
                if dataset in f:
                    data = f[dataset]

                    if not isinstance(data, h5py.Dataset):  # Ensure it's a dataset
                        logs.append(f"Skipping {dataset} in {file_path}: Not a dataset")
                        continue  # Skip groups

                    logs.append(f"Checking {dataset} in {file_path}: {data.dtype.names}")  # Debug print

                    # ✅ FIX: Compare column names as strings
                    if column_to_check not in map(str, data.dtype.names):
                        logs.append(f"❌ Column '{column_to_check}' not found in {dataset}, deleting file.")
                        # os.remove(file_path)  # Uncomment when confident
                        return file_path, logs  # Return deleted file and logs

                else:
                    logs.append(f"❌ Dataset {dataset} missing in {file_path}, deleting file.")
                    # os.remove(file_path)  # Uncomment when confident
                    return file_path, logs

        logs.append(f"✅ File {file_path} is good")
        return None, logs  # File is valid

    except Exception as e:
        logs.append(f"❌ Error processing {file_path}: {e}")
        return None, logs

def process_files_parallel(files):
    """Process HDF5 files in parallel and print all logs at the end."""
    logs_collection = []
    
    with multiprocessing.Pool(processes=8) as pool:
        with tqdm(total=len(files), desc="Checking HDF5 Files") as pbar:
            for deleted_file, logs in pool.imap_unordered(check_and_delete_hdf5, files):
                logs_collection.extend(logs)
                if deleted_file:
                    logs_collection.append(f"🗑️ Deleted: {deleted_file}")
                pbar.update(1)

    # Print all logs at the end
    print("\n".join(logs_collection))

if __name__ == "__main__":
    hdf_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith(".h5")]

    if not hdf_files:
        print("No HDF5 files found.")
    else:
        print(f"🔍 Checking {len(hdf_files)} HDF5 files in parallel...")
        process_files_parallel(hdf_files)
        print("✅ Processing complete!")



🔍 Checking 5381 HDF5 files in parallel...


Checking HDF5 Files: 100%|██████████| 5381/5381 [01:10<00:00, 76.72it/s] 

Checking /hits/table in /viper/ptmp/arego/LBC/3635.h5: ('index', 'time', 'record_id', 'string_id', 'module_id', 'pmt_id', 'event_no')
Checking /records/table in /viper/ptmp/arego/LBC/3635.h5: ('index', 'record_id', 'time', 'duration', 'location_x', 'location_y', 'location_z', 'orientation_x', 'orientation_y', 'orientation_z', 'energy', 'particle_id', 'length', 'event_no', 'type')
✅ File /viper/ptmp/arego/LBC/3635.h5 is good
Checking /hits/table in /viper/ptmp/arego/LBC/5327.h5: ('index', 'time', 'record_id', 'string_id', 'module_id', 'pmt_id', 'event_no')
Checking /records/table in /viper/ptmp/arego/LBC/5327.h5: ('index', 'record_id', 'time', 'duration', 'location_x', 'location_y', 'location_z', 'orientation_x', 'orientation_y', 'orientation_z', 'energy', 'particle_id', 'length', 'event_no', 'type')
✅ File /viper/ptmp/arego/LBC/5327.h5 is good
Checking /hits/table in /viper/ptmp/arego/LBC/2295.h5: ('index', 'time', 'record_id', 'string_id', 'module_id', 'pmt_id', 'event_no')
Checking /




In [6]:
len([os.path.join(directory, f) for f in os.listdir(directory) if f.endswith(".h5")])

2833

In [3]:
import os
import h5py
import multiprocessing
from tqdm import tqdm

directory = "/viper/ptmp/arego/LBC/"
column_to_check = "event_no"
datasets_to_check = ["/hits/table", "/records/table"]

def check_and_delete_hdf5(file_path):
    """Check if an HDF5 file has the required column in datasets, delete if missing."""
    logs = []  # Collect logs here instead of printing
    try:
        with h5py.File(file_path, "r") as f:
            for dataset in datasets_to_check:
                if dataset in f:
                    data = f[dataset]

                    if not isinstance(data, h5py.Dataset):  # Ensure it's a dataset
                        logs.append(f"Skipping {dataset} in {file_path}: Not a dataset")
                        continue  # Skip groups

                    # ✅ FIX: Compare column names as strings
                    if column_to_check not in map(str, data.dtype.names):
                        logs.append(f"Column '{column_to_check}' not found in {dataset}, deleting file.")
                        os.remove(file_path)  # Uncomment when confident
                        return file_path, logs, False  # File deleted

                else:
                    logs.append(f"Dataset {dataset} missing in {file_path}, deleting file.")
                    os.remove(file_path)  # Uncomment when confident
                    return file_path, logs, False  # File deleted

        logs.append(f"File {file_path} is good")
        return None, logs, True  # File kept

    except Exception as e:
        logs.append(f"Error processing {file_path}: {e}")
        return None, logs, True  # Assume file is kept to avoid miscount

def process_files_parallel(files):
    """Process HDF5 files in parallel and print all logs at the end."""
    logs_collection = []
    deleted_count = 0
    kept_count = 0
    
    with multiprocessing.Pool(processes=8) as pool:
        with tqdm(total=len(files), desc="Checking HDF5 Files") as pbar:
            for deleted_file, logs, kept in pool.imap_unordered(check_and_delete_hdf5, files):
                logs_collection.extend(logs)
                if deleted_file:
                    logs_collection.append(f"🗑️ Deleted: {deleted_file}")
                    deleted_count += 1
                else:
                    kept_count += 1
                pbar.update(1)

    # Print all logs at the end
    print("\n".join(logs_collection))
    print(f"\n✅ Summary: {deleted_count} files deleted, {kept_count} files kept.")

if __name__ == "__main__":
    hdf_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith(".h5")]

    if not hdf_files:
        print("No HDF5 files found.")
    else:
        print(f"🔍 Checking {len(hdf_files)} HDF5 files in parallel...")
        process_files_parallel(hdf_files)
        print("✅ Processing complete!")


🔍 Checking 5381 HDF5 files in parallel...


Checking HDF5 Files: 100%|██████████| 5381/5381 [00:55<00:00, 97.46it/s] 

File /viper/ptmp/arego/LBC/3635.h5 is good
File /viper/ptmp/arego/LBC/1151.h5 is good
File /viper/ptmp/arego/LBC/100.h5 is good
File /viper/ptmp/arego/LBC/822.h5 is good
File /viper/ptmp/arego/LBC/5190.h5 is good
File /viper/ptmp/arego/LBC/5327.h5 is good
File /viper/ptmp/arego/LBC/2295.h5 is good
File /viper/ptmp/arego/LBC/2584.h5 is good
File /viper/ptmp/arego/LBC/1654.h5 is good
File /viper/ptmp/arego/LBC/690.h5 is good
File /viper/ptmp/arego/LBC/1286.h5 is good
File /viper/ptmp/arego/LBC/4029.h5 is good
File /viper/ptmp/arego/LBC/1943.h5 is good
File /viper/ptmp/arego/LBC/632.h5 is good
File /viper/ptmp/arego/LBC/409.h5 is good
File /viper/ptmp/arego/LBC/3267.h5 is good
File /viper/ptmp/arego/LBC/467.h5 is good
File /viper/ptmp/arego/LBC/2202.h5 is good
File /viper/ptmp/arego/LBC/4318.h5 is good
File /viper/ptmp/arego/LBC/2337.h5 is good
File /viper/ptmp/arego/LBC/2992.h5 is good
File /viper/ptmp/arego/LBC/4181.h5 is good
File /viper/ptmp/arego/LBC/3251.h5 is good
File /viper/ptmp/


