In [None]:
import os
import sys
import time
import logging
import multiprocessing
import pandas as pd
import numpy as np
from tqdm import tqdm

# Set up logging
logging.basicConfig(
    filename="hdf5_processing_log.txt",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

def process_and_save_hdf5(file_path):
    """Processes and modifies an HDF5 file in-place."""
    try:
        start_time = time.time()
        logging.info(f"Processing {file_path}")
        print(file_path)

        name=os.path.basename(file_path)
        num,_=name.split('.')
        with pd.HDFStore(file_path, mode='a') as store:  # 'a' mode allows modifying the file
            if 'hits' not in store or 'records' not in store:
                logging.error(f"Skipping {file_path}: Missing 'hits' or 'records' dataset.")
                return f"Skipped {file_path}: Missing required datasets."

            logging.info('Successfull loading')
            # Read datasets
            hits = store['hits']
            records = store['records']

            if 'event_no' in hits.columns:
                hits = hits.drop(columns=['event_no'])
            if 'event_no' in records.columns:
                records = records.drop(columns=['event_no'])

            # Compute event bins per record_id (Optimized)
            hits["min_time"] = hits.groupby("record_id")["time"].transform('min')
            hits["event_no"] = np.floor((hits["time"] - hits["min_time"]) // 100 + 1).astype(np.int64)
            hits["event_no"] += ( hits["record_id"].astype(np.int64) * 10**6  # Ensures uniqueness within a dataset
                                    + np.int64(num) * 10**12 ) # Adds uniqueness across datasets)

            hits.drop(columns=["min_time"], inplace=True)

            # Generate event truth mapping (Corrected Aggregation)
            event_truth = hits[['record_id', 'event_no', 'type']].drop_duplicates()
            event_truth = event_truth.groupby('event_no', as_index=False).agg({'record_id': 'min', 'type': 'min'})

            # Drop 'type' column before merging
            #records.drop(columns=['type'], errors='ignore', inplace=True)
            #hits.drop(columns=['type'], errors='ignore', inplace=True)

            # Merge on event_no (Corrected)
            merged_df = records.merge(event_truth, on='record_id', how='left', sort=False)
            return hits,merged_df
            logging.info('hit the save part of the function')
            # ✅ Overwrite modified datasets inside the same HDF5 file
            store.put('hits', hits, format='table', data_columns=True)
            store.put('records', merged_df, format='table', data_columns=True)

        duration = time.time() - start_time
        logging.info(f"Completed {file_path} in {duration:.2f} seconds")
        return f"Processed {file_path} in {duration:.2f} seconds"

    except Exception as e:
        logging.error(f"Error processing {file_path}: {str(e)}")
        return f"Error processing {file_path}: {str(e)}"

def update_progress_bar(total_files, progress_queue):
    """Updates the progress bar based on completed tasks."""
    with tqdm(total=total_files, desc="Processing HDF5 Files") as pbar:
        for _ in range(total_files):
            progress_queue.get()  # Wait for an update
            pbar.update(1)

def process_files_parallel(files, num_workers=8):
    """Processes multiple HDF5 files in parallel using multiprocessing."""
    manager = multiprocessing.Manager()
    progress_queue = manager.Queue()
    total_files = len(files)

    # Start progress bar in a separate process
    progress_process = multiprocessing.Process(target=update_progress_bar, args=(total_files, progress_queue))
    progress_process.start()

    def update_progress(_):
        """Callback function to update progress bar after each process."""
        progress_queue.put(1)

    with multiprocessing.Pool(processes=num_workers) as pool:
        for file_path in files:
            pool.apply_async(process_and_save_hdf5, args=(file_path,), callback=update_progress)

        pool.close()
        pool.join()

    progress_process.join()
    print("Processing complete!")



In [None]:
hits,records=process_and_save_hdf5('data/LargeTMerge/0.h5')

In [None]:
hits[hits['type']!=2]['event_no'].value_counts().sort_index()

In [None]:
def filter_events(hits,filter_value):
    # Use `.value_counts(sort=False)` to prevent unnecessary sorting overhead
    event_counts = hits['event_no'].value_counts(sort=False)
    filtered_counts = hits.loc[hits['type'] != 2, 'event_no'].value_counts(sort=False)

    # Align indices efficiently without introducing NaNs
    filtered_counts = filtered_counts.reindex(event_counts.index, fill_value=0).astype(int)

    # Vectorized filtering (avoids loops for large datasets)
    valid_events = event_counts.index[event_counts.values <= filter_value * (filtered_counts.values+1)]

    # Use `.loc` for efficient filtering
    return hits.loc[hits['event_no'].isin(valid_events)]



In [None]:
a=filter_events(hits,1)

In [None]:
u=hits['event_no'].unique()
v=a['event_no'].unique()

In [None]:
x=[i for i in u if i not in v]

In [None]:
len(x)

In [None]:
plot(hits)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot(hitss):
    """
    Generate and display a histogram plot of 'time' for different 'type' values 
    within a specific 'record_id' in the given DataFrame.
    """
    info={
        0:'Realistic Tracks',
        1:'Cascades',
        2:'Starting Track',
        20:'Electrical noise',
        21:'biolumi noise',
    }

    
    # Select the first unique record_id
    records = hitss['record_id'].unique()
    for record in records:
        
            # Create a figure
        fig, ax = plt.subplots(figsize=(8, 5))
        for i in range(5):
            # Get all unique types
            hits=filter_events(hitss,i)
            types = hits['type'].unique()

            # Filter hits for the selected record_id
            hit = hits[hits['record_id'] == record]


            interval = np.arange(hit['time'].min(), hit['time'].max(), 100)

            y,x = np.histogram(hit[hit['type'] == 2]['time'], bins=interval)
            x = (x[:-1] + x[1:]) / 2  # Convert bin edges to bin centers
            ax.plot(x, y+1, label=f'Type : hits filter level{i}')

#             y,x = np.histogram(hit[hit['type'] != 2]['time'], bins=interval)
#             x = (x[:-1] + x[1:]) / 2  # Convert bin edges to bin centers
#             ax.plot(x, y+1, label=f'Type : Noise')

            # y, x = np.histogram(hit['time'], bins=interval)
            # x = (x[:-1] + x[1:]) / 2  # Convert bin edges to bin centers
            # ax.plot(x, y+1, label=f'Type : All filter level {i}')

            # Labeling and legend
            ax.set_xlabel("Time")
            ax.set_ylabel("Frequency")
            ax.set_title(f"Hit histogram for Record ID {record}")
            ax.legend()

            plt.xlim([0,5000])
            # Show the plot each time the function is called
            plt.yscale('log')
        plt.show()

# Example usage:
#plot(hits)  # This will display the plot each time the function is called
