In [None]:
import sys

log = open("output_tml_dataset.log", "w")
sys.stdout = sys.stderr = log

In [32]:
import os
import pandas as pd
import glob
import re
from tqdm.notebook import tqdm
import numpy as np

# Add nhits and weight

In [33]:
# Function to process one event
def process_event(base_folder, event_id, output_folder):
    # Compose file paths
    digitization_folder = os.path.join(base_folder, 'digitization')
    
    measurements_xyz_file = os.path.join(digitization_folder, f"event{event_id}-measurements_global_xyz.csv")
    particles_file = os.path.join(base_folder, f"event{event_id}-particles.csv")
    fatras_hits_file = os.path.join(base_folder, f"event{event_id}-fatras_hits.csv")
    
    # Check files exist
    if not (os.path.isfile(measurements_xyz_file) and os.path.isfile(particles_file) and os.path.isfile(fatras_hits_file)):
        print(f"Skipping event {event_id}: required files missing.")
        return

    try:
        # === HITS ===
        measurements_xyz = pd.read_csv(measurements_xyz_file)
        hits = pd.DataFrame({
            'hit_id': range(1, len(measurements_xyz) + 1),
            'x': measurements_xyz['global_x'],
            'y': measurements_xyz['global_y'],
            'z': measurements_xyz['global_z'],
            'volume_id': measurements_xyz['volume'],
            'layer_id': measurements_xyz['layer'],
            'module_id': measurements_xyz['module']
        })
        hits.to_csv(os.path.join(output_folder, f"event{event_id}-hits.csv"), index=False)
    
        # === PARTICLES ===
        particles = pd.read_csv(particles_file)
        particles_output = pd.DataFrame({
            'particle_id': particles['particle_id'],
            'vx': particles['vx'],
            'vy': particles['vy'],
            'vz': particles['vz'],
            'px': particles['px'],
            'py': particles['py'],
            'pz': particles['pz'],
            'q': particles['q']
        })
    
        # Calculate pT
        particles_output["pT"] = np.sqrt(particles_output["px"]**2 + particles_output["py"]**2)
    
        # Define pT-dependent weight function
        def w_pT(pt):
            if pt < 0.5:
                return 0.2
            elif pt < 3:
                return 0.2 + 0.8 * (pt - 0.5) / (3 - 0.5)
            else:
                return 1.0
    
        particles_output["w_pT"] = particles_output["pT"].apply(w_pT)
    
        # === TRUTH ===
        fatras_hits = pd.read_csv(fatras_hits_file)
        truth_output = pd.DataFrame({
            'hit_id': range(1, len(measurements_xyz) + 1),
            'particle_id': fatras_hits['particle_id'],
            'tx': fatras_hits['tx'],
            'ty': fatras_hits['ty'],
            'tz': fatras_hits['tz'],
            'tpx': fatras_hits['tpx'],
            'tpy': fatras_hits['tpy'],
            'tpz': fatras_hits['tpz']
        })
    
        # Calculate nhits
        nhits_per_particle = (
            truth_output.groupby('particle_id').size().reset_index(name='nhits')
        )
    
        # Merge nhits into particles
        particles_output = pd.merge(
            particles_output,
            nhits_per_particle,
            on='particle_id',
            how='left'
        )
    
        particles_output['nhits'] = particles_output['nhits'].fillna(0).astype(int)
    
        # Calculate weights for truth
        merged = truth_output.merge(hits, on='hit_id')
        merged = merged.merge(particles_output[['particle_id', 'pT', 'w_pT']], on='particle_id', how='left')
        merged['r'] = np.sqrt(merged['x']**2 + merged['y']**2 + merged['z']**2)
        merged.sort_values(by=['particle_id', 'r'], inplace=True)
    
        weights_order = []
        for pid, group in merged.groupby('particle_id'):
            n_hits = len(group)
            for i, row in enumerate(group.itertuples()):
                if i == 0 or i == n_hits - 1:
                    weights_order.append(1.0)
                elif i in [1, n_hits - 2]:
                    weights_order.append(0.7)
                else:
                    weights_order.append(0.4)
    
        merged['w_order'] = weights_order
        merged['w_total'] = merged['w_order'] * merged['w_pT']
        normalization_factor = merged['w_total'].sum()
        merged['weight'] = merged['w_total'] / normalization_factor
    
        # Save updated files
        particles_output.to_csv(os.path.join(output_folder, f"event{event_id}-particles.csv"), index=False)
        merged[['hit_id', 'particle_id', 'tx', 'ty', 'tz', 'tpx', 'tpy', 'tpz', 'weight']].to_csv(
            os.path.join(output_folder, f"event{event_id}-truth.csv"), index=False
        )
    
        # print(f"✅ Processed event {event_id}")

    except Exception as e:
        print(f"❌ Error processing event {event_id}: {e}")

# Just merge files

In [31]:
# # Function to process one event
# def process_event(base_folder, event_id, output_folder):
#     # Compose file paths
#     digitization_folder = os.path.join(base_folder, 'digitization')
    
#     measurements_xyz_file = os.path.join(digitization_folder, f"event{event_id}-measurements_global_xyz.csv")
#     particles_file = os.path.join(base_folder, f"event{event_id}-particles.csv")
#     fatras_hits_file = os.path.join(base_folder, f"event{event_id}-fatras_hits.csv")
    
#     # Check files exist
#     if not (os.path.isfile(measurements_xyz_file) and os.path.isfile(particles_file) and os.path.isfile(fatras_hits_file)):
#         print(f"Skipping event {event_id}: required files missing.")
#         return

#     # Read data
#     measurements_xyz = pd.read_csv(measurements_xyz_file)
#     particles = pd.read_csv(particles_file)
#     fatras_hits = pd.read_csv(fatras_hits_file)

#     error_files = []
    
#     try:
#         # === HITS ===
#         hits = pd.DataFrame({
#             'hit_id': range(1, len(measurements_xyz) + 1),
#             'x': measurements_xyz['global_x'],
#             'y': measurements_xyz['global_y'],
#             'z': measurements_xyz['global_z'],
#             'volume_id': measurements_xyz['volume'],
#             'layer_id': measurements_xyz['layer'],
#             'module_id': measurements_xyz['module']
#         })
#         hits.to_csv(os.path.join(output_folder, f"event{event_id}-hits.csv"), index=False)
    
#         # === PARTICLES ===
#         particles_output = pd.DataFrame({
#             'particle_id': particles['particle_id'],
#             'vx': particles['vx'],
#             'vy': particles['vy'],
#             'vz': particles['vz'],
#             'px': particles['px'],
#             'py': particles['py'],
#             'pz': particles['pz'],
#             'q': particles['q']
#         })
#         particles_output.to_csv(os.path.join(output_folder, f"event{event_id}-particles.csv"), index=False)
    
#         # === TRUTH ===
#         truth_output = pd.DataFrame({
#             'hit_id': range(1, len(measurements_xyz) + 1),
#             'particle_id': fatras_hits['particle_id'],
#             'tx': fatras_hits['tx'],
#             'ty': fatras_hits['ty'],
#             'tz': fatras_hits['tz'],
#             'tpx': fatras_hits['tpx'],
#             'tpy': fatras_hits['tpy'],
#             'tpz': fatras_hits['tpz']
#         })
        
#         truth_output.to_csv(os.path.join(output_folder, f"event{event_id}-truth.csv"), index=False)

#     except:
#         print(event_id)

# Run processing ALL SIMPLE

In [27]:
# Base folders
base_folders = ['data/ttbar_H_production_p50/csv/background', 'data/ttbar_H_production_p50/csv/signal']

In [28]:
# Process all events in each base folder
for base_folder in base_folders:
    digitization_folder = os.path.join(base_folder, 'digitization')
    output_folder = os.path.join(base_folder, 'tml_dataset')
    os.makedirs(output_folder, exist_ok=True)

    # List all events by finding files ending with -measurements_global_xyz.csv
    measurements_files = glob.glob(os.path.join(digitization_folder, '*-measurements_global_xyz.csv'))

    for measurements_file in tqdm(measurements_files):
        # Extract event ID using regex
        match = re.search(r'event(\d+)-measurements_global_xyz.csv', os.path.basename(measurements_file))
        if match:
            event_id = match.group(1)
            process_event(base_folder, event_id, output_folder)
        else:
            print(f"Skipping file {measurements_file}: could not extract event ID")

print("Processing complete!")


  0%|          | 0/20000 [00:00<?, ?it/s]

✅ Processed event 000008690
✅ Processed event 000006952
✅ Processed event 000004561
✅ Processed event 000004270
✅ Processed event 000007111
✅ Processed event 000018490
✅ Processed event 000001438
✅ Processed event 000008342
✅ Processed event 000003781
✅ Processed event 000017480
✅ Processed event 000001711
✅ Processed event 000008895
✅ Processed event 000011863
✅ Processed event 000012041
✅ Processed event 000019976
✅ Processed event 000011106
✅ Processed event 000009363
✅ Processed event 000010814
✅ Processed event 000014125
✅ Processed event 000013905
✅ Processed event 000004104
✅ Processed event 000006264
✅ Processed event 000005976
✅ Processed event 000006572
✅ Processed event 000008783
✅ Processed event 000017442
✅ Processed event 000004895
✅ Processed event 000008115
✅ Processed event 000011525
✅ Processed event 000015968
✅ Processed event 000009956
✅ Processed event 000018172
✅ Processed event 000010411
✅ Processed event 000008220
✅ Processed event 000014054
✅ Processed event 00

KeyboardInterrupt: 

# Run Processing Start from Last file

In [36]:
# Base folders
base_folders = ['data/ttbar_H_production_p50/csv/background', 'data/ttbar_H_production_p50/csv/signal']

In [37]:
# Collect all events (for each base folder)
event_lists = {}
for base_folder in base_folders:
    digitization_folder = os.path.join(base_folder, 'digitization')
    output_folder = os.path.join(base_folder, 'tml_dataset')
    os.makedirs(output_folder, exist_ok=True)

    measurements_files = glob.glob(os.path.join(digitization_folder, '*-measurements_global_xyz.csv'))
    event_ids = []
    for measurements_file in measurements_files:
        match = re.search(r'event(\d+)-measurements_global_xyz.csv', os.path.basename(measurements_file))
        if match:
            event_id = match.group(1)
            output_file = os.path.join(output_folder, f"event{event_id}-hits.csv")
            if not os.path.exists(output_file):
                event_ids.append(event_id)
    event_ids.sort(key=lambda x: int(x))
    event_lists[base_folder] = event_ids

# Interleave signal and background events
bg_events = event_lists[base_folders[0]]
sig_events = event_lists[base_folders[1]]
max_len = max(len(bg_events), len(sig_events))

interleaved_events = []
for i in range(max_len):
    if i < len(bg_events):
        interleaved_events.append((base_folders[0], bg_events[i]))
    if i < len(sig_events):
        interleaved_events.append((base_folders[1], sig_events[i]))

# Process interleaved events
for base_folder, event_id in tqdm(interleaved_events):
    output_folder = os.path.join(base_folder, 'tml_dataset')
    process_event(base_folder, event_id, output_folder)

print("Processing complete!")

  0%|          | 0/40000 [00:00<?, ?it/s]

❌ Error processing event 000000005: array length 27632 does not match index length 27633
❌ Error processing event 000000020: array length 26170 does not match index length 26171
❌ Error processing event 000000027: array length 24411 does not match index length 24412
❌ Error processing event 000000030: array length 23275 does not match index length 23276
❌ Error processing event 000000030: array length 26757 does not match index length 26758
❌ Error processing event 000000031: array length 28018 does not match index length 28019
❌ Error processing event 000000034: array length 24135 does not match index length 24136
❌ Error processing event 000000054: array length 31664 does not match index length 31665
❌ Error processing event 000000078: array length 27183 does not match index length 27184
❌ Error processing event 000000093: array length 23388 does not match index length 23389
❌ Error processing event 000000103: array length 19584 does not match index length 19585
❌ Error processing ev

KeyboardInterrupt: 