In [None]:
!pip install hdf5plugin

In [2]:
import numpy as np
import pandas as pd 
import torch 
import torch.nn as nn
from torch.utils.data import Dataset , DataLoader , TensorDataset
from torch.utils.data import random_split
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import h5py
import os
import hdf5plugin
from sklearn.preprocessing import StandardScaler
import gc
from tqdm import tqdm

In [3]:
torch.manual_seed(50)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


## **Inspecting Hierarchy Of Data**

In [None]:
file_path = '/kaggle/input/lhc-olympics-2020-ad-r-and-d/events_anomalydetection_v2.h5'
group_key = 'df'

print("--- Inspecting HDF5 File Structure ---")
with h5py.File(file_path, 'r') as f:
    # Check if the key points to a group
    if isinstance(f[group_key], h5py.Group):
        group = f[group_key]
        inner_keys = list(group.keys())
        print(f"Keys found inside group '{group_key}': {inner_keys}")
        
        # Print the shape of each dataset within the group
        for key in inner_keys:
            print(f"  -> Dataset '{key}' has shape: {group[key].shape}")
            
print("--- Inspection Complete ---")

## **Creating Jet Images**

In [4]:
file_path = '/kaggle/input/lhc-olympics-2020-ad-r-and-d/events_anomalydetection_v2.h5'
dataset_path = 'df/block0_values'
output_path = 'processed_jet_images.h5'
chunk_size = 50000 
image_shape = (50, 50)
image_bins = [50, 50]
image_range = [[-5, 5], [-np.pi, np.pi]] 

with h5py.File(file_path, 'r') as f_in:
    dataset = f_in[dataset_path]
    num_events = dataset.shape[0]

    with h5py.File(output_path, 'w') as f_out:
        image_dataset = f_out.create_dataset(
            'jet_images',
            shape=(0, image_shape[0], image_shape[1]),
            maxshape=(None, image_shape[0], image_shape[1]),
            dtype='float32',
            chunks=True
        )
        label_dataset = f_out.create_dataset(
            'labels',
            shape=(0,),
            maxshape=(None,),
            dtype='int8',
            chunks=True
        )

        for i in range(0, num_events, chunk_size):
            start = i
            end = min(i + chunk_size, num_events)
            print(f"-> Processing events {start} to {end-1}...")

            raw_chunk_data = dataset[start:end]
            
            chunk_images = []
            X_chunk = raw_chunk_data[:, :-1]
            y_chunk = raw_chunk_data[:, -1]
            X_chunk_reshaped = X_chunk.reshape(-1, 700, 3)

            for event in X_chunk_reshaped:
                pt, eta, phi = event[:, 0], event[:, 1], event[:, 2]
                jet_image, _, _ = np.histogram2d(
                    eta, phi,
                    bins = image_bins,      
                    range = image_range, 
                    weights = pt
                )
                chunk_images.append(jet_image.T)
            
            chunk_images_np = np.array(chunk_images, dtype='float32')
            current_size = image_dset.shape[0]
            new_size = current_size + len(chunk_images_np)
            
            image_dataset.resize(new_size, axis=0)
            image_dataset[current_size:] = chunk_images_np
            
            label_dataset.resize(new_size, axis=0)
            label_dataset[current_size:] = y_chunk

            del raw_chunk_data, chunk_images, chunk_images_np, X_chunk, y_chunk, X_chunk_reshaped
            gc.collect()

print(f"\n✅ Processing complete. All data saved to '{output_path}'.")

Processing data and saving progressively to 'processed_jet_images.h5'...
-> Processing events 0 to 49999...


KeyboardInterrupt: 

## Dataloader creation (with memory constraint --> slow)

In [4]:
class H5JetDataset(Dataset):
    def __init__(self, h5_path, indices, max_pixel_value, is_train=True):
        self.h5_path = h5_path
        self.indices = indices
        self.max_pixel_value = max_pixel_value
        self.is_train = is_train

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, idx):
        with h5py.File(self.h5_path, 'r') as f:
            true_index = self.indices[idx]
            image = f['jet_images'][true_index]
            image = np.expand_dims(image, axis=0) / self.max_pixel_value
            image_tensor = torch.from_numpy(image).float()
            if self.is_train:
                return (image_tensor,)
            else:
                label = f['labels'][true_index]
                label_tensor = torch.tensor(label).float()
                return image_tensor, label_tensor

In [4]:
h5_path = '/kaggle/input/output/processed_jet_images.h5'
with h5py.File(h5_path, 'r') as f:
    labels = f['labels'][:]
    all_indices = np.arange(len(labels))
    background_indices = all_indices[labels == 0]
    signal_indices = all_indices[labels == 1]
    sample_indices = np.random.choice(background_indices, 50000, replace=False)
    sorted_sample_indices = np.sort(sample_indices)
    max_pixel_value = f['jet_images'][sorted_sample_indices].max()

print(f"Found {len(background_indices)} background and {len(signal_indices)} signal events.")
print(f"Data will be normalized by max pixel value: {max_pixel_value:.2f}")

bg_train_indices, bg_val_indices = train_test_split(
    background_indices,
    train_size=0.8,
    random_state=42
)

val_indices = np.concatenate([bg_val_indices, signal_indices])
np.random.shuffle(val_indices)

train_dataset = H5JetDataset(h5_path, 
                             indices=bg_train_indices, 
                             max_pixel_value=max_pixel_value, 
                             is_train=True)

val_dataset = H5JetDataset(h5_path, 
                           indices=val_indices, 
                           max_pixel_value=max_pixel_value, 
                           is_train=False)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False)

print("\n--- Memory-Efficient DataLoaders Ready ---")
print(f"Number of training samples: {len(train_dataset)}")
print(f"Number of validation samples: {len(val_dataset)}")
print(f"Number of batches in train_loader: {len(train_loader)}")

Scanning HDF5 file to get indices and stats...
Found 1000000 background and 100000 signal events.
Data will be normalized by max pixel value: 3672.09


'# --- 3. Create Datasets and DataLoaders ---\n# Create the training dataset (background only)\ntrain_dataset = H5JetDataset(h5_path, \n                             indices=bg_train_indices, \n                             max_pixel_value=max_pixel_value, \n                             is_train=True)\n\n# Create the validation dataset (mixed)\nval_dataset = H5JetDataset(h5_path, \n                           indices=val_indices, \n                           max_pixel_value=max_pixel_value, \n                           is_train=False)\n\n# Create the final DataLoaders\ntrain_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)\nval_loader = DataLoader(val_dataset, batch_size=256, shuffle=False)\n\nprint("\n--- Memory-Efficient DataLoaders Ready ---")\nprint(f"Number of training samples: {len(train_dataset)}")\nprint(f"Number of validation samples: {len(val_dataset)}")\nprint(f"Number of batches in train_loader: {len(train_loader)}")'

## Dataloader creation (small sample --> directly to ram --> fast)

In [None]:
def FastLoader(samples_used = 300000, dset ="val",file_path= None, train_indices=None, valid_indices=None, norm_value=None, batch_size=256, shuffle_set=False):
    image_chunks = []
    label_chunks = []
     if dset == "train":
        if train_indices is None:
            raise ValueError("train_indices must be provided for dset='train'")
        sorted_indices = np.sort(train_indices)
        load_labels = False
    elif dset == "val":
        if valid_indices is None:
            raise ValueError("valid_indices must be provided for dset='val'")
        sorted_indices = np.sort(valid_indices)
        load_labels = True
    else:
        raise ValueError("dset must be 'train' or 'val'")
    samples_used = samples_used
    chunk_size = 20000
    
    with h5py.File(file_path, 'r') as f:
        if 'jet_images' not in f:
            raise KeyError("Dataset 'jet_images' not found in HDF5 file.")
        if load_labels and 'labels' not in f:
             raise KeyError("Dataset 'labels' not found in HDF5 file (needed for validation).")
        img_dataset = f['jet_images']
        label_dataset = f['labels'] if load_labels else None
        for i in tqdm(range(0, samples_used, chunk_size), desc = f"Loading {dset} data"):
            start = i
            end = min(i + chunk_size, samples_used)
            chunk_indices = sorted_indices[start:end]
            image_chunks.append(f['jet_images'][chunk_indices])
    
    print("Concatenating chunks...")
    images_ram = np.concatenate(image_chunks)
    images_ram = np.expand_dims(images_ram, axis=1) / norm_value
    images_tensor = torch.from_numpy(images_ram).float()
    if load_labels:
        labels_ram = np.concatenate(label_chunks)
        labels_tensor = torch.from_numpy(labels_ram).float()
        dataset_fast = TensorDataset(images_tensor, labels_tensor)
    else:
        dataset_fast = TensorDataset(images_tensor)
    loader = DataLoader(dataset_fast, batch_size=batch_size, shuffle=shuffle_set)
    return loader

In [None]:
train_loader = FastLoader(samples_used = 400000, dset="train", file_path = jet_imgs_path, train_indices = bg_train_indices, norm_value = max_pixel_value, shuffle_set=True)
val_loader = FastLoader(samples_used = len(val_indices), dset = "val", file_path = jet_imgs_path, valid_indices = val_indices, norm_value = max_pixel_value, shuffle_set=False)

## **Viusalizing Jet Images (normalized)**

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import torch

def plot_jet_images(loader, title = "Sample Jet Images"):
    data_iter = iter(loader)
    batch_data = next(data_iter)
    if len(batch_data) == 1:
        images, = batch_data 
        labels = None
    else:
        images, labels = batch_data
        
    fig, axes = plt.subplots(4, 4, figsize=(12, 11), constrained_layout=True)
    fig.suptitle(title, fontsize=16)
    mappable = None
    for i in range(16):
        if i >= len(images): 
            axes.flat[i].set_visible(False)
            continue
        ax = axes.flat[i]
        img = images[i].squeeze().numpy()
        im = ax.imshow(img, cmap='viridis', origin='lower')
        if i == 0:
            mappable = im
        if i // 4 == 3:
            ax.set_xlabel("Pseudorapidity (η)")
        if i % 4 == 0:
            ax.set_ylabel("Azimuthal Angle (φ)") 
        ax.set_xticks([])
        ax.set_yticks([])
        if labels is not None:
            label_text = "Signal" if labels[i].item() == 1 else "Background"
            ax.set_title(f"Type: {label_text}")
    if mappable:
        cbar = fig.colorbar(mappable, ax=axes.ravel().tolist(), shrink=0.8, pad=0.02)
        cbar.set_label("Normalized Sum of $p_T$")
    plt.show()

# --- Plotting the Samples ---
print("Displaying sample background jets from the training set...")
plot_jet_images(train_loader, title="Background Jets (for Training)")

print("\nDisplaying sample validation jets (mixed background and signal)...")
plot_jet_images(val_loader, title="Validation Jets")