In [1]:
! pip install gdown
! gdown --id 1WO2K-SfU2dntGU4Bb3IYBp9Rh7rtTYEr -O filename
! pip install h5p
! pip install torch_geometric
! pip install torch_sparse torch_scatter torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-$(python -c "import torch; print(torch.__version__.split('+')[0])")+cpu.html

Downloading...
From (original): https://drive.google.com/uc?id=1WO2K-SfU2dntGU4Bb3IYBp9Rh7rtTYEr
From (redirected): https://drive.google.com/uc?id=1WO2K-SfU2dntGU4Bb3IYBp9Rh7rtTYEr&confirm=t&uuid=32071a75-a3b8-4da3-8181-851ec9428fcb
To: /kaggle/working/filename
100%|█████████████████████████████████████████| 701M/701M [00:04<00:00, 172MB/s]
[31mERROR: Could not find a version that satisfies the requirement h5p (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for h5p[0m[31m
[0mCollecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_

In [2]:
import h5py
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import MessagePassing, global_mean_pool
from sklearn.neighbors import radius_neighbors_graph
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
from torch.optim.lr_scheduler import ReduceLROnPlateau
import gc

# ======================
# 1. Memory-Efficient Graph Creation
# ======================
def process_in_chunks(h5_path, chunk_size=30000, threshold=0.01):
    """Process HDF5 file in chunks to save memory"""
    all_graphs = []
    with h5py.File(h5_path, 'r') as f:
        total_jets = f['X_jets'].shape[0]
        
        for start_idx in tqdm(range(0, total_jets, chunk_size), 
                          desc="Processing chunks"):
            end_idx = min(start_idx + chunk_size, total_jets)
            
            # Load chunk
            X_chunk = f['X_jets'][start_idx:end_idx]
            m0_chunk = f['m0'][start_idx:end_idx]
            pt_chunk = f['pt'][start_idx:end_idx]
            y_chunk = f['y'][start_idx:end_idx]
            
            # Process chunk
            chunk_graphs = []
            for i in range(X_chunk.shape[0]):
                data = multi_channel_image_to_graph(
                    X_chunk[i,0], X_chunk[i,1], X_chunk[i,2], threshold)
                data.m0 = torch.tensor([m0_chunk[i]], dtype=torch.float)
                data.pt = torch.tensor([pt_chunk[i]], dtype=torch.float)
                data.y = torch.tensor([int(y_chunk[i])], dtype=torch.long)
                chunk_graphs.append(data)
            
            all_graphs.extend(chunk_graphs)
            
            # Clean up memory
            del X_chunk, m0_chunk, pt_chunk, y_chunk, chunk_graphs
            gc.collect()
    
    return all_graphs
# ======================
# 3. Graph Construction 
# ======================
def multi_channel_image_to_graph(ecal, hcal, track, threshold=0.01):
    """Convert 3-channel jet image to graph"""
    nodes = []
    height, width = ecal.shape
    
    for i in range(height):
        for j in range(width):
            total_energy = ecal[i,j] + hcal[i,j] + track[i,j]
            if total_energy > threshold:
                nodes.append([
                    i/float(height),   # norm x
                    j/float(width),   # norm y
                    ecal[i,j],        # ECAL
                    hcal[i,j],        # HCAL
                    track[i,j]        # Track
                ])
    
    if len(nodes) == 0:  # Fallback
        combined = ecal + hcal + track
        max_idx = np.unravel_index(np.argmax(combined), combined.shape)
        nodes.append([
            max_idx[0]/float(height), max_idx[1]/float(width),
            ecal[max_idx], hcal[max_idx], track[max_idx]
        ])
    
    nodes = np.array(nodes, dtype=np.float32)
    pos = nodes[:, :2]
    
    if len(nodes) > 1:
        edges = radius_neighbors_graph(pos, radius=0.15, mode='connectivity')
        edge_index = torch.tensor(edges.nonzero(), dtype=torch.long)
    else:
        edge_index = torch.tensor([[0], [0]], dtype=torch.long)
    
    return Data(x=torch.tensor(nodes, dtype=torch.float),
                edge_index=edge_index)
    # ======================

# 5. Data Loading (First 30,000 jets only)
# ======================
def load_data(filename, num_jets=30000):
    with h5py.File(filename, 'r') as f:
        X_jets = f['X_jets'][:num_jets]  # Only load first 30,000 jets
        m0 = f['m0'][:num_jets]
        pt = f['pt'][:num_jets]
        y = f['y'][:num_jets]
    return X_jets, m0, pt, y

def create_graph_dataset(X_jets, m0, pt, y, threshold=0.01):
    graphs = []
    num_jets = X_jets.shape[0]
    
    for i in tqdm(range(num_jets), desc="Creating graphs"):
        ecal = X_jets[i, 0, :, :]  # ECAL channel
        hcal = X_jets[i, 1, :, :]  # HCAL channel
        track = X_jets[i, 2, :, :]  # Track channel
        
        data = multi_channel_image_to_graph(ecal, hcal, track, threshold)
        data.m0 = torch.tensor([m0[i]], dtype=torch.float)
        data.pt = torch.tensor([pt[i]], dtype=torch.float)
        data.y = torch.tensor([int(y[i])], dtype=torch.long)
        graphs.append(data)
    
    return graphs

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
threshold = 0.01
patience = 5
lr = 0.001
batch_size = 32
hidden_dim = 128
chunk_size = 30000  # Process 30,000 jets at a time

# Process data in chunks
print("Processing entire dataset in chunks...")
graphs = process_in_chunks('/kaggle/working/filename', chunk_size, threshold)

# Split data
train_graphs, val_graphs = train_test_split(graphs, test_size=0.2, random_state=42)
del graphs  # Free memory
gc.collect()

torch.save(train_graphs, "train_graphs.pt")
torch.save(val_graphs, "val_graphs.pt")

Processing entire dataset in chunks...


Processing chunks:   0%|          | 0/5 [00:00<?, ?it/s]

  edge_index = torch.tensor(edges.nonzero(), dtype=torch.long)


In [3]:
! rm /kaggle/working/filename