# Data preaparation


In [1]:
import uproot
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split
from collections import namedtuple, defaultdict
import open3d as o3d
import random
import h5py
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
uproot.__version__

'5.0.9'

In [3]:
!ls ..

CaloGNN.png                          MyxAODAnalysis_withNB_1.outputs.root
[34mGNNCaloClustering[m[m                    Untitled.ipynb
[34mGNNCaloClustering_1[m[m                  ZacClub.ipynb
GNNKeras.ipynb                       fixed_length.py
LLP_for_Calo.ipynb                   input.txt
LLP_time-Copy1.ipynb                 karate.edgelist
LLP_time.ipynb                       mlp-0000.params
LLP_time_1.ipynb                     mlp-symbol.json
[34mMLBasedCaloClustering[m[m                mxHybrid.ipynb
[34mMLBasedCaloClusteringPipeLine[m[m        mxnet_exported_mlp.onnx
ML_01.ipynb                          [34mmy_env[m[m
[34mML_Cell_data[m[m                         [34mnew_env[m[m
ML_for_Calo.ipynb                    [34msaved_model[m[m
ML_for_Calo_students.ipynb           trackML.ipynb
MyxAODAnalysis.outputs_100.root


In [4]:
file = uproot.open("../MyxAODAnalysis.outputs_100.root")

In [5]:
file.keys()

['analysis;1']

In [6]:
tree = file['analysis']

In [7]:
branches = tree.arrays()

In [8]:
print(tree.keys()) # no. of variables per event

['RunNumber', 'EventNumber', 'cell_eta', 'cell_phi', 'cell_x', 'cell_y', 'cell_z', 'cell_subCalo', 'cell_sampling', 'cell_size', 'cell_hashID', 'neighbor', 'seedCell_id', 'cell_e', 'cell_noiseSigma', 'cell_SNR', 'cell_time', 'cell_weight', 'cell_truth', 'cell_truth_indices', 'cell_shared_indices', 'cell_cluster_index', 'cluster_to_cell_indices', 'cluster_to_cell_weights', 'cell_to_cluster_e', 'cell_to_cluster_eta', 'cell_to_cluster_phi', 'cluster_eta', 'cluster_phi', 'cluster_e', 'cellsNo_cluster', 'clustersNo_event', 'jetEnergyWtdTimeAve', 'jetEta', 'jetPhi', 'jetE', 'jetPt', 'jetNumberPerEvent', 'cellIndices_per_jet']


In [9]:
len(branches['cluster_to_cell_indices'][0])

839

# Preparing data with required features for 600 events

In [10]:
cell_coordinate_x = branches['cell_x']
cell_coordinate_y = branches['cell_y']
cell_coordinate_z = branches['cell_z']
cell_eta = branches['cell_eta']
cell_phi = branches['cell_phi']
cell_subCalo = branches['cell_subCalo']
cell_sampling = branches['cell_sampling']
cell_weight = branches['cell_weight']
cell_truth = branches['cell_truth']
cell_to_cluster_e = branches['cell_to_cluster_e']
cell_noiseSigma = branches['cell_noiseSigma']
cell_SNR = branches['cell_SNR']
cell_time = branches['cell_time']
cellsNo_cluster = branches['cellsNo_cluster']
clustersNo_event = branches['clustersNo_event']
cell_cluster_index = branches['cell_cluster_index']
cell_e = branches['cell_e']
cluster_to_cell_indices = branches['cluster_to_cell_indices']
neighbor = branches['neighbor']

In [11]:
cell_coordinate_x = np.array(cell_coordinate_x)
cell_coordinate_y = np.array(cell_coordinate_y)
cell_coordinate_z = np.array(cell_coordinate_z)
cell_eta = np.array(cell_eta)
cell_phi = np.array(cell_phi)
cell_subCalo = np.array(cell_subCalo)
cell_sampling = np.array(cell_sampling)
cell_weight = np.array(cell_weight)
cell_truth = np.array(cell_truth)
cell_to_cluster_e = np.array(cell_to_cluster_e)
cell_noiseSigma = np.array(cell_noiseSigma)
cell_SNR = np.array(cell_SNR)
cell_time = np.array(cell_time)
#cellsNo_cluster = np.array(cellsNo_cluster)
clustersNo_event = np.array(clustersNo_event)
cell_to_cluster_index = np.array(cell_cluster_index)
cell_e = np.array(cell_e)

In [12]:
cell_to_cluster_index.shape

(100, 187652)

In [13]:
len(cellsNo_cluster[1])

757

In [15]:
data = {}
for i in range(100):
    data[f"data_{i}"] = np.concatenate((np.expand_dims(cell_coordinate_x[i], axis=1), np.expand_dims(cell_coordinate_y[i], axis=1), np.expand_dims(cell_coordinate_z[i], axis=1),
                        np.expand_dims(cell_eta[i], axis=1), np.expand_dims(cell_phi[i], axis=1),
                        np.expand_dims(cell_sampling[i], axis=1),
                        np.expand_dims(cell_noiseSigma[i], axis=1),
                        np.expand_dims(cell_e[i], axis=1)), axis=1)

In [16]:
data['data_2'].shape

(187652, 8)

In [18]:
def save_dict_to_hdf5(dic, filename):
    """
    Save a dictionary to an HDF5 file
    """
    with h5py.File(filename, 'w') as f:
        _save_dict_to_hdf5(f, dic)

def _save_dict_to_hdf5(group, dic):
    """
    Save a dictionary to an HDF5 group
    """
    for key, value in dic.items():
        if isinstance(value, dict):
            subgroup = group.create_group(key)
            _save_dict_to_hdf5(subgroup, value)
        else:
            if isinstance(value, list):
                # Convert list to numpy array before saving
                value = np.array(value)
            group[key] = value

In [None]:
save_dict_to_hdf5(data,'./cellFeatures_100evs.hdf5')

## preparing Neighbor Pairs

In [24]:
neighbor = branches['neighbor'][0]
len(neighbor)

187652

In [25]:
neighbor[0]

In [35]:
arr = np.argwhere(cell_noiseSigma[0]==0)
print(arr)

[[186986]
 [187352]]


In [40]:
neibor_pairs_set = []
for i in range(len(neighbor)):
    if i == 186986 or i==187352: #removing 2 not working cells 
        continue
    for cell in neighbor[i]:
        if cell == 186986 or cell==187352:
            continue
        neibor_pairs_set.append((i,cell))

In [42]:
len(neibor_pairs_set)

2500484

In [43]:
# A code to remove permutation variant
def canonical_form(t):
    """Return a canonical representation of a tuple."""
    return tuple(sorted(t))

def remove_permutation_variants(tuple_list):
    """Remove permutation variants from a list of tuples."""
    unique_tuples = set(canonical_form(t) for t in tuple_list)
    return [tuple(sorted(t)) for t in unique_tuples]

In [44]:
#remove list_of_tuples = [(3, 1), (1, 3), (2, 4), (4, 2), (5, 6)]
neighbor_pairs_unique = remove_permutation_variants(neibor_pairs_set)

In [45]:
len(neighbor_pairs_unique)

1250242

In [47]:
neighbor_pairs_unique_sorted = sorted(neighbor_pairs_unique, key=lambda x: x[0])

In [48]:
neighbor_pairs_unique_sorted

[(0, 26751),
 (0, 1),
 (0, 127),
 (0, 26497),
 (0, 63),
 (0, 26500),
 (0, 64),
 (0, 448),
 (0, 26496),
 (0, 26498),
 (0, 26499),
 (0, 65),
 (1, 26500),
 (1, 64),
 (1, 26501),
 (1, 65),
 (1, 449),
 (1, 26504),
 (1, 26499),
 (1, 26502),
 (1, 66),
 (1, 26503),
 (1, 2),
 (2, 26503),
 (2, 450),
 (2, 26505),
 (2, 26506),
 (2, 67),
 (2, 26504),
 (2, 3),
 (2, 26507),
 (2, 65),
 (2, 26508),
 (2, 66),
 (3, 26509),
 (3, 67),
 (3, 26510),
 (3, 26512),
 (3, 68),
 (3, 451),
 (3, 4),
 (3, 26507),
 (3, 26508),
 (3, 66),
 (3, 26511),
 (4, 67),
 (4, 26512),
 (4, 26513),
 (4, 26515),
 (4, 452),
 (4, 26516),
 (4, 68),
 (4, 26511),
 (4, 69),
 (4, 26514),
 (4, 5),
 (5, 26518),
 (5, 68),
 (5, 26516),
 (5, 26517),
 (5, 26519),
 (5, 69),
 (5, 26520),
 (5, 70),
 (5, 453),
 (5, 26515),
 (5, 6),
 (6, 71),
 (6, 26522),
 (6, 7),
 (6, 69),
 (6, 26520),
 (6, 70),
 (6, 26523),
 (6, 454),
 (6, 26524),
 (6, 26519),
 (6, 26521),
 (7, 71),
 (7, 26524),
 (7, 455),
 (7, 26525),
 (7, 26528),
 (7, 72),
 (7, 26523),
 (7, 8),
 

In [49]:
with h5py.File('./neighbor_pairs_unique_sorted.hdf5', 'w') as f: 
    dset = f.create_dataset("neighbor_pair", data = neighbor_pairs_unique_sorted)

## Loading neighbor cell pairs

In [50]:
hf_neighbor_pairs_unique_sorted= h5py.File("./neighbor_pairs_unique_sorted.hdf5", 'r')
neighbor_pairs_unique_sorted = hf_neighbor_pairs_unique_sorted.get("neighbor_pair")[:]
hf_neighbor_pairs_unique_sorted.close()

In [51]:
len(neighbor_pairs_unique_sorted)

1250242

In [52]:
neighbor_pairs_unique_sorted.shape

(1250242, 2)

## Creating Labels for neighbor_pairs

In [53]:
cell_to_cluster_index.shape

(100, 187652)

In [54]:
true_neighbor_cluster = []
for i in range(100):
    true_neighbor_cluster_event = []
    for pair in neighbor_pairs_unique_sorted:
        if cell_to_cluster_index[i][pair[0]]==cell_to_cluster_index[i][pair[1]]:
            if cell_to_cluster_index[i][pair[0]] != 0:
                true_neighbor_cluster_event.append(1)
            else:
                true_neighbor_cluster_event.append(9) # labelling 9 to non participating cells
        else:
            if cell_to_cluster_index[i][pair[0]]!=0 and cell_to_cluster_index[i][pair[1]]!=0: 
                true_neighbor_cluster_event.append(10) # labelling 10 for cells from two different clusters
            else:
                true_neighbor_cluster_event.append(0)  # labelling 0 for cell from  cluster and nonparticipaing cell
    true_neighbor_cluster.append(true_neighbor_cluster_event)

In [56]:
true_neighbor_cluster = np.array(true_neighbor_cluster)
true_neighbor_cluster.shape

(100, 1250242)

In [60]:
true_neighbor_cluster[4]

array([1, 1, 1, ..., 9, 9, 9])

In [61]:
with h5py.File('./neigbor_truth_100evs.hdf5', 'w') as f: 
    dset = f.create_dataset("neigbor_truth_100evs", data = true_neighbor_cluster)