# Data preaparation


In [1]:
import uproot
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split
from collections import namedtuple, defaultdict
import open3d as o3d
import random
import h5py
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
uproot.__version__

'5.0.9'

In [4]:
!ls ..

CaloGNN.png                          Untitled.ipynb
GNNKeras.ipynb                       ZacClub.ipynb
LLP_for_Calo.ipynb                   fixed_length.py
LLP_time-Copy1.ipynb                 input.txt
LLP_time.ipynb                       karate.edgelist
LLP_time_1.ipynb                     mlp-0000.params
[34mMLBasedCaloClustering[m[m                mlp-symbol.json
[34mMLBasedCaloClusteringPipeLine[m[m        mxHybrid.ipynb
ML_01.ipynb                          mxnet_exported_mlp.onnx
[34mML_Cell_data[m[m                         [34mmy_env[m[m
ML_for_Calo.ipynb                    [34mnew_env[m[m
ML_for_Calo_students.ipynb           [34msaved_model[m[m
MyxAODAnalysis.outputs_600evs.root   trackML.ipynb
MyxAODAnalysis_withNB_1.outputs.root


In [5]:
file = uproot.open("../MyxAODAnalysis.outputs_600evs.root")

In [6]:
file.keys()

['analysis;1']

In [7]:
tree = file['analysis']

In [8]:
branches = tree.arrays()

In [9]:
print(tree.keys()) # no. of variables per event

['RunNumber', 'EventNumber', 'cell_eta', 'cell_phi', 'cell_x', 'cell_y', 'cell_z', 'cell_subCalo', 'cell_sampling', 'cell_size', 'cell_hashID', 'neighbor', 'seedCell_id', 'cell_e', 'cell_noiseSigma', 'cell_SNR', 'cell_time', 'cell_weight', 'cell_truth', 'cell_truth_indices', 'cell_shared_indices', 'cell_cluster_index', 'cluster_to_cell_indices', 'cluster_to_cell_weights', 'cell_to_cluster_e', 'cell_to_cluster_eta', 'cell_to_cluster_phi', 'cluster_eta', 'cluster_phi', 'cluster_e', 'cellsNo_cluster', 'clustersNo_event', 'jetEnergyWtdTimeAve', 'jetEta', 'jetPhi', 'jetE', 'jetPt', 'jetNumberPerEvent', 'cellIndices_per_jet']


In [10]:
len(branches['cluster_to_cell_indices'][0])

839

# Preparing data with required features for 600 events

In [10]:
cell_coordinate_x = branches['cell_x']
cell_coordinate_y = branches['cell_y']
cell_coordinate_z = branches['cell_z']
cell_eta = branches['cell_eta']
cell_phi = branches['cell_phi']
cell_subCalo = branches['cell_subCalo']
cell_sampling = branches['cell_sampling']
cell_weight = branches['cell_weight']
cell_truth = branches['cell_truth']
cell_to_cluster_e = branches['cell_to_cluster_e']
cell_noiseSigma = branches['cell_noiseSigma']
cell_SNR = branches['cell_SNR']
cell_time = branches['cell_time']
cellsNo_cluster = branches['cellsNo_cluster']
clustersNo_event = branches['clustersNo_event']
cell_cluster_index = branches['cell_cluster_index']
cell_e = branches['cell_e']
cluster_to_cell_indices = branches['cluster_to_cell_indices']
neighbor = branches['neighbor']

In [12]:
cell_coordinate_x = np.array(cell_coordinate_x)
cell_coordinate_y = np.array(cell_coordinate_y)
cell_coordinate_z = np.array(cell_coordinate_z)
cell_eta = np.array(cell_eta)
cell_phi = np.array(cell_phi)
cell_subCalo = np.array(cell_subCalo)
cell_sampling = np.array(cell_sampling)
cell_weight = np.array(cell_weight)
cell_truth = np.array(cell_truth)
cell_to_cluster_e = np.array(cell_to_cluster_e)
cell_noiseSigma = np.array(cell_noiseSigma)
cell_SNR = np.array(cell_SNR)
cell_time = np.array(cell_time)
#cellsNo_cluster = np.array(cellsNo_cluster)
clustersNo_event = np.array(clustersNo_event)
cell_to_cluster_index = np.array(cell_cluster_index)
cell_e = np.array(cell_e)

In [13]:
cell_to_cluster_index.shape

(600, 187652)

In [14]:
len(cellsNo_cluster[1])

757

In [15]:
data = {}
for i in range(600):
    data[f"data_{i}"] = np.concatenate((np.expand_dims(cell_coordinate_x[i], axis=1), np.expand_dims(cell_coordinate_y[i], axis=1), np.expand_dims(cell_coordinate_z[i], axis=1),
                        np.expand_dims(cell_eta[i], axis=1), np.expand_dims(cell_phi[i], axis=1),
                        np.expand_dims(cell_sampling[i], axis=1),
                        np.expand_dims(cell_noiseSigma[i], axis=1),
                        np.expand_dims(cell_e[i], axis=1)), axis=1)

In [16]:
data['data_2'].shape

(187652, 8)

In [17]:
def save_dict_to_hdf5(dic, filename):
    """
    Save a dictionary to an HDF5 file
    """
    with h5py.File(filename, 'w') as f:
        _save_dict_to_hdf5(f, dic)

def _save_dict_to_hdf5(group, dic):
    """
    Save a dictionary to an HDF5 group
    """
    for key, value in dic.items():
        if isinstance(value, dict):
            subgroup = group.create_group(key)
            _save_dict_to_hdf5(subgroup, value)
        else:
            if isinstance(value, list):
                # Convert list to numpy array before saving
                value = np.array(value)
            group[key] = value

In [18]:
save_dict_to_hdf5(data,'./cellFeatures_600evs.hdf5')

## Loading neighbor cell pairs

In [21]:
hf_neighbor_pairs_unique_sorted= h5py.File("./neighbor_pairs_unique_sorted.hdf5", 'r')
neighbor_pairs_unique_sorted = hf_neighbor_pairs_unique_sorted.get("neighbor_pair")[:]
hf_neighbor_pairs_unique_sorted.close()

In [22]:
len(neighbor_pairs_unique_sorted)

1250242

In [23]:
neighbor_pairs_unique_sorted.shape

(1250242, 2)

## Creating Labels for neighbor_pairs

In [24]:
cell_to_cluster_index.shape

(600, 187652)

In [25]:
true_neighbor_cluster = []
for i in range(600):
    true_neighbor_cluster_event = []
    for pair in neighbor_pairs_unique_sorted:
        if cell_to_cluster_index[i][pair[0]]==cell_to_cluster_index[i][pair[1]]:
            if cell_to_cluster_index[i][pair[0]] != 0:
                true_neighbor_cluster_event.append(1)
            else:
                true_neighbor_cluster_event.append(9) # labelling 9 to non participating cells
        else:
            if cell_to_cluster_index[i][pair[0]]!=0 and cell_to_cluster_index[i][pair[1]]!=0: 
                true_neighbor_cluster_event.append(10) # labelling 10 for cells from two different clusters
            else:
                true_neighbor_cluster_event.append(0)  # labelling 0 for cell from  cluster and nonparticipaing cell
    true_neighbor_cluster.append(true_neighbor_cluster_event)

In [26]:
with h5py.File('./neigbor_truth_600evs.hdf5', 'w') as f: 
    dset = f.create_dataset("neigbor_truth_600evs", data = true_neighbor_cluster)