In [1]:
import uproot
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split
from collections import namedtuple, defaultdict
import open3d as o3d
import random
random.seed(42)
import h5py
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
class SiameseNetwork(nn.Module):
    def __init__(self, input_size):
        super(SiameseNetwork, self).__init__()

        # Define the architecture for one branch of the Siamese network
        self.branch = nn.Sequential(
            nn.Linear(input_size, 128),
            #nn.BatchNorm1d(128),
            nn.ReLU(inplace=True),
            nn.Linear(128, 128),
            #nn.BatchNorm1d(128),
            nn.ReLU(inplace=True),
            nn.Linear(128, 128),
            #nn.BatchNorm1d(128),
            nn.ReLU(inplace=True),
            nn.Linear(128, 64),
            #nn.BatchNorm1d(64),
            nn.ReLU(inplace=True),
            nn.Linear(64, 32),
            #nn.BatchNorm1d(32),
            nn.ReLU(inplace=True),
            nn.Linear(32, 16),
        )

    def forward_one(self, x):
        # Forward pass for one branch of the Siamese network
        return self.branch(x)

    def forward(self, input1, input2):
        # Forward pass for both branches of the Siamese network
        output1 = self.forward_one(input1)
        output2 = self.forward_one(input2)
        return output1, output2

In [3]:
input_size = 7
path = "/Users/bakshiguptad/ML/MLBasedCaloClusteringPipeLine/siamese_net.pth"
model = SiameseNetwork(input_size)
#model.load(torch.load(path))
model = torch.load(path)

In [4]:
print(model)

SiameseNetwork(
  (branch): Sequential(
    (0): Linear(in_features=6, out_features=128, bias=True)
    (1): ReLU(inplace=True)
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): ReLU(inplace=True)
    (4): Linear(in_features=128, out_features=128, bias=True)
    (5): ReLU(inplace=True)
    (6): Linear(in_features=128, out_features=64, bias=True)
    (7): ReLU(inplace=True)
    (8): Linear(in_features=64, out_features=32, bias=True)
    (9): ReLU(inplace=True)
    (10): Linear(in_features=32, out_features=16, bias=True)
  )
)


In [11]:
model.forward_one()

TypeError: forward_one() missing 1 required positional argument: 'x'

## Loading Cell data

In [5]:
hf_cellFeaturesScaled = h5py.File("./cellFeaturesScaled.hdf5", 'r')
hf_test_index_list = h5py.File("./test_index_list.hdf5", 'r')
hf_test_pair = h5py.File("./test_pair.hdf5", 'r')
hf_test_truth = h5py.File("./test_truth.hdf5", 'r')
#hf_pair_ev1 = h5py.File("./pair_index_ev1.hdf5", 'r')
#hf_nopair_ev1 = h5py.File("./nopair_index_ev1.hdf5", 'r')
#hf_cellFeaturesScaled_ev1 = h5py.File("./cellFeaturesScaled_ev1.hdf5", 'r')

In [6]:
cellFeaturesScaled= hf_cellFeaturesScaled.get('cellFeaturesScaled')[:]
#cellFeaturesScaled_ev1= hf_cellFeaturesScaled.get('cellFeaturesScaled')[:]
test_index_list = hf_test_index_list.get('test_index_list')[:]
test_pair = hf_test_pair.get('test_pair')[:]
test_truth = hf_test_truth.get('test_truth_label')[:]
#pair_ev1 = hf_pair_ev1.get("pair")[:]
#nopair_ev1 = hf_nopair_ev1.get("nopair")[:]

In [7]:
test_cellFeaturesScaled = cellFeaturesScaled[test_index_list]

In [8]:
cellFeaturesScaled.shape

(187652, 6)

In [12]:
len(test_cellFeaturesScaled)

71763

In [17]:
# Convert data to PyTorch tensors
testData = [torch.Tensor(data) for data in test_cellFeaturesScaled]

In [18]:
testDataloader = DataLoader(testData, batch_size=71763, shuffle=False)

In [19]:
testDataloader

<torch.utils.data.dataloader.DataLoader at 0x155949f70>

In [20]:
#testData

In [21]:
for batch in testDataloader:
        inputs, target = batch
        input1, input2 = inputs

        # Forward pass
        output1, output2 = model(input1, input2)
        distances = torch.pairwise_distance(output1, output2)
        dist.append(distances)
        label.append(target)

ValueError: too many values to unpack (expected 2)

In [22]:
for batch in testDataloader:
    testOutput = model.forward_one(batch)

In [23]:
len(testOutput)

71763

In [24]:
featuresIn16D = testOutput.detach().numpy()

In [25]:
featuresIn16D.shape

(71763, 16)

In [69]:
len(trial_indices)

206

In [17]:
featuresIn16D_60k = featuresIn16D[:60000]

In [70]:
featuresIn16D_trial = featuresIn16D[trial_indices]

In [316]:
from sklearn import metrics
from sklearn.cluster import DBSCAN
from sklearn.model_selection import GridSearchCV


db = DBSCAN(eps=0.01, min_samples=8).fit(featuresIn16D)

In [317]:
labels = db.labels_

In [318]:
len(labels)

71763

In [319]:
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

In [320]:
n_clusters_

277

In [321]:
n_noise_

36205

In [322]:
labelSet = set(labels)

In [323]:
min(labelSet)

-1

In [324]:
max(labelSet)

276

In [325]:
labelList = list(labelSet)

In [326]:
len(labelList)

278

In [327]:
labelList.pop(277)

-1

In [328]:
badLabels = [i for i in range(len(labels)) if labels[i]==-1]

In [329]:
test_index_list_bad = test_index_list[badLabels]

In [332]:
clusters = []
for item in labelList:
    cellIndices = [i for i in range(len(labels)) if labels[i] == item]
    clusters.append(cellIndices)

In [333]:
len(clusters)

277

In [336]:
len(clusters[2])

1128

In [199]:
test_index_list.shape

(71763,)

In [337]:
clustersCell_indices = []

for item in clusters:
    clustersCell_indices.append(test_index_list[item])


In [62]:
len(clustersCell_indices[2])

38

In [530]:
bad_cell = clustersCell_indices[0]

In [37]:
bad_cell_ev1 = clusters[0]

In [41]:
len(bad_cell_ev1)

50674

In [204]:
hf_nonParticipatingCell = h5py.File("./nonParticipatingCell.hdf5", 'r')
nonParticipatingCell = hf_nonParticipatingCell.get('nonParticipatingCell')[:]

In [205]:
hf_nonParticipatingCell.close()

In [40]:
len(nonParticipatingCell)

164437

In [330]:
sum_cell =0 
for cell in test_index_list_bad:
    if cell in nonParticipatingCell:
        sum_cell = sum_cell+1

In [331]:
sum_cell

27643

In [536]:
len(clustersCell_indices)

422

In [44]:
good_clusters = clusters[1:]

In [45]:
len(good_clusters)

293

In [338]:
predictedPairs = []
for cluster in clusters:
    for i in range(len(cluster)):
        for j in range(i + 1, len(cluster)):
            predictedPairs.append((cluster[i], cluster[j]))

In [339]:
len(predictedPairs)

13533081

In [340]:
len(test_pair)

300000

In [264]:
#predictedPairs

In [50]:
pair_ev1

<HDF5 dataset "pair": shape (2199950, 2), type "<i4">

In [540]:
truePair_indices = [i for i in range(len(test_truth)) if test_truth[i] == 1]

In [541]:
truePair = test_pair[truePair_indices]

In [542]:
len(truePair)

149641

In [543]:
len(predictedPairs)

100043

In [53]:
len(pair_ev1)

2199950

In [62]:
pair_ev1_1 = pair_ev1[:219995]
pair_ev1_2 = pair_ev1[219995:2*219995]
pair_ev1_3 = pair_ev1[2*219995:3*219995]
pair_ev1_4 = pair_ev1[3*219995:4*219995]
pair_ev1_5 = pair_ev1[4*219995:5*219995]

In [58]:
predictedTruth = 0
badPair = []
for pair in predictedPairs:
    if pair in pair_ev1_1:
        predictedTruth = predictedTruth+1
    else:
        badPair.append(pair)

In [64]:
predictedTruth

55668

In [341]:
predictedTruth = 0
for pair in predictedPairs:
    if pair in test_pair:
        predictedTruth = predictedTruth+1

KeyboardInterrupt: 

In [546]:
#badPair

In [547]:
hf_sharedCellsIndices = h5py.File("./sharedCellsIndices.hdf5", 'r')
sharedCells = hf_sharedCellsIndices.get('sharedCellsIndices')[:]
hf_sharedCellsIndices.close()

In [548]:
len(badPair)

15902

In [549]:
sharedPair = []
for pair in badPair:
    if pair[0] in sharedCells or pair[1] in sharedCells:
        sharedPair.append(pair)

In [550]:
len(sharedPair)

228