In [None]:
import awkward as ak
import numpy as np
import uproot as uproot
import matplotlib
import matplotlib.pyplot as plt
import networkx as nx
# import tensorflow as tf
import glob
from numba import jit
import pickle
import os, errno
import time 

In [None]:
def mkdir_p(mypath):
    '''Function to create a new directory, if it not already exist
        - mypath : directory path
    '''
    from errno import EEXIST
    from os import makedirs,path
    try:
        makedirs(mypath)
    except OSError as exc:
        if exc.errno == EEXIST and path.isdir(mypath):
            pass
        else: raise



In [None]:
@jit
def computeEdgeAndLabels(trk_data, ass_data, gra_data, nodes, edges, edges_labels, best_sim_ts):
    '''Compute the truth graph'''
    for i in range(trk_data.NTracksters):
        nodes.append(i)
        qualities = ass_data.tsCLUE3D_recoToSim_CP_score[i]
        best_sts_i = ass_data.tsCLUE3D_recoToSim_CP[i][ak.argmin(qualities)]
        best_sim_ts.append(best_sts_i)
        #best_sts_i = best_sts_i if qualities[best_sts_i]<0.1 else -1
        for j in gra_data.linked_inners[i]:
            edges.append([j,i])
            qualities = ass_data.tsCLUE3D_recoToSim_CP_score[j]
            best_sts_j = ass_data.tsCLUE3D_recoToSim_CP[j][ak.argmin(qualities)]
            #best_sts_j = best_sts_j if qualities[best_sts_j]<0.1 else -1
            if best_sts_i == best_sts_j:
                edges_labels.append(1)
            else:
                edges_labels.append(0)

In [None]:
input_folder = "/eos/cms/store/group/dpg_hgcal/comm_hgcal/hackathon/samples/wredjeb/close_by_two_pions/ntuples_10_600/"
files = glob.glob(f"{input_folder}/*ntuples_*.root")


X = [ ]
Edges = [ ]
Edges_labels = [ ] 
outputPath  = './dataset_closeByDoublePion/'
mkdir_p(outputPath)

cum_events = 0

N = 10000000
offset = 0
for i_file, file in enumerate(files[offset:1]):
    i_file = i_file + offset
    if i_file >= N: break
    try:
        with uproot.open(file) as f:
        #f = uproot.open(file)
            t =  f["ticlNtuplizer/tracksters"]
            calo = f["ticlNtuplizer/simtrackstersCP"]
            ass = f["ticlNtuplizer/associations"]
            gra = f["ticlNtuplizer/graph"]
            l = f['ticlNtuplizer/clusters']

            trk_data = t.arrays(["NTracksters", "raw_energy","raw_em_energy","barycenter_x","barycenter_y","barycenter_z","eVector0_x", "eVector0_y","eVector0_z","EV1","EV2","EV3","vertices_indexes", "sigmaPCA1", "sigmaPCA2", "sigmaPCA3"])
            gra_data = gra.arrays(['linked_inners'])
            ass_data = ass.arrays([ "tsCLUE3D_recoToSim_CP", "tsCLUE3D_recoToSim_CP_score"])
            lcs_t = t.arrays(['vertices_indexes'])
            lcs = l.arrays(['energy', 'cluster_layer_id'])

            X = [ ]
            Edges = [ ]
            Edges_labels = [ ] 

    except:
        print("error ", file)
        continue
    print('\nProcessing file {} '.format(file), end="")
    if(cum_events%1000 == 0):
        print(f"\nEvents {cum_events}")
    
    start = time.time()
    for ev in range(len(gra_data)):
        cum_events += 1
        print(".", end="")
        
        trackster_sizes = []
        lcs_ev = lcs[ev]
        tot_en_lay = []
        
        it=0
        for vertices in trk_data[ev].vertices_indexes:
#             print(it, vertices)
            it+=1
            trackster_sizes.append(ak.size(vertices))
            t_hgcal_layers = np.zeros(51)            
            lcs_t = lcs_ev[vertices]
            lcs_t_layerId = lcs_t.cluster_layer_id
            for id_i, l_id in enumerate(np.unique(ak.to_numpy(lcs_t_layerId))):
                energy_lcs_t_id = ak.sum(lcs_t[lcs_t.cluster_layer_id == l_id].energy)
                t_hgcal_layers[l_id] = energy_lcs_t_id
            tot_en_lay.append(t_hgcal_layers)
        best_sim_ts = []
        nodes = []
        edges = []
        edges_labels = []        
        computeEdgeAndLabels(trk_data[ev], ass_data[ev], gra_data[ev], nodes, edges, edges_labels, best_sim_ts)
        # Save the input variables
        

        x_ev = ak.zip({  "barycenter_x": trk_data[ev].barycenter_x,
                         "barycenter_y": trk_data[ev].barycenter_y,
                         "barycenter_z": trk_data[ev].barycenter_z,
                       "eVector0_x": trk_data[ev].eVector0_x,
                       "eVector0_y": trk_data[ev].eVector0_y,
                       "eVector0_z": trk_data[ev].eVector0_z,
                       "EV1": trk_data[ev].EV1,
                       "EV2": trk_data[ev].EV2,
                       "EV3": trk_data[ev].EV3,
                       "sigmaPCA1": trk_data[ev].sigmaPCA1,
                       "sigmaPCA2": trk_data[ev].sigmaPCA2,
                       "sigmaPCA3": trk_data[ev].sigmaPCA3,
                       "size": trackster_sizes,
                       "raw_en": trk_data[ev].raw_energy, 
                       'raw_em_energy': trk_data[ev].raw_em_energy,
                       "best_sim_ts" : best_sim_ts
                      })

        X.append(x_ev)        
        
        ed_np = np.array(edges).T
        Edges.append(ed_np)
        Edges_labels.append(edges_labels)     
        enOnLayers.append(tot_en_lay)
        # Save to disk
        if((cum_events % 500 == 0 and cum_events != 0)):
            stop = time.time()
            print(f"t = {stop-start}")
            print("Saving now the pickle data")

            pickle_dir = outputPath
            with open(pickle_dir+"{}_{}_node_features.pkl".format(str(i_file), str(cum_events)), "wb") as fp:   #Pickling
                pickle.dump(X, fp)
            with open(pickle_dir+"{}_{}_edges.pkl".format(str(i_file),str(cum_events)), "wb") as fp:   #Pickling
                pickle.dump(Edges, fp)
            with open(pickle_dir+"{}_{}_edges_labels.pkl".format(str(i_file),str(cum_events)), "wb") as fp:   #Pickling
                pickle.dump(Edges_labels, fp)
            with open(pickle_dir+"{}_{}_energies_on_layers.pkl".format(str(i_file),str(cum_events)), "wb") as fp:   #Pickling
                pickle.dump(enOnLayers, fp)
            #Emptying arrays
            ed_np = []
            X = []
            Edges = []
            Edges_labels = []
            enOnLayers = []
            start = time.time()

### Test

In [None]:
x = X[0]

In [None]:
plt.plot(np.arange(0,51), x.energy_on_layer[4])