In [1]:
import awkward as ak
import numpy as np
import uproot as uproot
import matplotlib
import matplotlib.pyplot as plt
import networkx as nx
# import tensorflow as tf
import glob
from numba import jit
import pickle
import os, errno
import time 

In [2]:
def mkdir_p(mypath):
    '''Function to create a new directory, if it not already exist
        - mypath : directory path
    '''
    from errno import EEXIST
    from os import makedirs,path
    try:
        makedirs(mypath)
    except OSError as exc:
        if exc.errno == EEXIST and path.isdir(mypath):
            pass
        else: raise



In [3]:
@jit
def computeEdgeAndLabels(trk_data, ass_data, gra_data, nodes, edges, edges_labels):
    '''Compute the truth graph'''
    for i in range(trk_data.NTracksters):
        nodes.append(i)
        qualities = ass_data.tsCLUE3D_recoToSim_CP_score[i]
        best_sts_i = ass_data.tsCLUE3D_recoToSim_CP[i][ak.argmin(qualities)]
        #best_sts_i = best_sts_i if qualities[best_sts_i]<0.1 else -1
        for j in gra_data.linked_inners[i]:
            edges.append([j,i])
            qualities = ass_data.tsCLUE3D_recoToSim_CP_score[j]
            best_sts_j = ass_data.tsCLUE3D_recoToSim_CP[j][ak.argmin(qualities)]
            #best_sts_j = best_sts_j if qualities[best_sts_j]<0.1 else -1
            if best_sts_i == best_sts_j:
                edges_labels.append(1)
            else:
                edges_labels.append(0)

In [4]:
# f = uproot.open("/eos/cms/store/group/dpg_hgcal/comm_hgcal/hackathon/samples/close_by_double_pion/production/new_new_ntuples/ntuples_3933206_0.root")
# t = f["ntuplizer/tracksters"]
# t.keys()

# #print(t.arrays(["vertices_indexes"])[0])
# for i in t.arrays(["raw_energy"])[0].raw_energy:
#     print(i)
# for i in t.arrays(["vertices_indexes"])[0].vertices_indexes:
#     print(i)
# print(ak.size(t.arrays(["vertices_indexes"])[0].vertices_indexes))

In [None]:
input_folder = "/eos/cms/store/group/dpg_hgcal/comm_hgcal/hackathon/samples/close_by_double_pion/production/new_new_ntuples/"
files = glob.glob(f"{input_folder}/*ntuples_*.root")


X = [ ]
Edges = [ ]
Edges_labels = [ ] 
outputPath  = './dataset_closeByDoublePion/'
mkdir_p(outputPath)

cum_events = 0

N = 10000000
offset = 20
for i_file, file in enumerate(files[offset:]):
    i_file = i_file + offset
    if i_file >= N: break
    try:
        with uproot.open(file) as f:
        #f = uproot.open(file)
            t =  f["ntuplizer/tracksters"]
            calo = f["ntuplizer/simtrackstersCP"]
            ass = f["ntuplizer/associations"]
            gra = f["ntuplizer/graph"]

            trk_data = t.arrays(["NTracksters", "raw_energy","raw_em_energy","barycenter_x","barycenter_y","barycenter_z","eVector0_x", "eVector0_y","eVector0_z","EV1","EV2","EV3","vertices_indexes", "sigmaPCA1", "sigmaPCA2", "sigmaPCA3"])
            gra_data = gra.arrays(['linked_inners'])
            ass_data = ass.arrays([ "tsCLUE3D_recoToSim_CP", "tsCLUE3D_recoToSim_CP_score"])

            X = [ ]
            Edges = [ ]
            Edges_labels = [ ] 
    
    except:
        print("error ", file)
        continue
    print('\nProcessing file {} '.format(file), end="")
    cum_events += len(gra_data)
    if(cum_events%1000 == 0):
        print(f"\nEvents {cum_events}")
    
    start = time.time()
    for ev in range(len(gra_data)):
        print(".", end="")
        
        trackster_sizes = []
        for vertices in trk_data[ev].vertices_indexes:
            trackster_sizes.append(ak.size(vertices))


        # Save the input variables

        x_ev = ak.zip({  "barycenter_x": trk_data[ev].barycenter_x,
                         "barycenter_y": trk_data[ev].barycenter_y,
                         "barycenter_z": trk_data[ev].barycenter_z,
                       "eVector0_x": trk_data[ev].eVector0_x,
                       "eVector0_y": trk_data[ev].eVector0_y,
                       "eVector0_z": trk_data[ev].eVector0_z,
                       "EV1": trk_data[ev].EV1,
                       "EV2": trk_data[ev].EV2,
                       "EV3": trk_data[ev].EV3,
                       "sigmaPCA1": trk_data[ev].sigmaPCA1,
                       "sigmaPCA2": trk_data[ev].sigmaPCA2,
                       "sigmaPCA3": trk_data[ev].sigmaPCA3,
                       "size": trackster_sizes,
                       "raw_en": trk_data[ev].raw_energy, 
                       'raw_em_energy': trk_data[ev].raw_em_energy
                      })

        X.append(x_ev)
        nodes = []
        edges = []
        edges_labels = []
        
        computeEdgeAndLabels(trk_data[ev], ass_data[ev], gra_data[ev], nodes, edges, edges_labels)
        ed_np = np.array(edges).T
        Edges.append(ed_np)
        Edges_labels.append(edges_labels)
        
        
        # Save to disk
        if((ev % 500 == 0 and ev != 0)  or (ev == len(gra_data))):
            stop = time.time()
            print(f"t = {stop-start}")
            print("Saving now the pickle data")

            pickle_dir = outputPath
            with open(pickle_dir+"{}_{}_node_features.pkl".format(str(i_file), str(ev)), "wb") as fp:   #Pickling
                pickle.dump(X, fp)
            with open(pickle_dir+"{}_{}_edges.pkl".format(str(i_file),str(ev)), "wb") as fp:   #Pickling
                pickle.dump(Edges, fp)
            with open(pickle_dir+"{}_{}_edges_labels.pkl".format(str(i_file),str(ev)), "wb") as fp:   #Pickling
                pickle.dump(Edges_labels, fp)
            #Emptying arrays
            ed_np = []
            X = []
            Edges = []
            Edges_labels = []
            start = time.time()


Processing file /eos/cms/store/group/dpg_hgcal/comm_hgcal/hackathon/samples/close_by_double_pion/production/new_new_ntuples/ntuples_3933206_130.root 
Events 5000
.

Compilation is falling back to object mode WITH looplifting enabled because Function "computeEdgeAndLabels" failed type inference due to: non-precise type pyobject
During: typing of argument at /tmp/ipykernel_1225/556552253.py (4)

File "../../../../../../../tmp/ipykernel_1225/556552253.py", line 4:
<source missing, REPL/exec in use?>

  @jit
Compilation is falling back to object mode WITHOUT looplifting enabled because Function "computeEdgeAndLabels" failed type inference due to: cannot determine Numba type of <class 'numba.core.dispatcher.LiftedLoop'>

File "../../../../../../../tmp/ipykernel_1225/556552253.py", line 4:
<source missing, REPL/exec in use?>

  @jit

File "../../../../../../../tmp/ipykernel_1225/556552253.py", line 4:
<source missing, REPL/exec in use?>

Fall-back from the nopython compilation path to the object mode compilation path has been detected, this is deprecated behaviour.

For more information visit https://numba.pydata.org/numba-doc/latest/reference/deprecati

....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................t = 62.64076542854309
Saving now the pickle data
...................................................................................................................................................................................................................................................................................................................................................................................................................................................................

....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................t = 70.00877642631531
Saving now the pickle data
...................................................................................................................................................................................................................................................................................................................................................................................................................................................................

....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................t = 62.522770404815674
Saving now the pickle data
..................................................................................................................................................................................................................................................................................................................................................................................................................................................................