In [1]:
########## import some useful package ##########

import time
t1 = time.time()

import uproot
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm

import ripser
import persim

In [2]:
########## read the data ##########

processes = ["hhvv", "hhmumu", "jjBG_ptcut", "ttBG_ptcut", "bbBG_ptcut", "wwBG", "wwvvBG", "zzBG", "zzvvBG"]
event_type = [str(i)+"_500K" for i in processes]

Xsection = [4.0360050e-02, +3.3945676e-03, 1.345150e+01, 4.423600e+00, 2.325600e+00, 1.506969e+02, 1.224052e+01, 9.618000e+00, 6.473005e+00]     ### unit: fb
Luminosity = 1000   ### unit: fb^-1
simulation_num = 500000

features = ["VLCjetR10N2", "VLCjetR10N2.PT", "VLCjetR10N2.Mass", "VLCjetR10N2.Constituents", "EFlowTrack.fUniqueID", "EFlowTrack.PT", "EFlowTrack.Eta", "EFlowTrack.Phi", "EFlowTrack.PID", "EFlowPhoton.fUniqueID", "EFlowPhoton.ET", "EFlowPhoton.Eta", "EFlowPhoton.Phi", "EFlowNeutralHadron.fUniqueID", "EFlowNeutralHadron.ET", "EFlowNeutralHadron.Eta", "EFlowNeutralHadron.Phi"]

##### set data path #####

event_path = []
for type in event_type:
    event_path.append("/data/mucollider/two_boosted/" + type + "/delphes_output.root")

##### get the data file #####

data_file =[]
for path in event_path:
    data_file.append(uproot.open(path))
    
##### read data with features #####
events = []     ### total events
for process in processes:
    tmp_events = []
    for feature in features:
        tmp_events.append(data_file[processes.index(process)]["Delphes;1"][feature].array())
    tmp_events = np.expand_dims(tmp_events, axis=-1)
    tmp_events = tmp_events.transpose((1,0,2))
    tmp_events = np.squeeze(tmp_events,axis=(2,))
    events.append(tmp_events)
    print("Time:{:^8.4f}(s)".format(time.time()-t1))
del tmp_events

  return cls.numpy.array(value, copy=False)


Time:39.2027 (s)
Time:100.1255(s)
Time:169.0124(s)
Time:248.3700(s)
Time:321.1157(s)
Time:368.3673(s)
Time:410.9279(s)
Time:460.9003(s)
Time:516.2707(s)


In [3]:
########## define useful function ##########

##### calculate significance #####

def significance(s,b):
    return np.sqrt(2*((s+b)*np.log(1+s/b)-s))

##### count event number #####

def count(events):
    events_num = []
    for i, process in enumerate(processes):
        events_num.append(len(events[processes.index(process)]))
        print("There are", events_num[i], process, "events. Corresponding cross section:", Xsection[processes.index(process)]*events_num[i]/simulation_num, "(fb)")
        
##### select if Fat Jet >= 2 #####
        
def Fat_Jet_selection(events):
    where1 = np.where(events[:,features.index("VLCjetR10N2")]>=2)
    return events[where1]

##### select if M_jet_leading > XX GeV #####

def mass_leading_selection(events, low_mass_cut, high_mass_cut):
    where1 = []
    for i in range(len(events)):
        switch=1
        if events[i][features.index("VLCjetR10N2.Mass")][0]<low_mass_cut:
            switch=0
        if events[i][features.index("VLCjetR10N2.Mass")][0]>high_mass_cut:
            switch=0
        if switch==1:
            where1.append(i)
    return events[where1]

##### select if M_jet_subleading > XX GeV #####

def mass_subleading_selection(events, low_mass_cut, high_mass_cut):
    where1 = []
    for i in range(len(events)):
        switch=1
        if events[i][features.index("VLCjetR10N2.Mass")][1]<low_mass_cut:
            switch=0
        if events[i][features.index("VLCjetR10N2.Mass")][1]>high_mass_cut:
            switch=0
        if switch==1:
            where1.append(i)
    return events[where1]

##### select if pT_J_leading > XX GeV #####

def pT_leading_selection(events, low_pT_cut, high_pT_cut):
    where1 = []
    for i in range(len(events)):
        switch = 1
        if events[i][features.index("VLCjetR10N2.PT")][0]<low_pT_cut:
            switch=0
        if events[i][features.index("VLCjetR10N2.PT")][0]>high_pT_cut:
            switch=0
        if switch==1:
            where1.append(i)
    return events[where1]

##### select if pT_J_subleading > XX GeV #####

def pT_subleading_selection(events, low_pT_cut, high_pT_cut):
    where1 = []
    for i in range(len(events)):
        switch = 1
        if events[i][features.index("VLCjetR10N2.PT")][1]<low_pT_cut:
            switch=0
        if events[i][features.index("VLCjetR10N2.PT")][1]>high_pT_cut:
            switch=0
        if switch==1:
            where1.append(i)
    return events[where1]


In [4]:
def get_particle_info(event):
    ##### set jet constituents ID #####

    leadingJ_constituents = event[features.index("VLCjetR10N2.Constituents")][0][:-1]
    subleadingJ_constituents= event[features.index("VLCjetR10N2.Constituents")][1][:-1]

    ##### construct infromation for all EFlow particles in this events and sort by PT #####

    all_ID = np.concatenate((event[features.index("EFlowTrack.fUniqueID")], event[features.index("EFlowPhoton.fUniqueID")], event[features.index("EFlowNeutralHadron.fUniqueID")]))
    all_PT = np.concatenate((event[features.index("EFlowTrack.PT")], event[features.index("EFlowPhoton.ET")], event[features.index("EFlowNeutralHadron.ET")]))
    all_Eta = np.concatenate((event[features.index("EFlowTrack.Eta")], event[features.index("EFlowPhoton.Eta")], event[features.index("EFlowNeutralHadron.Eta")]))
    all_Phi = np.concatenate((event[features.index("EFlowTrack.Phi")], event[features.index("EFlowPhoton.Phi")], event[features.index("EFlowNeutralHadron.Phi")]))
    all_PID = np.concatenate((event[features.index("EFlowTrack.PID")], np.full(len(event[features.index("EFlowPhoton.fUniqueID")]), 22), np.full(len(event[features.index("EFlowNeutralHadron.fUniqueID")]), 4)))

    #####                          categorize PID                      #####
    ##### 1:electron 2:muon 3:photon 4:neutral hadron 5:charged hadron #####

    condlist = [abs(all_PID)==11, abs(all_PID)==13, abs(all_PID)==22, abs(all_PID)==4]
    choicelist = [1, 2, 3, 4]
    all_PID = np.select(condlist, choicelist, 5)

    sort_order = np.argsort(all_PT)[::-1]   ### from large PT to low PT

    all_ID = all_ID[sort_order]
    all_PT = all_PT[sort_order]
    all_Eta = all_Eta[sort_order]
    all_Phi = all_Phi[sort_order]
    all_PID = all_PID[sort_order]

    ##### find Jet particle in EFlow particle #####
    if len(leadingJ_constituents)>20: where_leading = np.where(np.isin(all_ID, leadingJ_constituents))[0][0:20]
    else: where_leading = np.where(np.isin(all_ID, leadingJ_constituents))[0]

    if len(subleadingJ_constituents)>20: where_subleading = np.where(np.isin(all_ID, subleadingJ_constituents))[0][0:20]
    else: where_subleading = np.where(np.isin(all_ID, subleadingJ_constituents))[0]

    leadingJ_PT, subleadingJ_PT = all_PT[where_leading], all_PT[where_subleading]
    leadingJ_Eta, subleadingJ_Eta = all_Eta[where_leading], all_Eta[where_subleading]
    leadingJ_Phi, subleadingJ_Phi = all_Phi[where_leading], all_Phi[where_subleading]
    leadingJ_PID, subleadingJ_PID = all_PID[where_leading], all_PID[where_subleading]

    leadingJ_info, subleadingJ_info = np.stack((leadingJ_PT, leadingJ_Eta, leadingJ_Phi, leadingJ_PID), axis=1).ravel(), np.stack((subleadingJ_PT, subleadingJ_Eta, subleadingJ_Phi, subleadingJ_PID), axis=1).ravel()

    ##### do zero padding #####

    particle_info = np.zeros(2*20*4)
    particle_info[0:len(leadingJ_info)] = leadingJ_info
    particle_info[80:80+len(subleadingJ_info)] = subleadingJ_info
    
    return particle_info

In [5]:
########## preselection ##########

print("Before any selection:")
count(events)

##### 2 fat jet selection #####

for process in processes:
    events[processes.index(process)] = Fat_Jet_selection(events[processes.index(process)])
print("\nAfter 2 fat jet selection:")
count(events)

##### leading jet pT selection #####

leading_low_pT_cut = 200   ### GeV
leading_high_pT_cut = 800   ### GeV
for process in processes:
    events[processes.index(process)] = pT_leading_selection(events[processes.index(process)], leading_low_pT_cut, leading_high_pT_cut)
print("\nAfter leading jet pT selection:")
count(events)

##### subleading jet pT selection #####

subleading_low_pT_cut = 200   ### GeV
subleading_high_pT_cut = 600   ### GeV
for process in processes:
    events[processes.index(process)] = pT_subleading_selection(events[processes.index(process)], subleading_low_pT_cut, subleading_high_pT_cut)
print("\nAfter subleading jet pT selection:")
count(events)

##### leading jet mass selection #####

leading_low_mass_cut = 65
leading_high_mass_cut = 150
for process in processes:
    events[processes.index(process)] = mass_leading_selection(events[processes.index(process)], leading_low_mass_cut, leading_high_mass_cut)
print("\nAfter leading jet mass selection:")
count(events)

##### subleading jet mass selection #####

subleading_low_mass_cut = 65
subleading_high_mass_cut = 150
for process in processes:
    events[processes.index(process)] = mass_subleading_selection(events[processes.index(process)], subleading_low_mass_cut, subleading_high_mass_cut)
print("\nAfter subleading jet mass selection:")
count(events)

Before any selection:
There are 500000 hhvv events. Corresponding cross section: 0.04036005 (fb)
There are 500000 hhmumu events. Corresponding cross section: 0.0033945676 (fb)
There are 500000 jjBG_ptcut events. Corresponding cross section: 13.4515 (fb)
There are 500000 ttBG_ptcut events. Corresponding cross section: 4.4236 (fb)
There are 500000 bbBG_ptcut events. Corresponding cross section: 2.3256 (fb)
There are 500000 wwBG events. Corresponding cross section: 150.6969 (fb)
There are 500000 wwvvBG events. Corresponding cross section: 12.24052 (fb)
There are 500000 zzBG events. Corresponding cross section: 9.618 (fb)
There are 500000 zzvvBG events. Corresponding cross section: 6.473005 (fb)

After 2 fat jet selection:
There are 500000 hhvv events. Corresponding cross section: 0.04036005 (fb)
There are 500000 hhmumu events. Corresponding cross section: 0.0033945676 (fb)
There are 500000 jjBG_ptcut events. Corresponding cross section: 13.4515 (fb)
There are 500000 ttBG_ptcut events. Cor

In [6]:
##### generate particle information #####

particle_info = []

for process in processes:
    for event in events[processes.index(process)]:
        particle_info.append(get_particle_info(event))
    print("Time:{:^8.4f}(s)".format(time.time()-t1))

Time:629.2680(s)
Time:659.2483(s)
Time:667.3630(s)
Time:675.6273(s)
Time:684.3562(s)
Time:694.9510(s)
Time:715.2433(s)
Time:727.8392(s)
Time:752.5693(s)


In [8]:
particle_info[0]

array([ 4.30954247e+01, -7.89546728e-01,  1.97008216e+00,  5.00000000e+00,
        3.17038307e+01, -7.59684265e-01,  1.96242058e+00,  3.00000000e+00,
        2.53575516e+01, -7.86899090e-01,  1.96201515e+00,  5.00000000e+00,
        2.52210560e+01, -8.08560967e-01,  1.93457544e+00,  5.00000000e+00,
        2.35498238e+01, -7.83316016e-01,  1.97401130e+00,  5.00000000e+00,
        2.14236622e+01, -7.78688490e-01,  1.96669447e+00,  5.00000000e+00,
        1.92623024e+01, -7.56675959e-01,  1.97009254e+00,  3.00000000e+00,
        1.58884048e+01, -7.79752731e-01,  1.97663045e+00,  5.00000000e+00,
        1.36894751e+01, -7.86555409e-01,  1.96184230e+00,  4.00000000e+00,
        1.30705929e+01, -7.64987528e-01,  1.90379560e+00,  4.00000000e+00,
        1.26837025e+01, -2.33904347e-01,  2.01430774e+00,  5.00000000e+00,
        1.17850733e+01, -7.79733002e-01,  1.95872068e+00,  4.00000000e+00,
        8.99926281e+00, -7.88574159e-01,  1.97822928e+00,  3.00000000e+00,
        7.98797560e+00, -

In [7]:
##### output test data #####

d = {'particle_info': particle_info}
df = pd.DataFrame(data=d)
df

Unnamed: 0,particle_info
0,"[43.09542465209961, -0.7895467281341553, 1.970..."
1,"[40.64804458618164, 0.2635952830314636, -0.742..."
2,"[66.96754455566406, 1.329986572265625, 1.01288..."
3,"[58.80675506591797, 0.6966723799705505, 2.0086..."
4,"[20.046756744384766, 0.19954101741313934, -1.1..."
...,...
733354,"[121.3589859008789, -0.029672032222151756, -1...."
733355,"[75.09783935546875, -0.13999530673027039, 0.64..."
733356,"[80.9485092163086, 0.5361918807029724, 1.57844..."
733357,"[196.5435333251953, 1.0892689228057861, -1.657..."


In [10]:
import h5py
h5f = h5py.File('/data/mucollider/two_boosted/data_LLfeature.h5', 'w')
for key, value in d.items():
    h5f.create_dataset(key, data=value)
    print(f"已寫入 Dataset: {key}")
h5f.close()

已寫入 Dataset: particle_info
