In [1]:
import time
t1 = time.time()

import math
import os
import pandas as pd
import uproot
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm

import ripser
import persim

In [2]:
##### setting file path #####

sig_rootpath = "/data/Chen-Wang/gghh_highPT_250_200K/Events/run_01_decayed_1/tag_1_delphes_events.root"
sig_file = uproot.open(sig_rootpath)

ttbarBG_rootpath = "/data/Chen-Wang/ttbarBG_200K/Events/run_01/tag_1_delphes_events.root"
ttbarBG_file = uproot.open(ttbarBG_rootpath)

jjBG_rootpath = "/data/Chen-Wang/jjBG_200PT_200K/Events/run_01/tag_1_delphes_events.root"
jjBG_file = uproot.open(jjBG_rootpath)

jjBG_2_rootpath = "/data/Chen-Wang/jjBG_200PT_additional/Events/run_01/tag_1_delphes_events.root"
jjBG_2_file = uproot.open(jjBG_2_rootpath)

In [3]:
##### include signal data #####

sig_events = [sig_file["Delphes;1"]["FatJet"].array(),
        sig_file["Delphes;1"]["FatJet.Eta"].array(),
        sig_file["Delphes;1"]["FatJet.Phi"].array(),
        sig_file["Delphes;1"]["FatJet.PT"].array(),
        sig_file["Delphes;1"]["FatJet.Mass"].array(),
        sig_file["Delphes;1"]["FatJet.Tau[5]"].array(),
        sig_file["Delphes;1"]["FatJet.Particles"].array(),
        sig_file["Delphes;1"]["Particle.PT"].array(),
        sig_file["Delphes;1"]["Particle.Eta"].array(),
        sig_file["Delphes;1"]["Particle.Phi"].array(),
        sig_file["Delphes;1"]["Jet.BTag"].array()
        ]

##### reshape the signal data #####

sig_events = np.expand_dims(sig_events, axis=-1)
sig_events = sig_events.transpose((1,0,2))
sig_events = np.squeeze(sig_events,axis=(2,))

num_sig = len(sig_events)

##### include ttbarBG data #####

ttbarBG_events = [ttbarBG_file["Delphes;1"]["FatJet"].array(),
        ttbarBG_file["Delphes;1"]["FatJet.Eta"].array(),
        ttbarBG_file["Delphes;1"]["FatJet.Phi"].array(),
        ttbarBG_file["Delphes;1"]["FatJet.PT"].array(),
        ttbarBG_file["Delphes;1"]["FatJet.Mass"].array(),
        ttbarBG_file["Delphes;1"]["FatJet.Tau[5]"].array(),
        ttbarBG_file["Delphes;1"]["FatJet.Particles"].array(),
        ttbarBG_file["Delphes;1"]["Particle.PT"].array(),
        ttbarBG_file["Delphes;1"]["Particle.Eta"].array(),
        ttbarBG_file["Delphes;1"]["Particle.Phi"].array(),
        ttbarBG_file["Delphes;1"]["Jet.BTag"].array()
        ]

##### reshape the ttbarBG data #####

ttbarBG_events = np.expand_dims(ttbarBG_events, axis=-1)
ttbarBG_events = ttbarBG_events.transpose((1,0,2))
ttbarBG_events = np.squeeze(ttbarBG_events,axis=(2,))

num_ttbarBG = len(ttbarBG_events)

##### include jjBG data #####

jjBG_events = [jjBG_file["Delphes;1"]["FatJet"].array(),
        jjBG_file["Delphes;1"]["FatJet.Eta"].array(),
        jjBG_file["Delphes;1"]["FatJet.Phi"].array(),
        jjBG_file["Delphes;1"]["FatJet.PT"].array(),
        jjBG_file["Delphes;1"]["FatJet.Mass"].array(),
        jjBG_file["Delphes;1"]["FatJet.Tau[5]"].array(),
        jjBG_file["Delphes;1"]["FatJet.Particles"].array(),
        jjBG_file["Delphes;1"]["Particle.PT"].array(),
        jjBG_file["Delphes;1"]["Particle.Eta"].array(),
        jjBG_file["Delphes;1"]["Particle.Phi"].array(),
        jjBG_file["Delphes;1"]["Jet.BTag"].array()
        ]

##### reshape the jjBG data #####

jjBG_events = np.expand_dims(jjBG_events, axis=-1)
jjBG_events = jjBG_events.transpose((1,0,2))
jjBG_events = np.squeeze(jjBG_events,axis=(2,))

num_jjBG = len(jjBG_events)

##### include jjBG2 data #####

jjBG_2_events = [jjBG_2_file["Delphes;1"]["FatJet"].array(),
        jjBG_2_file["Delphes;1"]["FatJet.Eta"].array(),
        jjBG_2_file["Delphes;1"]["FatJet.Phi"].array(),
        jjBG_2_file["Delphes;1"]["FatJet.PT"].array(),
        jjBG_2_file["Delphes;1"]["FatJet.Mass"].array(),
        jjBG_2_file["Delphes;1"]["FatJet.Tau[5]"].array(),
        jjBG_2_file["Delphes;1"]["FatJet.Particles"].array(),
        jjBG_2_file["Delphes;1"]["Particle.PT"].array(),
        jjBG_2_file["Delphes;1"]["Particle.Eta"].array(),
        jjBG_2_file["Delphes;1"]["Particle.Phi"].array(),
        jjBG_2_file["Delphes;1"]["Jet.BTag"].array()
        ]

##### reshape the jjBG data #####

jjBG_2_events = np.expand_dims(jjBG_2_events, axis=-1)
jjBG_2_events = jjBG_2_events.transpose((1,0,2))
jjBG_2_events = np.squeeze(jjBG_2_events,axis=(2,))

num_jjBG_2 = len(jjBG_2_events)

  return cls.numpy.array(value, copy=False)


In [4]:
##### useful function #####

##### select if Fat Jet >= 2 #####

def Fat_Jet_selection(events):
    where1 = np.where(events[:,0]>=2)    ### 0: number of fat jet
    return events[where1]

##### select if M_jet > 50 GeV #####

def mass_selection(events):
    where1 = []
    for i in range(len(events)):
        switch=1
        if events[i][4][0]<50:
            switch=0
        if events[i][4][1]<50:
            switch=0
        if switch==1:
            where1.append(i)
    return events[where1]

##### calculate X_HH #####

def cal_X_HH(jet_mass1, jet_mass2):
    diff1 = jet_mass1 - 124
    diff2 = jet_mass2 - 124
    if diff1<diff2:
        m1 = jet_mass1
        m2 = jet_mass2
    else:
        m1 = jet_mass2
        m2 = jet_mass1
    return np.sqrt(((m1-124)/(0.1*m1+0.00001))**2 + ((m2-115)/(0.1*m2+0.00001))**2)

##### calculate iht #####

def cal_iht(events):
    return events[3][0]+events[3][1]

##### calculate M_JJ #####

def cal_m_JJ(events):
    p = [0,0,0,0]    ### four momentum
    for i in range(2):    ### leading jet and sub-leading jet
        pt = events[3][i]    ### 3: pT of fat jet
        eta = events[1][i]    ### 1: eta of fat jet
        phi = events[2][i]    ### 2: phi of fat jet
        mass = events[4][i]    ### 4: mass of fat jet
        p[1] = p[1] + pt*np.cos(phi)    ### px
        p[2] = p[2] + pt*np.sin(phi)    ### py
        p[3] = p[3] + pt*np.sinh(eta)    ### pz
        p[0] = p[0] + np.sqrt( mass**2 + (pt*np.cos(phi))**2 + (pt*np.sin(phi))**2 + (pt*np.sinh(eta))**2 )    ### energy
    m_JJ = np.sqrt(p[0]**2 - p[1]**2 - p[2]**2 - p[3]**2)
    return m_JJ

##### calculate deta_JJ #####

def cal_deta_JJ(events):
    eta_J1 = events[1][0]
    eta_J2 = events[1][1]
    return abs(eta_J1-eta_J2)

##### calculate J1_tau21 #####

def cal_J1_tau21(events):
    tau2 = events[5][0][1]
    tau1 = events[5][0][0]
    return tau2/tau1

##### calculate J2_tau21 #####

def cal_J2_tau21(events):
    tau2 = events[5][1][1]
    tau1 = events[5][1][0]
    return tau2/tau1

##### calculate TDA #####

def cal_TDA(event):
    where_jet_particle = event[6][0]+event[6][1]    ### 6: fat jet particles
    where_jet_particle = np.array(where_jet_particle)-1
    center_eta = (event[1][0]+event[1][1])/2    ### 1: fat jet eta
    center_phi = (event[2][0]+event[2][1])/2    ### 2: fat jet phi
    particle_eta = event[8][where_jet_particle]-center_eta    ### 8: particle eta, and centerize
    particle_phi = event[9][where_jet_particle]-center_phi    ### 9: particle phi, and centerize

    where_larger_pi = np.where(particle_phi>np.pi)
    particle_phi[where_larger_pi] = -(2*np.pi - particle_phi[where_larger_pi])    ### check if jet particle split
    where_smaller_mpi = np.where(particle_phi<-np.pi)
    particle_phi[where_smaller_mpi] = 2*np.pi + particle_phi[where_smaller_mpi]    ### check if jet particle split
    P = np.array([particle_eta, particle_phi]).T
    diagrams = ripser.ripser(P)['dgms']
    return diagrams

##### calculate TDA sum of lifetime #####

def cal_TDA_sum(diagrams):
    sum_l0 = np.sum(diagrams[0][:-1,1] - diagrams[0][:-1,0])
    sum_l1 = np.sum(diagrams[1][:,1] - diagrams[1][:,0])
    return sum_l0, sum_l1

##### calculate TDA mean of lifetime #####

def cal_TDA_mean(diagrams):
    l0 = diagrams[0][:-1,1]-diagrams[0][:-1,0]
    l1 = diagrams[1][:,1]-diagrams[1][:,0]
    if np.sum(l1)==0:
        l1 = np.concatenate([l1,[0]])
    return np.mean(l0), np.mean(l1)

##### calculate TDA entropy #####

def cal_TDA_entropy(diagrams):
    l0 = diagrams[0][:-1,1]-diagrams[0][:-1,0]
    L0 = np.sum(l0)
    S0 = (l0/L0)*np.log2((l0/L0))
    l1 = diagrams[1][:,1]-diagrams[1][:,0]
    L1 = np.sum(l1)
    S1 = (l1/L1)*np.log2((l1/L1))
    return -np.sum(S0), -np.sum(S1)

##### calculate TDA Lifetime*Birthtime #####

def cal_TDA_LB(diagrams):
    return np.sum(diagrams[1][:,0]*(diagrams[1][:,1]-diagrams[1][:,0]))

##### select if Btag (small jet) >= 2 #####

def Btag_selection(events):
    where1 = []
    for i in range(len(events)):
        if np.sum(events[i][10]) >= 2:    ### 10: Jet.BTag
            where1.append(i)
    return events[where1]

##### select X_HH < 10 #####

def X_HH_selection(events):
    where1 = []
    for i in range(len(events)):
        if cal_X_HH(events[i][4][0], events[i][4][1]) < 10:
            where1.append(i)
    return events[where1]

In [5]:
##### basic selection #####

sig_events = Fat_Jet_selection(sig_events)
sig_events = mass_selection(sig_events)
sig_events = Btag_selection(sig_events)
sig_events = X_HH_selection(sig_events)
print("There are", len(sig_events), "signal events left after the selection with originally event number", num_sig)
num_sig = len(sig_events)

ttbarBG_events = Fat_Jet_selection(ttbarBG_events)
ttbarBG_events = mass_selection(ttbarBG_events)
ttbarBG_events = Btag_selection(ttbarBG_events)
ttbarBG_events = X_HH_selection(ttbarBG_events)
print("There are", len(ttbarBG_events), "ttbarBG events left after the selection with originally event number", num_ttbarBG)
num_ttbarBG = len(ttbarBG_events)

jjBG_events = Fat_Jet_selection(jjBG_events)
jjBG_events = mass_selection(jjBG_events)
jjBG_events = Btag_selection(jjBG_events)
jjBG_events = X_HH_selection(jjBG_events)
print("There are", len(jjBG_events), "jjBG events left after the selection with originally event number", num_jjBG)
num_jjBG = len(jjBG_events)

jjBG_2_events = Fat_Jet_selection(jjBG_2_events)
jjBG_2_events = mass_selection(jjBG_2_events)
jjBG_2_events = Btag_selection(jjBG_2_events)
jjBG_2_events = X_HH_selection(jjBG_2_events)
print("There are", len(jjBG_2_events), "jjBG_2 events left after the selection with originally event number", num_jjBG_2)
num_jjBG_2 = len(jjBG_2_events)

There are 102879 signal events left after the selection with originally event number 200000
There are 50065 ttbarBG events left after the selection with originally event number 200000
There are 1033 jjBG events left after the selection with originally event number 200000
There are 5288 jjBG_2 events left after the selection with originally event number 1000000


In [6]:
##### calculate high level featueres #####

X_HH = []
for i in range(len(sig_events)):
    X_HH.append(cal_X_HH(sig_events[i][4][0], sig_events[i][4][1]))
for i in range(len(ttbarBG_events)):
    X_HH.append(cal_X_HH(ttbarBG_events[i][4][0], ttbarBG_events[i][4][1]))
for i in range(len(jjBG_events)):
    X_HH.append(cal_X_HH(jjBG_events[i][4][0], jjBG_events[i][4][1]))
for i in range(len(jjBG_2_events)):
    X_HH.append(cal_X_HH(jjBG_2_events[i][4][0], jjBG_2_events[i][4][1]))
print("X_HH finished after time:", time.time()-t1)

iht = []
for i in range(len(sig_events)):
    iht.append(cal_iht(sig_events[i]))
for i in range(len(ttbarBG_events)):
    iht.append(cal_iht(ttbarBG_events[i]))
for i in range(len(jjBG_events)):
    iht.append(cal_iht(jjBG_events[i]))
for i in range(len(jjBG_2_events)):
    iht.append(cal_iht(jjBG_2_events[i]))
print("iht finished after time:", time.time()-t1)

m_JJ = []
for i in range(len(sig_events)):
    m_JJ.append(cal_m_JJ(sig_events[i]))
for i in range(len(ttbarBG_events)):
    m_JJ.append(cal_m_JJ(ttbarBG_events[i]))
for i in range(len(jjBG_events)):
    m_JJ.append(cal_m_JJ(jjBG_events[i]))
for i in range(len(jjBG_2_events)):
    m_JJ.append(cal_m_JJ(jjBG_2_events[i]))
print("m_JJ finished after time:", time.time()-t1)

m_J1 = []
for i in range(len(sig_events)):
    m_J1.append(sig_events[i][4][0])
for i in range(len(ttbarBG_events)):
    m_J1.append(ttbarBG_events[i][4][0])
for i in range(len(jjBG_events)):
    m_J1.append(jjBG_events[i][4][0])
for i in range(len(jjBG_2_events)):
    m_J1.append(jjBG_2_events[i][4][0])
print("m_J1 finished after time:", time.time()-t1)

m_J2 = []
for i in range(len(sig_events)):
    m_J2.append(sig_events[i][4][1])
for i in range(len(ttbarBG_events)):
    m_J2.append(ttbarBG_events[i][4][1])
for i in range(len(jjBG_events)):
    m_J2.append(jjBG_events[i][4][1])
for i in range(len(jjBG_2_events)):
    m_J2.append(jjBG_2_events[i][4][1])
print("m_J2 finished after time:", time.time()-t1)

deta_JJ = []
for i in range(len(sig_events)):
    deta_JJ.append(cal_deta_JJ(sig_events[i]))
for i in range(len(ttbarBG_events)):
    deta_JJ.append(cal_deta_JJ(ttbarBG_events[i]))
for i in range(len(jjBG_events)):
    deta_JJ.append(cal_deta_JJ(jjBG_events[i]))
for i in range(len(jjBG_2_events)):
    deta_JJ.append(cal_deta_JJ(jjBG_2_events[i]))
print("deta_JJ finished after time:", time.time()-t1)

J1_tau21 = []
for i in range(len(sig_events)):
    J1_tau21.append(cal_J1_tau21(sig_events[i]))
for i in range(len(ttbarBG_events)):
    J1_tau21.append(cal_J1_tau21(ttbarBG_events[i]))
for i in range(len(jjBG_events)):
    J1_tau21.append(cal_J1_tau21(jjBG_events[i]))
for i in range(len(jjBG_2_events)):
    J1_tau21.append(cal_J1_tau21(jjBG_2_events[i]))
print("J1_tau21 finished after time:", time.time()-t1)

J2_tau21 = []
for i in range(len(sig_events)):
    J2_tau21.append(cal_J2_tau21(sig_events[i]))
for i in range(len(ttbarBG_events)):
    J2_tau21.append(cal_J2_tau21(ttbarBG_events[i]))
for i in range(len(jjBG_events)):
    J2_tau21.append(cal_J2_tau21(jjBG_events[i]))
for i in range(len(jjBG_2_events)):
    J2_tau21.append(cal_J2_tau21(jjBG_2_events[i]))
print("J2_tau21 finished after time:", time.time()-t1)

X_HH finished after time: 289.99944162368774
iht finished after time: 290.1119043827057
m_JJ finished after time: 297.01312232017517
m_J1 finished after time: 297.0812871456146
m_J2 finished after time: 297.1437301635742
deta_JJ finished after time: 297.2611267566681
J1_tau21 finished after time: 297.4261796474457
J2_tau21 finished after time: 297.5780303478241


In [7]:
##### calculate TDA featueres #####

H_0 = []
H_1 = []
SS_0 = []
SS_1 = []
LB_1 = []

for i in range(len(sig_events)):
    TDA_tmp = cal_TDA(sig_events[i])
    H_0.append(cal_TDA_sum(TDA_tmp)[0])        ### H is the sum of lifetime
    H_1.append(cal_TDA_sum(TDA_tmp)[1])
    SS_0.append(cal_TDA_entropy(TDA_tmp)[0])   ### SS is the entropy of lifetime
    SS_1.append(cal_TDA_entropy(TDA_tmp)[1])
    LB_1.append(cal_TDA_LB(TDA_tmp))           ### LB is the lifetime*birthtime
print("signal TDA features finished after time:", time.time()-t1)
    
for i in range(len(ttbarBG_events)):
    TDA_tmp = cal_TDA(ttbarBG_events[i])
    H_0.append(cal_TDA_sum(TDA_tmp)[0])
    H_1.append(cal_TDA_sum(TDA_tmp)[1])
    SS_0.append(cal_TDA_entropy(TDA_tmp)[0])
    SS_1.append(cal_TDA_entropy(TDA_tmp)[1])
    LB_1.append(cal_TDA_LB(TDA_tmp))
print("ttbarBG TDA features finished after time:", time.time()-t1)
    
for i in range(len(jjBG_events)):
    TDA_tmp = cal_TDA(jjBG_events[i])
    H_0.append(cal_TDA_sum(TDA_tmp)[0])
    H_1.append(cal_TDA_sum(TDA_tmp)[1])
    SS_0.append(cal_TDA_entropy(TDA_tmp)[0])
    SS_1.append(cal_TDA_entropy(TDA_tmp)[1])
    LB_1.append(cal_TDA_LB(TDA_tmp))
print("jjBG TDA features finished after time:", time.time()-t1)

for i in range(len(jjBG_2_events)):
    TDA_tmp = cal_TDA(jjBG_2_events[i])
    H_0.append(cal_TDA_sum(TDA_tmp)[0])
    H_1.append(cal_TDA_sum(TDA_tmp)[1])
    SS_0.append(cal_TDA_entropy(TDA_tmp)[0])
    SS_1.append(cal_TDA_entropy(TDA_tmp)[1])
    LB_1.append(cal_TDA_LB(TDA_tmp))
print("jjBG_2 TDA features finished after time:", time.time()-t1)

signal TDA features finished after time: 854.9578542709351
ttbarBG TDA features finished after time: 1118.392730474472
jjBG TDA features finished after time: 1125.1959760189056
jjBG_2 TDA features finished after time: 1160.1312658786774


In [29]:
##### create true label #####

target2 = np.zeros(num_sig+num_ttbarBG+num_jjBG+num_jjBG_2)
target2[num_sig:] = 1     ### signal label as 0

target3 = np.zeros(num_sig+num_ttbarBG+num_jjBG+num_jjBG_2)
target3[0:num_sig] = 0     ### signal label as 0
target3[num_sig:num_sig+num_ttbarBG] = 1     ### ttbarBG label as 1
target3[num_sig+num_ttbarBG:] = 2     ### jjBG label as 2

In [30]:
features_TDA = ['X_HH', 'iht', 'm_JJ', 'm_J1', 'm_J2', 'deta_JJ', 'J1_tau21', 'J2_tau21', 'H_0', 'H_1', 'SS_0', 'SS_1', 'LB_1']

print(np.shape(LB_1))
print(np.shape(target2))

(159265,)
(159265,)


In [31]:
d = {'X_HH':X_HH, 'iht':iht, 'm_JJ':m_JJ, 'm_J1':m_J1, 'm_J2':m_J2, 'deta_JJ':deta_JJ, 'J1_tau21':J1_tau21, 'J2_tau21':J2_tau21, 'H_0':H_0, 'H_1':H_1, 'SS_0':SS_0, 'SS_1':SS_1, 'LB_1':LB_1, 'target2':target2, 'target3':target3}
df = pd.DataFrame(data=d)
df

Unnamed: 0,X_HH,iht,m_JJ,m_J1,m_J2,deta_JJ,J1_tau21,J2_tau21,H_0,H_1,SS_0,SS_1,LB_1,target2,target3
0,2.169814,498.863251,552.356994,102.008881,112.236855,0.482302,0.221553,0.385671,13.233662,0.314678,5.709411,1.573460,0.355070,0.0,0.0
1,9.475008,593.795044,597.207660,67.299019,80.223694,0.341132,0.742909,0.179675,10.209827,0.090669,4.825404,2.549878,0.018734,0.0,0.0
2,3.305570,639.797974,709.878242,136.601318,168.479492,0.386040,0.395503,0.343482,14.189290,0.422620,6.027356,2.983404,0.161782,0.0,0.0
3,6.146391,496.671631,524.943021,88.915337,80.513718,0.158647,0.406887,0.329383,8.284440,0.056204,5.248461,2.160549,0.004379,0.0,0.0
4,7.433843,1078.768555,1137.874294,358.927338,95.291557,0.214824,0.313913,0.576201,10.056411,0.340978,5.419955,1.501408,0.130146,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159260,5.089304,1573.573853,1599.042715,88.412392,167.013168,0.173529,0.855478,0.528775,10.469873,0.190693,5.289431,1.903203,0.081365,1.0,2.0
159261,9.340556,455.242462,499.149634,68.203171,79.269127,0.582812,0.608380,0.446284,10.620731,0.318192,5.604804,2.640760,0.095618,1.0,2.0
159262,7.773527,632.782104,942.910524,70.630043,97.251419,1.810463,0.646579,0.607893,12.022771,0.135812,5.574103,3.047439,0.022354,1.0,2.0
159263,6.504410,1074.433838,1118.811352,83.542770,203.263428,0.063253,0.594353,0.144684,12.091694,0.199152,5.600882,3.292116,0.059199,1.0,2.0


In [32]:
##### output test data #####
import h5py

h5f = h5py.File('/data/Chen-Wang/test_data_HLfeature.h5', 'w')
h5f.create_dataset('X_HH', data=X_HH)
h5f.create_dataset('iht', data=iht)
h5f.create_dataset('m_JJ', data=m_JJ)
h5f.create_dataset('m_J1', data=m_J1)
h5f.create_dataset('m_J2', data=m_J2)
h5f.create_dataset('deta_JJ', data=deta_JJ)
h5f.create_dataset('J1_tau21', data=J1_tau21)
h5f.create_dataset('J2_tau21', data=J2_tau21)
h5f.create_dataset('H_0', data=H_0)
h5f.create_dataset('H_1', data=H_1)
h5f.create_dataset('SS_0', data=SS_0)
h5f.create_dataset('SS_1', data=SS_1)
h5f.create_dataset('LB_1', data=LB_1)
h5f.create_dataset('target2', data=target2)
h5f.create_dataset('target3', data=target3)
h5f.close()