# Notebook for creation of the tabular dataset (env: Scikit-HEP)

### Load the packges we are going to use:

In [None]:
import numpy as np 
from pyjet import cluster,DTYPE_PTEPM
import pandas as pd
from tqdm import tqdm
from itertools import combinations 

### Load the data we want to use to create the images:

#### Instead of load the full large h5 file we are going to make a generator to load the file in batches more menageble:

In [None]:
def generator(filename, chunksize=512,total_size=1100000):

    m = 0
    
    while True:

        yield pd.read_hdf(filename,start=m*chunksize, stop=(m+1)*chunksize)

        m+=1
        if (m+1)*chunksize > total_size:
            m=0

### I'm using the generator to load the events in batches of 512 * 100 rows per time, we can increase this number to process things a bit faster, however pyjet and matplotlib will always process everything at the same speed, so to process all the data will take some time.

In [None]:
#fn is the path to the file and fb is our file batch:
fn = '/home/felipe/LHCOlympics_2020/data/events_anomalydetection.h5'
fb = generator(fn,chunksize=512*100)

- For the tabular data I'm using pyjet to cluster the pseudojets and get the exclusive 4 jets, I'm also extracting the jet-substructure information with dcut =0.5

This loop is to get the $signal$ and $background$ events in the  resized_events_anomalydetection.h5 file, it can also be used in the full size file.

### functions for the calculation of deltaphi and deltaR elementwise in the subjets found:

In [None]:
def comp_delta_phi(phi1,phi2):
    delta_phi = np.abs(phi1 - phi2)
    if delta_phi > np.pi:
            delta_phi = 2. * np.pi - delta_phi
    return delta_phi

In [None]:
def delta_R_sub(subjets, n_subjets):
    lj = np.zeros([n_subjets], dtype=DTYPE_PTEPM)
    if len(subjets) < n_subjets:
        for i in range(len(subjets)):
            lj[i]['pT'] = subjets[i].pt
            lj[i]['eta'] = subjets[i].eta
            lj[i]['phi'] = subjets[i].phi
    else:
        for i in range(n_subjets):
            lj[i]['pT'] = subjets[i].pt
            lj[i]['eta'] = subjets[i].eta
            lj[i]['phi'] = subjets[i].phi
        
    comb = combinations(lj, 2)
    deltaR = []
    for i in comb:
        deleta = i[0]['eta'] - i[1]['eta']
        deltaphi = comp_delta_phi(i[0]['phi'], i[1]['phi'])
        deltaR.append(np.sqrt((deleta)**2 + (deltaphi)**2))
    return deltaR

## Signal first:

In [None]:
batch_idx = 0
for batch in fb:
    if batch_idx == 3:
        break
    events_combined = batch.T
    df = pd.DataFrame(columns=['pt_j1', 'm_j1', 'eta_j1', 'phi_j1', 'E_j1', 
                               'pt_j2', 'm_j2', 'eta_j2', 'phi_j2', 'E_j2',
                               'deltaeta', 'deltaphi',
                               'mEratio1', 'mEratio2',
                               'm_jj', 'pt_asym',
                               'deltaR_sj12', 'deltaR_sj13',
                               'deltaR_sj14', 'deltaR_sj23', 
                               'deltaR_sj24', 'deltaR_sj34',
                               'n_subjets', 'event_idx' ])

    for i in tqdm(range(events_combined.keys()[0],events_combined.keys()[-1]+1), total=len(batch)):
        issignal = events_combined[i][2100]
        if issignal == 1: #check if the events are signal and only process if yes
            pseudojets_input = np.zeros(len([x for x in events_combined[i][::3] if x > 0]), dtype=DTYPE_PTEPM)
            for j in range(700): #700 due to the way the jets are padded
                if (events_combined[i][j*3]>0):
                    pseudojets_input[j]['pT'] = events_combined[i][j*3]
                    pseudojets_input[j]['eta'] = events_combined[i][j*3+1]
                    pseudojets_input[j]['phi'] = events_combined[i][j*3+2]
                    pass

            sequence = cluster(pseudojets_input, R=1.0, algo='antikt')
            jets = sequence.inclusive_jets(500)
            jets2 = sequence.exclusive_jets(4)

            pt_j1 = jets[0].pt
            m_j1 = np.abs(jets[0].mass)
            eta_j1 = jets[0].eta
            phi_j1 = jets[0].phi
            E_j1 = jets[0].e

            try:
                pt_j2 = jets[1].pt
                m_j2 = np.abs(jets[1].mass)
                eta_j2 = jets[1].eta
                phi_j2 = jets[1].phi
                E_j2 = jets[1].e
            except IndexError:
                pt_j2 = 0.
                m_j2 = 0.
                eta_j2 = 0.
                phi_j2 = 0.
                E_j2 = 0.
            
            deltaeta = np.abs(eta_j1 - eta_j2)
            deltaphi = comp_delta_phi(phi_j1,phi_j2)

            mEratio1 = m_j1/E_j1
            try:
                mEratio2 = m_j2/E_j2
            except ZeroDivisionError:
                mEratio2 = 0.

            m_jj = m_j1 + m_j2

            pt_asym = (pt_j1 - pt_j2)/(pt_j1 + pt_j2)

            cluster_sj1 = cluster(jets[0].constituents_array(), R=0.4,p=1)
            sj1 = cluster_sj1.inclusive_jets(20)
            n_subjets = len(sj1)

            
            deltaR_list = delta_R_sub(sj1,4)
            deltaR_sj12 = deltaR_list[0]
            deltaR_sj13 = deltaR_list[1]
            deltaR_sj14 = deltaR_list[2]
            deltaR_sj23 = deltaR_list[3]
            deltaR_sj24 = deltaR_list[4]
            deltaR_sj34 = deltaR_list[5]

            #make the pandas entry
            entry = pd.DataFrame([[pt_j1, m_j1, eta_j1, phi_j1, E_j1, 
                                   pt_j2, m_j2, eta_j2, phi_j2, E_j2,
                                   deltaeta, deltaphi,
                                   mEratio1, mEratio2,
                                   m_jj, pt_asym,
                                   deltaR_sj12, deltaR_sj13,
                                   deltaR_sj14, deltaR_sj23, 
                                   deltaR_sj24, deltaR_sj34,
                                   n_subjets, i]],
                                 columns=['pt_j1', 'm_j1', 'eta_j1', 'phi_j1', 'E_j1', 
                                          'pt_j2', 'm_j2', 'eta_j2', 'phi_j2', 'E_j2',
                                          'deltaeta', 'deltaphi',
                                          'mEratio1', 'mEratio2',
                                          'm_jj', 'pt_asym',
                                          'deltaR_sj12', 'deltaR_sj13',
                                          'deltaR_sj14', 'deltaR_sj23', 
                                          'deltaR_sj24', 'deltaR_sj34',
                                          'n_subjets', 'event_idx'])

            #appending the entry (row) into the main dataframe
            df = df.append(entry)
        else:
            continue
    #end the loop and save the batch
    df.to_csv('../csv_data/signal/signal_batch_{}.csv'.format(batch_idx), sep=',', index=False)
    batch_idx += 1

### because the generator is exausted after loop through it, we have to instanciate it again to process the BG data:

In [None]:
fb = generator(fn,chunksize=512*100)

## Now the background:

In [None]:
batch_idx = 0
for batch in fb:
    if batch_idx == 3:
        break
    events_combined = batch.T
    df = pd.DataFrame(columns=['pt_j1', 'm_j1', 'eta_j1', 'phi_j1', 'E_j1', 
                               'pt_j2', 'm_j2', 'eta_j2', 'phi_j2', 'E_j2',
                               'deltaeta', 'deltaphi',
                               'mEratio1', 'mEratio2',
                               'm_jj', 'pt_asym',
                               'deltaR_sj12', 'deltaR_sj13',
                               'deltaR_sj14', 'deltaR_sj23', 
                               'deltaR_sj24', 'deltaR_sj34',
                               'n_subjets', 'event_idx' ])

    for i in tqdm(range(events_combined.keys()[0],events_combined.keys()[-1]+1), total=len(batch)):
        issignal = events_combined[i][2100]
        if issignal == 0: #check if the events are signal and only process if yes
            pseudojets_input = np.zeros(len([x for x in events_combined[i][::3] if x > 0]), dtype=DTYPE_PTEPM)
            for j in range(700): #700 due to the way the jets are padded
                if (events_combined[i][j*3]>0):
                    pseudojets_input[j]['pT'] = events_combined[i][j*3]
                    pseudojets_input[j]['eta'] = events_combined[i][j*3+1]
                    pseudojets_input[j]['phi'] = events_combined[i][j*3+2]
                    pass

            sequence = cluster(pseudojets_input, R=1.0, algo='antikt')
            jets = sequence.inclusive_jets(500)
            jets2 = sequence.exclusive_jets(4)

            pt_j1 = jets[0].pt
            m_j1 = np.abs(jets[0].mass)
            eta_j1 = jets[0].eta
            phi_j1 = jets[0].phi
            E_j1 = jets[0].e

            try:
                pt_j2 = jets[1].pt
                m_j2 = np.abs(jets[1].mass)
                eta_j2 = jets[1].eta
                phi_j2 = jets[1].phi
                E_j2 = jets[1].e
            except IndexError:
                pt_j2 = 0.
                m_j2 = 0.
                eta_j2 = 0.
                phi_j2 = 0.
                E_j2 = 0.
                

            deltaeta = np.abs(eta_j1 - eta_j2)
            deltaphi = comp_delta_phi(phi_j1,phi_j2)

            mEratio1 = m_j1/E_j1
            
            try:
                mEratio2 = m_j2/E_j2
            except ZeroDivisionError:
                mEratio2 = 0.
                
            
            m_jj = m_j1 + m_j2

            pt_asym = (pt_j1 - pt_j2)/(pt_j1 + pt_j2)

            cluster_sj1 = cluster(jets[0].constituents_array(), R=0.4,p=1)
            sj1 = cluster_sj1.inclusive_jets(20)
            n_subjets = len(sj1)

            
            deltaR_list = delta_R_sub(sj1,4)
            deltaR_sj12 = deltaR_list[0]
            deltaR_sj13 = deltaR_list[1]
            deltaR_sj14 = deltaR_list[2]
            deltaR_sj23 = deltaR_list[3]
            deltaR_sj24 = deltaR_list[4]
            deltaR_sj34 = deltaR_list[5]

            #make the pandas entry
            entry = pd.DataFrame([[pt_j1, m_j1, eta_j1, phi_j1, E_j1, 
                                   pt_j2, m_j2, eta_j2, phi_j2, E_j2,
                                   deltaeta, deltaphi,
                                   mEratio1, mEratio2,
                                   m_jj, pt_asym,
                                   deltaR_sj12, deltaR_sj13,
                                   deltaR_sj14, deltaR_sj23, 
                                   deltaR_sj24, deltaR_sj34,
                                   n_subjets, i]],
                                 columns=['pt_j1', 'm_j1', 'eta_j1', 'phi_j1', 'E_j1', 
                                          'pt_j2', 'm_j2', 'eta_j2', 'phi_j2', 'E_j2',
                                          'deltaeta', 'deltaphi',
                                          'mEratio1', 'mEratio2',
                                          'm_jj', 'pt_asym',
                                          'deltaR_sj12', 'deltaR_sj13',
                                          'deltaR_sj14', 'deltaR_sj23', 
                                          'deltaR_sj24', 'deltaR_sj34',
                                          'n_subjets', 'event_idx'])

            #appending the entry (row) into the main dataframe
            df = df.append(entry)
        else:
            continue
    #end the loop and save the batch
    df.to_csv('../csv_data/background/background_batch_{}.csv'.format(batch_idx), sep=',', index=False)
    batch_idx += 1

In [None]:
0./0.

# To create the same tabular data using the blackbox and pythia background:

### Blackbox data:

In [None]:
fn = '/home/felipe/LHCOlympics_2020/data/events_LHCO2020_BlackBox1.h5'
fb = generator(fn,chunksize=512*100)

In [None]:
batch_idx = 0
for batch in fb:
    if batch_idx == 3:
        break
    events_combined = batch.T
    df = pd.DataFrame(columns=['pt_j1', 'm_j1', 'eta_j1', 'phi_j1', 'E_j1', 
                               'pt_j2', 'm_j2', 'eta_j2', 'phi_j2', 'E_j2',
                               'deltaeta', 'deltaphi',
                               'mEratio1', 'mEratio2',
                               'm_jj', 'pt_asym',
                               'deltaR_sj12', 'deltaR_sj13',
                               'deltaR_sj14', 'deltaR_sj23', 
                               'deltaR_sj24', 'deltaR_sj34',
                               'n_subjets', 'event_idx' ])

    for i in tqdm(range(events_combined.keys()[0],events_combined.keys()[-1]+1), total=len(batch)):
        pseudojets_input = np.zeros(len([x for x in events_combined[i][::3] if x > 0]), dtype=DTYPE_PTEPM)
        for j in range(700): #700 due to the way the jets are padded
            if (events_combined[i][j*3]>0):
                pseudojets_input[j]['pT'] = events_combined[i][j*3]
                pseudojets_input[j]['eta'] = events_combined[i][j*3+1]
                pseudojets_input[j]['phi'] = events_combined[i][j*3+2]
                pass

        sequence = cluster(pseudojets_input, R=1.0, algo='antikt')
        jets = sequence.inclusive_jets(500)
        jets2 = sequence.exclusive_jets(4)

        pt_j1 = jets[0].pt
        m_j1 = np.abs(jets[0].mass)
        eta_j1 = jets[0].eta
        phi_j1 = jets[0].phi
        E_j1 = jets[0].e
        try:
            pt_j2 = jets[1].pt
            m_j2 = np.abs(jets[1].mass)
            eta_j2 = jets[1].eta
            phi_j2 = jets[1].phi
            E_j2 = jets[1].e
        except IndexError:
            pt_j2 = 0.
            m_j2 = 0.
            eta_j2 = 0.
            phi_j2 = 0.
            E_j2 = 0.


        deltaeta = np.abs(eta_j1 - eta_j2)
        deltaphi = comp_delta_phi(phi_j1,phi_j2)

        mEratio1 = m_j1/E_j1

        try:
            mEratio2 = m_j2/E_j2
        except ZeroDivisionError:
            mEratio2 = 0.

            
        m_jj = m_j1 + m_j2

        pt_asym = (pt_j1 - pt_j2)/(pt_j1 + pt_j2)

        cluster_sj1 = cluster(jets[0].constituents_array(), R=0.4,p=1)
        sj1 = cluster_sj1.inclusive_jets(20)
        n_subjets = len(sj1)


        deltaR_list = delta_R_sub(sj1,4)
        deltaR_sj12 = deltaR_list[0]
        deltaR_sj13 = deltaR_list[1]
        deltaR_sj14 = deltaR_list[2]
        deltaR_sj23 = deltaR_list[3]
        deltaR_sj24 = deltaR_list[4]
        deltaR_sj34 = deltaR_list[5]

        #make the pandas entry
        entry = pd.DataFrame([[pt_j1, m_j1, eta_j1, phi_j1, E_j1, 
                               pt_j2, m_j2, eta_j2, phi_j2, E_j2,
                               deltaeta, deltaphi,
                               mEratio1, mEratio2,
                               m_jj, pt_asym,
                               deltaR_sj12, deltaR_sj13,
                               deltaR_sj14, deltaR_sj23, 
                               deltaR_sj24, deltaR_sj34,
                               n_subjets, i]],
                             columns=['pt_j1', 'm_j1', 'eta_j1', 'phi_j1', 'E_j1', 
                                      'pt_j2', 'm_j2', 'eta_j2', 'phi_j2', 'E_j2',
                                      'deltaeta', 'deltaphi',
                                      'mEratio1', 'mEratio2',
                                      'm_jj', 'pt_asym',
                                      'deltaR_sj12', 'deltaR_sj13',
                                      'deltaR_sj14', 'deltaR_sj23', 
                                      'deltaR_sj24', 'deltaR_sj34',
                                      'n_subjets', 'event_idx'])

        #appending the entry (row) into the main dataframe
        df = df.append(entry)
    #end the loop and save the batch
    df.to_csv('../csv_data/blackbox/blackbox_batch_{}.csv'.format(batch_idx), sep=',', index=False)
    batch_idx += 1

### Background Pythia:

In [None]:
fn = '/home/felipe/LHCOlympics_2020/data/events_LHCO2020_backgroundMC_Pythia.h5'
fb = generator(fn,chunksize=512*100)

In [None]:
batch_idx = 0
for batch in fb:
    if batch_idx == 3:
        break
    events_combined = batch.T
    df = pd.DataFrame(columns=['pt_j1', 'm_j1', 'eta_j1', 'phi_j1', 'E_j1', 
                               'pt_j2', 'm_j2', 'eta_j2', 'phi_j2', 'E_j2',
                               'deltaeta', 'deltaphi',
                               'mEratio1', 'mEratio2',
                               'm_jj', 'pt_asym',
                               'deltaR_sj12', 'deltaR_sj13',
                               'deltaR_sj14', 'deltaR_sj23', 
                               'deltaR_sj24', 'deltaR_sj34',
                               'n_subjets', 'event_idx' ])

    for i in tqdm(range(events_combined.keys()[0],events_combined.keys()[-1]+1), total=len(batch)):
        pseudojets_input = np.zeros(len([x for x in events_combined[i][::3] if x > 0]), dtype=DTYPE_PTEPM)
        for j in range(700): #700 due to the way the jets are padded
            if (events_combined[i][j*3]>0):
                pseudojets_input[j]['pT'] = events_combined[i][j*3]
                pseudojets_input[j]['eta'] = events_combined[i][j*3+1]
                pseudojets_input[j]['phi'] = events_combined[i][j*3+2]
                pass

        sequence = cluster(pseudojets_input, R=1.0, algo='antikt')
        jets = sequence.inclusive_jets(500)
        jets2 = sequence.exclusive_jets(4)

        pt_j1 = jets[0].pt
        m_j1 = np.abs(jets[0].mass)
        eta_j1 = jets[0].eta
        phi_j1 = jets[0].phi
        E_j1 = jets[0].e
        
        try:
            pt_j2 = jets[1].pt
            m_j2 = np.abs(jets[1].mass)
            eta_j2 = jets[1].eta
            phi_j2 = jets[1].phi
            E_j2 = jets[1].e
        except IndexError:
            pt_j2 = 0.
            m_j2 = 0.
            eta_j2 = 0.
            phi_j2 = 0.
            E_j2 = 0.


        deltaeta = np.abs(eta_j1 - eta_j2)
        deltaphi = comp_delta_phi(phi_j1,phi_j2)

        mEratio1 = m_j1/E_j1

        try:
            mEratio2 = m_j2/E_j2
        except ZeroDivisionError:
            mEratio2 = 0.

        m_jj = m_j1 + m_j2

        pt_asym = (pt_j1 - pt_j2)/(pt_j1 + pt_j2)

        cluster_sj1 = cluster(jets[0].constituents_array(), R=0.4,p=1)
        sj1 = cluster_sj1.inclusive_jets(20)
        n_subjets = len(sj1)


        deltaR_list = delta_R_sub(sj1,4)
        deltaR_sj12 = deltaR_list[0]
        deltaR_sj13 = deltaR_list[1]
        deltaR_sj14 = deltaR_list[2]
        deltaR_sj23 = deltaR_list[3]
        deltaR_sj24 = deltaR_list[4]
        deltaR_sj34 = deltaR_list[5]

        #make the pandas entry
        entry = pd.DataFrame([[pt_j1, m_j1, eta_j1, phi_j1, E_j1, 
                               pt_j2, m_j2, eta_j2, phi_j2, E_j2,
                               deltaeta, deltaphi,
                               mEratio1, mEratio2,
                               m_jj, pt_asym,
                               deltaR_sj12, deltaR_sj13,
                               deltaR_sj14, deltaR_sj23, 
                               deltaR_sj24, deltaR_sj34,
                               n_subjets, i]],
                             columns=['pt_j1', 'm_j1', 'eta_j1', 'phi_j1', 'E_j1', 
                                      'pt_j2', 'm_j2', 'eta_j2', 'phi_j2', 'E_j2',
                                      'deltaeta', 'deltaphi',
                                      'mEratio1', 'mEratio2',
                                      'm_jj', 'pt_asym',
                                      'deltaR_sj12', 'deltaR_sj13',
                                      'deltaR_sj14', 'deltaR_sj23', 
                                      'deltaR_sj24', 'deltaR_sj34',
                                      'n_subjets', 'event_idx'])

        #appending the entry (row) into the main dataframe
        df = df.append(entry)

    #end the loop and save the batch
    df.to_csv('../csv_data/background_pythia/background_pythia_batch_{}.csv'.format(batch_idx), sep=',', index=False)
    batch_idx += 1