# Select a subset of events for the analysis

read the fastjet output and select a given number of signal and background events for the analysis

In [2]:
import pandas as pd
import numpy as np

In [2]:
Nbkg = 2*500000
Nsig = 2*500

In [3]:
data = pd.read_hdf('events_anomalydetection_v2.features.h5')

data['tau21j1'] = data.tau2j1/data.tau1j1
data['tau21j2'] = data.tau2j2/data.tau1j2
data['Ej1'] = np.sqrt(data.mj1**2
                      + (data.pxj1**2 + data.pyj1**2 + data.pzj1**2))
data['Ej2'] = np.sqrt(data.mj2**2
                      + (data.pxj2**2 + data.pyj2**2 + data.pzj2**2))
data['mjj'] = np.sqrt(data.mj1**2 + data.mj2**2
                      + 2*(data.Ej1*data.Ej2
                           - data.pxj1*data.pxj2
                           - data.pyj1*data.pyj2
                           - data.pzj1*data.pzj2))
# drop unused columns
data.drop(columns=['pxj1', 'pyj1', 'pzj1', 'tau1j1', 'tau2j1', 'tau3j1', 'Ej1',
                   'pxj2', 'pyj2', 'pzj2', 'tau1j2', 'tau2j2', 'tau3j2', 'Ej2'],
          inplace=True)
data

Unnamed: 0,mj1,mj2,label,tau21j1,tau21j2,mjj
0,38.896000,237.893997,0.0,0.583317,0.263237,3307.219387
1,389.532013,22.999201,0.0,0.519086,0.787732,3107.620603
2,72.155502,78.230698,0.0,0.789584,0.911809,3004.895272
3,55.797798,359.113007,0.0,0.277956,0.590729,3233.075105
4,84.891502,77.506500,0.0,0.577303,0.588890,2919.346337
...,...,...,...,...,...,...
1099995,126.183998,108.889999,1.0,0.225403,0.499705,3105.457308
1099996,115.719002,489.053009,1.0,0.271544,0.203001,3622.836928
1099997,508.045013,91.104897,1.0,0.166132,0.588186,3546.808986
1099998,114.938004,553.737000,1.0,0.153972,0.524699,3607.571044


In [4]:
signal = data[data.label==1].sample(Nsig, random_state=23)
background = data[data.label==0].sample(Nbkg, random_state=54)

In [5]:
combined = pd.concat((signal, background))
# shuffle the events
combined = combined.sample(frac=1, random_state=30).reset_index(drop=True)

In [6]:
combined.to_hdf('rnd_dataset.hd5', key='data', mode='w')