# load data samples


In [1]:
import uproot
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

index = ['B_sig_K_dr', 'B_sig_K_dz', 'B_sig_CleoConeCS_3_ROE',
       'thrustAxisCosTheta', 'aplanarity', 'sphericity',
       'harmonicMomentThrust0', 'harmonicMomentThrust1',
       'harmonicMomentThrust2', 'harmonicMomentThrust3',
       'harmonicMomentThrust4', 'foxWolframR1', 'foxWolframR2', 'foxWolframR3',
       'foxWolframR4','B_sig_isSignalAcceptMissingNeutrino']

data0 = uproot.open("/ceph/aavocone/Data/processed_simulation_B_K_a_nunu_ma_0_005_GeV_100000_events_nobdtcut.root:tree_Bsig;1").arrays(index, library ="pd")
data3 = uproot.open("/ceph/aavocone/Data/processed_simulation_B_K_a_nunu_ma_3_GeV_100000_events_nobdtcut.root:tree_Bsig;1").arrays(index, library ="pd")
data4 = uproot.open("/ceph/aavocone/Data/processed_simulation_B_K_a_nunu_ma_4_6_GeV_100000_events_nobdtcut.root:tree_Bsig;1").arrays(index, library ="pd")
background = pq.read_table(f"/ceph/aavocone/Datasets/background.parquet")
background = background.to_pandas()


data0["signal"]  = np.ones(len(data0))
data3["signal"]  = np.ones(len(data3))
data4["signal"]  = np.ones(len(data4))

data0.drop(data0[data0["B_sig_isSignalAcceptMissingNeutrino"]==0.0].index, inplace = True)
data3.drop(data3[data3["B_sig_isSignalAcceptMissingNeutrino"]==0.0].index, inplace = True)
data4.drop(data4[data4["B_sig_isSignalAcceptMissingNeutrino"]==0.0].index, inplace = True)



## 5 MeV + 3 GeV

In [2]:

sets =[data0,data3,background]

df = pd.concat(sets)
df.drop("B_sig_isSignalAcceptMissingNeutrino", axis=1, inplace= True)
table = pa.Table.from_pandas(df,nthreads = 1)
pq.write_table(table, f"/ceph/aavocone/Datasets/03_large.parquet")


## 5 MeV + 4.6 GeV

In [3]:

sets =[data0,data4,background]

df = pd.concat(sets)
df.drop("B_sig_isSignalAcceptMissingNeutrino", axis=1, inplace= True)
table = pa.Table.from_pandas(df,nthreads = 1)
pq.write_table(table, f"/ceph/aavocone/Datasets/04_large.parquet")


## 3 GeV + 4.6 GeV

In [4]:

sets =[data3,data4,background]

df = pd.concat(sets)
df.drop("B_sig_isSignalAcceptMissingNeutrino", axis=1, inplace= True)
table = pa.Table.from_pandas(df,nthreads = 1)
pq.write_table(table, f"/ceph/aavocone/Datasets/34_large.parquet")


## All samples

In [5]:

sets =[data0,data3,data4,background]

df = pd.concat(sets)
df.drop("B_sig_isSignalAcceptMissingNeutrino", axis=1, inplace= True)
table = pa.Table.from_pandas(df,nthreads = 1)
pq.write_table(table, f"/ceph/aavocone/Datasets/all_large.parquet")


## Test if everything went fine

In [6]:
data = ["0_large","3_large","03_large","4_large","04_large","34_large","all_large"]

for index in data:
    test = pq.read_table(f"/ceph/aavocone/Datasets/{index}.parquet")
    test = test.to_pandas()
    print(sum(test["signal"]))


8553.0
20067.0
28620.0
26090.0
34643.0
46157.0
54710.0
