# Datasets
Author: Javier Duarte


## Load datasets from `ROOT` files using `uproot`
Here we load the `ROOT` datasets in python using `uproot`

In [6]:
import uproot

## Load `ROOT` files
Here we load the `ROOT` datasets into `NumPy` arrays (possibly with jagged structured. See: https://github.com/scikit-hep/uproot

In [17]:
import numpy as np
import h5py

treename = 'HZZ4LeptonsAnalysisReduced'
filename = {}
upfile = {}
params = {}

filename['bkg'] = 'data/ntuple_4mu_bkg.root'
filename['VV'] = 'data/ntuple_4mu_VV.root'

upfile['bkg'] = uproot.open(filename['bkg'])
upfile['VV'] = uproot.open(filename['VV'])

params['bkg'] = upfile['bkg'][treename].arrays() # returns a dictionary of arrays
params['VV'] = upfile['VV'][treename].arrays()

# print all variables
print(params['bkg'].keys())

# print the shape of one of the NumPy arrays
print(params['bkg']['f_mass4l'].shape)

# print mass4l value of first entry
print(params['bkg']['f_mass4l'][0])

# print massjj value of first entry
print(params['bkg']['f_massjj'][0])

['f_event', 'f_pu_weight', 'f_lept3_sip', 'f_weight', 'f_lept4_phi', 'f_D_gg', 'f_angle_costheta2', 'f_angle_costheta1', 'f_jet2_e', 'f_lept4_eta', 'f_lept2_pt', 'f_lept1_pt', 'f_massjj', 'f_D_jet', 'f_lept2_pfx', 'f_lept3_phi', 'f_lept4_charge', 'f_angle_costhetastar', 'f_deltajj', 'f_jet1_e', 'f_lept1_charge', 'f_D_g4', 'f_mass4lErr', 'f_njets_pass', 'f_eta4l', 'f_lept2_phi', 'f_jet2_phi', 'f_lept1_sip', 'f_lumi', 'f_pt4l', 'f_D_bkg', 'f_D_bkg_kin', 'f_jet2_eta', 'f_jet1_phi', 'f_angle_phi', 'f_jet1_pt', 'f_eff_weight', 'f_lept4_sip', 'f_angle_phistar1', 'f_lept3_pfx', 'f_run', 'f_mass4l', 'f_jet1_eta', 'f_lept1_pfx', 'f_int_weight', 'f_lept2_eta', 'f_sip_max', 'f_jet2_pt', 'f_iso_max', 'f_lept2_sip', 'f_Z2mass', 'f_Djet_VAJHU', 'f_Z1mass', 'f_lept1_eta', 'f_lept3_charge', 'f_lept4_pfx', 'f_lept4_pt', 'f_lept2_charge', 'f_lept3_pt', 'f_lept3_eta', 'f_pfmet', 'f_lept1_phi']
(58107,)
91.09813
-999.0


## Convert `NumPy` arrays to `pandas` DataFrames
In my opinion, `pandas` DataFrames are a more convenient/flexible data container in python: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html. 
So we'll use this instead of structured `NumPy` arrays.

In [18]:
import pandas as pd

df = {}
df['bkg'] = pd.DataFrame(params['bkg'])
df['VV'] = pd.DataFrame(params['VV'])

# print first entry
print(df['bkg'].iloc[:1])

# print shape of DataFrame
print(df['bkg'].shape)

# print first entry for f_mass4l and f_massjj
print(df['bkg'][['f_mass4l','f_massjj']].iloc[:1])

# convert back into unstructured NumPY array
print(df['bkg'].values)
print(df['bkg'].values.shape)

# get boolean array
print(df['bkg']['f_mass4l'] > 125)

# cut usigg this boolean array
print(df['bkg']['f_mass4l'][df['bkg']['f_mass4l'] > 125])

    f_D_bkg  f_D_bkg_kin    f_D_g4    f_D_gg  f_D_jet  f_Djet_VAJHU  \
0  0.363088     0.363088  0.827116 -0.000022   -999.0          -1.0   

    f_Z1mass   f_Z2mass  f_angle_costheta1  f_angle_costheta2    ...     \
0  51.681366  12.933985           0.224607           0.762976    ...      

    f_mass4l  f_mass4lErr  f_massjj  f_njets_pass    f_pfmet     f_pt4l  \
0  91.098129          0.0    -999.0           1.0  18.884806  45.872066   

   f_pu_weight  f_run  f_sip_max  f_weight  
0     1.229054      1        0.0  0.000648  

[1 rows x 62 columns]
(58107, 62)
    f_mass4l  f_massjj
0  91.098129    -999.0
[[3.63087595e-01 3.63087595e-01 8.27115893e-01 ... 1.00000000e+00
  0.00000000e+00 6.48105284e-04]
 [3.67416233e-01 3.67416233e-01 4.15622257e-02 ... 1.00000000e+00
  0.00000000e+00 3.70107766e-04]
 [3.10998470e-01 3.10998470e-01 8.38690639e-01 ... 1.00000000e+00
  0.00000000e+00 4.69463557e-04]
 ...
 [5.05778231e-02 5.05778231e-02 8.02676916e-01 ... 1.00000000e+00
  0.00000000e+00