# Datasets
Author: Javier Duarte


## Load datasets from `ROOT` files using `uproot`
Here we load the `ROOT` datasets in python using `uproot`

In [1]:
import uproot

## Load `ROOT` files
Here we load the `ROOT` datasets with `uproot` (see: https://github.com/scikit-hep/uproot)

In [2]:
import numpy as np
import h5py

treename = 'HZZ4LeptonsAnalysisReduced'
filename = {}
upfile = {}

filename['bkg'] = 'data/ntuple_4mu_bkg.root'
filename['VV'] = 'data/ntuple_4mu_VV.root'

upfile['bkg'] = uproot.open(filename['bkg'])
upfile['VV'] = uproot.open(filename['VV'])

print(upfile['bkg'][treename].show())

f_run                      (no streamer)              asdtype('>i4')
f_lumi                     (no streamer)              asdtype('>i4')
f_event                    (no streamer)              asdtype('>i4')
f_weight                   (no streamer)              asdtype('>f4')
f_int_weight               (no streamer)              asdtype('>f4')
f_pu_weight                (no streamer)              asdtype('>f4')
f_eff_weight               (no streamer)              asdtype('>f4')
f_lept1_pt                 (no streamer)              asdtype('>f4')
f_lept1_eta                (no streamer)              asdtype('>f4')
f_lept1_phi                (no streamer)              asdtype('>f4')
f_lept1_charge             (no streamer)              asdtype('>f4')
f_lept1_pfx                (no streamer)              asdtype('>f4')
f_lept1_sip                (no streamer)              asdtype('>f4')
f_lept2_pt                 (no streamer)              asdtype('>f4')
f_lept2_eta                (no str

## Convert tree to `pandas` DataFrames
In my opinion, `pandas` DataFrames are a more convenient/flexible data container in python: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html. 

In [3]:
import pandas as pd

df = {}
df['bkg'] = upfile['bkg'][treename].pandas.df()
df['VV'] = upfile['VV'][treename].pandas.df()

# print first entry
print(df['bkg'].iloc[:1])

# print shape of DataFrame
print(df['bkg'].shape)

# print first entry for f_mass4l and f_massjj
print(df['bkg'][['f_mass4l','f_massjj']].iloc[:1])

# convert back into unstructured NumPY array
print(df['bkg'].values)
print(df['bkg'].values.shape)

# get boolean mask array
mask = (df['bkg']['f_mass4l'] > 125)
print(mask)

# cut using this boolean mask array
print(df['bkg']['f_mass4l'][mask])

       f_run  f_lumi  f_event  f_weight  f_int_weight  f_pu_weight  \
entry                                                                
0          1       4      630  0.000648           0.0     1.229054   

       f_eff_weight  f_lept1_pt  f_lept1_eta  f_lept1_phi  ...  f_jet2_pt  \
entry                                                      ...              
0               1.0    32.80312     0.354339     -1.41646  ...        0.0   

       f_jet2_eta  f_jet2_phi  f_jet2_e  f_D_bkg_kin   f_D_bkg    f_D_gg  \
entry                                                                      
0             0.0         0.0       0.0     0.363088  0.363088 -0.000022   

         f_D_g4  f_Djet_VAJHU    f_pfmet  
entry                                     
0      0.827116          -1.0  18.884806  

[1 rows x 62 columns]
(58107, 62)
        f_mass4l  f_massjj
entry                     
0      91.098129    -999.0
[[ 1.00000000e+00  4.00000000e+00  6.30000000e+02 ...  8.27115893e-01
  -1.00000000