In [1]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from collections import defaultdict
import matplotlib as mpl

mpl.rcParams['figure.figsize'] = (6,6)
mpl.rcParams['figure.dpi'] = 100
mpl.rcParams["image.origin"] = 'lower'

In [2]:
base_dir="/eos/home-b/bpinolin/ML_output/"
plot_config="VBSOS"
cut= "sr"
version = "DNN_v7/top_WW/highZ/train"

In [3]:
output_dir = os.path.join(base_dir, plot_config, cut, "samples/", version)

In [4]:
datasets  = ["2016","2017","2018"]
samples_dirs = [os.path.join(base_dir, plot_config, cut, "samples/", version, p) for p in datasets]

In [5]:
lumi = { 2016: 35.867, 2017: 41.5 , 2018: 59.74}

In [6]:
classes = { "WWewk": 0, "top":1, "WW":2}
signal_name = "WWewk"
sample_names = ["WWewk", "top", "WW"]

samples = {}

for samples_dir in samples_dirs:
    for file in os.listdir(samples_dir):
        if os.path.isdir(os.path.join(samples_dir, file)): continue
        sname = file.split("_part")[0]
        print(sname)
        s = pickle.load(open(os.path.join(samples_dir, file), "rb"))
        s.rename(columns=lambda c: c.split(cut+"_")[1] if cut in c else c, inplace=True)
        s["sample_name"]= sname
        s["class"] = classes[sname]
        if sname == signal_name:
            s["signal"] = 1
        else: 
            s["signal"] = 0
            
            
        if "2016" in samples_dir:
            s["year"] = 2016
            s["weight_"] = s["weight_"] * lumi[2016] #lumi is included 
        if "2017" in samples_dir:
            s["year"] = 2017
            s["weight_"] = s["weight_"] * lumi[2017] #lumi is included 
        if "2018" in samples_dir:
            s["year"] = 2018 
            s["weight_"] = s["weight_"] * lumi[2018] #lumi is included 
            
        if sname in samples:
            samples[sname] = pd.concat([samples[sname], s], ignore_index=True)
        else:
            samples[sname] = s

WW
WWewk
top
WW
WWewk
top
WW
WWewk
top


In [7]:
for s, df in samples.items():
    print(f"Sample name {s:10}, nsamples: {len(df):10},   XS total: {(df.weight_).sum():15}")

Sample name WW        , nsamples:       7449,   XS total: 960.0418934983124
Sample name WWewk     , nsamples:       2630,   XS total: 87.65963632493688
Sample name top       , nsamples:      51376,   XS total: 13342.675052833958


In [8]:
from sklearn.utils.class_weight import compute_sample_weight, compute_class_weight
from sklearn.preprocessing import LabelEncoder 

In [9]:
bkg_list = []
bkg_names = [ name for name in sample_names if name != signal_name ]
for bkg_name in bkg_names:
    bkg_list.append(samples[bkg_name])

background = pd.concat(bkg_list, ignore_index=True)

signal = samples["WWewk"]

# Balancing

### Normalization by background events

### Normalization by signal events

### What I was doing

In [10]:
print("len_sig = ", len(signal))
print("len_bkg = ", len(background))

ratio_neve_bkgsignal= len(background) / len(signal)
print("\nlenB/lenS = ", round(ratio_neve_bkgsignal,2))

nS = signal.weight_.sum()
nB = background.weight_.sum()
print("\nnS = ", int(nS))
print("nB = ", int(nB))

signal["weight_norm"] = signal.weight_ / signal["weight_"].mean()
background["weight_norm"] = background.weight_ * len(background)/ nB
print("\nMean of signal weights: ", signal.weight_norm.mean())
print("Mean of bkg weights: ", background.weight_norm.mean())

len_sig =  2630
len_bkg =  58825

lenB/lenS =  22.37

nS =  87
nB =  14302

Mean of signal weights:  0.9999999999999998
Mean of bkg weights:  0.9999999999999999


# Save signal and bkg samples

In [11]:
import os

os.makedirs(os.path.join(output_dir, "for_training"), exist_ok=True)
pickle.dump(background, open(os.path.join(output_dir, "for_training/background_balanced.pkl"), "wb"))
pickle.dump(signal, open(os.path.join(output_dir, "for_training/signal_balanced.pkl"), "wb"))

In [12]:
#v7 top+WW
low_sig = 225
low_bkg = 7155

low_sig_2016 = 46
low_bkg_2016 = 1929
low_sig_2017 = 67
low_bkg_2017 = 2058
low_sig_2018 = 113
low_bkg_2018 = 3168

high_sig = 93
high_bkg = 6063

high_sig_2016 = 18
high_bkg_2016 = 1582
high_sig_2017 = 28
high_bkg_2017 = 1808
high_sig_2018 = 47
high_bkg_2018 = 2672

## v6

sig_tot = 51435
bkg_tot = 1733357

print("Queste percentuali vengono utilizzate per la ROC della DNN")
print("% signal = ", round(len(signal)/sig_tot,3))
print("% bkg = ", round(len(background)/bkg_tot,3))