In [1]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from collections import defaultdict
import matplotlib as mpl

mpl.rcParams['figure.figsize'] = (6,6)
mpl.rcParams['figure.dpi'] = 100
mpl.rcParams["image.origin"] = 'lower'

In [2]:
classes = { "WWewk": 0, "top":1 }
signal_name = "WWewk"
sample_names = ["WWewk", "top"]

In [3]:
samples = {}

In [4]:
base_dir="/eos/home-b/bpinolin/ML_classification/output/"
plot_config="VBSOS"
cut= "em_tight"
version = "v3"

In [5]:
samples_dir = os.path.join(base_dir, plot_config, cut, "samples/" + version)
#output_dir = os.path.join(base_dir, plot_config, cut, "samples/" + version)

In [6]:
for file in os.listdir(samples_dir):
    if os.path.isdir(os.path.join(samples_dir, file)): continue
    sname = file.split("_part")[0]
    print(sname)
    print(os.path.join(samples_dir, file))
    s = pickle.load(open(os.path.join(samples_dir, file), "rb"))
    s.rename(columns=lambda c: c.split(cut+"_")[1] if cut in c else c, inplace=True)
    s["sample_name"]= sname
    s["class"] = classes[sname]
    if sname == signal_name:
        s["signal"] = 1
    else: 
        s["signal"] = 0
    if sname in samples:
        samples[sname] = pd.concat([samples[sname], s], ignore_index=True)
    else:
        samples[sname] = s

WWewk
/eos/home-b/bpinolin/ML_classification/output/VBSOS/em_tight/samples/v3/WWewk_part1.pkl
top
/eos/home-b/bpinolin/ML_classification/output/VBSOS/em_tight/samples/v3/top_part1.pkl
top
/eos/home-b/bpinolin/ML_classification/output/VBSOS/em_tight/samples/v3/top_part2.pkl


In [7]:
for s, df in samples.items():
    print(f"Sample name {s:10}, nsamples: {len(df):10},   XS total: {(df.weight_).sum()*41.5 :15}")

Sample name WWewk     , nsamples:      11171,   XS total: 114.85178009853344
Sample name top       , nsamples:     108446,   XS total: 7500.626491892971


In [8]:
from sklearn.utils.class_weight import compute_sample_weight, compute_class_weight
from sklearn.preprocessing import LabelEncoder 

In [9]:
# _ = plt.hist(samples["WWewk"]["weight_"], bins=100, range=(0,0.001))

# Balancing

In [10]:
bkg_list = []
bkg_names = [ name for name in sample_names if name != signal_name ]
for bkg_name in bkg_names:
    bkg_list.append(samples[bkg_name])

In [11]:
background = pd.concat(bkg_list, ignore_index=True)

In [12]:
signal = samples["WWewk"]

In [13]:
ratio_neve_bkgsignal= len(background) / len(signal)
print(ratio_neve_bkgsignal)

9.707814877808612


In [14]:
tot_ev_weighted_sig = (signal.weight_ / signal.weight_.mean()).sum()
print("TOT signal weighted events", tot_ev_weighted_sig)

TOT signal weighted events 11171.0


In [15]:
rescale_factor_bkg = (tot_ev_weighted_sig *ratio_neve_bkgsignal )/ background.weight_.sum()
print(rescale_factor_bkg)

600.0177458328797


In [16]:
signal["weight_norm"] = signal.weight_ / signal.weight_.mean()
background["weight_norm"] = background.weight_ * rescale_factor_bkg

# Save signal and bkg samples

In [17]:
import os

os.makedirs(os.path.join(samples_dir, "for_training"), exist_ok=True)
pickle.dump(background, open(os.path.join(samples_dir, "for_training/background_balanced.pkl"), "wb"))
pickle.dump(signal, open(os.path.join(samples_dir, "for_training/signal_balanced.pkl"), "wb"))

In [18]:
#plt.hist(background["weight_norm"], bins=100, range=(0, 50))
#plt.yscale("log")

In [19]:
#plt.hist(signal["weight_norm"], bins=100, range=(0, 100))
#plt.yscale("log")