# BDT - Background and Signal distribution

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import pandas as pd
import math

signal_all = pd.DataFrame()
signal_mass = [300, 420, 440, 460, 500, 600, 700, 800, 900, 1000, 1200, 1400, 1600, 2000]
for each in signal_mass:
    df_temp = pd.read_csv(str(each) + ".csv", index_col=0)
    df_temp["mass"] = each
    df_temp.drop(columns=["nTags", "MCChannelNumber", "mVHres"], inplace=True)
    signal_all = pd.concat([df_temp, signal_all], ignore_index=True)
    
signal_one = pd.read_csv('500.csv', index_col=0)
signal_one.drop(["nTags", "MCChannelNumber", "mVHres"], axis=1, inplace=True)
    
background = pd.read_csv("background.csv", index_col=0)
background.drop(["nTags", "MCChannelNumber", "mVHres"], axis=1, inplace=True)

background = background[background["weight"]>0 ] #Only positive Bkg Weight

#### train_test_split on signals and bkg separately:

In [2]:
train_bkg, test_bkg = train_test_split(background, test_size=0.4, random_state=2) #splitting bkg into train and test
train_signal, test_signal = train_test_split(signal_all, test_size=0.4, random_state=2) #splitting signal into train and test

#### Reweighting signals so they are all equal

In [3]:
val_bkg, test_bkg = train_test_split(test_bkg, test_size=0.5, random_state=2)
val_signal, test_signal = train_test_split(test_signal, test_size=0.5, random_state=2)

train_bkg.loc[:, "weight"] = np.abs(train_bkg["weight"])
val_bkg.loc[:, "weight"] = np.abs(val_bkg["weight"])
train_signal.loc[:, "weight"] = np.sum(train_bkg["weight"])/np.sum(train_signal["weight"])
for each in signal_mass:
        val_signal.loc[val_signal["mass"]==each].loc[:, "weight"] = np.sum(val_bkg["weight"])/np.sum(val_signal.loc[val_signal["mass"]==each]["weight"])/len(signal_mass)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


#### Change the weights of training signal so the sum is the same as that of bkg:

In [4]:
train_signal_weight = train_signal["weight"] * np.sum(train_bkg["weight"])/np.sum(train_signal["weight"])

test_signal_weight = signal_one["weight"] * np.sum(test_bkg["weight"])/np.sum(signal_one["weight"])
    
test_bkg_weight = test_bkg["weight"].to_numpy()

train_bkg_weight = train_bkg["weight"].to_numpy()

#### Combine train_signal and train_bkg:

In [5]:
train_y = len(train_bkg) * [0] + len(train_signal) * [1]
test_y = len(test_bkg) * [0] + len(signal_one) * [1] 

train_x = pd.concat([train_bkg, train_signal], ignore_index=True) 
test_x = pd.concat([test_bkg, signal_one], ignore_index=True) 

#### Assign the weights to a different variable and drop weights from train_x 

In [6]:
train_weight = train_x["weight"].to_numpy()
test_weight = test_x["weight"].to_numpy()

train_x.drop(["weight", "mass"], axis=1, inplace=True)
test_x.drop(["weight"], axis=1, inplace=True)

#### Using GridSearchCV to optimise the BDT

In [7]:
def test_gridsearch():
    BDT_clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), algorithm="SAMME")
    parameters = {'n_estimators': [300,500,700],  
                  'base_estimator__max_depth': [2, 3, 4], 
                  'learning_rate':[0.5, 1, 1.5]} 
    
    grid = GridSearchCV(BDT_clf, parameters, n_jobs=8)
    grid.fit(train_x, train_y, train_weight)
    best_parameters = grid.best_params_
    print(best_parameters) 
    
    with open("Sig_GridSearch_Weighted_500.txt", "a") as output:
        output.write(str(best_parameters))    
    
test_gridsearch()

BlockingIOError: [Errno 11] Resource temporarily unavailable