# BDT - Background and Signal distribution

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import pandas as pd
import math

signal_all = pd.DataFrame()
signal_mass = [300, 420, 440, 460, 500, 600, 700, 800, 900, 1000, 1200, 1400, 1600, 2000]
for each in signal_mass:
    df_temp = pd.read_csv(str(each) + ".csv", index_col=0)
    df_temp_subset = df_temp.sample(n = 3020) #Taking a subset of signals so they are all equal
    df_temp_subset.drop(columns=["nTags", "MCChannelNumber", "mVHres"], inplace=True)
    signal_all = pd.concat([df_temp_subset, signal_all], ignore_index=True)
     
    
signal_one = pd.read_csv('500.csv', index_col=0)
#signal_one = pd.read_csv('300.csv', index_col=0)
signal_one.drop(["nTags", "MCChannelNumber", "mVHres"], axis=1, inplace=True)
    
background = pd.read_csv("background.csv", index_col=0)
background.drop(["nTags", "MCChannelNumber", "mVHres"], axis=1, inplace=True)

background = background[background["weight"]>0 ] #Only positive Bkg Weight

#### train_test_split on signals and bkg separately:

In [2]:
train_bkg, test_bkg = train_test_split(background, test_size=0.4, random_state=2) #splitting bkg into train and test
train_signal, test_signal = train_test_split(signal_all, test_size=0.4, random_state=2) #splitting signal into train and test 

#### Change the weights of training signal so the sum is the same as that of bkg:

In [3]:
train_signal_weight = train_signal["weight"] * np.sum(train_bkg["weight"])/np.sum(train_signal["weight"])

test_signal_weight = signal_one["weight"] * np.sum(test_bkg["weight"])/np.sum(signal_one["weight"])
    
test_bkg_weight = test_bkg["weight"].to_numpy()

train_bkg_weight = train_bkg["weight"].to_numpy()

#### Combine train_signal and train_bkg:

In [4]:
train_y = len(train_bkg) * [0] + len(train_signal) * [1]
test_y = len(test_bkg) * [0] + len(signal_one) * [1] 

train_x = pd.concat([train_bkg, train_signal], ignore_index=True) 
test_x = pd.concat([test_bkg, signal_one], ignore_index=True) 

#### Assign the weights to a different variable and drop weights from train_x 

In [5]:
train_weight = train_x["weight"].to_numpy()
test_weight = test_x["weight"].to_numpy()

train_x.drop(["weight"], axis=1, inplace=True)
test_x.drop(["weight"], axis=1, inplace=True)

#### Using GridSearchCV to optimise the BDT

In [None]:
#parameters = {'n_estimators': [50,100,300,500,700,900,1000,1500,2000,2500,3000],  
#              'base_estimator__max_depth': [1, 2, 3, 4, 5], 
#              'learning_rate':[0.1, 0.3, 0.5, 1, 1.5,2]} 

In [None]:
def test_gridsearch():
    BDT_clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), algorithm="SAMME")
    parameters = {'n_estimators': [50,100],  
              'base_estimator__max_depth': [1, 2], 
              'learning_rate':[0.1, 0.3]}
    grid = GridSearchCV(BDT_clf, parameters)
    grid.fit(train_x, train_y, train_weight)
    print(grid.best_params_) 
    
test_gridsearch()