## BDT

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import pandas as pd
import math

In [2]:
signal_all = pd.DataFrame()
signal_mass = [300, 420, 440, 460, 500, 600, 700, 800, 900, 1000, 1200, 1400, 1600, 2000] #All signals
for each in signal_mass:
    df_temp = pd.read_csv(str(each) + ".csv", index_col=0)
    df_temp.drop(columns=["nTags", "MCChannelNumber", "mVHres"], inplace=True)
    signal_all = pd.concat([df_temp, signal_all], ignore_index=True)
    
signal_500 = pd.read_csv('500.csv', index_col=0) #Want to test 500
signal_500.drop(["nTags", "MCChannelNumber", "mVHres"], axis=1, inplace=True)
    
background = pd.read_csv("background.csv", index_col=0)
background.drop(["nTags", "MCChannelNumber", "mVHres"], axis=1, inplace=True)

background = background[background["weight"]>0 ] #Only positive Bkg Weight

#### Weights of signal and background:

In [3]:
signal_weight = signal_all["weight"] * np.sum(background["weight"])/np.sum(signal_all["weight"])

bkg_weight = background["weight"].to_numpy()

#### Combining background and signal:

In [4]:
bkg_signal = pd.concat([signal_all, background]) 

#### Splitting the data into training and testing:

In [5]:
train_bkg_sig, test_bkg_sig = train_test_split(bkg_signal, test_size=0.4, random_state=2)

In [6]:
train_500, test_500 = train_test_split(signal_500, test_size=0.4, random_state=2)

In [7]:
train_y = len(train_bkg_sig) * [0] + len(train_500) * [1]
test_y = len(test_bkg_sig) * [0] + len(test_500) * [1]

train_x = pd.concat([train_bkg_sig, train_500], ignore_index=True)
test_x = pd.concat([test_bkg_sig, test_500], ignore_index=True)

In [8]:
train_weight = train_x["weight"].to_numpy()
train_x.drop(columns=["weight"], inplace=True)
test_weight = test_x["weight"].to_numpy()
test_x.drop(columns=["weight"], inplace=True)

In [9]:
scaler = StandardScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.fit_transform(test_x)

In [10]:
BDT_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", 
                                 n_estimators=50, learning_rate=1)

#### This is where I get an error:

In [11]:
bkg_train = BDT_clf.fit(train_x,train_y,bkg_weight)
signal_train = BDT_clf.fit(train_x,train_y,signal_weight)
bkg_test = BDT_clf.fit(test_x,test_y,bkg_weight)
signal_test = BDT_clf.fit(test_x,test_y,signal_weight)

ValueError: sample_weight.shape == (559821,), expected (587558,)!

#### Plots and significance:

In [None]:
c_max = 1
c_min = -1

bins = np.linspace(-1,1,20)

plt.hist(bkg_test,bins=20,range=(c_min,c_max), histtype='step', fill=False, density=True, color ='dodgerblue', 
            label="Testing Bkg & Sig")
plt.hist(signal_test,bins=20,range=(c_min,c_max), histtype='step', fill=False, density=True, color ='orange', 
         label="Testing Signal (500)")
plt.hist(bkg_train,bins=20,range=(c_min,c_max), histtype='step', fill=False, density=True, color ='limegreen', 
            label="Training Bkg & Sig")
plt.hist(signal_train,bins=20,range=(c_min,c_max), histtype='step', fill=False, density=True, color ='m', 
         label="Training Signal (500)")
        
plt.xlabel('BDT output')
plt.ylabel('Density')
plt.legend(loc="upper left")

sighist, _ = np.histogram(signal_test, bins=bins, weights=signal_weight)
bkghist, _ = np.histogram(bkg_test, bins=bins, weights=bkg_weight)
sighist_train, _ = np.histogram(signal_train, bins=bins, weights=signal_weight)
    bkghist_train, _ = np.histogram(bkg_train, bins=bins, weights=bkg_weight)

In [None]:
portion = 0.4
logsig = True
    
backgrounds_content = np.array(bkghist)/portion
signal_content = np.array(sighist)/portion
backgrounds_content_train = np.array(bkghist_train)/portion
signal_content_train = np.array(sighist_train)/portion
    
total = 0

for each_b, each_s in zip(backgrounds_content, signal_content):
    if each_b > 0 and each_s > 0:
        total += 2 * ((each_s + each_b) * math.log(1 + each_s/each_b) - each_s)
test_signif = "Testing significance is:",math.sqrt(total)

for each_b, each_s in zip(backgrounds_content_train, signal_content_train):
    if each_b > 0 and each_s > 0:
        total += 2 * ((each_s + each_b) * math.log(1 + each_s/each_b) - each_s)
train_signif = "Training significance is:", math.sqrt(total)   
    
print(test_signif)
print(train_signif)