# BDT - Background and Signal distribution

In [4]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
import seaborn as sns
import pandas as pd
import math

signal = pd.read_csv('.//data//500.csv', index_col=0)
background = pd.read_csv('.//data//background.csv', index_col=0)

signal.drop(["nTags", "MCChannelNumber", "mVHres"], axis=1, inplace=True)
background.drop(["nTags", "MCChannelNumber", "mVHres"], axis=1, inplace=True)

train_bkg, test_bkg = train_test_split(background, test_size=0.4, random_state=2) #splitting bkg into train and test
train_signal, test_signal = train_test_split(signal, test_size=0.4, random_state=2) #splitting signal into train and test

train_y = len(train_bkg) * [0] + len(train_signal) * [1] #training set
test_y = len(test_bkg) * [0] + len(test_signal) * [1] #test set

train_x = pd.concat([train_bkg, train_signal], ignore_index=True) #training data
test_x = pd.concat([test_bkg, test_signal], ignore_index=True) #testing data

train_weight = train_x["weight"].to_numpy()
test_weight = test_x["weight"].to_numpy()
train_x.drop(["weight"], axis=1, inplace=True)
test_x.drop(["weight"], axis=1, inplace=True)
test_bkg_weight = test_bkg["weight"].to_numpy()
test_signal_weight = test_signal["weight"].to_numpy()
test_bkg.drop(["weight"], axis=1, inplace=True)
test_signal.drop(["weight"], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [5]:
def test_BDT(n, d, l):
    BDT_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=d), algorithm="SAMME", 
                                 n_estimators=n, learning_rate=l).fit(train_x, train_y)
    
    bkg = BDT_clf.decision_function(test_bkg)
    signal = BDT_clf.decision_function(test_signal)
    
    c_max = 1
    c_min = -1
    
    plt.hist(bkg,bins=10,range=(c_min,c_max), histtype='step', fill=False, density=True)
    plt.hist(signal,bins=10,range=(c_min,c_max), histtype='step', fill=False, density=True)
    
    filename = "n={} d={} l={}".format(n,d,l)
    plt.xlabel('BDT output')
    plt.ylabel('Density')
    plt.title(filename)
    plt.savefig("Figures/{}.png".format(filename))
    
    sighist, _ = np.histogram(signal, bins=bins, weights=test_signal_weight)
    bkghist, _ = np.histogram(bkg, bins=bins, weights=test_bkg_weight)
    portion = 0.4
    logsig = True
    
    backgrounds_content = np.array(bkghist)/portion
    signal_content = np.array(sighist)/portion
    total = 0
    
    if not logsig:
        return sum(signal_content)/math.sqrt(sum(backgrounds_content))

    for each_b, each_s in zip(backgrounds_content, signal_content):
        if each_b > 0 and each_s > 0:
            total += 2 * ((each_s + each_b) * math.log(1 + each_s/each_b) - each_s)
    return math.sqrt(total)

In [20]:
#def significance_binned(backgrounds, signal, logsig=True, portion=1):
#    backgrounds_content = np.array(backgrounds)/portion
#    signal_content = np.array(signal)/portion
#    total = 0

#    if not logsig:
#        return sum(signal_content)/math.sqrt(sum(backgrounds_content))

#    for each_b, each_s in zip(backgrounds_content, signal_content):
 #       if each_b > 0 and each_s > 0:
#            total += 2 * ((each_s + each_b) * math.log(1 + each_s/each_b) - each_s)
#    return math.sqrt(total)

In [None]:
n_testing = [600, 900]
d_testing = 5
l_testing = 1 # A learning_rate above 2 does not work. Error: ValueError: Input contains NaN,
              # infinity or a value too large for dtype('float64').
    
bins = np.linspace(-1,1,20)

for i in n_testing:
    plt.figure()
   # test_BDT(i, d_testing, l_testing)
   # print("Significance for n={}, d=3, l=1 is:".format(i), significance_binned(bkghist, sighist, portion=0.4))
    signif = "Significance for n={}, d=5, l=1 is:".format(i), test_BDT(i, d_testing, l_testing) 
    
    with open("Sig.txt", "a") as output:
        output.write(str(signif) + '\n')