In [1]:
%matplotlib inline
import numpy as np
import glob
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import random
import os
from sklearn.preprocessing import label_binarize
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from evolutionary_search import EvolutionaryAlgorithmSearchCV
from sklearn.model_selection import StratifiedKFold
import time



In [2]:
from sklearn import preprocessing
scale = StandardScaler()
min_max_scaler = preprocessing.MinMaxScaler()
def scaleColumns(df, cols_to_scale):
    for col in cols_to_scale:
        df[col] = pd.DataFrame(min_max_scaler.fit_transform(pd.DataFrame(df[col])),columns=[col])
    return df

In [3]:
signal = pd.read_csv('process_data/signal_pred.csv',sep=',')
background = pd.read_csv('process_data/background_pred.csv',sep=',')

In [4]:
signal['signal'] = 1
background['signal'] = 0

In [5]:
analysis = pd.concat([signal, background], ignore_index=True)

In [6]:
analysis.head()

Unnamed: 0.1,Unnamed: 0,pt_j1,m_j1,eta_j1,phi_j1,E_j1,pt_j2,m_j2,eta_j2,phi_j2,...,deltaR2_sj23,deltaR2_sj24,deltaR2_sj34,n_subjets1,n_subjets2,event_idx,img_name,P_BG,P_SIG,signal
0,0,1210.415787,129.499352,-0.744836,-2.883347,1567.3453,1091.785816,155.362262,1.060534,0.264977,...,0.0,0.0,0.0,1,1,4,4.png,0.378957,0.621043,1
1,1,1787.625573,99.168898,-0.934612,1.185407,2628.753873,1717.64994,515.054419,-1.044915,-1.955165,...,2.512072,2.512072,0.0,1,2,8,8.png,0.027786,0.972214,1
2,2,1368.776255,104.840764,-1.13636,1.353935,2354.162865,1283.549433,452.024576,0.053019,-1.781479,...,1.015464,1.926491,2.05649,1,3,19,19.png,0.288431,0.711569,1
3,3,1659.355146,110.390275,-0.348487,2.47501,1764.594154,1634.435917,487.949083,0.102619,-0.649581,...,0.452664,0.452664,0.0,1,2,24,24.png,0.09352,0.90648,1
4,4,1827.270115,551.510487,0.208524,-0.979991,1946.889764,1799.302155,108.942106,-0.452693,2.177153,...,0.0,0.0,0.0,2,1,33,33.png,0.027147,0.972853,1


In [7]:
analysis.keys()

Index(['Unnamed: 0', 'pt_j1', 'm_j1', 'eta_j1', 'phi_j1', 'E_j1', 'pt_j2',
       'm_j2', 'eta_j2', 'phi_j2', 'E_j2', 'deltaeta', 'deltaphi', 'mEratio1',
       'mEratio2', 'm_jj', 'pt_asym', 'deltaR1_sj12', 'deltaR1_sj13',
       'deltaR1_sj14', 'deltaR1_sj23', 'deltaR1_sj24', 'deltaR1_sj34',
       'deltaR2_sj12', 'deltaR2_sj13', 'deltaR2_sj14', 'deltaR2_sj23',
       'deltaR2_sj24', 'deltaR2_sj34', 'n_subjets1', 'n_subjets2', 'event_idx',
       'img_name', 'P_BG', 'P_SIG', 'signal'],
      dtype='object')

In [8]:
analysis.drop(['event_idx','img_name','Unnamed: 0'], axis=1, inplace=True)

In [9]:
analysis.head()

Unnamed: 0,pt_j1,m_j1,eta_j1,phi_j1,E_j1,pt_j2,m_j2,eta_j2,phi_j2,E_j2,...,deltaR2_sj13,deltaR2_sj14,deltaR2_sj23,deltaR2_sj24,deltaR2_sj34,n_subjets1,n_subjets2,P_BG,P_SIG,signal
0,1210.415787,129.499352,-0.744836,-2.883347,1567.3453,1091.785816,155.362262,1.060534,0.264977,1772.340209,...,1.096969,1.096969,0.0,0.0,0.0,1,1,0.378957,0.621043,1
1,1787.625573,99.168898,-0.934612,1.185407,2628.753873,1717.64994,515.054419,-1.044915,-1.955165,2791.763302,...,2.013673,2.013673,2.512072,2.512072,0.0,1,2,0.027786,0.972214,1
2,1368.776255,104.840764,-1.13636,1.353935,2354.162865,1283.549433,452.024576,0.053019,-1.781479,1362.520063,...,0.517361,1.698,1.015464,1.926491,2.05649,1,3,0.288431,0.711569,1
3,1659.355146,110.390275,-0.348487,2.47501,1764.594154,1634.435917,487.949083,0.102619,-0.649581,1713.973621,...,0.778878,0.778878,0.452664,0.452664,0.0,1,2,0.09352,0.90648,1
4,1827.270115,551.510487,0.208524,-0.979991,1946.889764,1799.302155,108.942106,-0.452693,2.177153,1989.823176,...,2.22321,2.22321,0.0,0.0,0.0,2,1,0.027147,0.972853,1


In [13]:
X = analysis.drop(['signal'], axis=1)
y = analysis['signal']

In [15]:
y.shape

(93383,)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

In [None]:
param_grid_BDT = {
    'n_estimators':[50,150,300,700],
    'learning_rate':[1.0,0.1,0.01],
    'algorithm':['SAMME','SAMME.R'],
    'base_estimator__max_depth':[1,3,5],
    'base_estimator__criterion':['gini','entropy'],
    'base_estimator__splitter':['random']
}

In [None]:
dt = DecisionTreeClassifier(random_state=42,min_samples_leaf=1)
clf = AdaBoostClassifier(dt, random_state=42)

In [None]:
time_start = time.clock()    
cv = EvolutionaryAlgorithmSearchCV(estimator=clf,
                               params=param_grid_BDT,
                               scoring="accuracy",
                               cv=StratifiedKFold(n_splits=4),
                               verbose=1,
                               population_size=50,
                               gene_mutation_prob=0.3,
                               gene_crossover_prob=0.5,
                               tournament_size=3,
                               generations_number=3,
                               n_jobs=4)

cv.fit(X_train,y_train)
time_elapsed = (time.clock() - time_start)
hours, rem = divmod(time_elapsed, 3600)
minutes, seconds = divmod(rem, 60)
total_time = {'hours': hours,
             'minutes':minutes,
             'seconds':seconds}

print('****Results****')
acc = cv.best_score_
best_par = cv.best_params_
print("Accuracy: {:.4%}".format(acc))

In [None]:
cv.best_params_

Best individual is: {'n_estimators': 700, 'learning_rate': 0.1, 'algorithm': 'SAMME', 'base_estimator__max_depth': 5, 'base_estimator__criterion': 'gini', 'base_estimator__splitter': 'random'}
with fitness: 0.9478890584424277
****Results****
Accuracy: 94.7889%

Best individual is: {'n_estimators': 700, 'learning_rate': 0.1, 'algorithm': 'SAMME.R', 'base_estimator__max_depth': 3, 'base_estimator__criterion': 'entropy', 'base_estimator__splitter': 'random'}
with fitness: 0.9538591277809011
****Results****
Accuracy: 95.3859%