#### This notebook presents an implementation of XGBoost Classifier in Imbalanced dataset.After building the XGBoost classifier it will use HYperOpt Library,to tune various model parameters with the goal of achieving the maximum f1-score for the  classification .As part of model evaluation ,the f1-score metric will be computed.

In [82]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, make_scorer, confusion_matrix, classification_report, precision_recall_curve, plot_precision_recall_curve, average_precision_score, auc
from sklearn.model_selection import train_test_split
import seaborn as sns
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
import xgboost as xgb

In [83]:
df=pd.read_csv("./data_transformed.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,T1,T2,T3,T4,T5,T6,T7,T8,T9,...,T21,T22,T23,T24,T25,T26,T27,T28,value,Class
0,169876,-0.611712,-0.769705,-0.149759,-0.224877,2.028577,-2.019887,0.292491,-0.52302,0.358468,...,-0.075208,0.045536,0.380739,0.02344,-2.220686,-0.201146,0.066501,0.22118,1.79,0
1,127467,-0.814682,1.319219,1.329415,0.027273,-0.284871,-0.653985,0.321552,0.435975,-0.704298,...,-0.128619,-0.368565,0.09066,0.401147,-0.261034,0.080621,0.162427,0.059456,1.98,0
2,137900,-0.318193,1.118618,0.969864,-0.127052,0.569563,-0.532484,0.706252,-0.064966,-0.463271,...,-0.305402,-0.774704,-0.123884,-0.495687,-0.018148,0.121679,0.24905,0.092516,0.89,0
3,21513,-1.328271,1.018378,1.775426,-1.574193,-0.117696,-0.457733,0.681867,-0.031641,0.383872,...,-0.220815,-0.419013,-0.239197,0.009967,0.232829,0.814177,0.098797,-0.004273,15.98,0
4,134700,1.276712,0.61712,-0.578014,0.879173,0.061706,-1.472002,0.373692,-0.287204,-0.084482,...,-0.160161,-0.430404,-0.076738,0.258708,0.55217,0.370701,-0.034255,0.041709,0.76,0


In [84]:
df.describe()

Unnamed: 0.1,Unnamed: 0,T1,T2,T3,T4,T5,T6,T7,T8,T9,...,T21,T22,T23,T24,T25,T26,T27,T28,value,Class
count,281959.0,281959.0,281959.0,281959.0,281959.0,281959.0,281959.0,281959.0,281959.0,281959.0,...,281959.0,281959.0,281959.0,281959.0,281959.0,281959.0,281959.0,281959.0,281959.0,281959.0
mean,142415.251643,-5.4e-05,4e-05,-0.000128,-0.00031,-0.000222,-6.9e-05,-0.000197,5.9e-05,7e-06,...,-0.000216,-1.6e-05,-0.000141,-9.2e-05,2.9e-05,5.9e-05,-6.2e-05,-0.00011,88.33534,0.001738
std,82208.247997,1.959208,1.651774,1.516882,1.415975,1.381363,1.332312,1.23887,1.19567,1.09889,...,0.733862,0.725659,0.625375,0.60557,0.521293,0.482182,0.404187,0.329206,250.290644,0.041651
min,0.0,-56.40751,-72.715728,-48.325589,-5.683171,-113.743307,-26.160506,-43.557242,-73.216718,-13.434066,...,-34.830382,-10.933144,-44.807735,-2.836627,-10.295397,-2.604551,-22.565679,-15.430084,0.0,0.0
25%,71237.5,-0.9206,-0.598572,-0.890549,-0.848853,-0.691687,-0.767719,-0.554134,-0.208488,-0.643141,...,-0.228425,-0.542624,-0.161856,-0.354748,-0.317061,-0.326992,-0.07084,-0.052969,5.6,0.0
50%,142410.0,0.018145,0.065234,0.17985,-0.020404,-0.054246,-0.274187,0.03992,0.022395,-0.051416,...,-0.029499,0.007105,-0.01121,0.041014,0.016554,-0.051901,0.001321,0.011226,22.0,0.0
75%,213601.5,1.315615,0.803611,1.027013,0.742886,0.611863,0.398574,0.57023,0.327472,0.59698,...,0.186207,0.528501,0.147697,0.439415,0.350762,0.240924,0.091077,0.078256,77.21,0.0
max,284806.0,2.45493,22.057729,9.382558,16.875344,34.801666,73.301626,120.589494,20.007208,15.594995,...,27.202839,10.50309,22.528412,4.584549,7.519589,3.517346,31.612198,33.847808,25691.16,1.0


In [85]:
X=df.drop(columns='Class')
Y=df['Class'].values

In [86]:
#splitting a testing set from the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, stratify = Y, random_state = 42)
#splitting a validation set from the training set to tune parameters
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size = 0.20, stratify = Y_train, random_state = 42)


In [87]:
f1_scrore=make_scorer(f1_score)

In [88]:
space = {'eta': hp.uniform("eta", 0.1, 1),
        'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40, 180, 1),
        'reg_lambda' : hp.uniform('reg_lambda', 0, 1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5, 1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': hp.quniform('n_estimators', 100, 200, 10),
        'scale_pos_weight':1
         #'seed':123
        }
       

    

In [89]:
 #defining function to optimize
def objective(space):
    clf = xgb.XGBClassifier(n_estimators = int(space['n_estimators']),       #number of trees to use
                            eta = space['eta'],                              #learning rate
                            max_depth = int(space['max_depth']),             #depth of trees
                            gamma = space['gamma'],                          #loss reduction required to further partition tree
                            reg_alpha = int(space['reg_alpha']),             #L1 regularization for weights
                            reg_lambda = space['reg_lambda'],                #L2 regularization for weights
                            min_child_weight =  space['min_child_weight'], #minimum sum of instance weight needed in child
                            colsample_bytree = space['colsample_bytree'],    #ratio of column sampling for each tree
                            nthread = -1,                                    #number of parallel threads 
                            scale_pos_weight=space['scale_pos_weight'])
    evaluation = [(X_train, Y_train), (X_test, Y_test)]
    
    clf.fit(X_train, Y_train,
            eval_set = evaluation,
            eval_metric="auc",
            early_stopping_rounds = 10,
            verbose = False)

    pred = clf.predict(X_val)
    pred = [1 if i>= 0.5 else 0 for i in pred]
    f1 = f1_score(Y_val, pred)
    print ("SCORE:", f1)
    return {'loss': -f1, 'status': STATUS_OK }

In [91]:
trials = Trials()
seed=123
best = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 100,
            rstate=np.random.RandomState(seed),
            trials = trials)



SCORE:                                                 
0.0                                                    
SCORE:                                                             
0.7361111111111112                                                 
SCORE:                                                                            
0.0                                                                               
SCORE:                                                                            
0.0                                                                               
SCORE:                                                                            
0.7375886524822695                                                                
SCORE:                                                                            
0.6865671641791046                                                                
SCORE:                                                                            
0.651

In [68]:
print (best)

{'colsample_bytree': 0.8909014503472679, 'eta': 0.8811458404528819, 'gamma': 8.023432926320394, 'max_depth': 17.0, 'min_child_weight': 6.0, 'n_estimators': 170.0, 'reg_alpha': 48.0, 'reg_lambda': 0.23155214850574146}


In [69]:
#initializing XGBoost Classifier with best model parameters
best_clf = xgb.XGBClassifier(n_estimators = int(best['n_estimators']), 
                            eta = best['eta'], 
                            max_depth = int(best['max_depth']), 
                            gamma = best['gamma'], 
                            reg_alpha = int(best['reg_alpha']), 
                            min_child_weight = best['min_child_weight'], 
                            colsample_bytree = best['colsample_bytree'], 
                            nthread = -1)


In [None]:
#fitting XGBoost Classifier with best model parameters to training data
best_clf.fit(X_train, Y_train)


In [None]:
#using the model to predict on the test set
Y_pred = best_clf.predict(X_test)

In [54]:
print('The f1-score on the test data is: {0:.2f}'.format(f1_score(Y_test, Y_pred)))

The f1-score on the test data is: 0.76
