In [1]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

X = np.genfromtxt('../new_data/X_word2vec.csv', delimiter=',')
Y = np.genfromtxt('../new_data/Y.csv', delimiter=',')

In [2]:
X.shape

(100001, 100)

In [4]:
X = X[1:]
X.shape

(100000, 100)

In [5]:
Y.shape

(100001,)

In [6]:
Y[0]

0.0

In [7]:
Y = Y[1:]
Y.shape

(100000,)

In [9]:
X[0]

array([ 0.13919652,  0.34522787,  0.25230184, -0.0769003 ,  0.16374893,
        0.20409723,  0.23620039, -0.22331832,  0.5665915 , -0.00426637,
        0.17732595, -0.23525636, -0.11313281, -0.10409188,  0.45694837,
       -0.10380501, -0.03387303,  0.04110739,  0.34642172,  0.0021829 ,
        0.56319404, -0.36089084, -0.10861673,  0.44213843,  0.04636109,
       -0.094379  , -0.1529754 , -0.33557013, -0.24949755, -0.03063167,
        0.19823845,  0.26017118,  0.46327618, -0.04559801, -0.5678514 ,
       -0.58161813, -0.19105203, -0.07561374, -0.20636106, -0.04571686,
       -0.02558777, -0.35042024, -0.51668215, -0.13987066,  0.01888408,
       -0.32288697,  0.15072297,  0.03304933, -0.22576384,  0.10634459,
       -0.29640242, -0.11779431,  0.32316992, -0.02969029,  0.08544739,
        0.49210298, -0.07357363, -0.3178645 ,  0.5132642 ,  0.14494598,
        0.02008446, -0.38770854,  0.14228489,  0.26731256,  0.26943806,
       -0.04936242,  0.3703746 ,  0.25799286,  0.25105253,  0.57

In [10]:
X_comp, X_test, Y_comp, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
Xtr, Xva, Ytr, Yva = train_test_split(X_comp, Y_comp, test_size=0.2)

## Run a baseline model without tuning

In [11]:
import time

gb_base = GradientBoostingClassifier()

print('training start')
starting_time = time.time()
gb_base.fit(Xtr, Ytr)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

roc = roc_auc_score(Yva, gb_base.predict_proba(Xva)[:,1])
print("ROC AUC score:\t", roc)

print("training error:\t", gb_base.score(Xtr, Ytr))
print("validation error:\t ", gb_base.score(Xva, Yva))

training start
training finished, took 79.82819199562073 seconds
ROC AUC score:	 0.8215001112911763
training error:	 0.757671875
validation error:	  0.7400625


In [12]:
roc = roc_auc_score(Y_test, gb_base.predict_proba(X_test)[:,1])
print("ROC AUC score on Test data:\t", roc)

ROC AUC score on Test data:	 0.8255387284271801


## Grid Search

In [13]:
from sklearn.model_selection import GridSearchCV

parameters1 = {"learning_rate": [0.01, 0.05, 0.075, 0.1,0.2],
               "n_estimators":[10, 100, 250, 500, 1000, 1250, 1500, 1750]}

parameters2 = {"max_depth":[3,5,8]}

parameters3 = {"min_samples_split": [2,6,10,20,40,60],
               "min_samples_leaf": [1,3,5,7,9]}

parameters4 = {"max_features":["log2","sqrt"]}

parameters5 = {"subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0]}

### 1. Tuning learning_rate and n_estimators

In [14]:
clf = GridSearchCV(GradientBoostingClassifier(loss="deviance"), parameters1, cv=5)

print('training start')
starting_time = time.time()
clf.fit(Xtr[:5000], Ytr[:5000])
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

training start
training finished, took 4162.911555767059 seconds


In [15]:
print('best_estimator:\t', clf.best_estimator_)
print('best_score:\t', clf.best_score_)
print('best_params:\t', clf.best_params_)
print('best_index:\t', clf.best_index_)
print('scorer:\t', clf.scorer_)
print('cv_results:\t', clf.cv_results_)
print('refit_time:\t', clf.refit_time_)

best_estimator:	 GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=250,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
best_score:	 0.7426
best_params:	 {'learning_rate': 0.1, 'n_estimators': 250}
best_index:	 26
scorer:	 <function _passthrough_scorer at 0x1a1ce9c710>
cv_results:	 {'mean_fit_time': array([ 0.30441604,  2.80691099,  6.76570477, 13.35367575, 25.79758997,
       32.42334356, 39.51852546, 40.57986403,  0.26379

In [16]:
gradient_boosting_classifier_roc = roc_auc_score(
   Yva, clf.predict_proba(Xva)[:,1])
print("AUC: ", gradient_boosting_classifier_roc)

print("training error:", 1 - clf.score(Xtr, Ytr))
print("validation error:", 1 - clf.score(Xva, Yva))

AUC:  0.8148767587158697
training error: 0.2518125
validation error: 0.2681875


### 2. Tuning max_depth

In [18]:
clf = GridSearchCV(GradientBoostingClassifier(loss="deviance", learning_rate=0.1, 
                                              n_estimators=250), parameters2, cv=5)

print('training start')
starting_time = time.time()
clf.fit(Xtr[:5000], Ytr[:5000])
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

training start
training finished, took 230.16340517997742 seconds


In [19]:
print('best_estimator:\t', clf.best_estimator_)
print('best_score:\t', clf.best_score_)
print('best_params:\t', clf.best_params_)
print('best_index:\t', clf.best_index_)
print('scorer:\t', clf.scorer_)
print('cv_results:\t', clf.cv_results_)
print('refit_time:\t', clf.refit_time_)

best_estimator:	 GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=250,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
best_score:	 0.7426
best_params:	 {'max_depth': 3}
best_index:	 0
scorer:	 <function _passthrough_scorer at 0x1a1ce9c710>
cv_results:	 {'mean_fit_time': array([ 7.89431405, 12.6886364 , 23.94322276]), 'std_fit_time': array([0.22272457, 1.32708582, 0.95811688]), 'mean_score_time': array([0.00571175, 0.00680

In [20]:
gradient_boosting_classifier_roc = roc_auc_score(
   Yva, clf.predict_proba(Xva)[:,1])
print("AUC: ", gradient_boosting_classifier_roc)

print("training error:", 1 - clf.score(Xtr, Ytr))
print("validation error:", 1 - clf.score(Xva, Yva))

AUC:  0.8148851819334516
training error: 0.2518125
validation error: 0.26831249999999995


### 3. Tuning min_samples_split and min_samples_leaf

In [21]:
clf = GridSearchCV(GradientBoostingClassifier(loss="deviance", learning_rate=0.1, 
                                              n_estimators=250, max_depth=3), parameters3, cv=5)

print('training start')
starting_time = time.time()
clf.fit(Xtr[:5000], Ytr[:5000])
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

training start
training finished, took 895.4347591400146 seconds


In [22]:
print('best_estimator:\t', clf.best_estimator_)
print('best_score:\t', clf.best_score_)
print('best_params:\t', clf.best_params_)
print('best_index:\t', clf.best_index_)
print('scorer:\t', clf.scorer_)
print('cv_results:\t', clf.cv_results_)
print('refit_time:\t', clf.refit_time_)

best_estimator:	 GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=250,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
best_score:	 0.7428
best_params:	 {'min_samples_leaf': 1, 'min_samples_split': 2}
best_index:	 0
scorer:	 <function _passthrough_scorer at 0x1a1ce9c710>
cv_results:	 {'mean_fit_time': array([6.29438066, 6.3808579 , 6.35299459, 5.57460928, 5.61101842,
       5.49739528, 5.54592533, 5.53019524, 6.20381484, 6

In [23]:
gradient_boosting_classifier_roc = roc_auc_score(
   Yva, clf.predict_proba(Xva)[:,1])
print("AUC: ", gradient_boosting_classifier_roc)

print("training error:", 1 - clf.score(Xtr, Ytr))
print("validation error:", 1 - clf.score(Xva, Yva))

AUC:  0.8148286885539365
training error: 0.251765625
validation error: 0.2684375


### 4. Tuning max_features

In [24]:
clf = GridSearchCV(GradientBoostingClassifier(loss="deviance", learning_rate=0.1, 
                                              n_estimators=250, max_depth=3,
                                              min_samples_split=2,min_samples_leaf=1), parameters4, cv=5)

print('training start')
starting_time = time.time()
clf.fit(Xtr[:5000], Ytr[:5000])
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

training start
training finished, took 8.63338017463684 seconds


In [25]:
print('best_estimator:\t', clf.best_estimator_)
print('best_score:\t', clf.best_score_)
print('best_params:\t', clf.best_params_)
print('best_index:\t', clf.best_index_)
print('scorer:\t', clf.scorer_)
print('cv_results:\t', clf.cv_results_)
print('refit_time:\t', clf.refit_time_)

best_estimator:	 GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features='sqrt', max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=250,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
best_score:	 0.7358
best_params:	 {'max_features': 'sqrt'}
best_index:	 1
scorer:	 <function _passthrough_scorer at 0x1a1ce9c710>
cv_results:	 {'mean_fit_time': array([0.61297674, 0.89302468]), 'std_fit_time': array([0.0221916 , 0.01699773]), 'mean_score_time': array([0.00444722, 0.004916  ]), 'std_score

In [26]:
gradient_boosting_classifier_roc = roc_auc_score(
   Yva, clf.predict_proba(Xva)[:,1])
print("AUC: ", gradient_boosting_classifier_roc)

print("training error:", 1 - clf.score(Xtr, Ytr))
print("validation error:", 1 - clf.score(Xva, Yva))

AUC:  0.8182076802585363
training error: 0.250484375
validation error: 0.26356250000000003


### 5. Tuning subsample

In [27]:
clf = GridSearchCV(GradientBoostingClassifier(loss="deviance", learning_rate=0.1, 
                                              n_estimators=250, max_depth=3,
                                              min_samples_split=2,min_samples_leaf=1,
                                              max_features='sqrt'), parameters5, cv=5)

print('training start')
starting_time = time.time()
clf.fit(Xtr[:5000], Ytr[:5000])
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

training start
training finished, took 30.260679960250854 seconds


In [28]:
print('best_estimator:\t', clf.best_estimator_)
print('best_score:\t', clf.best_score_)
print('best_params:\t', clf.best_params_)
print('best_index:\t', clf.best_index_)
print('scorer:\t', clf.scorer_)
print('cv_results:\t', clf.cv_results_)
print('refit_time:\t', clf.refit_time_)

best_estimator:	 GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features='sqrt', max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=250,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=0.5, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
best_score:	 0.7406
best_params:	 {'subsample': 0.5}
best_index:	 0
scorer:	 <function _passthrough_scorer at 0x1a1ce9c710>
cv_results:	 {'mean_fit_time': array([0.75178308, 0.78481722, 0.84637012, 0.85710168, 0.87065721,
       0.93545694, 0.79726086]), 'std_fit_time': array([0.02164824, 0.06138873, 0.0

In [29]:
gradient_boosting_classifier_roc = roc_auc_score(
   Yva, clf.predict_proba(Xva)[:,1])
print("AUC: ", gradient_boosting_classifier_roc)

print("training error:", 1 - clf.score(Xtr, Ytr))
print("validation error:", 1 - clf.score(Xva, Yva))

AUC:  0.8190869141501758
training error: 0.25396874999999997
validation error: 0.26349999999999996


### Final GB model after gridsearch

In [31]:
gb_grid = GradientBoostingClassifier(loss="deviance", learning_rate=0.1, 
                                     n_estimators=250, max_depth=3,
                                     min_samples_split=2,min_samples_leaf=1,
                                     max_features='sqrt', subsample=0.5)

print('training start')
starting_time = time.time()
gb_grid.fit(Xtr, Ytr)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

gradient_boosting_classifier_roc = roc_auc_score(
   Yva, gb_grid.predict_proba(Xva)[:,1])
print("AUC: ", gradient_boosting_classifier_roc)

print("training acc:", gb_grid.score(Xtr, Ytr))
print("validation acc:", gb_grid.score(Xva, Yva))

training start
training finished, took 17.08224606513977 seconds
AUC:  0.8337655506622237
training acc: 0.76834375
validation acc: 0.7521875


In [32]:
roc = roc_auc_score(Y_test, gb_grid.predict_proba(X_test)[:,1])
print("ROC AUC score on Test data:\t", roc)

ROC AUC score on Test data:	 0.8357213362820417


## Bayesian Optimization

In [33]:
from skopt.space import Integer
from skopt.space import Real
from skopt.utils import use_named_args
from skopt import gp_minimize
from numpy import mean

import warnings
warnings.filterwarnings('ignore')

### 1. Tuning "n_estimators" and "learning_rate"

In [34]:
search_space1 = [Integer(1000, 2000, name='n_estimators'), Real(0.001, 1, name="learning_rate")]

clf1 = GradientBoostingClassifier(loss="deviance",max_features="sqrt")

@use_named_args(search_space1)
def evaluate_model1(**params):
    # something
    clf1.set_params(**params)
    # calculate 5-fold cross validation
    result = cross_val_score(clf1, Xtr[:5000], Ytr[:5000], cv=5, n_jobs=-1, scoring='accuracy')
    # calculate the mean of the scores
    estimate = mean(result)
    return 1.0 - estimate

result1 = gp_minimize(evaluate_model1, search_space1)
print('Best Accuracy: %.3f' % (1.0 - result1.fun))
print('Best Parameters: n_estimators=%d\t learning_rate=%f' % (result1.x[0], result1.x[1]))

Best Accuracy: 0.743
Best Parameters: n_estimators=1000	 learning_rate=0.056248


### 2. Tuning "subsample"

In [35]:
search_space2 = [Real(0.1, 1, name="subsample")]

clf2 = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.056248,
                                  loss="deviance",max_features="sqrt")

@use_named_args(search_space2)
def evaluate_model2(**params):
    # something
    clf2.set_params(**params)
    # calculate 5-fold cross validation
    result = cross_val_score(clf2, Xtr[:5000], Ytr[:5000], cv=5, n_jobs=-1, scoring='accuracy')
    # calculate the mean of the scores
    estimate = mean(result)
    return 1.0 - estimate

result2 = gp_minimize(evaluate_model2, search_space2)
print('Best Accuracy: %.3f' % (1.0 - result2.fun))
print('Best Parameters: subsample=%f' % (result2.x[0]))

Best Accuracy: 0.744
Best Parameters: subsample=0.640235


### 3. Tuning "max_depth"

In [37]:
search_space3 = [Integer(3, 10, name="max_depth")]

clf3 = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.056248,
                                  subsample=0.640235,
                                  loss="deviance",max_features="sqrt")

@use_named_args(search_space3)
def evaluate_model3(**params):
    # something
    clf3.set_params(**params)
    # calculate 5-fold cross validation
    result = cross_val_score(clf3, Xtr[:5000], Ytr[:5000], cv=5, n_jobs=-1, scoring='accuracy')
    # calculate the mean of the scores
    estimate = mean(result)
    return 1.0 - estimate

result3 = gp_minimize(evaluate_model3, search_space3)
print('Best Accuracy: %.3f' % (1.0 - result3.fun))
print('Best Parameters: max_depth=%d' % (result3.x[0]))

Best Accuracy: 0.745
Best Parameters: max_depth=10


### 4. Tuning "min_samples_split" and "min_samples_leaf"

In [38]:
search_space4 = [Integer(2, 60, name="min_samples_split"), Integer(1, 10, name="min_samples_leaf")]

clf4 = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.056248,
                                  subsample=0.640235, max_depth=10,
                                  loss="deviance",max_features="sqrt")

@use_named_args(search_space4)
def evaluate_model4(**params):
    # something
    clf4.set_params(**params)
    # calculate 5-fold cross validation
    result = cross_val_score(clf4, Xtr[:5000], Ytr[:5000], cv=5, n_jobs=-1, scoring='accuracy')
    # calculate the mean of the scores
    estimate = mean(result)
    return 1.0 - estimate

result4 = gp_minimize(evaluate_model4, search_space4)
print('Best Accuracy: %.3f' % (1.0 - result4.fun))
print('Best Parameters: min_samples_split=%d\t min_samples_leaf=%d' % (result4.x[0], result4.x[1]))

Best Accuracy: 0.748
Best Parameters: min_samples_split=2	 min_samples_leaf=10


### Final GB model after Bayesian Optimization

In [39]:
gb_bayeOptimal = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.056248,
                                  subsample=0.640235, max_depth=10, min_samples_split=2, 
                                  min_samples_leaf=10, loss="deviance",max_features="sqrt")

print('training start')
starting_time = time.time()
gb_bayeOptimal.fit(Xtr, Ytr)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

gradient_boosting_classifier_roc = roc_auc_score(
   Yva, gb_bayeOptimal.predict_proba(Xva)[:,1])
print("AUC: ", gradient_boosting_classifier_roc)

print("training acc:", gb_bayeOptimal.score(Xtr, Ytr))
print("validation acc:", gb_bayeOptimal.score(Xva, Yva))

training start
training finished, took 403.4930191040039 seconds
AUC:  0.8421545440391209
training acc: 0.993515625
validation acc: 0.7619375


In [40]:
roc = roc_auc_score(Y_test, gb_bayeOptimal.predict_proba(X_test)[:,1])
print("ROC AUC score on Test data:\t", roc)

ROC AUC score on Test data:	 0.8431092672800585


## Save the best GB model

In [None]:
import joblib

joblib.dump(best, '../new_data/best_gb.sav')