# Gradient Boosting Classifier Training

## Load Data

In [1]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

X = load = scipy.sparse.load_npz('../new_data/X_sparse.npz')
Y = np.genfromtxt('../new_data/Y.csv', delimiter=',')[1:]

In [2]:
X.shape

(100000, 5000)

In [3]:
Y.shape

(100000,)

In [4]:
X_comp, X_test, Y_comp, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
Xtr, Xva, Ytr, Yva = train_test_split(X_comp, Y_comp, test_size=0.2)

In [5]:
Xtr.shape

(64000, 5000)

In [6]:
Xva.shape

(16000, 5000)

In [7]:
X_test.shape

(20000, 5000)

## Run a baseline model without tuning

In [12]:
import time

gb_base = GradientBoostingClassifier()

print('training start')
starting_time = time.time()
gb_base.fit(Xtr, Ytr)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

roc = roc_auc_score(Yva, gb_base.predict_proba(Xva)[:,1])
print("ROC AUC score:\t", roc)

print("training error:\t", gb_base.score(Xtr, Ytr))
print("validation error:\t ", gb_base.score(Xva, Yva))

training start
training finished, took 3.35256290435791 seconds
ROC AUC score:	 0.7671358764009053
training error:	 0.69871875
validation error:	  0.6970625


In [13]:
roc = roc_auc_score(Y_test, gb_base.predict_proba(X_test)[:,1])
print("ROC AUC score on Test data:\t", roc)

ROC AUC score on Test data:	 0.7655406268454239


## Grid Search

In [14]:
from sklearn.model_selection import GridSearchCV

In [15]:
# using a smaller subset (5000) of training data to do gridsearch

In [16]:
parameters1 = {"learning_rate": [0.01, 0.05, 0.075, 0.1,0.2],
               "n_estimators":[10, 100, 250, 500, 1000, 1250, 1500, 1750]}

parameters2 = {"max_depth":[3,5,8]}

parameters3 = {"min_samples_split": [2,6,10,20,40,60],
               "min_samples_leaf": [1,3,5,7,9]}

parameters4 = {"max_features":["log2","sqrt"]}

parameters5 = {"subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0]}

### 1. Tuning learning_rate and n_estimators

In [21]:
clf = GridSearchCV(GradientBoostingClassifier(loss="deviance"), parameters1, cv=5)

print('training start')
starting_time = time.time()
clf.fit(Xtr[:5000], Ytr[:5000])
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

training start
training finished, took 1972.924947977066 seconds


In [22]:
print('best_estimator:\t', clf.best_estimator_)
print('best_score:\t', clf.best_score_)
print('best_params:\t', clf.best_params_)
print('best_index:\t', clf.best_index_)
print('scorer:\t', clf.scorer_)
print('cv_results:\t', clf.cv_results_)
print('refit_time:\t', clf.refit_time_)

best_estimator6:	 GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.2, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=1500,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
best_score6:	 0.7072
best_params6:	 {'learning_rate': 0.2, 'n_estimators': 1500}
best_index6:	 38
scorer6:	 <function _passthrough_scorer at 0x1a18dd0710>
cv_results6:	 {'mean_fit_time': array([ 0.12456379,  1.19323473,  3.06718607,  6.17734776, 12.29856892,
       15.15646286, 18.58393478, 21.79404864, 

In [23]:
gradient_boosting_classifier_roc = roc_auc_score(
   Yva, clf.predict_proba(Xva)[:,1])
print("AUC: ", gradient_boosting_classifier_roc)

print("training error:", 1 - clf.score(Xtr, Ytr))
print("validation error:", 1 - clf.score(Xva, Yva))

AUC:  0.7899945892146074
training error: 0.26960937500000004
validation error: 0.2830625


### 2. Tuning max_depth

In [24]:
clf = GridSearchCV(GradientBoostingClassifier(loss="deviance", learning_rate=0.2, 
                                              n_estimators=1500), parameters2, cv=5)

print('training start')
starting_time = time.time()
clf.fit(Xtr[:5000], Ytr[:5000])
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

training start
training finished, took 346.23646211624146 seconds


In [25]:
print('best_estimator:\t', clf.best_estimator_)
print('best_score:\t', clf.best_score_)
print('best_params:\t', clf.best_params_)
print('best_index:\t', clf.best_index_)
print('scorer:\t', clf.scorer_)
print('cv_results:\t', clf.cv_results_)
print('refit_time:\t', clf.refit_time_)

best_estimator:	 GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.2, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=1500,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
best_score:	 0.7064
best_params:	 {'max_depth': 3}
best_index:	 0
scorer:	 <function _passthrough_scorer at 0x1a18dd0710>
cv_results:	 {'mean_fit_time': array([18.38099461, 21.22659183, 25.30114784]), 'std_fit_time': array([0.36820228, 0.80794296, 0.93825758]), 'mean_score_time': array([0.0255198 , 0.0417

In [26]:
gradient_boosting_classifier_roc = roc_auc_score(
   Yva, clf.predict_proba(Xva)[:,1])
print("AUC: ", gradient_boosting_classifier_roc)

print("training error:", 1 - clf.score(Xtr, Ytr))
print("validation error:", 1 - clf.score(Xva, Yva))

AUC:  0.7893865809366011
training error: 0.269234375
validation error: 0.28393749999999995


### 3. Tuning min_samples_split and min_samples_leaf

In [27]:
clf = GridSearchCV(GradientBoostingClassifier(loss="deviance", learning_rate=0.2, 
                                              n_estimators=1500, max_depth=3), parameters3, cv=5)

print('training start')
starting_time = time.time()
clf.fit(Xtr[:5000], Ytr[:5000])
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

training start
training finished, took 2490.830836057663 seconds


In [28]:
print('best_estimator:\t', clf.best_estimator_)
print('best_score:\t', clf.best_score_)
print('best_params:\t', clf.best_params_)
print('best_index:\t', clf.best_index_)
print('scorer:\t', clf.scorer_)
print('cv_results:\t', clf.cv_results_)
print('refit_time:\t', clf.refit_time_)

best_estimator:	 GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.2, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=6,
                           min_weight_fraction_leaf=0.0, n_estimators=1500,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
best_score:	 0.707
best_params:	 {'min_samples_leaf': 1, 'min_samples_split': 6}
best_index:	 1
scorer:	 <function _passthrough_scorer at 0x1a18dd0710>
cv_results:	 {'mean_fit_time': array([18.27026734, 16.98340802, 18.55240593, 18.29862819, 15.98394623,
       15.76463747, 15.99805918, 16.66479754, 15.85

In [29]:
gradient_boosting_classifier_roc = roc_auc_score(
   Yva, clf.predict_proba(Xva)[:,1])
print("AUC: ", gradient_boosting_classifier_roc)

print("training error:", 1 - clf.score(Xtr, Ytr))
print("validation error:", 1 - clf.score(Xva, Yva))

AUC:  0.7887109163567952
training error: 0.269828125
validation error: 0.28456250000000005


### 4. Tuning max_features

In [31]:
clf = GridSearchCV(GradientBoostingClassifier(loss="deviance", learning_rate=0.2, 
                                              n_estimators=1500, max_depth=3,
                                             min_samples_split=6,min_samples_leaf=1), parameters4, cv=5)

print('training start')
starting_time = time.time()
clf.fit(Xtr[:5000], Ytr[:5000])
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

training start
training finished, took 26.237683057785034 seconds


In [32]:
print('best_estimator:\t', clf.best_estimator_)
print('best_score:\t', clf.best_score_)
print('best_params:\t', clf.best_params_)
print('best_index:\t', clf.best_index_)
print('scorer:\t', clf.scorer_)
print('cv_results:\t', clf.cv_results_)
print('refit_time:\t', clf.refit_time_)

best_estimator:	 GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.2, loss='deviance', max_depth=3,
                           max_features='sqrt', max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=6,
                           min_weight_fraction_leaf=0.0, n_estimators=1500,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
best_score:	 0.7026
best_params:	 {'max_features': 'sqrt'}
best_index:	 1
scorer:	 <function _passthrough_scorer at 0x1a18dd0710>
cv_results:	 {'mean_fit_time': array([1.96429586, 2.59483891]), 'std_fit_time': array([0.08706508, 0.06227113]), 'mean_score_time': array([0.02735553, 0.026579  ]), 'std_scor

In [33]:
gradient_boosting_classifier_roc = roc_auc_score(
   Yva, clf.predict_proba(Xva)[:,1])
print("AUC: ", gradient_boosting_classifier_roc)

print("training error:", 1 - clf.score(Xtr, Ytr))
print("validation error:", 1 - clf.score(Xva, Yva))

AUC:  0.7879401892042075
training error: 0.273953125
validation error: 0.2820625


### 5. Tuning subsample

In [34]:
clf = GridSearchCV(GradientBoostingClassifier(loss="deviance", learning_rate=0.2, 
                                              n_estimators=1500, max_depth=3,
                                             min_samples_split=6,min_samples_leaf=1,
                                             max_features='sqrt'), parameters5, cv=5)

print('training start')
starting_time = time.time()
clf.fit(Xtr[:5000], Ytr[:5000])
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

training start
training finished, took 98.57253408432007 seconds


In [35]:
print('best_estimator:\t', clf.best_estimator_)
print('best_score:\t', clf.best_score_)
print('best_params:\t', clf.best_params_)
print('best_index:\t', clf.best_index_)
print('scorer:\t', clf.scorer_)
print('cv_results:\t', clf.cv_results_)
print('refit_time:\t', clf.refit_time_)

best_estimator:	 GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.2, loss='deviance', max_depth=3,
                           max_features='sqrt', max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=6,
                           min_weight_fraction_leaf=0.0, n_estimators=1500,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=0.95, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
best_score:	 0.7048
best_params:	 {'subsample': 0.95}
best_index:	 5
scorer:	 <function _passthrough_scorer at 0x1a18dd0710>
cv_results:	 {'mean_fit_time': array([2.67482667, 2.61542406, 2.81198616, 2.84666705, 2.91049404,
       2.77735562, 2.26036119]), 'std_fit_time': array([0.06290559, 0.09579523, 

In [36]:
gradient_boosting_classifier_roc = roc_auc_score(
   Yva, clf.predict_proba(Xva)[:,1])
print("AUC: ", gradient_boosting_classifier_roc)

print("training error:", 1 - clf.score(Xtr, Ytr))
print("validation error:", 1 - clf.score(Xva, Yva))

AUC:  0.7877566421886792
training error: 0.27351562500000004
validation error: 0.28574999999999995


### Final GB model after gridsearch

In [37]:
gb_grid = GradientBoostingClassifier(loss="deviance", learning_rate=0.2, 
                                     n_estimators=1500, max_depth=3,
                                     min_samples_split=6,min_samples_leaf=1,
                                     max_features='sqrt', subsample=0.95)

print('training start')
starting_time = time.time()
gb_grid.fit(Xtr, Ytr)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

gradient_boosting_classifier_roc = roc_auc_score(
   Yva, gb_grid.predict_proba(Xva)[:,1])
print("AUC: ", gradient_boosting_classifier_roc)

print("training acc:", gb_grid.score(Xtr, Ytr))
print("validation acc:", gb_grid.score(Xva, Yva))

training start
training finished, took 19.872843742370605 seconds
AUC:  0.8334768881307424
training acc: 0.792515625
validation acc: 0.76075


In [38]:
roc = roc_auc_score(Y_test, gb_grid.predict_proba(X_test)[:,1])
print("ROC AUC score on Test data:\t", roc)

ROC AUC score on Test data:	 0.8338549983465341


## Bayesian Optimization

In [39]:
from skopt.space import Integer
from skopt.space import Real
from skopt.utils import use_named_args
from skopt import gp_minimize
from numpy import mean

import warnings
warnings.filterwarnings('ignore')

### 1. Tuning "n_estimators" and "learning_rate"

In [40]:
search_space1 = [Integer(1000, 2000, name='n_estimators'), Real(0.001, 1, name="learning_rate")]

clf1 = GradientBoostingClassifier(loss="deviance",max_features="sqrt")

@use_named_args(search_space1)
def evaluate_model1(**params):
    # something
    clf1.set_params(**params)
    # calculate 5-fold cross validation
    result = cross_val_score(clf1, Xtr[:5000], Ytr[:5000], cv=5, n_jobs=-1, scoring='accuracy')
    # calculate the mean of the scores
    estimate = mean(result)
    return 1.0 - estimate

result1 = gp_minimize(evaluate_model1, search_space1)
print('Best Accuracy: %.3f' % (1.0 - result1.fun))
print('Best Parameters: n_estimators=%d\t learning_rate=%f' % (result1.x[0], result1.x[1]))

Best Accuracy: 0.705
Best Parameters: n_estimators=1515	 learning_rate=0.214818


### 2. Tuning "subsample"

In [45]:
search_space2 = [Real(0.1, 1, name="subsample")]

clf2 = GradientBoostingClassifier(n_estimators=1515, learning_rate=0.214818,
                                  loss="deviance",max_features="sqrt")

@use_named_args(search_space2)
def evaluate_model2(**params):
    # something
    clf2.set_params(**params)
    # calculate 5-fold cross validation
    result = cross_val_score(clf2, Xtr[:5000], Ytr[:5000], cv=5, n_jobs=-1, scoring='accuracy')
    # calculate the mean of the scores
    estimate = mean(result)
    return 1.0 - estimate

result2 = gp_minimize(evaluate_model2, search_space2)
print('Best Accuracy: %.3f' % (1.0 - result2.fun))
print('Best Parameters: subsample=%f' % (result2.x[0]))

Best Accuracy: 0.707
Best Parameters: subsample=0.827280


### 3. Tuning "max_depth"

In [46]:
search_space3 = [Integer(1, 10, name="max_depth")]

clf3 = GradientBoostingClassifier(n_estimators=1515, learning_rate=0.214818,
                                  subsample=0.827280,
                                  loss="deviance",max_features="sqrt")

@use_named_args(search_space3)
def evaluate_model3(**params):
    # something
    clf3.set_params(**params)
    # calculate 5-fold cross validation
    result = cross_val_score(clf3, Xtr[:5000], Ytr[:5000], cv=5, n_jobs=-1, scoring='accuracy')
    # calculate the mean of the scores
    estimate = mean(result)
    return 1.0 - estimate

result3 = gp_minimize(evaluate_model3, search_space3)
print('Best Accuracy: %.3f' % (1.0 - result3.fun))
print('Best Parameters: max_depth=%d' % (result3.x[0]))

Best Accuracy: 0.705
Best Parameters: max_depth=1


In [53]:
search_space3 = [Integer(3, 10, name="max_depth")]

clf3 = GradientBoostingClassifier(n_estimators=1515, learning_rate=0.214818,
                                  subsample=0.827280,
                                  loss="deviance",max_features="sqrt")

@use_named_args(search_space3)
def evaluate_model3(**params):
    # something
    clf3.set_params(**params)
    # calculate 5-fold cross validation
    result = cross_val_score(clf3, Xtr[:5000], Ytr[:5000], cv=5, n_jobs=-1, scoring='accuracy')
    # calculate the mean of the scores
    estimate = mean(result)
    return 1.0 - estimate

result3 = gp_minimize(evaluate_model3, search_space3)
print('Best Accuracy: %.3f' % (1.0 - result3.fun))
print('Best Parameters: max_depth=%d' % (result3.x[0]))

Best Accuracy: 0.707
Best Parameters: max_depth=3


### 4. Tuning "min_samples_split" and "min_samples_leaf"

In [47]:
search_space4 = [Integer(2, 60, name="min_samples_split"), Integer(1, 10, name="min_samples_leaf")]

clf4 = GradientBoostingClassifier(n_estimators=1515, learning_rate=0.214818,
                                  subsample=0.827280, max_depth=1,
                                  loss="deviance",max_features="sqrt")

@use_named_args(search_space4)
def evaluate_model4(**params):
    # something
    clf4.set_params(**params)
    # calculate 5-fold cross validation
    result = cross_val_score(clf4, Xtr[:5000], Ytr[:5000], cv=5, n_jobs=-1, scoring='accuracy')
    # calculate the mean of the scores
    estimate = mean(result)
    return 1.0 - estimate

result4 = gp_minimize(evaluate_model4, search_space4)
print('Best Accuracy: %.3f' % (1.0 - result4.fun))
print('Best Parameters: min_samples_split=%d\t min_samples_leaf=%d' % (result4.x[0], result4.x[1]))

Best Accuracy: 0.707
Best Parameters: min_samples_split=2	 min_samples_leaf=1


### Final GB model after Bayesian Optimization

In [54]:
gb_bayeOptimal = GradientBoostingClassifier(n_estimators=1515, learning_rate=0.214818,
                                  subsample=0.827280, max_depth=3, min_samples_split=2, 
                                  min_samples_leaf=1, loss="deviance",max_features="sqrt")

print('training start')
starting_time = time.time()
gb_bayeOptimal.fit(Xtr, Ytr)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

gradient_boosting_classifier_roc = roc_auc_score(
   Yva, gb_bayeOptimal.predict_proba(Xva)[:,1])
print("AUC: ", gradient_boosting_classifier_roc)

print("training acc:", gb_bayeOptimal.score(Xtr, Ytr))
print("validation acc:", gb_bayeOptimal.score(Xva, Yva))

training start
training finished, took 21.741851806640625 seconds
AUC:  0.8322982387908391
training acc: 0.795375
validation acc: 0.7588125


In [55]:
roc = roc_auc_score(Y_test, gb_bayeOptimal.predict_proba(X_test)[:,1])
print("ROC AUC score on Test data:\t", roc)

ROC AUC score on Test data:	 0.8331215162977716
