In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
import time

X = np.genfromtxt('../data/X.csv', delimiter=',')
Y = np.genfromtxt('../data/Y.csv', delimiter=',')

X_comp, X_test, Y_comp, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
Xtr, Xva, Ytr, Yva = train_test_split(X_comp, Y_comp, test_size=0.2)

In [2]:
from skopt.space import Integer
from skopt.utils import use_named_args
from skopt import gp_minimize
from numpy import mean

In [3]:
from skopt.space import Real

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
# 1st: Tuning "n_estimators" and "learning_rate"

search_space1 = [Integer(1000, 2000, name='n_estimators'), Real(0.001, 1, name="learning_rate")]

In [6]:
clf1 = GradientBoostingClassifier()

@use_named_args(search_space1)
def evaluate_model1(**params):
    # something
    clf1.set_params(**params)
    # calculate 5-fold cross validation
    result = cross_val_score(clf1, Xtr[:5000], Ytr[:5000], cv=5, n_jobs=-1, scoring='accuracy')
    # calculate the mean of the scores
    estimate = mean(result)
    return 1.0 - estimate

result1 = gp_minimize(evaluate_model1, search_space1)
print('Best Accuracy: %.3f' % (1.0 - result1.fun))
print('Best Parameters: n_estimators=%d\t learning_rate=%f' % (result1.x[0], result1.x[1]))

KeyboardInterrupt: 

In [7]:
# 2nd: Tuning "subsample"

search_space2 = [Real(0, 1, name="subsample")]

clf2 = GradientBoostingClassifier()

@use_named_args(search_space2)
def evaluate_model2(**params):
    # something
    clf2.set_params(**params)
    # calculate 5-fold cross validation
    result = cross_val_score(clf2, Xtr[:5000], Ytr[:5000], cv=5, n_jobs=-1, scoring='accuracy')
    # calculate the mean of the scores
    estimate = mean(result)
    return 1.0 - estimate

result2 = gp_minimize(evaluate_model2, search_space2)
print('Best Accuracy: %.3f' % (1.0 - result2.fun))
print('Best Parameters: subsample=%f' % (result2.x[0]))

KeyboardInterrupt: 

In [5]:
# 3rd: Tuning "max_depth"

search_space3 = [Integer(1, 10, name="max_depth")]

clf3 = GradientBoostingClassifier()

@use_named_args(search_space3)
def evaluate_model3(**params):
    # something
    clf3.set_params(**params)
    # calculate 5-fold cross validation
    result = cross_val_score(clf3, Xtr[:5000], Ytr[:5000], cv=5, n_jobs=-1, scoring='accuracy')
    # calculate the mean of the scores
    estimate = mean(result)
    return 1.0 - estimate

result3 = gp_minimize(evaluate_model3, search_space3)
print('Best Accuracy: %.3f' % (1.0 - result3.fun))
print('Best Parameters: max_depth=%d' % (result3.x[0]))

Best Accuracy: 0.667
Best Parameters: max_depth=4


In [None]:
# 4th: Tuning "min_samples_split" and "min_samples_leaf"

search_space4 = [Integer(1, 60, name="min_samples_split"), Integer(1, 10, name="min_samples_leaf")]

clf4 = GradientBoostingClassifier()

@use_named_args(search_space4)
def evaluate_model4(**params):
    # something
    clf4.set_params(**params)
    # calculate 5-fold cross validation
    result = cross_val_score(clf4, Xtr[:5000], Ytr[:5000], cv=5, n_jobs=-1, scoring='accuracy')
    # calculate the mean of the scores
    estimate = mean(result)
    return 1.0 - estimate

result4 = gp_minimize(evaluate_model4, search_space4)
print('Best Accuracy: %.3f' % (1.0 - result4.fun))
print('Best Parameters: min_samples_split=%d\t min_samples_leaf=%d' % (result4.x[0], result4.x[1]))

In [None]:
# final classifier

clf = GradientBoostingClassifier(max_depth=3,loss="deviance",learning_rate=0.1,
                                 n_estimators=1500,min_samples_split=20,min_samples_leaf=9,
                                 max_features="sqrt", subsample=0.5)

print('training start')
starting_time = time.time()
clf.fit(Xtr, Ytr)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

gradient_boosting_classifier_roc = roc_auc_score(
   Yva, clf.predict_proba(Xva)[:,1])
print(gradient_boosting_classifier_roc)

print("training acc:", clf.score(Xtr, Ytr))
print("validation acc:", clf.score(Xva, Yva))

In [9]:
search_space = [Real(0, 1, name="subsample")]

clf6 = GradientBoostingClassifier(max_depth=3,loss="deviance",learning_rate=0.1,
                                 n_estimators=1500,min_samples_split=20,min_samples_leaf=9,
                                 max_features="sqrt")

In [14]:
@use_named_args(search_space)
def evaluate_model(**params):
    # something
    clf6.set_params(**params)
    # calculate 5-fold cross validation
    result = cross_val_score(clf6, Xtr[:5000], Ytr[:5000], cv=5, n_jobs=-1, scoring='accuracy')
    # calculate the mean of the scores
    estimate = mean(result)
    return 1.0 - estimate

In [16]:
result = gp_minimize(evaluate_model, search_space)
print('Best Accuracy: %.3f' % (1.0 - result.fun))
print('Best Parameters: subsample=%f' % (result.x[0]))



Best Accuracy: 0.678
Best Parameters: subsample=0.637095


In [17]:
clf = GradientBoostingClassifier(max_depth=3,loss="deviance",learning_rate=0.1,
                                 n_estimators=1500,min_samples_split=20,min_samples_leaf=9,
                                 max_features="sqrt", subsample=0.637095)
print('training start')
starting_time = time.time()
clf.fit(Xtr, Ytr)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

gradient_boosting_classifier_roc = roc_auc_score(
   Yva, clf.predict_proba(Xva)[:,1])
print(gradient_boosting_classifier_roc)

print("training acc:", clf.score(Xtr, Ytr))
print("validation acc:", clf.score(Xva, Yva))

training start
training finished, took 68.53286004066467 seconds
0.7616694713965693
training acc: 0.7048427171721907
validation acc: 0.6950434402150134


In [18]:
gradient_boosting_classifier_roc = roc_auc_score(
   Y_test, clf.predict_proba(X_test)[:,1])
print(gradient_boosting_classifier_roc)

0.7602428367100331


In [11]:
import warnings
warnings.filterwarnings('ignore')

search_space2 = [Integer(1000, 2000, name='n_estimators'), Real(0.001, 1, name="learning_rate")]

In [13]:
clf1 = GradientBoostingClassifier(max_depth=3,loss="deviance",min_samples_split=20,
                                  min_samples_leaf=9,max_features="sqrt",subsample=0.637095)

@use_named_args(search_space2)
def evaluate_model2(**params):
    # something
    clf1.set_params(**params)
    # calculate 5-fold cross validation
    result = cross_val_score(clf1, Xtr[:10000], Ytr[:10000], cv=5, n_jobs=-1, scoring='accuracy')
    # calculate the mean of the scores
    estimate = mean(result)
    return 1.0 - estimate

result2 = gp_minimize(evaluate_model2, search_space2)
print('Best Accuracy: %.3f' % (1.0 - result2.fun))
print('Best Parameters: n_estimators=%d\t learning_rate=%f' % (result2.x[0], result2.x[1]))

Best Accuracy: 0.687
Best Parameters: n_estimators=1000	 learning_rate=0.160222


In [14]:
clf1 = GradientBoostingClassifier(max_depth=3,loss="deviance",min_samples_split=20,
                                  min_samples_leaf=9,max_features="sqrt",subsample=0.637095)

@use_named_args(search_space2)
def evaluate_model2(**params):
    # something
    clf1.set_params(**params)
    # calculate 5-fold cross validation
    result = cross_val_score(clf1, Xtr[:5000], Ytr[:5000], cv=5, n_jobs=-1, scoring='accuracy')
    # calculate the mean of the scores
    estimate = mean(result)
    return 1.0 - estimate

result2 = gp_minimize(evaluate_model2, search_space2)
print('Best Accuracy: %.3f' % (1.0 - result2.fun))
print('Best Parameters: n_estimators=%d\t learning_rate=%f' % (result2.x[0], result2.x[1]))

Best Accuracy: 0.685
Best Parameters: n_estimators=1000	 learning_rate=0.081586
