In [24]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV

### Загружаем данные из pickle-файлов

In [25]:
import pickle

In [26]:
with open('X_train.pickle', 'rb') as f:
    X_train = pickle.load(f)
with open('y_train.pickle', 'rb') as f:
    y_train = pickle.load(f)

In [27]:
with open('X_holdout.pickle', 'rb') as f:
    X_holdout = pickle.load(f)
with open('y_holdout.pickle', 'rb') as f:
    y_holdout = pickle.load(f)

### XGBoost

In [28]:
xgbc = XGBClassifier()
xgbc.fit(X_train, y_train);

### Качество моделей (оценка на основе отложенной выборки)

In [29]:
pred_holdout_xgbc = xgbc.predict(X_holdout)
XGB_accuracy = accuracy_score(y_holdout, pred_holdout_xgbc)
print("Accuracy: %.2f%%" % (XGB_accuracy * 100.0))

Accuracy: 50.00%


### Кросс-валидация

In [30]:
#StratifiedKfold returns stratified folds, 
#i.e while making the folds it maintains the percentage of samples for each class in every fold. 

skf = StratifiedKFold(n_splits = 7, random_state=1, shuffle = True)

In [31]:
xgb_cvs = cross_val_score(xgbc, X_train, y_train, scoring='accuracy', cv = skf)
xgb_cvs

array([0.5, 0.5, 0.5, 0.5, 1. , 0.5, 1. ])

In [32]:
print("Mean accuracy: %.2f%%" % (xgb_cvs.mean()* 100.0))

Mean accuracy: 64.29%


In [33]:
clf = GridSearchCV(xgbc,{'max_depth': [3,5,7],
                    'n_estimators': [50,100,200],
                        'min_child_weight': [1,3,5]}, 
                    verbose=1, 
                    scoring='accuracy')

clf.fit(X_train, y_train)
print(clf.best_score_)
print(clf.best_params_)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6428571428571429
{'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 50}


[Parallel(n_jobs=1)]: Done  81 out of  81 | elapsed:    0.9s finished


In [34]:
cv_params = {'max_depth': [3,5,7], 'min_child_weight': [1,3,5]}

In [35]:
ind_params = {'learning_rate': 0.1, 'n_estimators': 1000, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'binary:logistic'}

In [36]:
optimized_GBM = GridSearchCV(XGBClassifier(**ind_params), cv_params, scoring = 'accuracy', cv = 5, n_jobs = -1) 

In [37]:
optimized_GBM.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bytree=0.8,
                                     gamma=0, learning_rate=0.1,
                                     max_delta_step=0, max_depth=3,
                                     min_child_weight=1, missing=None,
                                     n_estimators=1000, n_jobs=1, nthread=None,
                                     objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=0, silent=True,
                                     subsample=0.8),
             iid='warn', n_jobs=-1,
             param_grid={'max_depth': [3, 5, 7], 'min_child_weight': [1, 3, 5]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [38]:
print(optimized_GBM.best_params_)
print(optimized_GBM.best_score_)

{'max_depth': 3, 'min_child_weight': 1}
0.6428571428571429


In [39]:
cv_params = {'learning_rate': [0.1, 0.01], 'subsample': [0.7,0.8,0.9]}
ind_params = {'n_estimators': 1000, 'seed':0, 'colsample_bytree': 0.8, 
             'objective': 'binary:logistic', 'max_depth': 3, 'min_child_weight': 1}


optimized_GBM = GridSearchCV(XGBClassifier(**ind_params), 
                            cv_params, 
                             scoring = 'accuracy', cv = 5, n_jobs = -1)
optimized_GBM.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bytree=0.8,
                                     gamma=0, learning_rate=0.1,
                                     max_delta_step=0, max_depth=3,
                                     min_child_weight=1, missing=None,
                                     n_estimators=1000, n_jobs=1, nthread=None,
                                     objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=0, silent=True,
                                     subsample=1),
             iid='warn', n_jobs=-1,
             param_grid={'learning_rate': [0.1, 0.01],
                         'subsample': [0.7, 0.8, 0.9]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             sc

In [40]:
print(optimized_GBM.best_params_)
print(optimized_GBM.best_score_)

{'learning_rate': 0.01, 'subsample': 0.7}
0.7857142857142857


In [41]:
xgbc_opt = XGBClassifier(max_depth=3, min_child_weight=1, learning_rate=0.1, subsample=0.7)
xgbc_opt.fit(X_train, y_train);

In [42]:
pred_holdout_xgbc_opt = xgbc_opt.predict(X_holdout)
accuracy_score(y_holdout, pred_holdout_xgbc)

0.5

In [43]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from time import time
from scipy.stats import randint as sp_randint

#### Step 1: Tune max_depth and min_child_weight

In [44]:
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.5f} (std: {1:.5f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

    return

In [45]:
fixed_params_rd1 = {
    'n_estimators': 500
}

# specify parameters and distributions to sample from
cv_params_rd1 = {"max_depth": sp_randint(3,11),
                 "min_child_weight": sp_randint(1,6)}

# randomized search
n_iter_search = 25
optimized_XGB = RandomizedSearchCV(XGBClassifier(**fixed_params_rd1),
                               cv_params_rd1,
                               scoring = 'neg_mean_squared_error',
                               cv = 5,
                               n_iter = n_iter_search,
                               n_jobs = 4,
                               verbose = 1)


start = time()

optimized_XGB.fit(X_train, y_train)

print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(optimized_XGB.cv_results_)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


RandomizedSearchCV took 2.16 seconds for 25 candidates parameter settings.
Model with rank: 1
Mean validation score: -0.35714 (std: 0.29451)
Parameters: {'max_depth': 9, 'min_child_weight': 1}

Model with rank: 1
Mean validation score: -0.35714 (std: 0.29451)
Parameters: {'max_depth': 4, 'min_child_weight': 1}

Model with rank: 1
Mean validation score: -0.35714 (std: 0.29451)
Parameters: {'max_depth': 8, 'min_child_weight': 1}



[Parallel(n_jobs=4)]: Done 125 out of 125 | elapsed:    2.1s finished


#### Tune gamma

In [46]:
from scipy.stats import uniform as sp_uniform
fixed_params_rd2 = {
    'n_estimators': 500,
    "max_depth": 3,
    "min_child_weight": 6
}

# specify parameters and distributions to sample from
rvs = sp_uniform(0.,0.3)
cv_params_rd2 = {"gamma": rvs}

# randomized search
n_iter_search = 15
optimized_XGB = RandomizedSearchCV(XGBClassifier(**fixed_params_rd2),
                               cv_params_rd2,
                               scoring = 'neg_mean_squared_error',
                               cv = 5,
                               n_iter = n_iter_search,
                               n_jobs = 4,
                               verbose = 1)


start = time()

optimized_XGB.fit(X_train, y_train)

print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 15 candidates, totalling 75 fits
RandomizedSearchCV took 1.31 seconds for 15 candidates parameter settings.


[Parallel(n_jobs=4)]: Done  68 out of  75 | elapsed:    1.2s remaining:    0.1s
[Parallel(n_jobs=4)]: Done  75 out of  75 | elapsed:    1.3s finished


In [47]:
report(optimized_XGB.cv_results_)

Model with rank: 1
Mean validation score: -0.50000 (std: 0.00000)
Parameters: {'gamma': 0.18302819102885612}

Model with rank: 1
Mean validation score: -0.50000 (std: 0.00000)
Parameters: {'gamma': 0.065988925998143}

Model with rank: 1
Mean validation score: -0.50000 (std: 0.00000)
Parameters: {'gamma': 0.08517109842350347}

Model with rank: 1
Mean validation score: -0.50000 (std: 0.00000)
Parameters: {'gamma': 0.28348151280054773}

Model with rank: 1
Mean validation score: -0.50000 (std: 0.00000)
Parameters: {'gamma': 0.23651376770194227}

Model with rank: 1
Mean validation score: -0.50000 (std: 0.00000)
Parameters: {'gamma': 0.24790147468878368}

Model with rank: 1
Mean validation score: -0.50000 (std: 0.00000)
Parameters: {'gamma': 0.2719915568755403}

Model with rank: 1
Mean validation score: -0.50000 (std: 0.00000)
Parameters: {'gamma': 0.19194177084499797}

Model with rank: 1
Mean validation score: -0.50000 (std: 0.00000)
Parameters: {'gamma': 0.28741959822475344}

Model with ra

#### Tune subsample and colsample_bytree

In [48]:
fixed_params_rd3 = {
    'n_estimators': 500,
    "max_depth": 3,
    "min_child_weight": 6, 
    'gamma': 0.122
}

# specify parameters and distributions to sample from
rvs_1 = sp_uniform(0.6,0.4)
rvs_2 = sp_uniform(0.6,0.4)
cv_params_rd3 = {"subsample": rvs_1,
                 "colsample_bytree": rvs_2}

# randomized search
n_iter_search = 50
optimized_XGB = RandomizedSearchCV(XGBClassifier(**fixed_params_rd3),
                               cv_params_rd3,
                               scoring = 'neg_mean_squared_error',
                               cv = 5,
                               n_iter = n_iter_search,
                               n_jobs = 4,
                               verbose = 1)


start = time()

optimized_XGB.fit(X_train, y_train)

print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(optimized_XGB.cv_results_)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=4)]: Done 144 tasks      | elapsed:    2.6s


RandomizedSearchCV took 4.53 seconds for 50 candidates parameter settings.
Model with rank: 1
Mean validation score: -0.50000 (std: 0.00000)
Parameters: {'colsample_bytree': 0.76941352173128, 'subsample': 0.6987974514420823}

Model with rank: 1
Mean validation score: -0.50000 (std: 0.00000)
Parameters: {'colsample_bytree': 0.6438433680224384, 'subsample': 0.969339862578239}

Model with rank: 1
Mean validation score: -0.50000 (std: 0.00000)
Parameters: {'colsample_bytree': 0.7003606657587043, 'subsample': 0.8143265063201174}

Model with rank: 1
Mean validation score: -0.50000 (std: 0.00000)
Parameters: {'colsample_bytree': 0.9082345379482861, 'subsample': 0.8483203478531486}

Model with rank: 1
Mean validation score: -0.50000 (std: 0.00000)
Parameters: {'colsample_bytree': 0.9150832574022808, 'subsample': 0.6458437491897856}

Model with rank: 1
Mean validation score: -0.50000 (std: 0.00000)
Parameters: {'colsample_bytree': 0.9362958725003898, 'subsample': 0.8575911966440829}

Model with

[Parallel(n_jobs=4)]: Done 250 out of 250 | elapsed:    4.5s finished


#### Tune regularization parameters

In [49]:
fixed_params_rd4 = {
    'n_estimators': 500,
    'subsample': 0.917,
    'colsample_bytree': 0.631,
    "max_depth": 3,
    "min_child_weight": 6,
    'gamma': 0.122
}

# specify parameters to search across
cv_params_rd4 = {'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05],
                 'reg_lambda': [0.6, 0.7, 0.8, 0.9, 1.]}

# grid search
optimized_XGB = GridSearchCV(XGBClassifier(**fixed_params_rd4),
                         cv_params_rd4,
                         scoring = 'neg_mean_squared_error',
                         cv = 5,
                         n_jobs = 4,
                         verbose = 1)


start = time()

optimized_XGB.fit(X_train, y_train)

print("GridSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), 25)) #25 = number of reg_alpha * number of reg_lambda entries

report(optimized_XGB.cv_results_)


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 25 candidates, totalling 125 fits
GridSearchCV took 1.99 seconds for 25 candidates parameter settings.
Model with rank: 1
Mean validation score: -0.50000 (std: 0.00000)
Parameters: {'reg_alpha': 0, 'reg_lambda': 0.6}

Model with rank: 1
Mean validation score: -0.50000 (std: 0.00000)
Parameters: {'reg_alpha': 0, 'reg_lambda': 0.7}

Model with rank: 1
Mean validation score: -0.50000 (std: 0.00000)
Parameters: {'reg_alpha': 0, 'reg_lambda': 0.8}

Model with rank: 1
Mean validation score: -0.50000 (std: 0.00000)
Parameters: {'reg_alpha': 0, 'reg_lambda': 0.9}

Model with rank: 1
Mean validation score: -0.50000 (std: 0.00000)
Parameters: {'reg_alpha': 0, 'reg_lambda': 1.0}

Model with rank: 1
Mean validation score: -0.50000 (std: 0.00000)
Parameters: {'reg_alpha': 0.001, 'reg_lambda': 0.6}

Model with rank: 1
Mean validation score: -0.50000 (std: 0.00000)
Parameters: {'reg_alpha': 0.001, 'reg_lambda': 0.7}

Model with rank: 1
Mean validation score: -0.50000 (std:

[Parallel(n_jobs=4)]: Done 125 out of 125 | elapsed:    1.9s finished


#### Tune learning_rate

In [50]:
fixed_params_rd5 = {
    'n_estimators': 500,
    'subsample': 0.917,
    'colsample_bytree': 0.631,
    "max_depth": 3,
    "min_child_weight": 6,
    'gamma': 0.122
}

# specify parameters to search across
rvs = sp_uniform(.00001, 0.4)
cv_params_rd5 = {'learning_rate': rvs}

# randomized search
n_iter_search = 20
optimized_XGB = RandomizedSearchCV(XGBClassifier(**fixed_params_rd5),
                               cv_params_rd5,
                               scoring = 'neg_mean_squared_error',
                               cv = 5,
                               n_iter = n_iter_search,
                               n_jobs = 4,
                               verbose = 1)


start = time()

optimized_XGB.fit(X_train, y_train)

print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))

report(optimized_XGB.cv_results_)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits
RandomizedSearchCV took 1.68 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: -0.50000 (std: 0.00000)
Parameters: {'learning_rate': 0.09725826721288826}

Model with rank: 1
Mean validation score: -0.50000 (std: 0.00000)
Parameters: {'learning_rate': 0.2973260022041909}

Model with rank: 1
Mean validation score: -0.50000 (std: 0.00000)
Parameters: {'learning_rate': 0.03554907940731689}

Model with rank: 1
Mean validation score: -0.50000 (std: 0.00000)
Parameters: {'learning_rate': 0.023924336789571735}

Model with rank: 1
Mean validation score: -0.50000 (std: 0.00000)
Parameters: {'learning_rate': 0.05110347204369988}

Model with rank: 1
Mean validation score: -0.50000 (std: 0.00000)
Parameters: {'learning_rate': 0.020605940329857676}

Model with rank: 1
Mean validation score: -0.50000 (std: 0.00000)
Parameters: {'learning_rate': 0.09218974771369608}

Model with rank: 1
Mean validation s

[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    1.6s finished


####  Tune n_estimators

In [51]:
fixed_params_rd6 = {
    'learning_rate': 0.0597,
    'subsample': 0.917,
    'colsample_bytree': 0.631,
    "max_depth": 3,
    "min_child_weight": 6,
    'gamma': 0.122
}

# specify parameters to search across
cv_params_rd6 = {'n_estimators':[10, 100, 500, 1000, 5000]}

# grid search
optimized_XGB = GridSearchCV(XGBClassifier(**fixed_params_rd6),
                         cv_params_rd6,
                         scoring = 'neg_mean_squared_error',
                         cv = 5,
                         n_jobs = 4,
                         verbose = 1)


start = time()

optimized_XGB.fit(X_train, y_train)

print("GridSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), 5)) 

report(optimized_XGB.cv_results_)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 5 candidates, totalling 25 fits
GridSearchCV took 1.32 seconds for 5 candidates parameter settings.
Model with rank: 1
Mean validation score: -0.50000 (std: 0.00000)
Parameters: {'n_estimators': 10}

Model with rank: 1
Mean validation score: -0.50000 (std: 0.00000)
Parameters: {'n_estimators': 100}

Model with rank: 1
Mean validation score: -0.50000 (std: 0.00000)
Parameters: {'n_estimators': 500}

Model with rank: 1
Mean validation score: -0.50000 (std: 0.00000)
Parameters: {'n_estimators': 1000}

Model with rank: 1
Mean validation score: -0.50000 (std: 0.00000)
Parameters: {'n_estimators': 5000}



[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:    1.3s finished
