# XGBoost gridsearch

This notebook handles arbitrary data input and executes a complete XGBoost parameter gridsearch. Finally, a dictionary with all optimal parameter and value pairs is returned.

In [1]:
import xgboost as xgb
from xgb_utils import gridsearch, update
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.datasets import make_classification

In [2]:
# generate toy dataset
x_train, y_train = make_classification(n_samples=1000, n_features=5, n_informative=2, 
                                       n_redundant=3, random_state=42, class_sep=.5)

# create train-test split
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.25)

dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test)

## n_estimator

In [3]:
# Set initial parameters and find optimal number of boosting rounds

xgb_params = {
    'objective': 'multi:softmax',
    'eval_metric': 'merror',
    'learning_rate': 0.1,
    'max_depth': 6,
    'min_child_weight': 1,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'seed': 2017,
    'silent': 1,
    'num_parallel_tree': 1,
    'num_class': 5
}

res = xgb.cv(xgb_params,
             dtrain,
             num_boost_round=750,
             nfold=4,
             seed=2017,
             stratified=False,
             early_stopping_rounds=15,
             verbose_eval=20,
             show_stdv=True,
             maximize=False)

[0]	train-merror:0.16533+0.0149866	test-merror:0.215241+0.00694639
[20]	train-merror:0.0877897+0.00912173	test-merror:0.185829+0.0276258


## Hyperparameter tuning

In [4]:
# formulate initial parameters
params = {
    'objective': 'binary:logistic',
    'num_class': 2,
    'scoring': 'f1',
    'learning_rate': 0.1,
    'max_depth': 6,
    'min_child_weight': 1,
    'scale_pos_weight': 1,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'seed': 1337,
    'silent': 1,
    'num_parallel_tree': 1,
    'gamma': 0,
    'n_estimator': res.shape[0]
}

### max_depth and min_child_weight

In [6]:
# formulate the grid
tune_params = {
 'max_depth': list(range(3,10,2)),
 'min_child_weight': list(range(1,6,2))}
tmp = gridsearch(x_train, y_train, params, tune_params)

# narrow down grid
tune_params = {
 'max_depth': [tmp['max_depth'] + i for i in range(-1, 2)],
 'min_child_weight': [tmp['min_child_weight'] + i for i in range(-1, 2)]}
tmp = gridsearch(x_train, y_train, params, tune_params)

# update base parameters with optimal values
params = update(params, tmp)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:    0.3s finished


{'min_child_weight': 1, 'max_depth': 7}
Fitting 3 folds for each of 9 candidates, totalling 27 fits
{'min_child_weight': 0, 'max_depth': 7}


[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:    0.3s finished


### gamma

In [7]:
tune_params = {
 'gamma':[i/10.0 for i in list(range(0,5))]
}
tmp = gridsearch(x_train, y_train, params, tune_params)
params = update(params, tmp)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
{'gamma': 0.0}


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    0.2s finished


## Update n_estimator

In [8]:
pars = ['scale_pos_weight','gamma','colsample_bytree','max_depth'
        ,'subsample','num_parallel_tree','min_child_weight']
for par in pars:
    xgb_params[par] = params[par]

res = xgb.cv(xgb_params,
             dtrain,
             num_boost_round=750,
             nfold=4,
             seed=2017,
             stratified=False,
             early_stopping_rounds=15,
             verbose_eval=20,
             show_stdv=True,
             maximize=False)

[0]	train-merror:0.146168+0.0133394	test-merror:0.207219+0.010943
[20]	train-merror:0.0668453+0.00388487	test-merror:0.179145+0.0158185


### subsample and colsample_bytree 

In [9]:
tune_params = {
 'subsample': [i/10.0 for i in list(range(5,10))],
 'colsample_bytree': [i/10.0 for i in list(range(5,10))]
}
tmp = gridsearch(x_train, y_train, params, tune_params)
params = update(params, tmp)

tune_params = {
 'subsample': [i/100.0 for i in list(range(int(tmp['subsample'] * 100) - 15
                                           , int(tmp['subsample'] * 100) + 15, 5))],
 'colsample_bytree': [i/100.0 for i in list(range(int(tmp['subsample'] * 100) - 15
                                                  , int(tmp['subsample'] * 100) + 15, 5))]
}
tmp = gridsearch(x_train, y_train, params, tune_params)
params = update(params, tmp)

Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:    0.7s finished


{'colsample_bytree': 0.6, 'subsample': 0.7}
Fitting 3 folds for each of 36 candidates, totalling 108 fits
{'colsample_bytree': 0.6, 'subsample': 0.7}


[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed:    1.0s finished


### alpha

In [10]:
tune_params = {
 'reg_alpha': [0, 0.001, 0.005, 0.01, 0.05]
}
tmp = gridsearch(x_train, y_train, params, tune_params)
params = update(params, tmp)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
{'reg_alpha': 0}


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    0.2s finished


## Results
This dictionary contains all optimal parameter values

In [11]:
params

{'colsample_bytree': 0.6,
 'gamma': 0.0,
 'learning_rate': 0.1,
 'max_depth': 7,
 'min_child_weight': 0,
 'n_estimator': 15,
 'num_class': 2,
 'num_parallel_tree': 1,
 'objective': 'binary:logistic',
 'reg_alpha': 0,
 'scale_pos_weight': 1,
 'scoring': 'f1',
 'seed': 1337,
 'silent': 1,
 'subsample': 0.7}