In [3]:
# Import libraries:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV   #Perforing grid search.

import matplotlib.pylab as plt
%matplotlib inline

from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

In [4]:
# Load the data
train_df = pd.read_csv('./LargeTrain.csv', header=0)
target = 'Class'

In [5]:
# Choose all predictors except target
predictors = [x for x in train_df.columns if x not in [target]]

# initialize model
xgb_model = XGBClassifier( 
    learning_rate = 0.1,
    n_estimators = 140, 
    max_depth = 3, 
    min_child_weight = 1,
    gamma = 0,
    subsample = 0.8,
    colsample_bytree = 0.8,         
    objective = "multi:softmax",
    nthread = 4, 
    scale_pos_weight = 1, 
    seed = 27)

In [6]:
# Test the parameters - max_depth & min_child_weight
# Because those two parameters make much impact in the result
param_test1 = {
    'max_depth':range(3,11,2),
    'min_child_weight':(range(3,11,2))
}

gsearch1 = GridSearchCV(
    estimator = xgb_model,
    param_grid = param_test1,
    scoring = 'accuracy',
    n_jobs = 4,
    iid = False,
    cv = 5)

gsearch1.fit(train_df[predictors],train_df[target])

# Print the output
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



([mean: 0.99678, std: 0.00096, params: {'max_depth': 3, 'min_child_weight': 3},
  mean: 0.99650, std: 0.00111, params: {'max_depth': 3, 'min_child_weight': 5},
  mean: 0.99650, std: 0.00138, params: {'max_depth': 3, 'min_child_weight': 7},
  mean: 0.99604, std: 0.00125, params: {'max_depth': 3, 'min_child_weight': 9},
  mean: 0.99669, std: 0.00089, params: {'max_depth': 5, 'min_child_weight': 3},
  mean: 0.99632, std: 0.00130, params: {'max_depth': 5, 'min_child_weight': 5},
  mean: 0.99623, std: 0.00134, params: {'max_depth': 5, 'min_child_weight': 7},
  mean: 0.99632, std: 0.00151, params: {'max_depth': 5, 'min_child_weight': 9},
  mean: 0.99678, std: 0.00071, params: {'max_depth': 7, 'min_child_weight': 3},
  mean: 0.99696, std: 0.00099, params: {'max_depth': 7, 'min_child_weight': 5},
  mean: 0.99660, std: 0.00129, params: {'max_depth': 7, 'min_child_weight': 7},
  mean: 0.99632, std: 0.00151, params: {'max_depth': 7, 'min_child_weight': 9},
  mean: 0.99660, std: 0.00080, params: {

In [7]:
# Update the xgb_model
xgb_model = gsearch1.best_estimator_

In [8]:
xgb_model

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=7,
       min_child_weight=5, missing=None, n_estimators=140, nthread=4,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=27, silent=True, subsample=0.8)

In [9]:
# Test the parameters - gamma
param_test2 = {
 'gamma':[i/10.0 for i in range(0,5)]
}

gsearch2 = GridSearchCV(
    estimator = xgb_model, 
    param_grid = param_test2, 
    scoring='accuracy',
    n_jobs=4,
    iid=False, 
    cv=5)

gsearch2.fit(train_df[predictors],train_df[target])
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_



([mean: 0.99696, std: 0.00099, params: {'gamma': 0.0},
  mean: 0.99669, std: 0.00118, params: {'gamma': 0.1},
  mean: 0.99650, std: 0.00090, params: {'gamma': 0.2},
  mean: 0.99669, std: 0.00102, params: {'gamma': 0.3},
  mean: 0.99641, std: 0.00114, params: {'gamma': 0.4}],
 {'gamma': 0.0},
 0.99696428540599558)

In [10]:
# Update the xgb_model
xgb_model = gsearch2.best_estimator_
xgb_model

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0.0, learning_rate=0.1, max_delta_step=0, max_depth=7,
       min_child_weight=5, missing=None, n_estimators=140, nthread=4,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=27, silent=True, subsample=0.8)

In [14]:
# Test the parameters - subsample & colsample_bytree
param_test3 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}

gsearch3 = GridSearchCV(
    estimator = xgb_model, 
    param_grid = param_test3, 
    scoring='accuracy',
    n_jobs=4,
    iid=False, 
    cv=5)
                        
gsearch3.fit(train_df[predictors],train_df[target])
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_



([mean: 0.99632, std: 0.00113, params: {'subsample': 0.6, 'colsample_bytree': 0.6},
  mean: 0.99678, std: 0.00167, params: {'subsample': 0.7, 'colsample_bytree': 0.6},
  mean: 0.99669, std: 0.00128, params: {'subsample': 0.8, 'colsample_bytree': 0.6},
  mean: 0.99669, std: 0.00121, params: {'subsample': 0.9, 'colsample_bytree': 0.6},
  mean: 0.99632, std: 0.00112, params: {'subsample': 0.6, 'colsample_bytree': 0.7},
  mean: 0.99660, std: 0.00118, params: {'subsample': 0.7, 'colsample_bytree': 0.7},
  mean: 0.99678, std: 0.00082, params: {'subsample': 0.8, 'colsample_bytree': 0.7},
  mean: 0.99650, std: 0.00122, params: {'subsample': 0.9, 'colsample_bytree': 0.7},
  mean: 0.99604, std: 0.00138, params: {'subsample': 0.6, 'colsample_bytree': 0.8},
  mean: 0.99669, std: 0.00079, params: {'subsample': 0.7, 'colsample_bytree': 0.8},
  mean: 0.99696, std: 0.00099, params: {'subsample': 0.8, 'colsample_bytree': 0.8},
  mean: 0.99650, std: 0.00090, params: {'subsample': 0.9, 'colsample_bytree'

In [15]:
# Update the xgb_model
xgb_model = gsearch2.best_estimator_
xgb_model

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0.0, learning_rate=0.1, max_delta_step=0, max_depth=7,
       min_child_weight=5, missing=None, n_estimators=140, nthread=4,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=27, silent=True, subsample=0.8)

In [17]:
# Test the parameters - reg_alpha
param_test4 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}

gsearch4 = GridSearchCV(
    estimator = xgb_model, 
    param_grid = param_test4, 
    scoring='accuracy',
    n_jobs=4,
    iid=False, 
    cv=5)

gsearch4.fit(train_df[predictors],train_df[target])
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_



([mean: 0.99696, std: 0.00099, params: {'reg_alpha': 1e-05},
  mean: 0.99687, std: 0.00125, params: {'reg_alpha': 0.01},
  mean: 0.99669, std: 0.00089, params: {'reg_alpha': 0.1},
  mean: 0.99660, std: 0.00125, params: {'reg_alpha': 1},
  mean: 0.98749, std: 0.00262, params: {'reg_alpha': 100}],
 {'reg_alpha': 1e-05},
 0.99696428540599558)