In [20]:
import numpy as np
import pandas as pd
import random

from sklearn.model_selection import cross_val_score, RandomizedSearchCV, KFold
from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier

In [2]:
data = pd.read_csv('data_featureselection.csv')
random.seed(10)
X = data.iloc[:,1:]; y = data.iloc[:,0]
data.head()

Unnamed: 0,team_one_win,team_one_obj_damage,team_two_dragons,team_two_double_kills,team_one_dragons,team_one_first_inhibitor,team_one_assists,team_two_enemy_monsters_killed,team_two_cs,team_two_kills,...,team_two_barons,team_two_damage,team_one_jungle_avg_xp_diff,team_one_kills,team_one_bot_avg_xp_diff,team_two_obj_damage,team_one_enemy_monsters_killed,team_one_healing,team_one_first_tower,team_one_barons
0,0,0.198563,4,4,2,0,0.588652,0.297521,0.460517,0.740741,...,2,0.459944,3.441667,0.597826,-103.466667,0.582378,0.126126,0.384546,1,0
1,0,0.399344,1,2,4,1,0.099291,0.380165,0.518819,0.37037,...,1,0.358009,-77.2,0.184783,-113.4,0.35175,0.027027,0.198796,1,0
2,1,0.459898,0,1,3,1,0.312057,0.0,0.342435,0.185185,...,0,0.191652,3.0,0.369565,104.4,0.063946,0.081081,0.169561,1,0
3,1,0.129875,0,1,2,1,0.248227,0.033058,0.219188,0.197531,...,0,0.116541,0.0,0.369565,163.2,0.079341,0.036036,0.134021,0,0
4,0,0.214706,3,2,1,0,0.198582,0.198347,0.464945,0.259259,...,1,0.343545,-63.366667,0.152174,-124.266667,0.609278,0.081081,0.26182,0,0


In [3]:
# XGB with default setting

xgb = XGBClassifier(nthread = -1)

[0.9729619  0.97254098 0.97090164 0.97377049 0.96885246]


In [4]:
cross_score = cross_val_score(xgb, X, y, cv=5, n_jobs = -1)
print(cross_score)
print(sum(cross_score)/5)

[0.9729619  0.97254098 0.97090164 0.97377049 0.96885246]
0.9718054949261591


In [14]:
xgb = XGBClassifier(objective = 'binary:logistic', nthread = -1)
params = {'n_estimators': [200,500,800,1600],
              'learning_rate': [0.01,0.1,0.3,0.6],
              'subsample': [0.3,0.6,0.9],
              'max_depth': [3, 4, 5, 6, 7, 8, 9],
              'colsample_bytree': [0.5,0.7,0.9],
              'min_child_weight': [1, 2, 3, 4]
             }
kfold_5 = KFold(shuffle = True, n_splits = 5)
clf = RandomizedSearchCV(xgb ,param_distributions = params, cv=kfold_5, scoring="accuracy", n_jobs= 10, verbose = 1)

In [15]:
clf.fit(X, y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=...
                                           reg_lambda=None,
                                           scale_pos_weight=None,
                                           subsample=None, tree_method=None,
                                           validate_parameters=N

In [16]:
clf.cv_results_

{'mean_fit_time': array([139.37292099,  40.17091885,  10.88570628,  84.69857383,
         67.3568419 ,  38.19006243,  14.79657845,  13.34882207,
         80.61134429,  93.23330479]),
 'std_fit_time': array([0.99216997, 0.81145401, 0.35412435, 1.68833403, 0.70071923,
        0.22186548, 0.15845392, 0.13066745, 0.48258729, 1.11315078]),
 'mean_score_time': array([1.01869822, 0.15808344, 0.09824281, 0.38229003, 0.31276369,
        0.13673573, 0.0508646 , 0.10890894, 0.32753644, 0.1847074 ]),
 'std_score_time': array([0.09031883, 0.02598954, 0.02176731, 0.02097512, 0.03022133,
        0.03143912, 0.02760267, 0.02136582, 0.0631424 , 0.02721315]),
 'param_subsample': masked_array(data=[0.3, 0.6, 0.3, 0.6, 0.6, 0.9, 0.3, 0.3, 0.9, 0.9],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[1600, 500, 200, 800, 800, 500, 200, 200, 1600, 800],
  

In [17]:
clf.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.3, max_delta_step=0, max_depth=3,
              min_child_weight=2, missing=nan, monotone_constraints='()',
              n_estimators=1600, n_jobs=-1, nthread=-1, num_parallel_tree=1,
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=0.9, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [19]:
xgb_grid = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.3, max_delta_step=0, max_depth=3,
              min_child_weight=2,monotone_constraints='()',
              n_estimators=1600, n_jobs=-1, nthread=-1, num_parallel_tree=1,
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=0.9, tree_method='exact', validate_parameters=1,
              verbosity=None)
cross_score = cross_val_score(xgb_grid, X, y, cv=5, n_jobs = -1)
print(cross_score)
print(sum(cross_score)/5)

[0.97501024 0.97172131 0.97008197 0.97459016 0.96885246]
0.9720512286687129


cross validation score: 0.9720512286687129

learning_rate: step size shrinkage used to prevent overfitting. Range is [0,1]
max_depth: determines how deeply each tree is allowed to grow during any boosting round.
subsample: percentage of samples used per tree. Low value can lead to underfitting.
colsample_bytree: percentage of features used per tree. High value can lead to overfitting.
n_estimators: number of trees you want to build.
objective: determines the loss function to be used like reg:linear for regression problems, reg:logistic for classification problems with only decision, binary:logistic for classification problems with probability.
XGBoost also supports regularization parameters to penalize models as they become more complex and reduce them to simple (parsimonious) models.

gamma: controls whether a given node will split based on the expected reduction in loss after the split. A higher value leads to fewer splits. Supported only for tree-based learners.
alpha: L1 regularization on leaf weights. A large value leads to more regularization.
lambda: L2 regularization on leaf weights and is smoother than L1 regularization.

https://xgboost.readthedocs.io/en/latest/parameter.html

# add bagging

In [None]:
bagging_XGB = BaggingClassifier(base_estimator=xgb_grid,
                                n_estimators=1, random_state=10)
cross_score = cross_val_score(bagging_XGB, X, y, cv=5, n_jobs = -1)
print(cross_score)
print(sum(cross_score)/5)

In [None]:
xgb_grid = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.3, max_delta_step=0, max_depth=3,
              min_child_weight=2,monotone_constraints='()',
              n_estimators=1600, n_jobs=-1, nthread=-1, num_parallel_tree=1,
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=0.9,  tree_method='exact', validate_parameters=1,
              verbosity=None)
bagging_XGB = BaggingClassifier(base_estimator=xgb_grid,
                                n_estimators=1, random_state=10)
cross_score = cross_val_score(bagging_XGB, X, y, cv=5, n_jobs = -1)
print(cross_score)
print(sum(cross_score)/5)                         