# 12 - Modelling, hyperparamater tuning, and test data.

It's been a ride! Let's tune our model and check how it does on the testing data, comparing it to our dummy model.

In [20]:
import pandas as pd
import numpy as np

from xgboost import XGBClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from imblearn.under_sampling import RandomUnderSampler

from imblearn.pipeline import Pipeline, make_pipeline

from sklearn.metrics import f1_score

from warnings import simplefilter
simplefilter("ignore", category=UserWarning)

In [21]:
train = pd.read_csv('encoded_train.csv')
test = pd.read_csv('encoded_test.csv')

y_train = train['casualty_severity']
X_train = train.drop('casualty_severity', axis=1)

y_test = test['casualty_severity']
X_test = test.drop('casualty_severity', axis=1)

In [22]:
pipeline = make_pipeline(RandomUnderSampler(random_state=42),
                         XGBClassifier())

kf = StratifiedKFold(n_splits=5, shuffle=True)
mean_score = cross_val_score(pipeline, X_train, y_train, scoring='f1', cv=kf).mean()
print("The mean F1 score for 5-fold stratified cross-validation on our randomly undersampled dataset is {:.2f}".format(mean_score))


The mean F1 score for 5-fold stratified cross-validation on our randomly undersampled dataset is 0.56


In [23]:
X_under, y_under = RandomUnderSampler(random_state=42).fit_resample(X_train, y_train)

Okay, let's try and boost that number a little bit.

We're using Bayesian Optimisaiton for hyperparameter tuning.


In [116]:
from sklearn.model_selection import RandomizedSearchCV

In [167]:
param_space={'learning_rate': np.arange(0.05, 0.35, 0.025),
             'max_depth': np.arange(3, 15, 1),
             'gamma': [0.5, 1, 1.5, 2, 5, 20],
             'colsample_bytree' : np.arange(0, 1, 0.1),
             'min_child_weight' : np.arange(1, 15, 1),
             'n_estimators': [1000, 1500],
    }

In [168]:
xgb_param_space = {'xgbclassifier__' + key: param_space[key] for key in param_space}

grid_search_pipeline = make_pipeline(RandomUnderSampler(random_state=42),
                         XGBClassifier())

random_search = RandomizedSearchCV(grid_search_pipeline, param_distributions=xgb_param_space, n_iter=1000, scoring='f1', cv=kf, verbose=2, random_state=42)
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
[CV] END xgbclassifier__colsample_bytree=0.5, xgbclassifier__gamma=0.5, xgbclassifier__learning_rate=0.10000000000000002, xgbclassifier__max_depth=14, xgbclassifier__min_child_weight=10, xgbclassifier__n_estimators=1000; total time=   0.4s
[CV] END xgbclassifier__colsample_bytree=0.5, xgbclassifier__gamma=0.5, xgbclassifier__learning_rate=0.10000000000000002, xgbclassifier__max_depth=14, xgbclassifier__min_child_weight=10, xgbclassifier__n_estimators=1000; total time=   0.4s
[CV] END xgbclassifier__colsample_bytree=0.5, xgbclassifier__gamma=0.5, xgbclassifier__learning_rate=0.10000000000000002, xgbclassifier__max_depth=14, xgbclassifier__min_child_weight=10, xgbclassifier__n_estimators=1000; total time=   0.4s
[CV] END xgbclassifier__colsample_bytree=0.5, xgbclassifier__gamma=0.5, xgbclassifier__learning_rate=0.10000000000000002, xgbclassifier__max_depth=14, xgbclassifier__min_child_weight=10, xgbclassifier__n_estimators=

In [171]:
print('\n Best F1 Score:')
print(np.round(random_search.best_score_, 2))


 Best F1 Score:
0.57


In [172]:
print(random_search.best_estimator_)

Pipeline(steps=[('randomundersampler', RandomUnderSampler(random_state=42)),
                ('xgbclassifier',
                 XGBClassifier(base_score=None, booster=None, callbacks=None,
                               colsample_bylevel=None, colsample_bynode=None,
                               colsample_bytree=0.7000000000000001, device=None,
                               early_stopping_rounds=None,
                               enable_categorical=False, eval_metric=None,
                               feature_types=None, gamma=0.5, grow_poli...ne,
                               importance_type=None,
                               interaction_constraints=None,
                               learning_rate=0.17500000000000004, max_bin=None,
                               max_cat_threshold=None, max_cat_to_onehot=None,
                               max_delta_step=None, max_depth=14,
                               max_leaves=None, min_child_weight=2, missing=nan,
                    

In [173]:
# tuned_model = XGBClassifier(params=best_hyperparams,
#                             num_boost_round = 10000,
#                             verbose=False,
#                             eval_metric="aucpr")
tuned_pipeline = make_pipeline(RandomUnderSampler(random_state=42),
                         random_search.best_estimator_)

mean_score = cross_val_score(tuned_pipeline, X_train, y_train, scoring='f1', cv=kf).mean()
print("The mean F1 score for 5-fold stratified cross-validation on our randomly undersampled dataset is {:.4f}".format(mean_score))

The mean F1 score for 5-fold stratified cross-validation on our randomly undersampled dataset is 0.5616
