In [76]:
from xgboost import XGBRFClassifier as xgbclassifier

import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_validate

from matplotlib import pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [34]:
df = pd.read_csv('../data/titanic.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [35]:
'''
Normally, XGBoost only works with number, for this example
we just exclude text values for simplicity, but we can use encoding or mapping to 
convert text values into number values 
Name, Sex, Ticket, Cabin, Embarked
'''
df = df.loc[:, ~df.columns.isin(['Unnamed: 0', 'PassengerId', 'Name', 'Cabin', 'Ticket'])] #exclude Name, Cabin and Ticket 

'''
mapping categorical into numerical
'''
df.Sex = df.Sex.map({'male': 1, 'female': 0})
df.Embarked = df.Embarked.map({'S': 0, 'C': 1, 'Q': 1})

In [36]:
'''
train test split

Pay attention that we use X_train, X_validation, y_train, y_validation instead 
DMatrix format
'''
X = df.drop('Survived', axis='columns')
y = df.Survived
X.fillna(-999, inplace=True)
y.fillna(0, inplace=True)
X_train, X_validation, y_train, y_validation = train_test_split(X, y, 
                                                                train_size=0.75,
                                                                random_state=42)

## Cross validation using cross_val_score [scikit-learn]

Important: 

cross validation is not for returning the best model. It only serves as generalize for the model performance,
for example: if the cross_val_score are [0.7, 0.68, 0.71] you would expect if you trained your model, the perfomace would be around those numbers and wont get 0.2 in any circumstances.


- cross_val_score usually can take only 1 evaluation metric
- can't return the best estimators back

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html

In [74]:
'''
Important: cross_val_score cant take any non-number values, 
make sure all your values are in number first

Important: doing CV doesn't required a trained model, because
cv function will call it inside the function later
'''
params = {'base_score': 0.5, 
          'booster': 'gbtree',
          'max_depth': 3,
          'eval_metric': 'logloss'}
model_classifier = xgbclassifier(**params)

'''
set kfold
'''
kfold = StratifiedKFold(n_splits=3, shuffle=True)

'''
do the cross validation, if you want some more advance
you could make loop for the base model

for name, model in models:
    kfold = StratifiedKFold(n_splits=3, shuffle=True)
    result = cross_val_score(model, X_train, y_train, cv=kfold)
'''
results = cross_val_score(model_classifier, X_train, y_train, cv=kfold,
                          scoring='roc_auc')
print(f'ROC-AUC for cross validated model: {results.mean() * 100:.3f} with STD: {results.std() * 100:.3f}')

ROC-AUC for cross validated model: 78.029 with STD: 3.473


In [75]:
cv_validation =cross_val_predict(model_classifier, X_validation, y_validation,
                                 cv=kfold, method='predict_proba')
print(f'ROC-AUC score for cross validated model on validation dataset is {roc_auc_score(y_validation, cv_validation[:, 1])*100:.3f}')

ROC-AUC score for cross validated model on validation dataset is 77.507


***

## Cross validation using cross_validate[scikit-learn]

Important: Unlike cross_val_score, cross_validate can return the estimators for re-use, and can have multiple eval_metric.

- can have multiple metrics
- can return the estimator for re-use

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validate

In [91]:
'''
Important: cross_validate cant take any non-number values, 
make sure all your values are in number first

Important: doing CV doesn't required a trained model, because
cv function will call it inside the function later
'''
params = {'base_score': 0.5, 
          'booster': 'gbtree',
          'max_depth': 3,
          'eval_metric': 'logloss'}
model_classifier = xgbclassifier(**params)

'''
set kfold
'''
kfold = StratifiedKFold(n_splits=3, shuffle=True)

'''
Important: we could return estimator using cross_validate
by doing this we can get the best estimators to re-use later

do the cross validation, if you want some more advance
you could make loop for the base model

for name, model in models:
    kfold = StratifiedKFold(n_splits=3, shuffle=True)
    result = cross_val_score(model, X_train, y_train, cv=kfold)
'''
cv_results = cross_validate(model_classifier, X_train, y_train, cv=kfold,
                           return_estimator=True, scoring='roc_auc')

print(f"Best ROC-AUC score is {np.max(cv_results['test_score']):.3f}, with return estimator number: {np.argmax(cv_results['test_score'])}")

Best ROC-AUC score is 0.798, with return estimator number: 2


In [92]:
'''
here are full return of cv_results
'''
cv_results

{'fit_time': array([0.09960628, 0.05326819, 0.15404534]),
 'score_time': array([0.00576663, 0.01828122, 0.01360035]),
 'estimator': [XGBRFClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                  colsample_bytree=1, enable_categorical=False,
                  eval_metric='logloss', gamma=0, gpu_id=-1, importance_type=None,
                  interaction_constraints='', max_delta_step=0, max_depth=3,
                  min_child_weight=1, missing=nan, monotone_constraints='()',
                  n_estimators=100, n_jobs=4, num_parallel_tree=100,
                  objective='binary:logistic', predictor='auto', random_state=0,
                  reg_alpha=0, scale_pos_weight=1, tree_method='exact',
                  validate_parameters=1, verbosity=None),
  XGBRFClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                  colsample_bytree=1, enable_categorical=False,
                  eval_metric='logloss', gamma=0, gpu_id=-1, importance_type=

In [97]:
'''
ROC-AUC score for best estimators
'''
cv_best_estimators = cv_results['estimator'][np.argmax(cv_results['test_score'])]
print(f"ROC-AUC using cv best estimator is {roc_auc_score(y_validation, cv_best_estimators.predict_proba(X_validation)[:, 1]):.3f}")

ROC-AUC using cv best estimator is 0.791


***