___
# 10 - Resampling

### Time to look into oversampling / undersampling techniques.

Resampling changes the distribution of the predictor class in our dataset. There are two main techniques: over- or under-sampling. Oversampling will randomly duplicate samples (increasing the number of observations of the underbalanced class), whilst undersampling will randomly delete samples (decreasing the number of observations in the overbalanced class).

In [2]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import cross_val_score

from sklearn.model_selection import StratifiedGroupKFold
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks

from imblearn.pipeline import make_pipeline

In [3]:
train = pd.read_csv('encoded_train.csv')
test = pd.read_csv('encoded_test.csv')

In [4]:
groups = train['accident_index'].copy().tolist()

y_train = train['casualty_severity']
X_train = train.drop(['casualty_severity', 'accident_index'], axis=1)

y_test = test['casualty_severity']
X_test = test.drop(['casualty_severity', 'accident_index'], axis=1)

In [5]:
def resample_dist(sampler_class, X_train=X_train, y_train=y_train):
  X_trans, y_trans = sampler_class.fit_resample(X_train, y_train)
  y_0 = y_trans.value_counts()[0] / len(y_trans)
  y_1 = y_trans.value_counts()[1] / len(y_trans)

  resampler_dict = {
    'Sampler type': sampler_class,
    'y=0': y_0,
    'y=1': y_1,
    'new_training_size': len(y_trans),
    'training_change': (len(y_trans) - len(y_train)) / len(y_train)
  }

  return resampler_dict

In [6]:
def evaluate_model(model_class, X_train=X_train, y_train=y_train):
    kf = StratifiedGroupKFold(n_splits=5, shuffle=True)
    # Stratified K fold cross validation and predict on training data
    accuracy_scores = cross_val_score(model_class, X_train, y_train.to_numpy().ravel(), cv=kf, scoring='accuracy', groups=groups) * 100
    precision_scores = cross_val_score(model_class, X_train, y_train.to_numpy().ravel(), cv=kf, scoring='precision', groups=groups) * 100
    recall_scores = cross_val_score(model_class, X_train, y_train.to_numpy().ravel(), cv=kf, scoring='recall', groups=groups) * 100
    f1_scores = cross_val_score(model_class, X_train, y_train.to_numpy().ravel(), cv=kf, scoring='f1', groups=groups) * 100
    

    metrics_dict = {
        'Model Type': model_class,
        'CV_mean_accuracy': np.round(accuracy_scores.mean(), 1), 
        'CV_mean_precision':np.round(precision_scores.mean(), 1), 
        'CV_mean_recall': np.round(recall_scores.mean(), 1), 
        'CV_mean_F1': np.round(f1_scores.mean(), 1)
        
       
    }   

    return metrics_dict

### Let's try random resampling first, starting with oversampling

In [7]:
random_overs_RF_pipeline = make_pipeline(RandomOverSampler(random_state=42), 
                              RandomForestClassifier())

random_overs_XGB_pipeline = make_pipeline(RandomOverSampler(random_state=42), 
                              XGBClassifier())

In [8]:
models = {'XGB_oversampled': random_overs_XGB_pipeline, 'RF_oversampled': random_overs_RF_pipeline}  
model_metric_dict = {}

for key, values in models.items():
  metrics_dict = evaluate_model(values)
  model_metric_dict.update({key: metrics_dict})

results = pd.DataFrame.from_dict(model_metric_dict).T.round(2)
results.sort_values(by='CV_mean_F1', ascending=False)

Unnamed: 0,Model Type,CV_mean_accuracy,CV_mean_precision,CV_mean_recall,CV_mean_F1
XGB_oversampled,"(RandomOverSampler(random_state=42), XGBClassi...",63.9,38.3,46.0,43.8
RF_oversampled,"(RandomOverSampler(random_state=42), RandomFor...",67.4,38.7,30.9,34.5


### Now, random undersampling

In [9]:
random_unders_RF_pipeline = make_pipeline(RandomUnderSampler(random_state=42), 
                              RandomForestClassifier())

random_unders_XGB_pipeline = make_pipeline(RandomUnderSampler(random_state=42), 
                              XGBClassifier())

models_under = {'XGB_undersampled': random_unders_XGB_pipeline, 'RF_undersampled': random_unders_RF_pipeline}
for key, values in models_under.items():
  metrics_dict = evaluate_model(values)
  model_metric_dict.update({key: metrics_dict})


In [10]:
results = pd.DataFrame.from_dict(model_metric_dict).T.round(2)
results.sort_values(by='CV_mean_F1', ascending=False)

Unnamed: 0,Model Type,CV_mean_accuracy,CV_mean_precision,CV_mean_recall,CV_mean_F1
XGB_undersampled,"(RandomUnderSampler(random_state=42), XGBClass...",61.6,37.5,56.3,44.7
XGB_oversampled,"(RandomOverSampler(random_state=42), XGBClassi...",63.9,38.3,46.0,43.8
RF_undersampled,"(RandomUnderSampler(random_state=42), RandomFo...",61.8,37.6,52.4,43.1
RF_oversampled,"(RandomOverSampler(random_state=42), RandomFor...",67.4,38.7,30.9,34.5


### Let's try something a little spicier: SMOTE (synthetic minority oversampling)

In [11]:
smote_XGB_pipeline = make_pipeline(SMOTE(random_state=42), XGBClassifier())

smote_RF_pipeline = make_pipeline(SMOTE(random_state=42), RandomForestClassifier())

models_smote = {'XGB_SMOTE': smote_XGB_pipeline, 'RF_SMOTE': smote_RF_pipeline}

for key, values in models_smote.items():
  metrics_dict = evaluate_model(values)
  model_metric_dict.update({key: metrics_dict})

results = pd.DataFrame.from_dict(model_metric_dict).T.round(2)
results.sort_values(by='CV_mean_F1', ascending=False)

Unnamed: 0,Model Type,CV_mean_accuracy,CV_mean_precision,CV_mean_recall,CV_mean_F1
XGB_undersampled,"(RandomUnderSampler(random_state=42), XGBClass...",61.6,37.5,56.3,44.7
XGB_oversampled,"(RandomOverSampler(random_state=42), XGBClassi...",63.9,38.3,46.0,43.8
RF_undersampled,"(RandomUnderSampler(random_state=42), RandomFo...",61.8,37.6,52.4,43.1
RF_oversampled,"(RandomOverSampler(random_state=42), RandomFor...",67.4,38.7,30.9,34.5
XGB_SMOTE,"(SMOTE(random_state=42), XGBClassifier(base_sc...",69.1,41.8,29.3,34.3
RF_SMOTE,"(SMOTE(random_state=42), RandomForestClassifie...",68.3,40.7,26.2,33.0


### Undersampling using Tomek links

In [12]:
tomek_XGB_pipeline = make_pipeline(TomekLinks(), XGBClassifier())

tomek_RF_pipeline = make_pipeline(TomekLinks(), RandomForestClassifier())

models_tomek = {'XGB_tomek': tomek_XGB_pipeline, 'RF_tomek': tomek_RF_pipeline}

for key, values in models_tomek.items():
  metrics_dict = evaluate_model(values)
  model_metric_dict.update({key: metrics_dict})

results = pd.DataFrame.from_dict(model_metric_dict).T.round(2)
results.sort_values(by='CV_mean_F1', ascending=False)

Unnamed: 0,Model Type,CV_mean_accuracy,CV_mean_precision,CV_mean_recall,CV_mean_F1
XGB_undersampled,"(RandomUnderSampler(random_state=42), XGBClass...",61.6,37.5,56.3,44.7
XGB_oversampled,"(RandomOverSampler(random_state=42), XGBClassi...",63.9,38.3,46.0,43.8
RF_undersampled,"(RandomUnderSampler(random_state=42), RandomFo...",61.8,37.6,52.4,43.1
XGB_tomek,"(TomekLinks(), XGBClassifier(base_score=None, ...",69.3,43.2,30.5,34.6
RF_oversampled,"(RandomOverSampler(random_state=42), RandomFor...",67.4,38.7,30.9,34.5
XGB_SMOTE,"(SMOTE(random_state=42), XGBClassifier(base_sc...",69.1,41.8,29.3,34.3
RF_SMOTE,"(SMOTE(random_state=42), RandomForestClassifie...",68.3,40.7,26.2,33.0
RF_tomek,"(TomekLinks(), RandomForestClassifier())",68.5,39.8,29.0,32.9


In [13]:
resamplers = {'Random Oversampler': RandomOverSampler(random_state=42), 'Random undersampler': RandomUnderSampler(random_state=42), 'SMOTE': SMOTE(random_state=42), 'Tomkek links': TomekLinks()}

resampler_metric_dict = {}

for key, values in resamplers.items():
  resampler_dict = resample_dist(values)
  resampler_metric_dict.update({key: resampler_dict})

results = pd.DataFrame.from_dict(resampler_metric_dict).T.round(2)
results

Unnamed: 0,Sampler type,y=0,y=1,new_training_size,training_change
Random Oversampler,RandomOverSampler(random_state=42),0.5,0.5,16312,0.438829
Random undersampler,RandomUnderSampler(random_state=42),0.5,0.5,6362,-0.438829
SMOTE,SMOTE(random_state=42),0.5,0.5,16312,0.438829
Tomkek links,TomekLinks(),0.706306,0.293694,10831,-0.044633


___

### With resampling, we've increased the F1 score of the XGBoost classifier from 31.5% to 44.7% - a 42% relative increase in our F1 accuracy. The cross-validated F1 score of the RF classifier has gone from 29.9 to 43.1 - a 44% relative increase in predictive power!

Now we can move onto hyperparameter turning our model on the resampled training data. After that we'll finally see how we do on the testing data!