# Right way of using SMOTE with cross validation 
https://towardsdatascience.com/the-right-way-of-using-smote-with-cross-validation-92a8d09d00c7

In this notebook I will follow all the steps of the article to make sure I get similar results (at least the differences). 

Import statements. 

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_classification

## We start with what you shouldn't do

Load the dataset. 

In [3]:
merged_df = pd.read_csv('/Users/dionnespaltman/Desktop/V3/merged_df.csv', sep=',')

merged_df.drop('Unnamed: 0', axis=1, inplace=True)
merged_df.drop('Unnamed: 0.1', axis=1, inplace=True)

# display(merged_df.head(5))

In [19]:
columns_to_drop = [ 'ID', 'sum_12', 'sum_4567', 'sum_456', 'VVR_group', 'Condition'] 

X = merged_df.drop(columns_to_drop, axis=1)
y = merged_df['VVR_group']

Split into test and train. 

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=11)

Then we apply SMOTE. 

In [21]:
smote = SMOTE(random_state = 11)
X_res, y_res = smote.fit_resample(X_train, y_train)
pipeline = Pipeline(steps = [['scaler', MinMaxScaler()],
                             ['classifier', LogisticRegression(random_state=11,
                                                               max_iter=1000)]])

In [27]:
from collections import Counter

print(X_train.shape)
print(X_res.shape)

# Apply SMOTE to the training data
print('Original dataset shape %s' % Counter(y_train))
print('Smote dataset shape %s' % Counter(y_res))



(88, 121)
(134, 121)
Original dataset shape Counter({0: 67, 1: 21})
Smote dataset shape Counter({0: 67, 1: 67})


In [28]:
stratified_kfold = StratifiedKFold(n_splits=3,
                                       shuffle=True,
                                       random_state=11)

We're looking at the roc_auc. To be extra clear: this is not how it should go! 

In [29]:
param_grid = {'classifier__C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=param_grid,
                           scoring='roc_auc',
                           cv=stratified_kfold,
                           n_jobs=-1)

grid_search.fit(X_res, y_res)
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test, y_test)
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')

Cross-validation score: 0.9736495388669302
Test score: 0.8333333333333335


In [30]:
param_grid = {'classifier__C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=stratified_kfold,
                           n_jobs=-1)

grid_search.fit(X_res, y_res)
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test, y_test)
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')

Cross-validation score: 0.9555555555555556
Test score: 0.782608695652174


In [31]:
param_grid = {'classifier__C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=stratified_kfold,
                           n_jobs=-1)

grid_search.fit(X_train, y_train)
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test, y_test)
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')

Cross-validation score: 0.8291187739463601
Test score: 0.782608695652174


In [32]:
param_grid = {'classifier__C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=param_grid,
                           scoring='recall',
                           cv=stratified_kfold,
                           n_jobs=-1)

grid_search.fit(X_train, y_train)
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test, y_test)
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')

Cross-validation score: 0.4761904761904762
Test score: 0.4


## Then what you should do 

Now we're moving on to the second part of the article. 

In [33]:
columns_to_drop = [ 'ID', 'sum_12', 'sum_4567', 'sum_456', 'VVR_group', 'Condition'] 

X = merged_df.drop(columns_to_drop, axis=1)
y = merged_df['VVR_group']

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=11)

The ROC-AUC is very good.

In [35]:
pipeline = imbpipeline(steps = [['smote', SMOTE(random_state=11)],
                                ['scaler', MinMaxScaler()],
                                ['classifier', LogisticRegression(random_state=11,
                                                                  max_iter=1000)]])

stratified_kfold = StratifiedKFold(n_splits=3,
                                       shuffle=True,
                                       random_state=11)
    
param_grid = {'classifier__C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=param_grid,
                           scoring='roc_auc',
                           cv=stratified_kfold,
                           n_jobs=-1)

grid_search.fit(X_train, y_train)
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test, y_test)
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')

Cross-validation score: 0.7853378505552419
Test score: 0.8666666666666667


Recall is bad (0.4 on test). 

In [36]:
pipeline = imbpipeline(steps = [['smote', SMOTE(random_state=11)],
                                ['scaler', MinMaxScaler()],
                                ['classifier', LogisticRegression(random_state=11,
                                                                  max_iter=1000)]])

stratified_kfold = StratifiedKFold(n_splits=3,
                                       shuffle=True,
                                       random_state=11)
    
param_grid = {'classifier__C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=param_grid,
                           scoring='recall',
                           cv=stratified_kfold,
                           n_jobs=-1)

grid_search.fit(X_train, y_train)
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test, y_test)
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')

Cross-validation score: 0.4761904761904762
Test score: 0.4


Precision also is not very good (0.4 on test). 

In [48]:
pipeline = imbpipeline(steps = [['smote', SMOTE(random_state=11)],
                                ['scaler', MinMaxScaler()],
                                ['classifier', LogisticRegression(random_state=11,
                                                                  max_iter=1000)]])

stratified_kfold = StratifiedKFold(n_splits=3,
                                       shuffle=True,
                                       random_state=11)
    
param_grid = {'classifier__C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=param_grid,
                           scoring='precision',
                           cv=stratified_kfold,
                           n_jobs=-1)

grid_search.fit(X_train, y_train)
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test, y_test)
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')

Cross-validation score: 0.75
Test score: 0.4


F1 is then also not very good. 

In [46]:
from sklearn.ensemble import RandomForestClassifier

pipeline = imbpipeline(steps = [['smote', SMOTE(random_state=11)],
                                ['scaler', MinMaxScaler()],
                                ['classifier', RandomForestClassifier(n_estimators=2, random_state=0)]])

stratified_kfold = StratifiedKFold(n_splits=3,
                                       shuffle=True,
                                       random_state=11)
    
param_grid = {
    'classifier__n_estimators': [10, 50, 100],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=param_grid,
                           scoring='f1',
                           cv=stratified_kfold,
                           n_jobs=-1)

grid_search.fit(X_train, y_train)
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test, y_test)
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')

Cross-validation score: 0.4666666666666666
Test score: 0.2857142857142857


Accuracy is very good. 

In [45]:
from sklearn.ensemble import RandomForestClassifier

pipeline = imbpipeline(steps = [['smote', SMOTE(random_state=11)],
                                ['scaler', MinMaxScaler()],
                                ['classifier', RandomForestClassifier(n_estimators=2, random_state=0)]])

stratified_kfold = StratifiedKFold(n_splits=3,
                                       shuffle=True,
                                       random_state=11)
    
param_grid = {
    'classifier__n_estimators': [10, 50, 100],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=stratified_kfold,
                           n_jobs=-1)

grid_search.fit(X_train, y_train)
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test, y_test)
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')

Cross-validation score: 0.7954022988505747
Test score: 0.7391304347826086


In [13]:
def model(X, y, smote=True):
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        stratify=y,
                                                        random_state=11)
    
    if smote == True:
        pipeline = imbpipeline(steps = [['smote', SMOTE(random_state=11)],
                                        ['scaler', MinMaxScaler()],
                                        ['classifier', LogisticRegression(random_state=11,
                                                                          max_iter=1000)]])
    else:
        smote = SMOTE(random_state = 11)
        X_train, y_train = smote.fit_resample(X_train, y_train)
        pipeline = Pipeline(steps = [['scaler', MinMaxScaler()],
                                     ['classifier', LogisticRegression(random_state=11,
                                                                       max_iter=1000)]])
        
    
    stratified_kfold = StratifiedKFold(n_splits=5,
                                       shuffle=True,
                                       random_state=11)
    
    
    param_grid = {'classifier__C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
    grid_search = GridSearchCV(estimator=pipeline,
                               param_grid=param_grid,
                               scoring='recall',
                               cv=stratified_kfold,
                               n_jobs=-1)
    
    
    grid_search.fit(X_train, y_train)
    cv_score = grid_search.best_score_
    test_score = grid_search.score(X_test, y_test)
    return {'cv_score':cv_score, 'test_score':test_score}

In [40]:
main_df = pd.DataFrame()
for i in range(10):
  
    np.random.seed(i)
    
    N_SAMPLES = np.random.randint(low=10000,
                                  high=50000,
                                  size=1)[0]

    N_FEATURES = np.random.randint(low=20,
                                   high=40,
                                   size=1)[0]

    N_INFORMATIVE = N_FEATURES - np.random.randint(low=2,
                                                   high=5,
                                                   size=1)[0]

    CLASS_SEP = np.random.uniform(low=0.4,
                                  high=0.8,
                                  size=1)[0]

    MINORITY_CLASS_WEIGHT = np.random.uniform(low=0.05,
                                              high=0.3,
                                              size=1)[0]

    CLASS_WEIGHTS = {1:MINORITY_CLASS_WEIGHT, 0:(1 - MINORITY_CLASS_WEIGHT)}


    data = make_classification(n_samples=N_SAMPLES,
                               n_features=N_FEATURES,
                               n_informative=N_INFORMATIVE,
                               n_redundant=0,
                               class_sep=CLASS_SEP,
                               weights=CLASS_WEIGHTS,
                               random_state=11)
    
    X = data[0].copy()
    y = data[1].copy()
    
    model_smote_in_pipeline = model(X, y, smote=True)
    model_smote_out_pipeline = model(X, y, smote=False)
    
    df = pd.DataFrame({'N_SAMPLES':N_SAMPLES,
                       'N_FEATURES':N_FEATURES,
                       'N_INFORMATIVE':N_INFORMATIVE,
                       'CLASS_SEP':CLASS_SEP,
                       'MINORITY_CLASS_WEIGHT':MINORITY_CLASS_WEIGHT,
                       'SMOTE_IN_PIPELINE_CV_SCORE': model_smote_in_pipeline['cv_score'],
                       'SMOTE_IN_PIPELINE_TEST_SCORE': model_smote_in_pipeline['test_score'],
                       'SMOTE_OUTSIDE_PIPELINE_CV_SCORE': model_smote_out_pipeline['cv_score'],
                       'SMOTE_OUTSIDE_PIPELINE_TEST_SCORE': model_smote_out_pipeline['test_score']},
                      index=[i])
    
    main_df = pd.concat([main_df, df])
    print(i)

0
1
2
3
4
5
6
7
8
9


In [41]:
display(main_df)

Unnamed: 0,N_SAMPLES,N_FEATURES,N_INFORMATIVE,CLASS_SEP,MINORITY_CLASS_WEIGHT,SMOTE_IN_PIPELINE_CV_SCORE,SMOTE_IN_PIPELINE_TEST_SCORE,SMOTE_OUTSIDE_PIPELINE_CV_SCORE,SMOTE_OUTSIDE_PIPELINE_TEST_SCORE
0,12732,35,32,0.737706,0.264486,0.804186,0.799701,0.817712,0.799536
1,43003,32,30,0.400046,0.125583,0.691067,0.69474,0.708868,0.694715
2,33720,35,32,0.772616,0.286933,0.841677,0.849414,0.854466,0.849486
3,15994,23,21,0.448531,0.192328,0.693345,0.698883,0.710698,0.698962
4,27530,34,31,0.789074,0.228704,0.845102,0.850348,0.861404,0.850043
5,45683,34,31,0.482688,0.279653,0.716573,0.717216,0.729643,0.717216
6,41626,29,27,0.728492,0.060424,0.846904,0.842133,0.869467,0.842085
7,20742,23,21,0.523205,0.115968,0.767139,0.756672,0.790411,0.756637
8,14547,37,34,0.747678,0.182714,0.891236,0.890141,0.907261,0.890141
9,30828,21,17,0.750888,0.084382,0.839469,0.841249,0.85452,0.841249
