# Split and then SMOTE
- First the merged data will be loaded. 
- Then the data will be split into train and test. There won't be a validation set, because of the small size of the data set. 
- Finally, SMOTE will be applied. 

This way, data leakage gets prevented. 

In [1]:
# Import statements 
import pandas as pd 
import json

First, the data will get loaded. 

In [2]:
# Read the file 
merged_df = pd.read_csv('/Users/dionnespaltman/Desktop/V3/merged_df.csv', sep=',')

# On accident, two columns are always added but they shouldn't be there 
merged_df.drop('Unnamed: 0', axis=1, inplace=True)
merged_df.drop('Unnamed: 0.1', axis=1, inplace=True)

display(merged_df.head(5))

Unnamed: 0,ID,sum_12,sum_4567,sum_456,VVR_group,Condition,VVR_1,VVR_2,AU01_r__sum_values,AU01_r__variance,...,AU26_r__minimum,AU26_r__mean,AU26_r__mean_abs_change,AU45_r__sum_values,AU45_r__variance,AU45_r__standard_deviation,AU45_r__maximum,AU45_r__minimum,AU45_r__mean,AU45_r__mean_abs_change
0,23,24.0,37.0,27.0,0,2,13.0,11.0,4982.48,0.425041,...,0.0,0.633284,0.076328,9231.74,0.825039,0.908316,4.91,0.0,0.627753,0.133624
1,24,23.0,37.0,28.0,0,2,12.0,11.0,9390.23,0.448366,...,0.0,1.484701,0.125851,11887.0,0.634554,0.796589,5.0,0.0,0.436942,0.098134
2,25,28.0,44.0,33.0,1,2,16.0,12.0,6954.35,0.599805,...,0.0,0.862301,0.101969,9020.78,0.750701,0.86643,4.04,0.0,0.550652,0.08572
3,26,30.0,37.0,29.0,0,1,15.0,15.0,9707.43,0.87328,...,0.0,0.552359,0.069582,6585.31,0.609348,0.780607,4.9,0.0,0.371673,0.056287
4,27,22.0,39.0,31.0,1,2,11.0,11.0,21049.9,1.475421,...,-3.92,0.142027,0.386527,23027.73,1.160635,1.077328,5.04,-4.29,1.094318,0.231853


In [3]:
# Count the number of instances of people in VVR_group = 1 and VVR_group = 0
count_vvr_group = merged_df['VVR_group'].value_counts()

# Print the counts
print("Number of instances in VVR_group = 1:", count_vvr_group[1])
print("Number of instances in VVR_group = 0:", count_vvr_group[0])

Number of instances in VVR_group = 1: 26
Number of instances in VVR_group = 0: 85


We will also loasd the names of the features. 

In [4]:
with open('/Users/dionnespaltman/Desktop/V3/columns_action_units.json', 'r') as f:
    columns_action_units = json.load(f)

print(len(columns_action_units))
# print(columns_action_units)

119


Then create X and y. 

In [5]:
columns_to_drop = [ 'ID', 'sum_12', 'sum_4567', 'sum_456', 'VVR_group', 'Condition'] 

X = merged_df.drop(columns_to_drop, axis=1)
y = merged_df['VVR_group']

# Article 1 

https://ploomber.io/blog/nested-cv/

In [6]:
from sklearn.model_selection import cross_validate

def do_cross_validation(clf, print_model=False):
    cv = cross_validate(clf, X, y, scoring='recall', cv=3)
    scores = ' + '.join(f'{s:.2f}' for s in cv["test_score"])
    mean_ = cv["test_score"].mean()
    msg = f'Cross-validated accuracy: ({scores}) / 3 = {mean_:.2f}'
    
    if print_model:
        msg = f'{clf}:\n\t{msg}\n'
    
    print(msg)

### Random Forest 

Then make the classifier. In this case Random Forest. 

In [7]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=2, random_state=0)
# X is our training data
clf.fit(X, y)

If you would just use the same data, then you get a super high accuracy score. 

In [9]:
from sklearn.metrics import precision_score, recall_score, f1_score

# This is an overly optimistic estimation since we are using X again!
y_pred = clf.predict(X)
recall = recall_score(y, y_pred)

print(f'Recall: {recall:.2f}')

Recall: 0.62


Now we want to split into a train and test part. We get a lower accuracy but it's better. 

In [10]:
from sklearn.model_selection import train_test_split

# split in train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

clf = RandomForestClassifier(n_estimators=2, random_state=0)
clf.fit(X_train, y_train)

# test with unseen data
y_pred = clf.predict(X_test)
recall = recall_score(y_test, y_pred)

print(f'Recall: {recall:.2f}')

Recall: 0.14


Now let's implement cross validation. 

In [11]:
do_cross_validation(clf)

Cross-validated accuracy: (0.00 + 0.22 + 0.00) / 3 = 0.07


### Support Vector Machine

In [12]:
from sklearn.svm import SVC

svc = SVC(random_state=0)
do_cross_validation(svc)

Cross-validated accuracy: (0.00 + 0.00 + 0.00) / 3 = 0.00


### Some experiments

There's an error here, becuase we are brute forcing our way into finding the best model. When we use this cross-validation method to optimize hyperparameters and model selection. 

In [13]:
# do_cross_validation(SVC(kernel='linear', random_state=0), print_model=True)
# do_cross_validation(SVC(kernel='poly', random_state=0), print_model=True)
# do_cross_validation(RandomForestClassifier(n_estimators=2, random_state=0), print_model=True)
# do_cross_validation(RandomForestClassifier(n_estimators=5, random_state=0), print_model=True)

### We enter nested cross-validation 

In [13]:
from sklearn.model_selection import GridSearchCV

# random forest inner loop
clf_grid = GridSearchCV(RandomForestClassifier(random_state=0), param_grid={'n_estimators': [2, 5]})
# random forest outer loop
do_cross_validation(clf_grid, print_model=True)

# svc inner loop
svc_grid = GridSearchCV(SVC(random_state=0), param_grid={'kernel': ['linear', 'poly']})
# svc outer loop
do_cross_validation(svc_grid, print_model=True)

GridSearchCV(estimator=RandomForestClassifier(random_state=0),
             param_grid={'n_estimators': [2, 5]}):
	Cross-validated accuracy: (0.00 + 0.22 + 0.00) / 3 = 0.07



I also want to try gridsearch on XGBoost and the Neural Network. 

In [9]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

param_grid = {
    'n_estimators': [2, 5, 10],
    'max_depth': [3, 6, 9]
}

# XGBoost inner loop
clf_grid = GridSearchCV(XGBClassifier(random_state=0), param_grid=param_grid)
# XGBoost  outer loop
do_cross_validation(clf_grid, print_model=True)

GridSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, device=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=None,
                                     grow_policy=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_bin=None,
                                     max_cat_threshold=None,
                                     max_cat_to_onehot=None,
                                     max_delta_step=None, max_depth=None,
                                     max_leaves=None, min_child_weight=None,
                   

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier

param_grid = {
    'hidden_layer_sizes': [(10), (50,), (100,)],  # Different sizes of hidden layers
    'activation': ['relu', 'tanh'],                  # Activation functions to try
    'solver': ['adam'],                              # Solver for weight optimization
    'alpha': [0.0001, 0.001],                        # L2 penalty (regularization term)
}

# Neural Network inner loop
clf_grid = GridSearchCV(MLPClassifier(random_state=0), param_grid=param_grid)
# Neural Network  outer loop
do_cross_validation(clf_grid, print_model=True)

GridSearchCV(estimator=MLPClassifier(random_state=0),
             param_grid={'activation': ['relu', 'tanh'],
                         'alpha': [0.0001, 0.001],
                         'hidden_layer_sizes': [10, (50,), (100,)],
                         'solver': ['adam']}):
	Cross-validated accuracy: (0.78 + 0.78 + 0.73) / 3 = 0.77



Final step is to run a final cross-validation procedure to find the optimal parameters. 

In [12]:
# do_cross_validation(SVC(kernel='linear', random_state=0), print_model=True)
# do_cross_validation(SVC(kernel='poly', random_state=0), print_model=True)

# Different article 

https://machinelearningmastery.com/nested-cross-validation-for-machine-learning-with-python/

In [6]:
# automatic nested cross-validation for random forest on a classification dataset
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

print("part 1")

# configure the cross-validation procedure
cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)

# define the model
model = RandomForestClassifier(random_state=1)

print("part 2")

# define search space
space = dict()
space['n_estimators'] = [10, 100, 500]
space['max_features'] = [2, 4, 6]

# define search
search = GridSearchCV(model, space, scoring='recall', n_jobs=1, cv=cv_inner, refit=True)

print("part 3")

# configure the cross-validation procedure
cv_outer = KFold(n_splits=10, shuffle=True, random_state=1)

# execute the nested cross-validation
scores = cross_val_score(search, X, y, scoring='recall', cv=cv_outer, n_jobs=-1)

# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

part 1
part 2
part 3


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.075 (0.160)


# SMOTE

In [16]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
from collections import Counter
from sklearn.metrics import precision_score, recall_score, f1_score

# Define the original dataset (X, y)
X = merged_df.drop(columns_to_drop, axis=1)
y = merged_df['VVR_group']

# print(len(X))
# print(len(y))

# Configure the outer cross-validation procedure
cv_outer = KFold(n_splits=10, shuffle=True, random_state=1)

# # Configure the outer cross-validation procedure with StratifiedKFold
# cv_outer = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)


# Loop through each outer fold
for train_index, test_index in cv_outer.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Apply SMOTE to the training data
    sm = SMOTE(random_state=42, k_neighbors=5)
    X_resampled, y_resampled = sm.fit_resample(X_train, y_train)
    # print(len(X_resampled))
    # print(len(y_resampled))
    
    # Define and fit the model
    model = RandomForestClassifier(random_state=1)
    model.fit(X_resampled, y_resampled)

    # Evaluate the model on the test fold
    y_pred = model.predict(X_test)
    recall = recall_score(y_test, y_pred)
    print("Recall:", recall)


Recall: 0.0
Recall: 0.25


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Recall: 0.0
Recall: 0.0
Recall: 0.16666666666666666


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Recall: 0.0
Recall: 0.0
Recall: 0.0
Recall: 0.6666666666666666
Recall: 0.0


In [21]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import recall_score
import numpy as np

# Define the original dataset (X, y)
X = merged_df.drop(columns_to_drop, axis=1)
y = merged_df['VVR_group']

display(X)

# Configure the outer cross-validation procedure with StratifiedKFold
cv_outer = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

# Loop through each outer fold
for train_index, test_index in cv_outer.split(X=y, y=y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Apply SMOTE to the training data
    print('Original dataset shape %s' % Counter(y))

    # sm = SMOTE(random_state=42, k_neighbors=5)
    # X_resampled, y_resampled = sm.fit_resample(X_train, y_train)

    # Apply SMOTE to the training data with sampling strategy set to 'auto' or 'minority'
    sm = SMOTE(random_state=42, sampling_strategy='minority', k_neighbors=5)
    X_resampled, y_resampled = sm.fit_resample(X_train, y_train)

    print('Resampled dataset shape %s' % Counter(y_resampled))

    # Define and fit the model
    model = RandomForestClassifier(random_state=1)
    model.fit(X_resampled, y_resampled)

    # Evaluate the model on the test fold
    y_pred = model.predict(X_test)
    
    # Calculate recall with zero_division='warn'
    recall = recall_score(y_test, y_pred, zero_division='warn')
    print("Recall:", recall)


Unnamed: 0,VVR_1,VVR_2,AU01_r__sum_values,AU01_r__variance,AU01_r__standard_deviation,AU01_r__maximum,AU01_r__minimum,AU01_r__mean,AU01_r__mean_abs_change,AU02_r__sum_values,...,AU26_r__minimum,AU26_r__mean,AU26_r__mean_abs_change,AU45_r__sum_values,AU45_r__variance,AU45_r__standard_deviation,AU45_r__maximum,AU45_r__minimum,AU45_r__mean,AU45_r__mean_abs_change
0,13.0,11.0,4982.48,0.425041,0.651952,5.00,0.00,0.338806,0.051614,2244.43,...,0.00,0.633284,0.076328,9231.74,0.825039,0.908316,4.91,0.00,0.627753,0.133624
1,12.0,11.0,9390.23,0.448366,0.669601,5.00,0.00,0.345166,0.042494,5122.35,...,0.00,1.484701,0.125851,11887.00,0.634554,0.796589,5.00,0.00,0.436942,0.098134
2,16.0,12.0,6954.35,0.599805,0.774471,4.53,0.00,0.424512,0.048749,2192.40,...,0.00,0.862301,0.101969,9020.78,0.750701,0.866430,4.04,0.00,0.550652,0.085720
3,15.0,15.0,9707.43,0.873280,0.934495,4.73,0.00,0.547885,0.033221,2641.00,...,0.00,0.552359,0.069582,6585.31,0.609348,0.780607,4.90,0.00,0.371673,0.056287
4,11.0,11.0,21049.90,1.475421,1.214669,5.99,-4.07,1.000328,0.187191,16193.17,...,-3.92,0.142027,0.386527,23027.73,1.160635,1.077328,5.04,-4.29,1.094318,0.231853
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,11.0,9.0,11750.48,0.739290,0.859820,5.00,0.00,0.509915,0.052530,4726.74,...,0.00,0.448007,0.076771,10547.03,0.506052,0.711374,3.53,0.00,0.457691,0.055759
107,14.0,11.0,13978.96,0.763932,0.874032,5.00,0.00,0.490541,0.040663,5675.33,...,0.00,0.960124,0.142141,6782.20,0.152370,0.390346,2.81,0.00,0.237997,0.035817
108,12.0,12.0,7175.00,0.461053,0.679009,4.97,0.00,0.383628,0.040392,3864.99,...,0.00,0.551210,0.075307,7319.31,0.418456,0.646882,3.78,0.00,0.391344,0.072455
109,11.0,9.0,11354.47,0.516678,0.718803,5.00,0.00,0.362647,0.035199,7260.07,...,0.00,0.641290,0.064991,16768.57,0.748187,0.864978,4.31,0.00,0.535566,0.118258


Original dataset shape Counter({0: 85, 1: 26})
Resampled dataset shape Counter({0: 76, 1: 76})
Recall: 0.3333333333333333
Original dataset shape Counter({0: 85, 1: 26})
Resampled dataset shape Counter({0: 76, 1: 76})
Recall: 0.0
Original dataset shape Counter({0: 85, 1: 26})
Resampled dataset shape Counter({0: 76, 1: 76})
Recall: 0.0
Original dataset shape Counter({0: 85, 1: 26})
Resampled dataset shape Counter({0: 76, 1: 76})
Recall: 0.5
Original dataset shape Counter({0: 85, 1: 26})
Resampled dataset shape Counter({1: 76, 0: 76})
Recall: 0.0
Original dataset shape Counter({0: 85, 1: 26})
Resampled dataset shape Counter({0: 77, 1: 77})
Recall: 0.0
Original dataset shape Counter({0: 85, 1: 26})
Resampled dataset shape Counter({0: 77, 1: 77})
Recall: 0.3333333333333333
Original dataset shape Counter({0: 85, 1: 26})
Resampled dataset shape Counter({0: 77, 1: 77})
Recall: 0.0
Original dataset shape Counter({0: 85, 1: 26})
Resampled dataset shape Counter({0: 77, 1: 77})
Recall: 0.333333333