# Random forest 

In [51]:
# Import necessary libraries
import json
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, auc, precision_recall_curve, confusion_matrix
import pandas as pd
import logging
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import precision_recall_curve, auc
from sklearn.metrics import confusion_matrix as cm
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE

In [52]:
merged_df = pd.read_csv('/Users/dionnespaltman/Desktop/V3/merged_df.csv', sep=',')

merged_df.drop('Unnamed: 0', axis=1, inplace=True)
merged_df.drop('Unnamed: 0.1', axis=1, inplace=True)

display(merged_df.head(5))

Unnamed: 0,ID,sum_12,sum_4567,sum_456,VVR_group,Condition,VVR_1,VVR_2,AU01_r__sum_values,AU01_r__variance,...,AU26_r__minimum,AU26_r__mean,AU26_r__mean_abs_change,AU45_r__sum_values,AU45_r__variance,AU45_r__standard_deviation,AU45_r__maximum,AU45_r__minimum,AU45_r__mean,AU45_r__mean_abs_change
0,23,24.0,37.0,27.0,0,2,13.0,11.0,4982.48,0.425041,...,0.0,0.633284,0.076328,9231.74,0.825039,0.908316,4.91,0.0,0.627753,0.133624
1,24,23.0,37.0,28.0,0,2,12.0,11.0,9390.23,0.448366,...,0.0,1.484701,0.125851,11887.0,0.634554,0.796589,5.0,0.0,0.436942,0.098134
2,25,28.0,44.0,33.0,1,2,16.0,12.0,6954.35,0.599805,...,0.0,0.862301,0.101969,9020.78,0.750701,0.86643,4.04,0.0,0.550652,0.08572
3,26,30.0,37.0,29.0,0,1,15.0,15.0,9707.43,0.87328,...,0.0,0.552359,0.069582,6585.31,0.609348,0.780607,4.9,0.0,0.371673,0.056287
4,27,22.0,39.0,31.0,1,2,11.0,11.0,21049.9,1.475421,...,-3.92,0.142027,0.386527,23027.73,1.160635,1.077328,5.04,-4.29,1.094318,0.231853


In [53]:
# Count the number of instances of people in VVR_group = 1 and VVR_group = 0
count_vvr_group = merged_df['VVR_group'].value_counts()

# Print the counts
print("Number of instances in VVR_group = 1:", count_vvr_group[1])
print("Number of instances in VVR_group = 0:", count_vvr_group[0])

Number of instances in VVR_group = 1: 26
Number of instances in VVR_group = 0: 85


In [54]:
with open('/Users/dionnespaltman/Desktop/V3/columns_au_12.json', 'r') as f:
    columns_au_12 = json.load(f)

print(len(columns_au_12))
# print(columns_au_12)

121


In [55]:
with open('/Users/dionnespaltman/Desktop/V3/columns_action_units.json', 'r') as f:
    columns_action_units = json.load(f)

print(len(columns_action_units))
# print(columns_action_units)

119


In [56]:
columns_to_drop = [ 'ID', 'sum_12', 'sum_4567', 'sum_456', 'VVR_group', 'Condition'] 

First we'll split the data into a train and test set. 

The train set has 88 participants, the test set has 23 participants. 

In [57]:
train, test = train_test_split(merged_df, test_size=0.2, random_state=123)
# train, val = train_test_split(train, stratify=train['VVR_group'], random_state=123)

print(train.shape)
print(test.shape)

(88, 127)
(23, 127)


In [68]:
columns_to_drop = [ 'ID', 'sum_12', 'sum_4567', 'sum_456', 'VVR_group', 'Condition'] 

X_test = test.drop(columns_to_drop, axis=1)
y_test = test['VVR_group']

# Print original class distribution
print('Original dataset shape %s' % Counter(y_test))

Original dataset shape Counter({0: 19, 1: 4})


In [58]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE

In [61]:
columns_to_drop = [ 'ID', 'sum_12', 'sum_4567', 'sum_456', 'VVR_group', 'Condition'] 

X_train = train.drop(columns_to_drop, axis=1)
y_train = train['VVR_group']

# Print original class distribution
print('Original dataset shape %s' % Counter(y_train))

# Apply SMOTE to the training data with sampling strategy set to 'auto' (default)
sm = SMOTE(sampling_strategy='not majority', random_state=42, k_neighbors=10)
X_resampled, y_resampled = sm.fit_resample(X_train, y_train)

# Print resampled class distribution
print('Resampled dataset shape %s' % Counter(y_resampled))

# Merge resampled features and target variable into a new DataFrame
new_merged_df = pd.merge(X_resampled, y_resampled, how='outer', left_index=True, right_index=True)


# Check the shape of the new merged DataFrame
print('New merged dataset shape:', new_merged_df.shape)

Original dataset shape Counter({0: 66, 1: 22})
Resampled dataset shape Counter({0: 66, 1: 66})
New merged dataset shape: (132, 122)


In [62]:
featurizer = ColumnTransformer(transformers=[("numeric", StandardScaler(), columns_au_12)], remainder='drop')

In [67]:
from sklearn.pipeline import make_pipeline
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

dummy = make_pipeline(featurizer, DummyClassifier(strategy='most_frequent'))
rf = make_pipeline(featurizer, RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=0))
svm = make_pipeline(featurizer, SVC())
multiclass_svm = make_pipeline(featurizer, SVC(decision_function_shape='ovr'))
xgb = make_pipeline(featurizer, XGBClassifier())
mlp = make_pipeline(featurizer, MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000))
decision_tree = make_pipeline(featurizer, DecisionTreeClassifier())

models = {
    "Dummy": dummy,
    "RandomForest": rf,
    "SVM": svm,
    "Multiclass SVM": multiclass_svm,
    "XGBoost": xgb,
    "MLP": mlp,
    "DecisionTree": decision_tree
}


In [66]:
logging.info("Fitting models")

for name, model in models.items():
    model.fit(train.drop('VVR_group', axis=1), train['VVR_group'].values)
    logging.info(f"Evaluating {name} on validation data")
    pred = model.predict(test.drop('VVR_group', axis=1))
    accuracy = accuracy_score(test['VVR_group'].values, pred)
    report = classification_report(test['VVR_group'].values, pred)
    
    # Calculating Precision-Recall curve and its AUC
    precision, recall, _ = precision_recall_curve(test['VVR_group'].values, pred)
    auc_pr = auc(recall, precision)

    logging.info(f"{name} Accuracy: {accuracy}")
    logging.info(f"{name} AUC-PR: {auc_pr}")  # Including AUC-PR in logging
    logging.info(f"{name} Classification Report:")
    logging.info(report)
    logging.info(f"{name} Confusion Matrix:")
    logging.info(cm)

best_model_name = max(models, key=lambda x: accuracy_score(test['VVR_group'].values, models[x].predict(test.drop('VVR_group', axis=1))))
best_model = models[best_model_name]

logging.info(f"Predicting on test using best model: {best_model_name}")

pred = best_model.predict(test.drop('VVR_group', axis=1))

accuracy = accuracy_score(test['VVR_group'].values, pred)
report = classification_report(test['VVR_group'].values, pred)
cm = confusion_matrix(test['VVR_group'].values, pred)

logging.info(f"{best_model_name} Accuracy on Test Data: {accuracy}")
logging.info(f"{best_model_name} Classification Report on Test Data:")
logging.info(report)
logging.info(f"{best_model_name} Confusion Matrix:")
logging.info(cm)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
