# Figure 3 Demo: Random-Forest Based Activity Classification

*Disclaimer:  This sample code and the reported results are represent the processed reported in the Pilot Study.  When running these processing with ML models incorporating stochasticity, the results shown in this demonstration notebook may differ slightly from those in the Pilot Study.*

In [1]:
import os
import re
import numpy as np
import pandas as pd
import mne
import shap
import boruta
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold

from tqdm import tqdm

___

## Data Loading

In [2]:
# Create a mapping from string type task ID to integer task ID
task_label_map = {'Angry': 0, 'Chewing': 1, 'Eye': 2, 'Eye-Iso': 3, 'In-Iso': 4, 'Jaw': 5, 'L Gaze-L': 6, \
                  'L Gaze-R': 7, 'Out-Iso': 8, 'Sad': 9, 'Smile-Iso': 10, 'Surprise': 11, 'Swallowing': 12, \
                  'Talk': 13, 'Up Gaze': 14, 'Wrinkle-Iso': 15}

reverse_task_label_map = {value: key for key, value in task_label_map.items()}

subject_ids = ['subject0', 'subject1', 'subject2', 'subject3', 'subject4', \
               'subject5', 'subject6', 'subject7', 'subject8', 'subject9']

In [3]:
# Create a dictionary of features and labels.  Each dictionary maps a subject's
# ID to a set of activity features or activity labels, respectively.
features = {sid: [] for sid in subject_ids}
labels = {sid: [] for sid in subject_ids}

# Read feature data from csv files
feature_data_dir = '../data/feature_data'
for sid in tqdm(subject_ids):
    for time in ['Morning', 'Evening']:
        df = pd.read_csv(os.path.join(feature_data_dir, sid, '{}_{}_variables.csv'.format(sid, time)))
        df['Task Label'] = df['Task Label'].apply(lambda x: task_label_map[x])
        
        # Remove Event Duration feature, as this feature is only applicable to
        # tasks performed for specified durations.
        df.drop(['Event Duration (s)'], axis=1, inplace=True)
        
        y = np.array(df['Task Label'])
        
        df.drop(['Task Label'], axis=1, inplace=True)
        X = df.to_numpy()
        
        features[sid].append(X)
        labels[sid].append(y)
        
    features[sid] = np.vstack(features[sid])
    labels[sid] = np.hstack(labels[sid])

100%|██████████| 10/10 [00:00<00:00, 37.91it/s]


___

## Activity Classification

In [4]:
X = np.vstack([features[sid] for sid in subject_ids])
y = np.hstack([labels[sid] for sid in subject_ids])

shuffled_inds = np.random.permutation(X.shape[0])
X = X[shuffled_inds]
y = y[shuffled_inds]

# Perform a stratified split of the pooled set of actions 
# (from all subjects) into training and test sets
test_size = 0.2
test_inds = []
train_inds = []
for class_id in list(task_label_map.values()):
    class_sample_idxs = np.where(y==class_id)[0]
    class_sample_count = len(class_sample_idxs)
    class_sample_idxs = np.random.permutation(class_sample_idxs)
    test_inds.append(class_sample_idxs[:int(test_size*class_sample_count)])
    train_inds.append(class_sample_idxs[int(test_size*class_sample_count):])
train_inds = np.hstack(train_inds)
test_inds = np.hstack(test_inds)

X_train, X_test = X[train_inds], X[test_inds]
y_train, y_test = y[train_inds], y[test_inds]

In [5]:
train_feature_means = np.mean(X_train, axis=0)
train_feature_stds = np.std(X_train, axis=0)

X_train = (X_train-train_feature_means)/train_feature_stds
X_test = (X_test-train_feature_means)/train_feature_stds

clf = RandomForestClassifier(n_estimators=500)
clf.fit(X_train, y_train)
y_hat = clf.predict(X_test)

print('Test Set Classification Report Summary:')
print(classification_report(y_test, y_hat))

Test Set Classification Report Summary:
              precision    recall  f1-score   support

           0       0.33      0.27      0.30        15
           1       0.94      1.00      0.97        16
           2       0.89      1.00      0.94        16
           3       0.53      0.56      0.55        16
           4       0.50      0.44      0.47        16
           5       0.84      1.00      0.91        16
           6       0.79      0.79      0.79        14
           7       0.86      0.86      0.86        14
           8       0.43      0.38      0.40        16
           9       0.43      0.56      0.49        16
          10       0.72      0.81      0.76        16
          11       0.78      0.44      0.56        16
          12       1.00      1.00      1.00        16
          13       0.94      1.00      0.97        15
          14       0.92      0.80      0.86        15
          15       0.35      0.38      0.36        16

    accuracy                           0