# Baseline Classification Model: Functions for Random Forest

This notebook contains functions for the Random Forest w/ LOOCV. It also contains code for evaluating feature importances from the random forest. 

__INPUT: .csv files containing the sliding window summary statistics data with feature engineering (engineered_features.csv)__

__OUTPUT: Random Forest Multi-Classification Model w/ Feature Importances__

## Imports 

In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import plot_confusion_matrix
from sklearn import preprocessing

## Read in Data

In [26]:
pd.set_option('display.max_columns', None)
df = pd.read_csv("../../10_code/40_usable_data_for_models/41_Duke_Data/engineered_features.csv")

In [33]:
ids = set(df['Subject_ID'])

{'19-001',
 '19-002',
 '19-003',
 '19-004',
 '19-005',
 '19-006',
 '19-007',
 '19-008',
 '19-009',
 '19-010',
 '19-011',
 '19-012',
 '19-013',
 '19-014',
 '19-015',
 '19-016',
 '19-017',
 '19-018',
 '19-019',
 '19-020',
 '19-021',
 '19-022',
 '19-023',
 '19-024',
 '19-025',
 '19-026',
 '19-027',
 '19-029',
 '19-030',
 '19-031',
 '19-032',
 '19-033',
 '19-034',
 '19-035',
 '19-036',
 '19-037',
 '19-038',
 '19-039',
 '19-040',
 '19-041',
 '19-042',
 '19-043',
 '19-044',
 '19-045',
 '19-046',
 '19-047',
 '19-048',
 '19-049',
 '19-050',
 '19-051',
 '19-052',
 '19-053',
 '19-054',
 '19-055',
 '19-056'}

## Feature Selection

In [5]:
def LOOCV_featureselection(data, ids, outcomevar, dropcols, idcolumn, numestimators=750):
    """
        Intermediate function. 
            
    """
    # Separate data for leave-one-person-out-cross-validation (LOOCV)
    LOOCV_O = ids 
    data[idcolumn] = data[idcolumn].apply(str)
    data_filtered = data[data[idcolumn] != LOOCV_O]
    data_cv = data[data[idcolumn] == LOOCV_O]
     
    
    # Train data - all other people in dataframe
    data_train = data_filtered.drop(columns=dropcols)
    data_train = data_train.dropna()
    X_train = data_train.drop(columns=[outcomevar])
    
    feature_list = list(X_train.columns)
    X_train= np.array(X_train)
    y_train = np.array(data_train[outcomevar]) #Outcome variable here

    
    from sklearn.ensemble import RandomForestClassifier
    # Instantiate model with numestimators decision trees
    rf = RandomForestClassifier(n_estimators = numestimators, max_depth=25, min_samples_leaf=1, class_weight='balanced_subsample')
    # Train the model on training data
    rf.fit(X_train, y_train);
    
    # Get importances:
    importances = list(rf.feature_importances_)# List of tuples with variable and importance
    important = pd.DataFrame()
    important['value'] = feature_list
    important['importances'] = importances
    
    return important

## Random Forest: Intermediate Function

In [6]:
def RFLOOCV(data, ids, outcomevar, dropcols, idcolumn, numestimators=750, fs=0.01):
    """
        Intermediate function. 
            
    """
    # Get important features 
    listimportances = LOOCV_featureselection(data, ids, outcomevar, dropcols, idcolumn, numestimators)
    filteredi = listimportances[listimportances['importances'] < fs]
    filteredi = filteredi['value']
    
    LOOCV_O = str(ids)
    data[idcolumn] = data[idcolumn].apply(str)
    data_filtered = data[data[idcolumn] != LOOCV_O]
    data_cv = data[data[idcolumn] == LOOCV_O]
   
    # Test data - the person left out of training
    data_test = data_cv.drop(columns=dropcols)
    data_test = data_test.drop(columns=filteredi) #cvf
    data_test = data_test.dropna()
    X_test = data_test.drop(columns=[outcomevar])
    y_test = data_test[outcomevar] #This is the outcome variable
    
    # Train data - all other people in dataframe
    data_train = data_filtered.drop(columns=dropcols)
    data_train = data_train.drop(columns=filteredi)
    data_train = data_train.dropna()
    X_train = data_train.drop(columns=[outcomevar])
    

    feature_list = list(X_train.columns)
    X_train= np.array(X_train)
    y_train = np.array(data_train[outcomevar]) #Outcome variable here

    
    from sklearn.ensemble import RandomForestClassifier
    # Instantiate model with numestimators decision trees
    rf = RandomForestClassifier(n_estimators = numestimators, max_depth=25, min_samples_leaf=1,  class_weight='balanced_subsample')
    # Train the model on training data
    rf.fit(X_train, y_train);
    
    # Use the forest's predict method on the test data
    y_pred = rf.predict(X_test)

    #Acccuracy Score
    Accuracy = accuracy_score(y_test, y_pred)
    
    #F1 Score
    F1 = f1_score(y_test, y_pred,average='weighted') 
   
    
    # List of tuples with variable and importance
    importances = list(rf.feature_importances_)
    important = pd.DataFrame()
    important['value'] = feature_list
    important['importances'] = importances
    important['id'] = str(ids)
    
    return Accuracy, F1, important


## Random Forest: Main Function

In [7]:

def loocvRF(data, idcolumn, outcomevar, dropcols=[], numestimators=750, fs=0.01):
    """
        Main loocv RF function that calls other functions to do RF feature selection, training, and testing. 
        Args:
          data (pandas DataFrame): This is a dataframe containing each participant's features and outcome variables
          idcolumn (string): This is the column name of your column containing your participant number or ID (case sensitive)
          outcomevar (string): This is the column name of your outcome variable (case sensitive)
          dropcols (list): This is a list containing strings of each column you wish to drop in your dataframe. Default is empty list [].
          numestimators (integer): The number of trees you want built in your RF. Default=1000.
          fs (float): The cutoff importance for feature selection. Anything below this importance will be removed for the RF training.
          
        Returns:
            errors (list): This is a list with the absolute error between the predicted value and actual value for each fold.
            meanrmse (float): This is the mean root mean squared error (RMSE) over all of the folds
            stdrmse (float): This is the standard deviation of the root mean squared error (RMSE) over all of the folds
            meanrmse (float): This is the mean mean average percent error (MAPE) over all of the folds
            meanrmse (float): This is the standard deviation of the mean average percent error (MAPE) over all of the folds
            importances(pandas DataFrame): This is a pandas DataFrame with 3 columns: value (feature), importances (importance of the feature), and id (fold over which this feature importance was derived)
            
    """
    
    # Make list of all ID's in idcolumn
    IDlist = list(data[idcolumn].unique())
    drop = [idcolumn] #add idcolumn to dropcols to drop from model
    drop = drop + dropcols
    
    # Initialize empty lists and dataframe 
    F1 = []
    accuracy = []
    importances = pd.DataFrame(columns=['value', 'importances', 'id'])
    
    # Run LOOCV Random Forest! 
    for i in IDlist:
        acc, f1, imp= RFLOOCV(data, i, outcomevar, drop, idcolumn, numestimators, fs)
        accuracy.append(acc)
        F1.append(f1)
        importances = importances.append(imp)
        idt = str(i)
        print('...' + idt + ' processing complete.')

    # Compute mean and std Accuracy, F1
    meanaccuracy = np.mean(accuracy)
    stdaccuracy = np.std(accuracy)
    meanF1 = np.mean(F1)
    stdF1 = np.std(F1)
    
    # Print Accuracy, F1 Stats
    print('Mean Accuracy:' + str(meanaccuracy))
    print('Std Accuracy:' + str(stdaccuracy))
    print('Mean F1:' + str(meanF1))
    print('Std F1:' + str(stdF1))
    
    return F1, accuracy, importances

## Label Encoding Subject ID & Window Count

In [8]:
df = df.assign(count=df.groupby(df.Activity.ne(df.Activity.shift()).cumsum()).cumcount().add(1))

In [9]:
le = preprocessing.LabelEncoder()
df['Subject_ID'] = le.fit_transform(df['Subject_ID'])

In [10]:
df = pd.concat([df, pd.get_dummies(df['Subject_ID'], prefix = 'SID')], axis =1)
df = pd.concat([df, pd.get_dummies(df['count'], prefix = 'count')], axis =1).drop('count', axis = 1)

In [11]:
df.head()

Unnamed: 0,ACC1,ACC2,ACC3,TEMP,EDA,BVP,HR,Magnitude,Subject_ID,Activity,Round,ACC1_mean,ACC2_mean,ACC3_mean,TEMP_mean,EDA_mean,BVP_mean,HR_mean,Magnitude_mean,ACC1_std,ACC2_std,ACC3_std,TEMP_std,EDA_std,BVP_std,HR_std,Magnitude_std,ACC1_skew,ACC2_skew,ACC3_skew,TEMP_skew,EDA_skew,BVP_skew,HR_skew,Magnitude_skew,ACC1_min,ACC2_min,ACC3_min,TEMP_min,EDA_min,BVP_min,HR_min,Magnitude_min,ACC1_max,ACC2_max,ACC3_max,TEMP_max,EDA_max,BVP_max,HR_max,Magnitude_max,SID_0,SID_1,SID_2,SID_3,SID_4,SID_5,SID_6,SID_7,SID_8,SID_9,SID_10,SID_11,SID_12,SID_13,SID_14,SID_15,SID_16,SID_17,SID_18,SID_19,SID_20,SID_21,SID_22,SID_23,SID_24,SID_25,SID_26,SID_27,SID_28,SID_29,SID_30,SID_31,SID_32,SID_33,SID_34,SID_35,SID_36,SID_37,SID_38,SID_39,SID_40,SID_41,SID_42,SID_43,SID_44,SID_45,SID_46,SID_47,SID_48,SID_49,SID_50,SID_51,SID_52,SID_53,SID_54,count_1,count_2,count_3,count_4,count_5,count_6,count_7,count_8,count_9,count_10,count_11,count_12,count_13,count_14,count_15,count_16,count_17,count_18,count_19,count_20,count_21,count_22,count_23,count_24,count_25,count_26,count_27,count_28,count_29,count_30,count_31,count_32,count_33,count_34,count_35,count_36,count_37,count_38,count_39,count_40,count_41,count_42,count_43,count_44,count_45,count_46,count_47,count_48,count_49,count_50,count_51,count_52,count_53,count_54,count_55,count_56,count_57,count_58
0,[41.0 41.0 41.0 41.0 41.0 41.0 41.0 41.0 41.0 ...,[27.2 27.3 27.4 27.5 27.6 27.7 27.8 27.9 28.0 ...,[40.0 40.0 40.0 40.0 40.0 40.0 40.0 40.0 40.0 ...,[32.39 32.39 32.39 32.39 32.34 32.34 32.34 32....,[0.275354 0.276634 0.270231 0.270231 0.26895 0...,[15.25 -12.75 -42.99 18.39 13.61 -9.66 -35.47 ...,[78.98 78.83500000000002 78.69 78.545 78.4 78....,[63.410093833710725 63.453053512025726 63.4961...,0,Baseline,1,40.24837,28.01288,38.824457,32.35,0.262354,-0.109875,73.931187,62.553853,0.701573,0.68759,0.632616,0.017607,0.004877,18.439453,2.574676,0.609756,-0.082592,-0.558848,0.705668,0.714533,0.896382,-0.392823,0.296262,0.531557,39.0,26.456522,38.0,32.33,0.254862,-42.99,69.765,61.692787,41.543478,29.0,40.0,32.39,0.276634,34.83,78.98,63.757353,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,[39.0 39.06521739130435 39.130434782608695 39....,[29.0 28.93478260869565 28.869565217391305 28....,[38.0 38.02173913043478 38.04347826086956 38.0...,[32.34 32.34 32.34 32.34 32.33 32.33 32.33 32....,[0.25998499999999997 0.25998499999999997 0.258...,[-18.83 -0.3 11.03 6.09 -15.3 14.61 6.75 -2.38...,[73.52 73.435 73.35 73.265 73.18 73.0925 73.00...,[61.69278726074872 61.7168170027034 61.7409828...,0,Baseline,1,40.82,26.815,38.1925,32.339,0.261058,0.321375,69.48175,62.021872,1.192214,1.149559,0.529382,0.01261,0.003007,20.104717,2.608254,0.542348,0.515544,0.109446,-0.188071,0.787066,0.212943,-0.3229,-0.170923,-0.438037,39.0,24.6,37.2,32.31,0.254862,-48.52,64.8025,60.778286,43.8,29.0,39.0,32.37,0.266389,37.72,73.52,62.936476,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,[41.60869565217392 41.67391304347826 41.739130...,[26.39130434782609 26.32608695652174 26.260869...,[38.869565217391305 38.89130434782609 38.91304...,[32.34 32.34 32.34 32.34 32.33 32.33 32.33 32....,[0.265108 0.263827 0.266389 0.265108 0.266389 ...,[-27.69 30.51 14.64 1.97 -13.65 -48.52 17.77 2...,[69.63 69.515 69.4 69.285 69.17 69.04 68.91 68...,[62.758486272725364 62.78782873035957 62.81730...,0,Baseline,1,43.252235,25.312684,37.488043,32.337,0.259585,0.684,64.893188,62.621785,2.109896,0.815025,0.647914,0.010536,0.004337,23.756276,2.639037,0.942868,-0.47302,0.227966,1.329886,0.620801,0.072564,-0.274279,0.185657,-0.382833,39.0,24.0,37.0,32.31,0.252301,-48.52,60.995,60.778286,45.532258,27.0,39.0,32.37,0.266389,47.14,69.63,64.010791,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,[43.971428571428575 44.14285714285715 44.31428...,[24.514285714285712 24.42857142857143 24.34285...,[37.17142857142857 37.142857142857146 37.11428...,[32.33 32.33 32.33 32.33 32.34 32.34 32.34 32....,[0.258704 0.258704 0.258704 0.257424 0.257424 ...,[17.18 2.41 -22.11 5.12 18.43 5.34 -14.33 -22....,[64.68 64.555 64.43 64.305 64.18 64.0475 63.91...,[62.579164557660036 62.649331804179724 62.7200...,0,Baseline,1,44.905798,24.915984,37.638218,32.356,0.25451,-0.180875,61.157687,63.734171,1.832017,1.509593,1.773398,0.025377,0.002396,25.635645,1.674001,0.841361,-4.941414,-1.040349,3.435987,0.672586,0.734482,-0.828441,0.406263,-0.532117,32.0,20.985816,37.0,32.33,0.25102,-101.74,58.8025,61.392182,46.0,27.0,48.0,32.41,0.262546,47.14,64.68,65.711491,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,[45.54838709677418 45.564516129032256 45.58064...,[25.64516129032258 25.69354838709677 25.741935...,[37.0 37.0 37.0 37.0 37.0 37.0 37.0 37.0 37.0 ...,[32.34 32.34 32.34 32.34 32.33 32.33 32.33 32....,[0.253581 0.253581 0.253581 0.252301 0.252301 ...,[-32.0 15.14 24.41 3.88 -22.97 -0.34 17.89 3.0...,[60.92 60.8475 60.77500000000001 60.7025 60.63...,[64.04162603123257 64.07248675362091 64.103373...,0,Baseline,1,43.577055,22.974382,38.971144,32.389,0.252733,-0.20975,59.226438,62.913435,2.115371,2.585687,1.809092,0.027,0.002055,25.593597,0.684349,1.365652,-1.857224,0.511935,1.380509,-0.686481,1.239716,-0.833856,0.996232,0.408229,32.0,20.702128,37.0,32.33,0.249739,-101.74,58.53,61.392182,46.0,27.0,48.0,32.43,0.258704,39.0,60.92,65.711491,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Run Random Forest

### Run this For Both ACC & PHYS

In [12]:
F1, accuracy, importances = loocvRF(df,'Subject_ID','Activity',['ACC1', 'ACC2', 'ACC3', 'TEMP', 'EDA', 'BVP', 'HR', 'Magnitude','Round'] ,numestimators =750)

...0 processing complete.
...1 processing complete.
...2 processing complete.
...3 processing complete.
...4 processing complete.
...5 processing complete.
...6 processing complete.
...7 processing complete.
...8 processing complete.
...9 processing complete.
...10 processing complete.
...11 processing complete.
...12 processing complete.
...13 processing complete.
...14 processing complete.
...15 processing complete.
...16 processing complete.
...17 processing complete.
...18 processing complete.
...19 processing complete.
...20 processing complete.
...21 processing complete.
...22 processing complete.
...23 processing complete.
...24 processing complete.
...25 processing complete.
...26 processing complete.
...27 processing complete.
...28 processing complete.
...29 processing complete.
...30 processing complete.
...31 processing complete.
...32 processing complete.
...33 processing complete.
...34 processing complete.
...35 processing complete.
...36 processing complete.
...37 proce

### Run this For ACC Only

In [49]:
F1, accuracy, importances = loocvRF(df,'Subject_ID','Activity',['ACC1', 'ACC2', 'ACC3', 'TEMP', 'EDA', 'BVP', 'HR', 'Magnitude','Round','TEMP_mean', 'EDA_mean', 'BVP_mean', 'HR_mean','TEMP_std',
       'EDA_std', 'BVP_std', 'HR_std','TEMP_skew', 'EDA_skew', 'BVP_skew',
       'HR_skew','TEMP_min', 'EDA_min', 'BVP_min', 'HR_min','TEMP_max', 'EDA_max', 'BVP_max', 'HR_max'] ,numestimators =750)

...19-001 processing complete.
...19-002 processing complete.
...19-003 processing complete.
...19-004 processing complete.
...19-005 processing complete.
...19-006 processing complete.
...19-007 processing complete.
...19-008 processing complete.
...19-009 processing complete.
...19-010 processing complete.
...19-011 processing complete.
...19-012 processing complete.
...19-013 processing complete.
...19-014 processing complete.
...19-015 processing complete.
...19-016 processing complete.
...19-017 processing complete.
...19-018 processing complete.
...19-019 processing complete.
...19-020 processing complete.
...19-021 processing complete.
...19-022 processing complete.
...19-023 processing complete.
...19-024 processing complete.
...19-025 processing complete.
...19-026 processing complete.
...19-027 processing complete.
...19-029 processing complete.
...19-030 processing complete.
...19-031 processing complete.
...19-032 processing complete.
...19-033 processing complete.
...19-03

### Feature Importances

In [17]:
importance_grouped = importances.groupby('value').mean().sort_values(by='importances', ascending = False)

In [19]:
importance_grouped

Unnamed: 0_level_0,importances
value,Unnamed: 1_level_1
ACC3_mean,0.104077
ACC3_max,0.073751
Magnitude_std,0.072004
ACC2_mean,0.064231
ACC2_max,0.059433
ACC3_min,0.055872
Magnitude_max,0.048158
ACC2_min,0.044349
ACC1_std,0.042591
ACC2_std,0.039545


In [24]:
df.columns

Index(['ACC1', 'ACC2', 'ACC3', 'TEMP', 'EDA', 'BVP', 'HR', 'Magnitude',
       'Subject_ID', 'Activity', 'Round', 'ACC1_mean', 'ACC2_mean',
       'ACC3_mean', 'TEMP_mean', 'EDA_mean', 'BVP_mean', 'HR_mean',
       'Magnitude_mean', 'ACC1_std', 'ACC2_std', 'ACC3_std', 'TEMP_std',
       'EDA_std', 'BVP_std', 'HR_std', 'Magnitude_std', 'ACC1_skew',
       'ACC2_skew', 'ACC3_skew', 'TEMP_skew', 'EDA_skew', 'BVP_skew',
       'HR_skew', 'Magnitude_skew', 'ACC1_min', 'ACC2_min', 'ACC3_min',
       'TEMP_min', 'EDA_min', 'BVP_min', 'HR_min', 'Magnitude_min', 'ACC1_max',
       'ACC2_max', 'ACC3_max', 'TEMP_max', 'EDA_max', 'BVP_max', 'HR_max',
       'Magnitude_max'],
      dtype='object')