## Feature Engineering

This notebook outlines the process for engineering features of our data that will be used in the Random Forest models.

We have chosen to engineer five features:
1. Window mean
2. Window standard deviation
3. Window skew
4. Window minimum
5. Window maximum

**INPUT: CSV output of 31_outlier_removal** (plain_data.csv)

**OUTPUT: CSV file containing all engineered features** (engineered_features.csv)

### Import libraries

In [210]:
import pandas as pd
import numpy as np
import wearablevar
import datetime
from scipy import stats

### Import data

In [211]:
df = pd.read_csv('plain_data.csv')

In [212]:
df

Unnamed: 0,ACC1,ACC2,ACC3,TEMP,EDA,BVP,HR,Magnitude,Activity,Subject_ID,Round
0,41.000000,27.200000,40.000000,32.39,0.275354,15.25,78.9800,63.410094,Baseline,19-001,1
1,41.000000,27.300000,40.000000,32.39,0.276634,-12.75,78.8350,63.453054,Baseline,19-001,1
2,41.000000,27.400000,40.000000,32.39,0.270231,-42.99,78.6900,63.496142,Baseline,19-001,1
3,41.000000,27.500000,40.000000,32.39,0.270231,18.39,78.5450,63.539358,Baseline,19-001,1
4,41.000000,27.600000,40.000000,32.34,0.268950,13.61,78.4000,63.582702,Baseline,19-001,1
...,...,...,...,...,...,...,...,...,...,...,...
279835,21.176471,-11.176471,64.823529,32.09,0.708502,0.85,92.8275,69.104605,Type,19-056,1
279836,24.235294,-12.235294,62.764706,32.09,0.694414,-1.00,92.8800,68.384649,Type,19-056,1
279837,27.294118,-13.294118,60.705882,32.09,0.672642,5.22,92.9400,67.874197,Type,19-056,1
279838,30.352941,-14.352941,58.647059,32.09,0.664957,-1.47,93.0000,67.577995,Type,19-056,1


### Create sliding windows

In [9]:
from window_slider import Slider

def make_windows(df, bucket_size, overlap_count):
    window_list = []
    final = pd.DataFrame()
    activity_list = list(df['Activity'].unique()) #list of the four activities
    sub_id_list = list(df['Subject_ID'].unique()) #list of the subject ids
    round_list = list(df['Round'].unique())
    df_list = []


    for i in sub_id_list:
        df_subject = df[df['Subject_ID'] == i] #isolate a single subject id
        for j in activity_list:
            df_subject_activity = df_subject[df_subject['Activity'] == j] #isolate by activity
            for k in round_list:
                df_subject_activity_round = df_subject_activity[df_subject_activity['Round'] == k]
                final_df = pd.DataFrame()
                if df_subject_activity_round.empty:
                      pass
                else:
                    df_flat = df_subject_activity_round[['ACC1', 'ACC2','ACC3','TEMP','EDA','BVP','HR','Magnitude', 'Subject_ID']].T.values #array of arrays, each row is every single reading in an array for a sensor in that isolation 

                    slider = Slider(bucket_size,overlap_count)
                    slider.fit(df_flat)
                    while True:
                        window_data = slider.slide()

                        if slider.reached_end_of_list(): break
                        window_list.append(list(window_data))
                    final_df = final.append(window_list)
                    final_df.columns = [['ACC1', 'ACC2','ACC3','TEMP','EDA','BVP','HR','Magnitude', 'SID']]
                    final_df.insert(9, "Subject_ID", [i]*len(final_df), True)
                    final_df.insert(10, "Activity", [j]*len(final_df), True)
                    final_df.insert(11, "Round", [k]*len(final_df), True)
                    df_list.append(final_df)
                    window_list = []

    final = pd.DataFrame(columns = df_list[0].columns)

    for l in df_list:
        final = final.append(l)


    final
    final.columns = final.columns.map(''.join)
    return final

In [213]:
df = make_windows(df, 80, 40)

### Create engineered dataframes

Protocol:
1. Copy windowed dataframe (df)
2. Apply feature engineering algorithm to df1
3. Copy df1
4. Change column names to represent new feature

#### Mean

In [214]:
df1 = df.copy()

In [216]:
for j in df1[['ACC1', 'ACC2', 'ACC3', 'TEMP', 'EDA', 'BVP', 'HR', 'Magnitude']].columns:
    df1[j] = df1[j].map(lambda x: np.mean(x))

In [218]:
dfmean = df1.copy()

In [220]:
dfmean.columns = dfmean.columns+"_mean"

In [221]:
dfmean

Unnamed: 0,ACC1_mean,ACC2_mean,ACC3_mean,TEMP_mean,EDA_mean,BVP_mean,HR_mean,Magnitude_mean,SID_mean,Subject_ID_mean,Activity_mean,Round_mean
0,40.248370,28.012880,38.824457,32.3500,0.262354,-0.109875,73.931187,62.553853,"[19-001, 19-001, 19-001, 19-001, 19-001, 19-00...",19-001,Baseline,1
1,40.820000,26.815000,38.192500,32.3390,0.261058,0.321375,69.481750,62.021872,"[19-001, 19-001, 19-001, 19-001, 19-001, 19-00...",19-001,Baseline,1
2,43.252235,25.312684,37.488043,32.3370,0.259585,0.684000,64.893188,62.621785,"[19-001, 19-001, 19-001, 19-001, 19-001, 19-00...",19-001,Baseline,1
3,44.905798,24.915984,37.638218,32.3560,0.254510,-0.180875,61.157687,63.734171,"[19-001, 19-001, 19-001, 19-001, 19-001, 19-00...",19-001,Baseline,1
4,43.577055,22.974382,38.971144,32.3890,0.252733,-0.209750,59.226438,62.913435,"[19-001, 19-001, 19-001, 19-001, 19-001, 19-00...",19-001,Baseline,1
...,...,...,...,...,...,...,...,...,...,...,...,...
0,25.927500,-12.236250,46.733750,32.1945,0.443490,2.167500,97.022312,59.921697,"[19-056, 19-056, 19-056, 19-056, 19-056, 19-05...",19-056,Type,1
1,13.338895,-5.276078,65.181670,32.1705,0.541785,-0.566125,95.047437,66.976772,"[19-056, 19-056, 19-056, 19-056, 19-056, 19-05...",19-056,Type,1
2,13.452000,-4.651000,56.675250,32.1590,0.576188,-0.373250,93.412750,58.688932,"[19-056, 19-056, 19-056, 19-056, 19-056, 19-05...",19-056,Type,1
3,14.930137,-8.046923,47.678158,32.1435,0.593157,-0.307875,92.144750,50.678627,"[19-056, 19-056, 19-056, 19-056, 19-056, 19-05...",19-056,Type,1


#### Standard deviation

In [222]:
df1 = df.copy()

In [223]:
for j in df1[['ACC1', 'ACC2', 'ACC3', 'TEMP', 'EDA', 'BVP', 'HR', 'Magnitude']].columns:
    df1[j] = df1[j].map(lambda x: np.std(x))

In [224]:
dfstd = df1.copy()

In [226]:
dfstd.columns = dfstd.columns + '_std'

In [227]:
dfstd

Unnamed: 0,ACC1_std,ACC2_std,ACC3_std,TEMP_std,EDA_std,BVP_std,HR_std,Magnitude_std,SID_std,Subject_ID_std,Activity_std,Round_std
0,0.701573,0.687590,0.632616,0.017607,0.004877,18.439453,2.574676,0.609756,"[19-001, 19-001, 19-001, 19-001, 19-001, 19-00...",19-001,Baseline,1
1,1.192214,1.149559,0.529382,0.012610,0.003007,20.104717,2.608254,0.542348,"[19-001, 19-001, 19-001, 19-001, 19-001, 19-00...",19-001,Baseline,1
2,2.109896,0.815025,0.647914,0.010536,0.004337,23.756276,2.639037,0.942868,"[19-001, 19-001, 19-001, 19-001, 19-001, 19-00...",19-001,Baseline,1
3,1.832017,1.509593,1.773398,0.025377,0.002396,25.635645,1.674001,0.841361,"[19-001, 19-001, 19-001, 19-001, 19-001, 19-00...",19-001,Baseline,1
4,2.115371,2.585687,1.809092,0.027000,0.002055,25.593597,0.684349,1.365652,"[19-001, 19-001, 19-001, 19-001, 19-001, 19-00...",19-001,Baseline,1
...,...,...,...,...,...,...,...,...,...,...,...,...
0,17.017661,5.972112,17.824375,0.020609,0.099720,21.499873,1.280447,7.640022,"[19-056, 19-056, 19-056, 19-056, 19-056, 19-05...",19-056,Type,1
1,2.920879,4.369355,5.616240,0.019615,0.064759,12.241906,0.998228,5.262136,"[19-056, 19-056, 19-056, 19-056, 19-056, 19-05...",19-056,Type,1
2,1.503319,3.232566,12.371872,0.015780,0.054757,3.497406,0.921024,11.664220,"[19-056, 19-056, 19-056, 19-056, 19-056, 19-05...",19-056,Type,1
3,1.350929,1.577617,5.591732,0.019046,0.061583,2.692347,0.656914,5.303471,"[19-056, 19-056, 19-056, 19-056, 19-056, 19-05...",19-056,Type,1


#### Skew

In [228]:
df1 = df.copy()

In [229]:
for j in df1[['ACC1', 'ACC2', 'ACC3', 'TEMP', 'EDA', 'BVP', 'HR', 'Magnitude']].columns:
    df1[j] = df1[j].map(lambda x: stats.skew(x))

  lambda m2, m3: m3 / m2**1.5,


In [230]:
dfskew = df1.copy()

In [231]:
dfskew.columns = dfskew.columns+"_skew"

In [232]:
dfskew

Unnamed: 0,ACC1_skew,ACC2_skew,ACC3_skew,TEMP_skew,EDA_skew,BVP_skew,HR_skew,Magnitude_skew,SID_skew,Subject_ID_skew,Activity_skew,Round_skew
0,-0.082592,-0.558848,0.705668,0.714533,0.896382,-0.392823,0.296262,0.531557,"[19-001, 19-001, 19-001, 19-001, 19-001, 19-00...",19-001,Baseline,1
1,0.515544,0.109446,-0.188071,0.787066,0.212943,-0.322900,-0.170923,-0.438037,"[19-001, 19-001, 19-001, 19-001, 19-001, 19-00...",19-001,Baseline,1
2,-0.473020,0.227966,1.329886,0.620801,0.072564,-0.274279,0.185657,-0.382833,"[19-001, 19-001, 19-001, 19-001, 19-001, 19-00...",19-001,Baseline,1
3,-4.941414,-1.040349,3.435987,0.672586,0.734482,-0.828441,0.406263,-0.532117,"[19-001, 19-001, 19-001, 19-001, 19-001, 19-00...",19-001,Baseline,1
4,-1.857224,0.511935,1.380509,-0.686481,1.239716,-0.833856,0.996232,0.408229,"[19-001, 19-001, 19-001, 19-001, 19-001, 19-00...",19-001,Baseline,1
...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.968658,-0.961744,-0.392065,-0.339878,0.191488,-0.633952,-0.305278,-0.077458,"[19-056, 19-056, 19-056, 19-056, 19-056, 19-05...",19-056,Type,1
1,0.554282,0.458399,-1.035384,0.963675,-0.205927,-3.052303,0.631581,-0.993077,"[19-056, 19-056, 19-056, 19-056, 19-056, 19-05...",19-056,Type,1
2,0.199743,1.355432,-0.134220,1.539263,0.642033,1.462432,-0.528450,-0.104840,"[19-056, 19-056, 19-056, 19-056, 19-056, 19-05...",19-056,Type,1
3,-0.360138,0.274186,0.199360,0.144487,0.549711,0.214730,1.023359,0.092249,"[19-056, 19-056, 19-056, 19-056, 19-056, 19-05...",19-056,Type,1


#### Minimum

In [233]:
df1 = df.copy()

In [234]:
for j in df1[['ACC1', 'ACC2', 'ACC3', 'TEMP', 'EDA', 'BVP', 'HR', 'Magnitude']].columns:
    df1[j] = df1[j].map(lambda x: np.min(x))

In [235]:
dfmin = df1.copy()

In [236]:
dfmin.columns = dfmin.columns+"_min"

In [237]:
dfmin

Unnamed: 0,ACC1_min,ACC2_min,ACC3_min,TEMP_min,EDA_min,BVP_min,HR_min,Magnitude_min,SID_min,Subject_ID_min,Activity_min,Round_min
0,39.000000,26.456522,38.000000,32.33,0.254862,-42.99,69.7650,61.692787,"[19-001, 19-001, 19-001, 19-001, 19-001, 19-00...",19-001,Baseline,1
1,39.000000,24.600000,37.200000,32.31,0.254862,-48.52,64.8025,60.778286,"[19-001, 19-001, 19-001, 19-001, 19-001, 19-00...",19-001,Baseline,1
2,39.000000,24.000000,37.000000,32.31,0.252301,-48.52,60.9950,60.778286,"[19-001, 19-001, 19-001, 19-001, 19-001, 19-00...",19-001,Baseline,1
3,32.000000,20.985816,37.000000,32.33,0.251020,-101.74,58.8025,61.392182,"[19-001, 19-001, 19-001, 19-001, 19-001, 19-00...",19-001,Baseline,1
4,32.000000,20.702128,37.000000,32.33,0.249739,-101.74,58.5300,61.392182,"[19-001, 19-001, 19-001, 19-001, 19-001, 19-00...",19-001,Baseline,1
...,...,...,...,...,...,...,...,...,...,...,...,...
0,8.000000,-41.000000,8.000000,32.16,0.279462,-80.79,94.6775,48.763216,"[19-056, 19-056, 19-056, 19-056, 19-056, 19-05...",19-056,Type,1
1,8.000000,-11.965517,49.965517,32.15,0.363990,-80.79,93.7275,53.343234,"[19-056, 19-056, 19-056, 19-056, 19-056, 19-05...",19-056,Type,1
2,11.000000,-8.720000,39.000000,32.13,0.488219,-9.37,91.7125,41.868843,"[19-056, 19-056, 19-056, 19-056, 19-056, 19-05...",19-056,Type,1
3,12.263158,-10.000000,39.000000,32.11,0.508710,-6.35,91.5000,41.868843,"[19-056, 19-056, 19-056, 19-056, 19-056, 19-05...",19-056,Type,1


#### Maximum

In [238]:
df1 = df.copy()

In [239]:
for j in df1[['ACC1', 'ACC2', 'ACC3', 'TEMP', 'EDA', 'BVP', 'HR', 'Magnitude']].columns:
    df1[j] = df1[j].map(lambda x: np.max(x))

In [240]:
dfmax = df1.copy()

In [172]:
dfmax.columns = dfmax.columns+"_max"

In [241]:
dfmax

Unnamed: 0,ACC1,ACC2,ACC3,TEMP,EDA,BVP,HR,Magnitude,SID,Subject_ID,Activity,Round
0,41.543478,29.0,40.0,32.39,0.276634,34.83,78.98,63.757353,"[19-001, 19-001, 19-001, 19-001, 19-001, 19-00...",19-001,Baseline,1
1,43.800000,29.0,39.0,32.37,0.266389,37.72,73.52,62.936476,"[19-001, 19-001, 19-001, 19-001, 19-001, 19-00...",19-001,Baseline,1
2,45.532258,27.0,39.0,32.37,0.266389,47.14,69.63,64.010791,"[19-001, 19-001, 19-001, 19-001, 19-001, 19-00...",19-001,Baseline,1
3,46.000000,27.0,48.0,32.41,0.262546,47.14,64.68,65.711491,"[19-001, 19-001, 19-001, 19-001, 19-001, 19-00...",19-001,Baseline,1
4,46.000000,27.0,48.0,32.43,0.258704,39.00,60.92,65.711491,"[19-001, 19-001, 19-001, 19-001, 19-001, 19-00...",19-001,Baseline,1
...,...,...,...,...,...,...,...,...,...,...,...,...
0,66.000000,1.4,69.0,32.23,0.643185,50.55,98.63,70.835302,"[19-056, 19-056, 19-056, 19-056, 19-056, 19-05...",19-056,Type,1
1,20.000000,4.0,71.0,32.21,0.670080,42.24,97.15,72.097157,"[19-056, 19-056, 19-056, 19-056, 19-056, 19-05...",19-056,Type,1
2,16.533333,4.0,71.0,32.21,0.704660,16.75,94.62,72.097157,"[19-056, 19-056, 19-056, 19-056, 19-056, 19-05...",19-056,Type,1
3,17.000000,-6.0,57.8,32.18,0.714905,6.38,93.68,60.278388,"[19-056, 19-056, 19-056, 19-056, 19-056, 19-05...",19-056,Type,1


### Concatenate engineered dataframes

In [242]:
df_list = [dfmean, dfstd, dfskew, dfmin, dfmax]

In [243]:
df1 = df.drop(['SID'], axis = 1)

In [244]:
df1 = pd.concat([df1, dfmean.iloc[:, :8]], axis = 1)
df1 = pd.concat([df1, dfstd.iloc[:, :8]], axis = 1)
df1 = pd.concat([df1, dfskew.iloc[:, :8]], axis = 1)
df1 = pd.concat([df1, dfmin.iloc[:, :8]], axis = 1)
df1 = pd.concat([df1, dfmax.iloc[:, :8]], axis = 1)

In [250]:
df1.to_csv('"../../40_usable_data_for_models/41_Duke_Data/engineered_features.csv', index = False)