# Model Template - STEP Data
This notebook outlines the data prep process for inputting the STEP data into a neural network. This code is essentially replicated in our full walkthrough notebooks for an end-to-end deep learning model.

### Import libraries and data

In [2]:
import pandas as pd
import numpy as np
import more_itertools
import random

In [3]:
df = pd.read_csv('/Users/N1/Data-2020/10_code/csv/no_aw_no_28.csv') #read in the csv file
df

Unnamed: 0,ACC1,ACC2,ACC3,TEMP,EDA,BVP,HR,Magnitude,Activity,Subject_ID,Round
0,41.000000,27.200000,40.000000,32.39,0.275354,15.25,78.9800,63.410094,Baseline,19-001,1
1,41.000000,27.300000,40.000000,32.39,0.276634,-12.75,78.8350,63.453054,Baseline,19-001,1
2,41.000000,27.400000,40.000000,32.39,0.270231,-42.99,78.6900,63.496142,Baseline,19-001,1
3,41.000000,27.500000,40.000000,32.39,0.270231,18.39,78.5450,63.539358,Baseline,19-001,1
4,41.000000,27.600000,40.000000,32.34,0.268950,13.61,78.4000,63.582702,Baseline,19-001,1
...,...,...,...,...,...,...,...,...,...,...,...
279835,21.176471,-11.176471,64.823529,32.09,0.708502,0.85,92.8275,69.104605,Type,19-056,1
279836,24.235294,-12.235294,62.764706,32.09,0.694414,-1.00,92.8800,68.384649,Type,19-056,1
279837,27.294118,-13.294118,60.705882,32.09,0.672642,5.22,92.9400,67.874197,Type,19-056,1
279838,30.352941,-14.352941,58.647059,32.09,0.664957,-1.47,93.0000,67.577995,Type,19-056,1


### Label Encode

In [4]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Subject_ID'] = le.fit_transform(df['Subject_ID'])

In [5]:
le1 = LabelEncoder()
df['Activity'] = le1.fit_transform(df['Activity'])

activity_name_mapping = dict(zip(le1.classes_, le1.transform(le1.classes_)))
print(activity_name_mapping)

{'Activity': 0, 'Baseline': 1, 'DB': 2, 'Type': 3}


### Sliding Window

In [7]:
from window_slider import Slider

def make_windows(df, bucket_size, overlap_count):
    window_list = []
    final = pd.DataFrame()
    activity_list = list(df['Activity'].unique()) #list of the four activities
    sub_id_list = list(df['Subject_ID'].unique()) #list of the subject ids
    round_list = list(df['Round'].unique())
    df_list = []


    for i in sub_id_list:
        df_subject = df[df['Subject_ID'] == i] #isolate a single subject id
        for j in activity_list:
            df_subject_activity = df_subject[df_subject['Activity'] == j] #isolate by activity
            for k in round_list:
                df_subject_activity_round = df_subject_activity[df_subject_activity['Round'] == k]
                final_df = pd.DataFrame()
                if df_subject_activity_round.empty:
                      pass
                else:
                    df_flat = df_subject_activity_round[['ACC1', 'ACC2','ACC3','TEMP','EDA','BVP','HR','Magnitude', 'Subject_ID']].T.values #array of arrays, each row is every single reading in an array for a sensor in that isolation 

                    slider = Slider(bucket_size,overlap_count)
                    slider.fit(df_flat)
                    while True:
                        window_data = slider.slide()

                        if slider.reached_end_of_list(): break
                        window_list.append(list(window_data))
                    final_df = final.append(window_list)
                    final_df.columns = [['ACC1', 'ACC2','ACC3','TEMP','EDA','BVP','HR','Magnitude', 'SID']]
                    final_df.insert(9, "Subject_ID", [i]*len(final_df), True)
                    final_df.insert(10, "Activity", [j]*len(final_df), True)
                    final_df.insert(11, "Round", [k]*len(final_df), True)
                    df_list.append(final_df)
                    window_list = []

    final = pd.DataFrame(columns = df_list[0].columns)

    for l in df_list:
        final = final.append(l)


    final
    final.columns = final.columns.map(''.join)
    return final


#### Please edit the cell below to change windows

In [8]:
window_size = 80
window_overlap = 40

In [9]:
windowed = make_windows(df, window_size, window_overlap)

In [10]:
windowed.head(1)

Unnamed: 0,ACC1,ACC2,ACC3,TEMP,EDA,BVP,HR,Magnitude,SID,Subject_ID,Activity,Round
0,"[41.0, 41.0, 41.0, 41.0, 41.0, 41.0, 41.0, 41....","[27.2, 27.3, 27.4, 27.5, 27.6, 27.7, 27.8, 27....","[40.0, 40.0, 40.0, 40.0, 40.0, 40.0, 40.0, 40....","[32.39, 32.39, 32.39, 32.39, 32.34, 32.34, 32....","[0.275354, 0.276634, 0.270231, 0.270231, 0.268...","[15.25, -12.75, -42.99, 18.39, 13.61, -9.66, -...","[78.98, 78.83500000000002, 78.69, 78.545, 78.4...","[63.410093833710725, 63.453053512025726, 63.49...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,1,1


In [11]:
windowed.shape

(6572, 12)

### Split Train Test

In [12]:
ID_list = list(windowed['Subject_ID'].unique())
random.shuffle(ID_list)
train = pd.DataFrame()
test = pd.DataFrame()

#change size of train/test split
train = windowed[windowed['Subject_ID'].isin(ID_list[:45])]
test = windowed[windowed['Subject_ID'].isin(ID_list[45:])]

print(train.shape, test.shape)

(5394, 12) (1178, 12)


In [13]:
X_train = train[['TEMP', 'EDA', 'HR', 'BVP', 'Magnitude', 'ACC1', 'ACC2', 'ACC3', 'SID']]
X_train = X_train.apply(pd.Series.explode).reset_index().drop(['index'], axis = 1)

X_test = test[['TEMP', 'EDA', 'HR', 'BVP', 'Magnitude', 'ACC1', 'ACC2', 'ACC3', 'SID']]
X_test = X_test.apply(pd.Series.explode).reset_index().drop(['index'], axis = 1)

print(X_train.shape, X_test.shape, X_train.shape[0] + X_test.shape[0])

(431520, 9) (94240, 9) 525760


In [14]:
y_train = train['Activity'].values
y_test = test['Activity'].values
print(y_train.shape, y_test.shape, y_train.shape[0] + y_test.shape[0])

(5394,) (1178,) 6572


### One-Hot Encoding Subject_ID

In [15]:
X_train['train'] =1
X_test['train'] = 0

In [19]:
combined = pd.concat([X_train, X_test])

In [20]:
combined = pd.concat([combined, pd.get_dummies(combined['SID'])], axis =1)

In [21]:
X_train = combined[combined['train'] == 1]
X_test = combined[combined['train'] == 0]

X_train.drop(["train", "SID"], axis = 1, inplace = True)
X_test.drop(["train", "SID"], axis = 1, inplace = True)
print(X_train.shape, X_test.shape, X_train.shape[0] + X_test.shape[0])

(431520, 63) (94240, 63) 525760


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [188]:
X_train.head(1)

Unnamed: 0,TEMP,EDA,HR,BVP,Magnitude,ACC1,ACC2,ACC3,0.0,1.0,...,45.0,46.0,47.0,48.0,49.0,50.0,51.0,52.0,53.0,54.0
0,32.39,0.275354,78.98,15.25,63.4101,41,27.2,40,1,0,...,0,0,0,0,0,0,0,0,0,0


### Normalize data

In [189]:
from sklearn.preprocessing import StandardScaler

In [190]:
ss = StandardScaler()

X_train_SID = X_train.iloc[:, 8:]
X_test_SID = X_test.iloc[:, 8:]

X_train_cont = pd.DataFrame(ss.fit_transform(X_train.iloc[:,0:8]))
X_test_cont = pd.DataFrame(ss.transform(X_test.iloc[:,0:8]))

In [191]:
X_train = pd.concat([X_train_cont, X_train_SID], axis = 1)
X_test = pd.concat([X_test_cont, X_test_SID], axis = 1)

In [192]:
X_train

Unnamed: 0,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,0.0.1,1.0.1,...,45.0,46.0,47.0,48.0,49.0,50.0,51.0,52.0,53.0,54.0
0,-0.643040,-0.600909,-0.492016,0.212738,-0.347034,0.884116,0.472796,0.568600,1,0,...,0,0,0,0,0,0,0,0,0,0
1,-0.643040,-0.600662,-0.501243,-0.177845,-0.343532,0.884116,0.475263,0.568600,1,0,...,0,0,0,0,0,0,0,0,0,0
2,-0.643040,-0.601900,-0.510471,-0.599675,-0.340020,0.884116,0.477730,0.568600,1,0,...,0,0,0,0,0,0,0,0,0,0
3,-0.643040,-0.601900,-0.519698,0.256539,-0.336497,0.884116,0.480198,0.568600,1,0,...,0,0,0,0,0,0,0,0,0,0
4,-0.683076,-0.602148,-0.528925,0.189861,-0.332964,0.884116,0.482665,0.568600,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431515,-0.883257,-0.517127,0.389191,0.011867,0.117124,0.398418,-0.474051,1.493869,0,0,...,0,0,0,0,0,0,0,0,0,1
431516,-0.883257,-0.519852,0.392532,-0.013940,0.058441,0.473363,-0.500175,1.417129,0,0,...,0,0,0,0,0,0,0,0,0,1
431517,-0.883257,-0.524063,0.396351,0.072825,0.016834,0.548307,-0.526299,1.340389,0,0,...,0,0,0,0,0,0,0,0,0,1
431518,-0.883257,-0.525550,0.400169,-0.020496,-0.007309,0.623251,-0.552423,1.263648,0,0,...,0,0,0,0,0,0,0,0,0,1


### Reshaping windows as arrays

In [193]:
# Convert to transposed arrays
X_test = X_test.T.values
X_train = X_train.T.values

In [194]:
X_test = X_test.astype('float64')
X_train = X_train.astype('float64')

# Reshape to -1, window_size, # features
X_train = X_train.reshape((-1, window_size, X_train.shape[0]))
X_test = X_test.reshape((-1, window_size, X_test.shape[0])) 

print(X_train.shape,  y_train.shape, X_test.shape, y_test.shape)

(5394, 80, 63) (5394,) (1178, 80, 63) (1178,)


#### If using TensorFlow, run this cell to convert y_train and y_test to One-Hot

In [195]:
from keras.utils import np_utils
y_train = np_utils.to_categorical(y_train)
y_test = np_utils.to_categorical(y_test)