## 1. Load dataset

Load the dataset. The dataset is composed of the subject information sheet as well as 16 different data collection sessions and 24 subjects. Each session is for a different activity type: ['dws', 'ups', 'std', 'sit', 'jog', 'wlk']. The device data is time-series data for each subject. Each avtivity session has its own folder containing the time series data for each subject.

In [15]:
import os
import numpy as np
import pandas as pd

subjects_data_file = 'dataset/data_subjects_info.csv'
device_data_dir = 'dataset/A_DeviceMotion_data/'

def get_all_dataset_paths(input_dir):
    input_files = []
    for dirs, subdirs, files in os.walk(input_dir):
        for file in files:
            if file.endswith('.csv'):
                input_files.append(os.path.join(dirs, file))
    return input_files

def load_dataset(paths, meta):
    df = pd.DataFrame()

    for p in paths:
        c_dir, c_file = p.split('/')[-2], p.split('/')[-1]
        c_cat, c_ses = c_dir.split('_')[-2], c_dir.split('_')[-1]
        c_sub = c_file.split('_')[-1].split('.')[-2]

        tdf = pd.read_csv(p, encoding='utf-8')

        tdf = tdf.assign(subject_id = int(c_sub))
        tdf = tdf.assign(session_id = int(c_ses))
        tdf = tdf.assign(category = str(c_cat))
        tdf = tdf.assign(age = int(meta.age[int(c_sub) - 1]))
        tdf = tdf.assign(gender = int(meta.gender[int(c_sub) - 1]))
        tdf = tdf.assign(height = int(meta.height[int(c_sub) - 1]))
        tdf = tdf.assign(weight = int(meta.weight[int(c_sub) - 1]))

        df = pd.concat([df, tdf])

    df.reset_index(drop=True, inplace=True)
    return df

subject_df = pd.DataFrame(pd.read_csv(subjects_data_file, encoding='utf-8'))
all_ds_paths = get_all_dataset_paths(device_data_dir)
data_frame = load_dataset(all_ds_paths, subject_df)

print('[INFO] Dataframe shape: ', data_frame.shape)

[INFO] Dataframe shape:  (1412865, 20)


## 2. Preprocessing

The Unnamed 0, weight, height, subject id, session id, age, gender columns are removed as they are not used in the analysis

In [16]:
df = data_frame.copy()

df.drop('Unnamed: 0', axis=1, inplace=True)
df.drop('subject_id', axis=1, inplace=True)
df.drop('session_id', axis=1, inplace=True)
df.drop('age', axis=1, inplace=True)
df.drop('gender', axis=1, inplace=True)
df.drop('height', axis=1, inplace=True)
df.drop('weight', axis=1, inplace=True)

## 2.1 Encoding the category

In [17]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
labels = le.fit(df['category'])
df['code'] = le.transform(df['category'])
df.drop('category', axis=1, inplace=True)

print('[INFO] Dataframe shape: ', df.shape)
print('[INFO] Dataframe columns: ', df.columns)
print('[INFO] Dataframe labels: ', labels.classes_)
print('[INFO] Dataframe head: ', df.head())

[INFO] Dataframe shape:  (1412865, 13)
[INFO] Dataframe columns:  Index(['attitude.roll', 'attitude.pitch', 'attitude.yaw', 'gravity.x',
       'gravity.y', 'gravity.z', 'rotationRate.x', 'rotationRate.y',
       'rotationRate.z', 'userAcceleration.x', 'userAcceleration.y',
       'userAcceleration.z', 'code'],
      dtype='object')
[INFO] Dataframe labels:  ['dws' 'jog' 'sit' 'std' 'ups' 'wlk']
[INFO] Dataframe head:     attitude.roll  attitude.pitch  attitude.yaw  gravity.x  gravity.y  \
0      -2.116381       -1.077507     -2.261502  -0.404768   0.880780   
1      -2.148154       -1.049759     -2.284278  -0.417081   0.867303   
2      -2.153824       -1.026749     -2.297008  -0.432082   0.855621   
3      -2.142509       -1.012749     -2.290595  -0.445311   0.848291   
4      -2.130486       -1.007262     -2.274149  -0.452661   0.845372   

   gravity.z  rotationRate.x  rotationRate.y  rotationRate.z  \
0   0.245713       -1.264215       -1.027909       -0.947909   
1   0.271686    

In [None]:
df.info()

# 2.2 sequencing

In [18]:
from scipy.stats import mode

WINDOW_SIZE = 150
STRIDE = 10
NUM_CLASSES = 6
NUM_FEATURES = 12
BATCH_SIZE = 100
EPOCHS_SIZE = 10

def sliding_window(x, y, length, stride):
    seq_x, seq_y = [], []
    data_len = len(x)

    for i in range(0, data_len - length + 1, stride):
        input_sec = x.iloc[i:i + length]
        target_sec = y.iloc[i:i + length]
        target = mode(target_sec)[0][0]
        seq_x.append(input_sec)
        seq_y.append(target)
    return np.array(seq_x), np.array(seq_y)

x_cols = df.iloc[:, 0:NUM_FEATURES]
y_cols = df.iloc[:, NUM_FEATURES:NUM_FEATURES + 1]

x_cols, y_cols = sliding_window(x_cols, y_cols, WINDOW_SIZE, STRIDE)
print("[INFO] x_cols shape: ", x_cols.shape)
print("[INFO] y_cols shape: ", y_cols.shape)

[INFO] x_cols shape:  (141272, 150, 12)
[INFO] y_cols shape:  (141272,)


# 2.3 Test train split

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x_cols, y_cols, test_size=0.2, stratify=y_cols)
print("[INFO] X_train shape: ", X_train.shape)
print("[INFO] X_test shape: ", X_test.shape)
print("[INFO] y_train shape: ", y_train.shape)
print("[INFO] y_test shape: ", y_test.shape)

[INFO] X_train shape:  (113017, 150, 12)
[INFO] X_test shape:  (28255, 150, 12)
[INFO] y_train shape:  (113017,)
[INFO] y_test shape:  (28255,)


# 2.4 One hot encoding

In [20]:
from tensorflow.keras.utils import to_categorical

y_train = to_categorical(y_train, num_classes=NUM_CLASSES)
y_test = to_categorical(y_test, num_classes=NUM_CLASSES)

# 3 Analisys

This is where the model comes in, either use LSTM based model or 2d CNNJk 

In [36]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

from keras.layers import LSTM, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization, Softmax
from keras.layers import Input
from keras.models import Sequential

model = Sequential()

# play with model structure here
# model.add(Input((WINDOW_SIZE, NUM_FEATURES)))
# model.add(Conv1D(filters=32, kernel_size=3, activation='relu'))
# model.add(MaxPooling1D(pool_size=2))
# model.add(Dropout(0.2))
# model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
# model.add(MaxPooling1D(pool_size=2))
# model.add(Dropout(0.2))
# model.add(Flatten())
# model.add(Dense(100, activation='relu'))
# model.add(Dense(NUM_CLASSES, activation='softmax'))

model = Sequential()
model.add(Input((WINDOW_SIZE, NUM_FEATURES)))
model.add(LSTM(6, input_shape=(NUM_FEATURES, NUM_FEATURES), return_sequences=True))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(NUM_CLASSES, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 150, 6)            456       
                                                                 
 flatten_1 (Flatten)         (None, 900)               0         
                                                                 
 dense_2 (Dense)             (None, 128)               115328    
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                                 
Total params: 116558 (455.30 KB)
Trainable params: 116558 (455.30 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Par

In [37]:
history = model.fit(X_train, y_train, epochs=EPOCHS_SIZE, batch_size=BATCH_SIZE, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [38]:
model.evaluate(X_test, y_test, batch_size=BATCH_SIZE)



[0.06194934621453285, 0.9847460389137268]

In [24]:
# print the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)

cm = confusion_matrix(y_true, y_pred)
print(cm)

[[2532    2    2    0   57   42]
 [   4 2661    0    0    8   12]
 [   0    1 6775    0    0    0]
 [   0    0    0 6121    1    5]
 [   8    1    0    2 3099   36]
 [   1    0    0   12    8 6865]]


# 4.1 Stratified K-Fold Cross validation

In [25]:
# from sklearn.model_selection import StratifiedKFold

# NUM_SPLIT = 5

# skf = StratifiedKFold(n_splits=NUM_SPLIT, shuffle=True)
# lst_accu_stratified = []

# for train_index, test_index in skf.split(x_cols, y_cols):
#     X_train_fold, X_test_fold = x_cols[train_index], x_cols[test_index]
#     y_train_fold, y_test_fold = y_cols[train_index], y_cols[test_index]
    
#     # convert to one-hot encoding
#     y_train_fold = to_categorical(y_train_fold, num_classes=NUM_CLASSES)
#     y_test_fold = to_categorical(y_test_fold, num_classes=NUM_CLASSES)

#     model.fit(X_train_fold, y_train_fold, epochs=EPOCHS_SIZE, batch_size=BATCH_SIZE)
#     test_loss, accuracy = model.evaluate(X_test_fold, y_test_fold, batch_size=BATCH_SIZE)
#     lst_accu_stratified.append(accuracy)

# 4.2 Analyzing Model Cross validation results

In [26]:
print('List of possible accuracy:', lst_accu_stratified)
print('Maximum Accuracy: ', max(lst_accu_stratified))
print('Minimum Accuracy: ', min(lst_accu_stratified))
print('Overall Accuracy: ', np.mean(lst_accu_stratified))
print('Standard Deviation: ', np.std(lst_accu_stratified))

NameError: name 'lst_accu_stratified' is not defined

# 5.1 Personally recorded data test

In [30]:
import pandas as pd
ethan_df = pd.DataFrame(pd.read_csv('personal_test/DeviceMotion.csv', encoding='utf-8'))

ethan_df.drop('timestamp', axis=1, inplace=True)
ethan_df.drop('timeIntervalSince1970', axis=1, inplace=True)
ethan_df.drop('magneticField.x', axis=1, inplace=True)
ethan_df.drop('magneticField.y', axis=1, inplace=True)
ethan_df.drop('magneticField.z', axis=1, inplace=True)
ethan_df.drop('magneticField.accuracy', axis=1, inplace=True)

# add catagory column
ethan_df = ethan_df.assign(category = 'wlk')

# transform the catagory column
ethan_df['code'] = le.transform(ethan_df['category'])
ethan_df.drop('category', axis=1, inplace=True)

ethan_df.info()
ethan_df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1465 entries, 0 to 1464
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   attitude.roll       1465 non-null   float64
 1   attitude.pitch      1465 non-null   float64
 2   attitude.yaw        1465 non-null   float64
 3   gravity.x           1465 non-null   float64
 4   gravity.y           1465 non-null   float64
 5   gravity.z           1465 non-null   float64
 6   rotationRate.x      1465 non-null   float64
 7   rotationRate.y      1465 non-null   float64
 8   rotationRate.z      1465 non-null   float64
 9   userAcceleration.x  1465 non-null   float64
 10  userAcceleration.y  1465 non-null   float64
 11  userAcceleration.z  1465 non-null   float64
 12  code                1465 non-null   int64  
dtypes: float64(12), int64(1)
memory usage: 148.9 KB


Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z,code
0,0.392283,-0.218175,0.043516,0.373236,0.216448,-0.902133,0.051881,-0.043821,-0.068015,-0.014548,-0.020815,-0.057309,5
1,0.39063,-0.218001,0.042478,0.371759,0.216278,-0.902784,-0.002829,-0.08068,-0.022565,0.029868,0.014328,0.053449,5
2,0.388539,-0.218712,0.042582,0.369812,0.216973,-0.903417,-0.07382,-0.124515,0.006733,0.021897,-0.002709,0.022191,5
3,0.385479,-0.220498,0.043769,0.3669,0.218716,-0.904184,-0.130309,-0.214334,0.027788,0.019453,-0.010891,0.019296,5
4,0.381166,-0.222687,0.045496,0.362818,0.220851,-0.905311,-0.115988,-0.210759,0.056921,-0.013163,-0.030559,-0.013466,5


In [39]:
ethan_x_cols = ethan_df.iloc[:, 0:NUM_FEATURES]
ethan_y_cols = ethan_df.iloc[:, NUM_FEATURES:NUM_FEATURES + 1]

ethan_x_cols, ethan_y_cols = sliding_window(ethan_x_cols, ethan_y_cols, WINDOW_SIZE, STRIDE)

ethan_y_cols = to_categorical(ethan_y_cols, num_classes=NUM_CLASSES)
print("[INFO] ethan_x_cols shape: ", ethan_x_cols.shape)

print(model.predict(ethan_x_cols))

model.evaluate(ethan_x_cols, ethan_y_cols)

[INFO] ethan_x_cols shape:  (132, 150, 12)
[[6.39754535e-06 6.69893196e-10 4.82887230e-17 9.75983627e-10
  5.48457913e-03 9.94509041e-01]
 [9.63270122e-06 1.42112558e-10 8.76480002e-24 2.63527102e-13
  4.63109789e-03 9.95359242e-01]
 [5.74459079e-07 8.27727036e-12 1.34319435e-24 4.36118460e-15
  2.47489545e-04 9.99751866e-01]
 [2.25956406e-04 1.41103578e-11 8.47727185e-25 1.09075745e-14
  1.17704202e-03 9.98596966e-01]
 [4.25586011e-04 1.77239500e-07 4.52373519e-25 6.07650044e-15
  1.56438164e-02 9.83930409e-01]
 [1.73048710e-03 3.73892517e-09 5.41776604e-21 1.84506396e-10
  1.05764550e-04 9.98163760e-01]
 [9.39831734e-05 2.37710823e-10 6.21923097e-25 1.32626245e-12
  1.98723637e-06 9.99904037e-01]
 [4.83784243e-04 4.97355224e-09 6.84131972e-34 2.00621389e-21
  3.00907686e-06 9.99513268e-01]
 [1.35747669e-02 2.99643726e-10 3.43455894e-26 2.26723683e-14
  7.94908119e-05 9.86345708e-01]
 [3.65060955e-01 1.29999425e-08 3.98506653e-29 4.25467148e-15
  8.08940036e-04 6.34130120e-01]
 [9.943

[6.982105731964111, 0.31060606241226196]