## 1. Load dataset

Load the dataset. The dataset is composed of the subject information sheet as well as 16 different data collection sessions and 24 subjects. Each session is for a different activity type: ['dws', 'ups', 'std', 'sit', 'jog', 'wlk']. The device data is time-series data for each subject. Each avtivity session has its own folder containing the time series data for each subject.

In [1]:
import os
import numpy as np
import pandas as pd

subjects_data_file = 'dataset/data_subjects_info.csv'
device_data_dir = 'dataset/A_DeviceMotion_data/'

def get_all_dataset_paths(input_dir):
    input_files = []
    for dirs, subdirs, files in os.walk(input_dir):
        for file in files:
            if file.endswith('.csv'):
                input_files.append(os.path.join(dirs, file))
    return input_files

def load_dataset(paths, meta):
    df = pd.DataFrame()

    for p in paths:
        c_dir, c_file = p.split('/')[-2], p.split('/')[-1]
        c_cat, c_ses = c_dir.split('_')[-2], c_dir.split('_')[-1]
        c_sub = c_file.split('_')[-1].split('.')[-2]

        tdf = pd.read_csv(p, encoding='utf-8')

        tdf = tdf.assign(subject_id = int(c_sub))
        tdf = tdf.assign(session_id = int(c_ses))
        tdf = tdf.assign(category = str(c_cat))
        tdf = tdf.assign(age = int(meta.age[int(c_sub) - 1]))
        tdf = tdf.assign(gender = int(meta.gender[int(c_sub) - 1]))
        tdf = tdf.assign(height = int(meta.height[int(c_sub) - 1]))
        tdf = tdf.assign(weight = int(meta.weight[int(c_sub) - 1]))

        df = pd.concat([df, tdf])

    df.reset_index(drop=True, inplace=True)
    return df

subject_df = pd.DataFrame(pd.read_csv(subjects_data_file, encoding='utf-8'))
all_ds_paths = get_all_dataset_paths(device_data_dir)
data_frame = load_dataset(all_ds_paths, subject_df)

print('[INFO] Dataframe shape: ', data_frame.shape)

[INFO] Dataframe shape:  (1412865, 20)


## 2. Preprocessing

The Unnamed 0, weight, height, subject id, session id, age, gender columns are removed as they are not used in the analysis

In [2]:
df = data_frame.copy()

df.drop('Unnamed: 0', axis=1, inplace=True)
df.drop('subject_id', axis=1, inplace=True)
df.drop('session_id', axis=1, inplace=True)
df.drop('age', axis=1, inplace=True)
df.drop('gender', axis=1, inplace=True)
df.drop('height', axis=1, inplace=True)
df.drop('weight', axis=1, inplace=True)

## 2.1 Encoding the category

In [3]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
labels = le.fit(df['category'])
df['code'] = le.transform(df['category'])
df.drop('category', axis=1, inplace=True)

print('[INFO] Dataframe shape: ', df.shape)
print('[INFO] Dataframe columns: ', df.columns)
print('[INFO] Dataframe labels: ', labels.classes_)
print('[INFO] Dataframe head: ', df.head())

[INFO] Dataframe shape:  (1412865, 13)
[INFO] Dataframe columns:  Index(['attitude.roll', 'attitude.pitch', 'attitude.yaw', 'gravity.x',
       'gravity.y', 'gravity.z', 'rotationRate.x', 'rotationRate.y',
       'rotationRate.z', 'userAcceleration.x', 'userAcceleration.y',
       'userAcceleration.z', 'code'],
      dtype='object')
[INFO] Dataframe labels:  ['dws' 'jog' 'sit' 'std' 'ups' 'wlk']
[INFO] Dataframe head:     attitude.roll  attitude.pitch  attitude.yaw  gravity.x  gravity.y  \
0      -2.116381       -1.077507     -2.261502  -0.404768   0.880780   
1      -2.148154       -1.049759     -2.284278  -0.417081   0.867303   
2      -2.153824       -1.026749     -2.297008  -0.432082   0.855621   
3      -2.142509       -1.012749     -2.290595  -0.445311   0.848291   
4      -2.130486       -1.007262     -2.274149  -0.452661   0.845372   

   gravity.z  rotationRate.x  rotationRate.y  rotationRate.z  \
0   0.245713       -1.264215       -1.027909       -0.947909   
1   0.271686    

# 2.2 Test train split

In [4]:
from sklearn.model_selection import train_test_split

x_cols = df.iloc[:, 0:12]
y_cols = df.iloc[:, 12:13]

# Do not shuffle as the data is time series
X_train, X_test, y_train, y_test = train_test_split(x_cols, y_cols, test_size=0.2, shuffle=False)
print("[INFO] X_train shape: ", X_train.shape)
print("[INFO] X_test shape: ", X_test.shape)
print("[INFO] y_train shape: ", y_train.shape)
print("[INFO] y_test shape: ", y_test.shape)

[INFO] X_train shape:  (1130292, 12)
[INFO] X_test shape:  (282573, 12)
[INFO] y_train shape:  (1130292, 1)
[INFO] y_test shape:  (282573, 1)


# 2.3 sequencing

In [5]:
from scipy.stats import mode

WINDOW_SIZE = 150
STRIDE = 10
NUM_CLASSES = 6
NUM_FEATURES = 12
BATCH_SIZE = 100
EPOCHS_SIZE = 10

def sliding_window(x, y, length, stride):
    seq_x, seq_y = [], []
    data_len = len(x)

    for i in range(0, data_len - length + 1, stride):
        input_sec = x.iloc[i:i + length]
        target_sec = y.iloc[i:i + length]
        target = mode(target_sec)[0][0]
        seq_x.append(input_sec)
        seq_y.append(target)
    return np.array(seq_x), np.array(seq_y)

tx, ty = sliding_window(X_train, y_train, WINDOW_SIZE, STRIDE)
vx, vy = sliding_window(X_test, y_test, WINDOW_SIZE, STRIDE)
print("[INFO] tx shape: ", tx.shape)
print("[INFO] ty shape: ", ty.shape)
print("[INFO] vx shape: ", vx.shape)
print("[INFO] vy shape: ", vy.shape)

[INFO] tx shape:  (113015, 150, 12)
[INFO] ty shape:  (113015,)
[INFO] vx shape:  (28243, 150, 12)
[INFO] vy shape:  (28243,)


# 2.4 One hot encoding

In [6]:
from tensorflow.keras.utils import to_categorical

tty = to_categorical(ty, num_classes=NUM_CLASSES)
vvy = to_categorical(vy, num_classes=NUM_CLASSES)

print("[INFO] tty shape: ", tty.shape)
print("[INFO] vvy shape: ", vvy.shape)

[INFO] tty shape:  (113015, 6)
[INFO] vvy shape:  (28243, 6)


# 3 Analisys

This is where the model comes in, either use LSTM based model or 2d CNNJk 

In [8]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

from keras.layers import LSTM, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization, Softmax
from keras.layers import Input
from keras.models import Sequential

model = Sequential()

# play with model structure here
model.add(Input((WINDOW_SIZE, NUM_FEATURES)))
model.add(Conv1D(filters=32, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))
model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(Dense(NUM_CLASSES, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_1 (Conv1D)           (None, 148, 32)           1184      
                                                                 
 max_pooling1d (MaxPooling1  (None, 74, 32)            0         
 D)                                                              
                                                                 
 dropout (Dropout)           (None, 74, 32)            0         
                                                                 
 conv1d_2 (Conv1D)           (None, 72, 64)            6208      
                                                                 
 max_pooling1d_1 (MaxPoolin  (None, 36, 64)            0         
 g1D)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 36, 64)           

In [9]:
history = model.fit(tx, tty, epochs=EPOCHS_SIZE, batch_size=BATCH_SIZE, validation_data=(vx, vvy))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [11]:
model.evaluate(vx, vvy, batch_size=BATCH_SIZE)



[0.01295988168567419, 0.9978047609329224]

In [17]:
# print the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(vx)
y_pred = np.argmax(y_pred, axis=1)
y_true = np.argmax(vvy, axis=1)

print(y_pred.shape)
print(y_true.shape)

print(np.max(y_pred))
print(np.min(y_pred))

print(np.max(y_true))
print(np.min(y_true))

cm = confusion_matrix(y_true, y_pred)
print(cm)

(28243,)
(28243,)
5
0
3
2
[[    0     0     0     0]
 [    0  5768     1     0]
 [   21    22 22413    18]
 [    0     0     0     0]]
