In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pickle

# EDA - Human activity recognition

In [86]:
# Each of the data-files contains 54 columns per row, the columns contain the following data:
# – 1 timestamp (s)
# – 2 activityID (see II.2. for the mapping to the activities)
# – 3 heart rate (bpm)
# – 4-20 IMU hand
# – 21-37 IMU chest
# – 38-54 IMU ankle
# The IMU sensory data contains the following columns:
# – 1 temperature (°C)
# – 2-4 3D-acceleration data (ms-2), scale: ±16g, resolution: 13-bit    **recommended to use  
# – 5-7 3D-acceleration data (ms-2), scale: ±6g, resolution: 13-bit*
# – 8-10 3D-gyroscope data (rad/s)
# – 11-13 3D-magnetometer data (μT)
# – 14-17 orientation (invalid in this data collection)

# Activity IDs:
# – 1 lying
# – 2 sitting
# – 3 standing
# – 4 walking
# – 5 running
# – 6 cycling
# – 7 Nordic walking
# – 9 watching TV
# – 10 computer work
# – 11 car driving
# – 12 ascending stairs
# – 13 descending stairs
# – 16 vacuum cleaning
# – 17 ironing
# – 18 folding laundry
# – 19 house cleaning
# – 20 playing soccer
# – 24 rope jumping
# – 0 other (transient activities)

In [2]:
number_to_activity ={ 
        '0': 'other'
        ,'1': 'lying'
        ,'2': 'sitting'
        ,'3': 'standing'
        ,'4': 'walking'
        ,'5': 'running'
        ,'6': 'cycling'
        ,'7': 'Nordic walking'
        ,'8': 'watching TV'
        ,'9':'computer work'
        ,'10':'car driving'
        ,'11':'ascending stairs'
        ,'12':'descending stairs'
        ,'13':'vacuum cleaning'
        ,'14':'ironing'
        ,'15':'folding laundry'
        ,'16':'house cleaning'
        ,'17':'playing soccer'
        ,'18':'rope jumping'
        }
activity_to_number ={ 
         'other':'0' 
        ,'lying':'1' 
        ,'sitting':'2' 
        ,'standing':'3' 
        ,'walking':'4' 
        ,'running':'5' 
        ,'cycling':'6' 
        ,'Nordic walking':'7' 
        ,'watching TV':'8' 
        ,'computer work':'9'
        ,'car driving':'10' 
        ,'ascending stairs':'11' 
        ,'descending stairs':'12' 
        ,'vacuum cleaning':'13' 
        ,'ironing':'14' 
        ,'folding laundry':'15' 
        ,'house cleaning':'16' 
        ,'playing soccer':'17' 
        ,'rope jumping':'18' 
        }


In [3]:
def read_data_and_preprocessing():
    data ={}
    columns_name = [
    'timestamp','activityID', 'heart-rate',
    # IMU hand
    'hand-temperature',
    'hand-acc-x','hand-acc-y', 'hand-acc-z', 
    'hand-x6','hand-y6', 'hand-z6',
    'hand-gyr-x','hand-gyr-y', 'hand-gyr-z',
    'hand-mag-x','hand-mag-y', 'hand-mag-z',
    # IMU chest 
    'chest-temperature',
    'chest-acc-x','chest-acc-y', 'chest-acc-z', 
    'chest-x6','chest-y6', 'chest-z6',
    'chest-gyr-x','chest-gyr-y', 'chest-gyr-z',
    'chest-mag-x','chest-mag-y', 'chest-mag-z',
    # IMU ankle
    'ankle-temperature',
    'ankle-acc-x','ankle-acc-y', 'ankle-acc-z', 
    'ankle-x6','ankle-y6', 'ankle-z6',
    'ankle-gyr-x','ankle-gyr-y', 'ankle-gyr-z',
    'ankle-mag-x','ankle-mag-y', 'ankle-mag-z']
    
    for subject_number in range(101,110):
        datContent = [i.split() for i in open(f'Protocol/subject{subject_number}.dat').readlines()] #Reading the date 
        # drop out orientation from all IMU sensors
        # data[3:17] #IMU chest data without orientation 
        # data[20:34] #IMU chest data without orientation 
        # data[37:51]  #IMU ankle data without orientation 
        datContent = [np.concatenate((row[:16],row[20:33],row[37:50])) for row in datContent]
        df = pd.DataFrame(datContent, columns=columns_name)
        # transform all columns from str to numeric
        for col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        
        df = df.drop(columns=['hand-x6','hand-y6', 'hand-z6','chest-x6','chest-y6', 'chest-z6','ankle-x6','ankle-y6', 'ankle-z6'])
        
        data[subject_number] = df   # add subject's dataframe to dictionary  
        
    return data

In [5]:
subjects_data = read_data_and_preprocessing()

In [104]:
# saving the data for future work
with open('subjects_data_new.pickle', 'wb') as f:
    pickle.dump(subjects_data, f)

In [105]:
# loading pickled file
with open('subjects_data_new.pickle', 'rb') as handle:
    loaded_file = pickle.load(handle)

In [106]:
df = loaded_file[101]

In [112]:
loaded_file[101]["activityID"].value_counts()

0     126460
1      27187
6      23575
17     23573
2      23480
16     22941
4      22253
3      21717
5      21265
7      20265
12     15890
13     14899
24     12912
Name: activityID, dtype: int64

In [68]:
def barh_plot_activity(df, subject_number):
    d = df.activityID.value_counts(normalize=True)
    y = d.values
    x = d.index
    activities = [number_to_activity[str(i)] for i in x]
    plt.figure(figsize = (8, 5))
    # creating the bar plot
    plt.barh(activities , y, color ='maroon')
    plt.ylabel("activity name")
    plt.xlabel("percentage of subject recorded time")
    plt.title(f"The ratio of each activity of a subject {subject_number}")
    # Add x, y gridlines
    plt.grid(visible = True, color ='grey',linestyle ='-.', linewidth = 0.5,alpha = 0.2)
    plt.show()

In [69]:
df_2 = subjects_data[102]

In [None]:
for subject in subjects_data.keys():
    print(f'subject {subject} dataframe details:')
    df = subjects_data[subject]
    display(df.describe())
    print('percentage of na values:')
    display(df.isna().mean())
    print('activity histogram:')
    barh_plot_activity(df,subject)

In [224]:
def plot_ts(df):
    fig, ax = plt.subplots(len(df.columns[2:]), 1, figsize=(14,50), sharex=True)
    i = 0
    for col in df.columns[2:]:
        ax[i].plot(df['timestamp'], df[col], color='r', alpha=0.5)
        ax[i].set_title(col)
        ax[i].grid(visible = True, color ='grey',linestyle ='-.', linewidth = 0.5,alpha = 0.2)
        i+=1
    plt.show()

In [None]:
plot_ts(subjects_data[102])

In [None]:
# Plot the time series
plt.style.use('fivethirtyeight')
df_2.plot(subplots=True,
        layout=(11, 3),
        figsize=(22,22),
        fontsize=10, 
        linewidth=2,
        sharex=False,
        title='Visualization of the original Time Series')
plt.show()

In [None]:
col = df_2.columns
# Plot
plt.figure(figsize=(30,30), dpi= 80)
sns.heatmap(df_2.corr(), xticklabels=col, yticklabels=col, cmap='RdYlGn', center=0, annot=True)
# Decorations
plt.title('Correlation matrix of features', fontsize=22)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

In [None]:
# Plot
plt.figure(figsize=(30,30), dpi= 80)
sns.heatmap(df_2[(df_2.activityID == 4 )| (df_2.activityID ==5)| (df_2.activityID ==6)|(df_2.activityID == 7)|(df_2.activityID > 11)].corr(), xticklabels=col, yticklabels=col, cmap='RdYlGn', center=0, annot=True)
# Decorations
plt.title(f'Correlation matrix for dynamic activity', fontsize=22)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

In [None]:
# Plot
plt.figure(figsize=(30,30), dpi= 80)
sns.heatmap(df_2[(df_2.activityID == 1 )| (df_2.activityID ==2)| (df_2.activityID ==3)|(df_2.activityID == 9)|(df_2.activityID == 10)|(df_2.activityID == 11)].corr(), xticklabels=col, yticklabels=col, cmap='RdYlGn', center=0, annot=True)
# Decorations
plt.title(f'Correlation matrix for static activity', fontsize=22)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

# section d - construct , fit and evaluate Neural Network 

In [97]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchsummary 
from torch.utils.data import Dataset, DataLoader
import time
import neptune.new as neptune
from sklearn import pipeline, metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, train_test_split

In [98]:
def sliding_window(df, window_size, stride):
    data = df.copy()
    data = data.reset_index(drop=True)
    data = data.to_numpy()
    data_list = []
    for i in range(0, len(data), stride):
        if i + window_size < len(data):
            data_list.append(data[i:i+window_size])
        else:
            return torch.Tensor(data_list)

#Function to interpolate NaNs because we want to keep the data continuous
def interpolate_nans(df):
    df = df.interpolate(method='linear', axis=0).ffill().bfill()
    return df

In [74]:
train_data = {}

keys = loaded_file.keys()
keys = [key for key in keys if key not in [107,108]]
list_of_classes = list(number_to_activity.keys())

window_size = 400
num_features = 33
stride = 100

for key in keys:
    print(f"Now reading subject number {key}")
    df = loaded_file[key].copy()
    for label in list_of_classes:
        # Should perform different sliding windows here per subject
        new_data = df[df['activityID'] == int(label)]
        # Interpolate NaNs per subject per activity ( No mixup of data)
        new_data = interpolate_nans(new_data)
        # Sliding window per subject per activity
        train_data[key,label] = sliding_window(new_data, window_size, stride)

Now reading subject number 101
Now reading subject number 102
Now reading subject number 103
Now reading subject number 104
Now reading subject number 105
Now reading subject number 106
Now reading subject number 109


In [75]:
test_data = {}


keys = [107,108]
list_of_classes = list(number_to_activity.keys())

window_size = 400
num_features = 33
stride = 100

for key in keys:
    print(f"Now reading subject number {key}")
    df = loaded_file[key].copy()
    for label in list_of_classes:
        # Should perform different sliding windows here per subject
        new_data = df[df['activityID'] == int(label)]
        # Interpolate NaNs per subject per activity ( No mixup of data)
        new_data = interpolate_nans(new_data)
        # Sliding window per subject per activity
        test_data[key,label] = sliding_window(new_data, window_size, stride)

Now reading subject number 107
Now reading subject number 108


In [5]:
def calc_trend(df,feature):
    if df[-1,feature] > df[0,feature]:
        return 1
    else:
        return 0

In [25]:
pretraining_df = pd.read_pickle("/Users/adirserruya/Desktop/4thYear/Assignment2/human-activity-classification/subject_data_pooled.pickle")

In [26]:
pretraining_df.keys()

dict_keys([(101, '0'), (101, '1'), (101, '2'), (101, '3'), (101, '4'), (101, '5'), (101, '6'), (101, '7'), (101, '9'), (101, '10'), (101, '11'), (101, '12'), (101, '13'), (101, '16'), (101, '17'), (101, '18'), (101, '19'), (101, '20'), (101, '24'), (102, '0'), (102, '1'), (102, '2'), (102, '3'), (102, '4'), (102, '5'), (102, '6'), (102, '7'), (102, '9'), (102, '10'), (102, '11'), (102, '12'), (102, '13'), (102, '16'), (102, '17'), (102, '18'), (102, '19'), (102, '20'), (102, '24'), (103, '0'), (103, '1'), (103, '2'), (103, '3'), (103, '4'), (103, '5'), (103, '6'), (103, '7'), (103, '9'), (103, '10'), (103, '11'), (103, '12'), (103, '13'), (103, '16'), (103, '17'), (103, '18'), (103, '19'), (103, '20'), (103, '24'), (104, '0'), (104, '1'), (104, '2'), (104, '3'), (104, '4'), (104, '5'), (104, '6'), (104, '7'), (104, '9'), (104, '10'), (104, '11'), (104, '12'), (104, '13'), (104, '16'), (104, '17'), (104, '18'), (104, '19'), (104, '20'), (104, '24'), (105, '0'), (105, '1'), (105, '2'), (

In [146]:
#Pool the data together for each activity
activities = ["0","1","2","3","4","5","6","7","9","10","11","12","13","16","17","18","19","20","24"]
#activities = ["0","1","2","3","4","5","6","7","9","10","11","12","13","16","17","18","19","20","21"]

subjects = [101,102,103,104,105,106,109]
subjects_test = [107,108]

x_train = {}
x_test = {}

#Pool the data together for each activity
for activity in activities:
    x_train[activity] = torch.empty(0,400,33)
    for subject in subjects:
        if pretraining_df[(subject,activity)] is not None :
            x_train[activity] = torch.cat((x_train[activity],pretraining_df[(subject,activity)]),0)

for activity in activities:
    x_test[activity] = torch.empty(0,400,33)
    for subject in subjects_test:
        if pretraining_df[(subject,activity)] is not None :
            x_test[activity] = torch.cat((x_test[activity],pretraining_df[(subject,activity)]),0)



In [38]:

y_test = {}
y_train = {}
for activity in activities:
    y_train[activity] = []
    for sample in range(x_train[activity].shape[0]):
        trend = calc_trend(x_train[activity][sample],feature_for_trend)
        y_train[activity].append(trend)

for activity in activities:
    y_test[activity] = []
    for sample in range(x_test[activity].shape[0]):
        trend = calc_trend(x_test[activity][sample],feature_for_trend)
        y_test[activity].append(trend)

In [154]:
feature_for_trend = 22
#Creating the labels for the data that will serve as the pretrain
#The way we are doing it is by calculating the trend of the data for some feature (in oour case the 22nd feature)
#Then we will use this trend as the label for the data
#Our model job will be to predict the trend of the data using the pretrain data

y_train_pooled= []
x_train_pooled = []
for act in activities:
    for sequence in x_train[act]:
        x_train_pooled.append(sequence[:,2:])   # drop time stamp and activity ID 
        y_train_pooled.append(calc_trend(sequence,feature_for_trend))

y_test_pooled= []
x_test_pooled = []
for act in activities:
    for sequence in x_test[act]:
        x_test_pooled.append(sequence[:,2:])   # drop time stamp and activity ID 
        y_test_pooled.append(calc_trend(sequence,feature_for_trend))

y_train_pooled = torch.Tensor(y_train_pooled)
y_test_pooled = torch.Tensor(y_test_pooled)



torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size([400, 33])
torch.Size

In [155]:
print(len(x_train_pooled),len(y_train_pooled),len(y_test_pooled),len(y_test_pooled))

21246 21246 7128 7128


In [156]:
x_train_pooled = torch.stack(x_train_pooled)
x_test_pooled = torch.stack(x_test_pooled)


In [114]:
def remap_labels(data):
    for sample in data:
        if sample == 9:
            sample = 8
        elif sample == 10:
            sample = 9
        elif sample == 11:
            sample = 10
        elif sample == 12:
            sample = 11
        elif sample == 13:
            sample = 12
        elif sample == 16:
            sample = 13
        elif sample == 17:
            sample = 14
        elif sample == 18:
            sample = 15
        elif sample == 19:
            sample = 16
        elif sample == 20:
            sample = 17
        elif sample == 21:
            sample = 18
    return data

In [157]:
class activity_dataset(Dataset):    
    def __init__(self, X, y, transform=None, trarget_transform=None):
        self.X = X 
        self.y = y
        self.transform = transform
        self.target_transform = trarget_transform
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self,idx):
        X = self.X[idx]
        y = self.y[idx]
        if self.transform:
            X = self.transform(X)
        if self.target_transform:
            y = self.target_transform(y)
        return X, y

In [158]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Computation resource: {device}')

Computation resource: cpu


In [159]:
class LSTM_Classifier(nn.Module):
    def __init__(self, input_dim=31, hidden_dim=256, num_layers=2, output_dim=22, dropout=0):
        '''
        input_dim = number of features at each time step 
        hidden_dim = number of features produced by each LSTM cell (in each layer)
        num_layers = number of LSTM layers
        output_dim = number of classes (number of activities)
        '''
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, 
                            num_layers=num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
        
    def forward(self, X):
        hidden_features, (h_n, c_n) = self.lstm(X)  # (h_0, c_0) default to zeros
        hidden_features = hidden_features[:,-1,:]  # index only the features produced by the last LSTM cell
        out = self.fc(hidden_features)
        return out

In [160]:
sample_loader = DataLoader(activity_dataset(x_train_pooled, y_train_pooled), batch_size=32, shuffle=True)

In [161]:
len(sample_loader.dataset)

21246

In [162]:
for batch,(X,y) in enumerate(sample_loader):
    print(X.shape)
    print(y.shape)
    break

torch.Size([32, 400, 31])
torch.Size([32])


In [164]:
def train_loop(data_loader, model,device,loss_fn,optimizer,print_every_n=200):
    model.train()
    size = len(data_loader.dataset)
    num_batches = len(data_loader)
    train_loss=0
    tp=0
    for batch,(X,y) in enumerate(data_loader):
        X = X.to(device)
        y = y.to(dtype=torch.long)
        y = y.to(device)
        pred = model(X)
        loss = loss_fn(pred,y)
        train_loss += loss
        tp += (y==pred.argmax(1)).type(torch.float).sum().item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loss, current = loss.item(), batch*len(X)
        if batch%print_every_n==0:
            print(f'loss={loss:.3f}, {current} / {size}')

    train_loss /= num_batches
    train_acc = tp/size    
        
    return train_loss,train_acc

def validation_loop(data_loader,model,device,loss_fn):
    model.eval()
    size=len(data_loader.dataset)
    num_batches = len(data_loader)
    val_loss=0
    tp=0
    with torch.no_grad():
        for X,y in data_loader:
            X = X.to(device)
            y = y.to(dtype=torch.long)
            y = y.to(device)
            pred = model(X)
            val_loss += loss_fn(pred,y).item()
            tp += (y==pred.argmax(1)).type(torch.float).sum().item()
        
    val_loss /= num_batches
    val_acc = tp/size
    print(f'accuracy = {val_acc}, val_loss = {val_loss:2f}')
    return val_loss,val_acc

In [165]:
batch_size=32
train_loader = DataLoader(activity_dataset(x_train_pooled,y_train_pooled), shuffle=True, batch_size=batch_size)
validation_loader = DataLoader(activity_dataset(x_test_pooled,y_test_pooled), shuffle=False, batch_size=batch_size)
lr = 0.001
n_epochs = 1
best_acc = 0

model = LSTM_Classifier(dropout=0.75)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
save_each_epoch = True

In [166]:
run = neptune.init(
    project="astarteam/PDLW-assignment-2",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiJhMDI5YzIxMy00NjE1LTQ2MDUtOTk3NS1jNDJhMjIzZDE0NDMifQ==",
)  # your credentials
run["algorithm"] = "LSTM"

params = {"batch_size": batch_size,
        "learning_rate": lr, 
        "optimizer": "CrossEntropyLoss"}

run["parameters"] = params

results = []
# define the number of epochs and early stopping patience
epochs = 5
patience = 5
best_loss = np.inf
for epoch in range(epochs):
    start_time = time.time()
    train_loss, train_acc = train_loop(train_loader, model, device, criterion, optimizer)
    total_train_time = (time.time() - start_time)/60
    val_loss, val_acc = validation_loop(validation_loader, model, device, criterion)
    run["train/accuracy"].log(train_acc)
    run["train/loss"].log(train_loss)
    run['train/run_time'].log(total_train_time)
    run["validation/accuracy"].log(val_acc)
    run["validation/loss"].log(val_loss)
    results.append({'epoch_number':epoch,'train_loss':train_loss.detach().cpu().numpy(),'val_loss':val_loss,'train_acc':train_acc,'val_acc':val_acc, 'train_time':total_train_time})
    # if the validation loss is the best seen so far, update the best loss and reset the early stopping counter
    if val_loss < best_loss:
        torch.save(model.state_dict(), f'model_pretraining_{epoch}.pth')
        val_loss,val_acc = validation_loop(validation_loader, model, device, criterion)
        best_loss = val_loss
        early_stopping_counter = 0
    # otherwise, increment the early stopping counter
    else:
        early_stopping_counter += 1     
    # if the early stopping counter has reached the patience, stop training
    if early_stopping_counter == patience:
        break
run.stop()

https://app.neptune.ai/astarteam/PDLW-assignment-2/e/PDLWAS-29
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api/run#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
loss=3.099, 0 / 21246
loss=0.703, 6400 / 21246
loss=0.697, 12800 / 21246
loss=0.592, 19200 / 21246
accuracy = 0.5879629629629629, val_loss = 0.669024


NameError: name 'best_loss' is not defined