In [1]:
import os
import numpy as np
import pandas as pd
from scipy.interpolate import interp1d
from scipy import interpolate
from scipy.signal import savgol_filter
from scipy.io import loadmat
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.signal import find_peaks
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load label names    
label_names = loadmat('label_names.mat', squeeze_me=True)
activity_names_indexed = label_names['activity_names_indexed']

In [None]:
# Function to extract raw data from files stored in allData folder
def compute_raw_data(dir_name):

    # Load label names    
    label_names = loadmat('label_names.mat', squeeze_me=True)
    activity_names_indexed = label_names['activity_names_indexed']

    # Function to enumerate and assign number labels against different activity names
    string_to_number = {string: number for number, string in enumerate(activity_names_indexed, start=1)}

    # Initialize data arrays
    y_accel_all = []
    y_bar_all = []
    y_bar_ts_all = []
    y_label_all = []
    y_user_all = []
    y_activity_all = []
    column_names_accel = ['ts', 'accel_x', 'accel_y','accel_z']
    y_accel_all_df = pd.DataFrame(columns= column_names_accel)

    # Iterate through accel files
    files = [f for f in os.listdir(dir_name) if f.endswith('-accel.txt')]
    for file_name in files:
        # Extracting activity name, netID from file names
        # for files where there is no IMEI number
        if(file_name[0:4]=='null'):
            substr = dir_name[dir_name.find("allData/")+8:]
            file_name_prefix = file_name[:-(11+len(substr))]
            file_name_common = file_name[:-(10)]
            activity_name = file_name[25:-(11+len(substr))]
        # for files where there is a 15 digit IMEI number  
        else:
            substr = dir_name[dir_name.find("allData/")+8:]
            file_name_prefix = file_name[:-(11+len(substr))]
            file_name_common = file_name[:-(10)]
            activity_name = file_name[36:-(11+len(substr))]

        for activity_index in range(1, len(activity_names_indexed) + 1):
            if len([1 for name in activity_names_indexed if activity_name in name]) > 0:
                break

        # loading acceleration data    
        accel_data = np.loadtxt(os.path.join(dir_name, file_name), delimiter=',')
        ts, accel_x, accel_y, accel_z = accel_data[:, 0], accel_data[:, 1], accel_data[:, 2], accel_data[:, 3]


        # Removing duplicate timestamps
        ts_same = np.where(ts[:-1] == ts[1:])[0]
        accel_data = np.delete(accel_data, ts_same, axis=0)

        # storing data in dataframe
        column_names_accel = ['ts', 'accel_x', 'accel_y','accel_z']
        df_accel_data = pd.DataFrame(accel_data,columns= column_names_accel)

        # sorting data based on timestamp and interpolating data 
        df_accel_data = df_accel_data.sort_values('ts')
        df_accel_data = df_accel_data.interpolate(method='spline', order=2)

        # time to exclude (first and last few seconds are excluded from analysis)
        time_to_exclude = 2
        df_accel_data_trim = df_accel_data.iloc[time_to_exclude*32:-1*time_to_exclude*32]

        # for the same activity and netID, extracting the pressure values as well

        file_name_baro = file_name_common+"-pressure.txt"
        bar_data = np.loadtxt(os.path.join(dir_name, file_name_baro), delimiter=',')
        ts_bar, y_bar = bar_data[:, 0], bar_data[:, 1]

        # Remove duplicate timestamps
        ts_same_bar = np.where(ts_bar[:-1] == ts_bar[1:])[0]
        bar_data = np.delete(bar_data, ts_same_bar, axis=0)

        # checking acceleration and pressure data length
        desired_length = len(df_accel_data)
        bar_data_len = len(bar_data)

        # Interpolate barometer data to match the length of accel_data
        interp_indices = np.linspace(0, bar_data_len - 1, desired_length)
        y_bar_interp = interp1d(np.arange(bar_data_len), bar_data[:, 1], kind='linear', fill_value='extrapolate')
        y_bar_interpolated = y_bar_interp(interp_indices)

        # Smooth interpolated barometer data
        window_size = 4 * 128
        y_bar_smoothed = savgol_filter(y_bar_interpolated, window_size, 1,mode='nearest')

        # Trim data based on time_to_exclude
        start_idx = int(time_to_exclude * 32)
        end_idx = -start_idx if start_idx > 0 else None
        y_bar_trimmed = y_bar_smoothed[start_idx:end_idx]
        
        # if activity is present in list of activities
        if activity_name in activity_names_indexed:

            y_label = np.full(len(y_bar_trimmed), string_to_number[activity_name])
            y_activity = np.full(len(y_bar_trimmed), activity_name)
            y_user = np.full(len(y_bar_trimmed), substr)

            # only sets of 128 data is processed
            multiple_of_128 = 128 * (len(y_label) // 128)
            y_bar_f = y_bar_trimmed[:multiple_of_128]
            y_label_f = y_label[:multiple_of_128]
            y_user_f = y_user[:multiple_of_128]
            y_activity_f = y_activity[:multiple_of_128]

            y_accel_f = df_accel_data_trim.iloc[:multiple_of_128, :]

            # Concatenate results into new arrays
            y_accel_all_df = pd.concat([y_accel_all_df, y_accel_f],ignore_index=True)
            y_label_all = np.concatenate((y_label_all, y_label_f))
            y_user_all = np.concatenate((y_user_all, y_user_f))
            y_activity_all = np.concatenate((y_activity_all, y_activity_f))
            y_bar_all = np.concatenate((y_bar_all, y_bar_f))

    # add pressure,label,user and activity values into dataframe  
    y_accel_all_df['y_bar'] = y_bar_all
    y_accel_all_df['label'] = y_label_all
    y_accel_all_df['user'] = y_user_all
    y_accel_all_df['activity'] = y_activity_all

    # returning final dataframe
    return y_accel_all_df

TO DO 

Set your NetID below.

In [None]:
# Set your netid
my_netid = 

In [None]:
# Location of data directory
data_dir = os.path.join(os.getcwd(), 'allData')

In [None]:
col_names_raw = ['ts', 'accel_x', 'accel_y','accel_z','y_bar','label','user','activity']
df_raw = pd.DataFrame(columns= col_names_raw)

In [None]:
# Extracting all raw data by iterating through the files

for data_dir_name in os.listdir(data_dir):
    # Goes through all of the directories representing all imei addresses
    if os.path.isdir(os.path.join(data_dir, data_dir_name)) and data_dir_name[0] != '.':

        print(f'Processing directory {data_dir_name}')
        # Compute raw data
        dir_path = os.path.join(data_dir, data_dir_name)
        df_raw_temp = compute_raw_data(dir_path)

        df_raw_temp = df_raw_temp.sort_values(by = ['label', 'ts'], ignore_index=True)
        df_raw = pd.concat([df_raw, df_raw_temp],ignore_index=True)


In [None]:
# Distribution of activities 
sns.set_style("whitegrid")
plt.figure(figsize = (20, 5))
sns.countplot(x = "activity", data = df_raw)
plt.title("Number of samples by activity")
plt.show()

In [None]:
# Distribution of activities among different users

plt.figure(figsize = (18, 6))
sns.countplot(x = "user", hue = "activity", data = df_raw)
plt.title("Activities by Users")
plt.show()

In [None]:
df_raw

In [None]:
# splitting train data and test data 
df_train = df_raw[df_raw['user'] != my_netid]
df_test = df_raw[df_raw['user'] == my_netid]

In [None]:
# Visual representation of raw data
# Note how the magnitudes and signal varies for each activity

for i in ["Stationary", "Running", "Walking-flat-surface"]:
    data_x = df_raw[(df_raw["user"] == my_netid) & (df_raw["activity"] == i)][:1000]
    plt.figure(figsize = (15, 6))
    sns.lineplot(y = "accel_x", x = "ts", data = data_x)
    sns.lineplot(y = "accel_y", x = "ts", data = data_x)
    sns.lineplot(y = "accel_z", x = "ts", data = data_x)
    plt.legend(["accel_x", "accel_y", "accel_z"])
    plt.ylabel(i)
    plt.title(i, fontsize = 15)
    plt.show()

TO DO
 
Do you see any interesting trends while observing signals from different activities (like walking, running stationary) ? Explain what you see. 
Can you think of any specific feature that might help us to differentiate among different activities? You will write a paragraph on this.

In [None]:
x_list_train = []
y_list_train = []
z_list_train = []
b_list_train = []
train_labels = []

# TO DO
# Similarly create test list

# Setting window size of 100 datapoints with an overlap of 50%
window_size = 100
step_size = 50

TO DO

You will extract features from the accelerometer magnitude and barometric pressure time series data. The windowing will be done by a sliding window where the length of the window will be 100 data points with an overlap of 50%. 

In [None]:
# creating overlaping windows of size 100
for i in range(0, df_train.shape[0] - window_size, step_size):
    xs = df_raw['accel_x'].values[i: i + 100]


    x_list_train.append(xs)


TO DO

Compute Different Time Domain features

In [None]:
# Computing Time Domain Features

TO DO

Compute Different Frequency Domain Features


In case there are NaN values in features extracted, extrapolate from neighbouring values.(Like mean of above and below value)

In [None]:
# Computing Frequency domain features

In [None]:
# While computing PSD features, you can use the below sampling rate. You can use signal.welch function to compute the features.
import scipy.signal as signal
sample_rate=32

TO DO

Prepare a TSNE plot from the extracted features.See if the plot can differentiate between stationary and moving activity labels. Briefly comment on the same. 

You can use 'from sklearn.manifold import TSNE' package


In [None]:
# TSNE

TO DO

KNN and Random Forest Classification.
Compute the confusion matrix and then compute the precision, recall and F1 score for each activity separately.

compare the performance of these two classifiers and comment on it.

In [None]:
# KNN with all features

In [None]:
# Random Forest with all features

TO DO

Time Domain vs Frequency Domain

Use just time domain and frequency domain features seperately and run the same classifier. Which domain ( frequency domain vs time domain ) is helping you the most in terms of building your activity recognition system ? Comment on it.

In [None]:
# TD vs FD

TO DO

Cross validation- you will implement Leave One Subject Out Cross Validation . 

For this task, you are free to select any classifier you like.


In [None]:
# Cross Validation

BONUS POINTS

Try building a basic CNN model and use either the raw time series data or the extracted feature set as input and try to classify the different activities
