In [7]:
import  numpy as np
import  matplotlib.pyplot as plt
import pandas as pd
import os
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers, losses, metrics
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GroupKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
import seaborn as sns
from time import time
import librosa
from scipy.spatial.distance import cosine
from scipy.signal import find_peaks
from scipy.spatial.distance import cosine
from scipy.stats import pearsonr
import librosa




---
# 1. Data loading

---

## 1.1 define labels

In [3]:
activity_codes_mapping = {'A': 'walking',
                          'D': 'sitting',
                          'E': 'standing',
                          'F': 'typing',
                          'J': 'eating',
                          'K': 'drinking',
                          'L':  'fetching'}

activity_color_map = {activity_codes_mapping['A']: 'lime',
                      activity_codes_mapping['D']: 'orange',
                      activity_codes_mapping['E']: 'yellow',
                      activity_codes_mapping['F']: 'lightgreen',
                      activity_codes_mapping['J']: 'cyan',
                      activity_codes_mapping['K']: 'purple',
                      activity_codes_mapping['L']: 'red'}

## 1.2 Load the data

In [10]:
# define header for the data
columns=['user','activity','time','x','y','z']
# create an empty dataframe
data_watch_accel_sum = pd.DataFrame(data=None,columns=columns)
# loop through all the files in the directory and load them into the dataframe
for dirname, _, filenames in os.walk('../wisdm/wisdm-dataset/wisdm-dataset/wisdm-dataset/raw/watch/accel'):
    for filename in filenames:
        df = pd.read_csv('../wisdm/wisdm-dataset/wisdm-dataset/wisdm-dataset/raw/watch/accel/'+filename , sep=",", header=None)
        temp=pd.DataFrame(data=df.values, columns=columns)
        data_watch_accel_sum=pd.concat([data_watch_accel_sum,temp])
        
print(data_watch_accel_sum)

# remove the semicolon from the z column
data_watch_accel_sum['z'] = data_watch_accel_sum['z'].str.replace(';','')

# convert the x, y, z columns to float
data_watch_accel_sum['x']=data_watch_accel_sum['x'].astype('float')
data_watch_accel_sum['y']=data_watch_accel_sum['y'].astype('float')
data_watch_accel_sum['z']=data_watch_accel_sum['z'].astype('float')

# remove the rows with missing values
data_watch_accel_sum = data_watch_accel_sum[data_watch_accel_sum.iloc[:, 1].isin(['A', 'D', 'E', 'F', 'J', 'K'])]

# remove the user column
data_watch_accel_sum.drop(['user'], axis=1, inplace=True)
print(data_watch_accel_sum)

        user activity              time         x         y            z
0       1619        A   351205245071760  9.306112 -1.640178   -2.385074;
1       1619        A   351205294571760  8.958953 -1.609053  -2.3108535;
2       1619        A   351205344071760  8.044368 -0.943465   -2.282123;
3       1619        A   351205393571760   8.84882 -0.177321  -2.5981576;
4       1619        A   351205443071760  8.173655 -0.117466   -2.459294;
...      ...      ...               ...       ...       ...          ...
160794  1638        S  1135293554939000 -4.071533  -7.89159  -1.3747413;
160795  1638        S  1135293575112000 -3.877537 -7.846084  -1.4370118;
160796  1638        S  1135293595208000   -3.7027  -7.91554  -1.5687379;
160797  1638        S  1135293615414000 -3.578159 -8.176597  -1.8130299;
160798  1638        S  1135293635510000 -3.499124 -8.449629  -1.8968556;

[3710454 rows x 6 columns]
      activity              time         x         y         z
0            A   351205245071760 

## 1.3 use local data to replace the data from the original data

In [11]:
# use local walking accel data as training data
local_watch_accel_walking = pd.read_csv('../local/walking/walking/walking.csv' , sep=",", header=0, index_col=False ,names = ['time','sensor','accelx','accely','accelz','AVx','AVy','AVz','Ax','Ay','Az','temp'])

local_watch_accel_walking.drop(['sensor','AVx','AVy','AVz','Ax','Ay','Az','temp'], axis = 1, inplace = True)
local_watch_accel_walking.columns = ['time','x', 'y', 'z']
local_watch_accel_walking.insert(0, 'activity', 'A')

data_watch_accel_sum = data_watch_accel_sum[data_watch_accel_sum.iloc[:, 0].isin(['D', 'E', 'F', 'J', 'K'])]
data_accel_local_sum_0 = pd.concat([data_watch_accel_sum, local_watch_accel_walking], ignore_index=True)
print(data_accel_local_sum_0)

# use local standing accel data as training data
local_watch_accel_standing = pd.read_csv('../local/standing/standing.csv' , sep=",", header=0, index_col=False ,names = ['time','sensor','accelx','accely','accelz','AVx','AVy','AVz','Ax','Ay','Az','temp'])

local_watch_accel_standing.drop(['sensor','AVx','AVy','AVz','Ax','Ay','Az','temp'], axis = 1, inplace = True)
local_watch_accel_standing.columns = ['time','x', 'y', 'z']
local_watch_accel_standing.insert(0, 'activity', 'E')

data_accel_local_sum_0 = data_accel_local_sum_0[data_accel_local_sum_0.iloc[:, 0].isin(['A', 'D', 'F', 'J', 'K'])]
data_accel_local_sum_1 = pd.concat([data_accel_local_sum_0, local_watch_accel_standing], ignore_index=True)
print(data_accel_local_sum_1)

# use local sitting accel data as training data
local_watch_accel_sitting = pd.read_csv('../local/sitting/sitting.csv' , sep=",", header=0, index_col=False ,names = ['time','sensor','accelx','accely','accelz','AVx','AVy','AVz','Ax','Ay','Az','temp'])

local_watch_accel_sitting.drop(['sensor','AVx','AVy','AVz','Ax','Ay','Az','temp'], axis = 1, inplace = True)
local_watch_accel_sitting.columns = ['time','x', 'y', 'z']
local_watch_accel_sitting.insert(0, 'activity', 'D')

data_accel_local_sum_1 = data_accel_local_sum_1[data_accel_local_sum_1.iloc[:, 0].isin(['A', 'E', 'F', 'J', 'K'])]
data_accel_local_sum_2 = pd.concat([data_accel_local_sum_1, local_watch_accel_sitting], ignore_index=True)
print(data_accel_local_sum_2)

# add fetching data
local_watch_accel_fetching = pd.read_csv('../local/fetching/fetching.csv' , sep=",", header=0, index_col=False ,names = ['time','sensor','accelx','accely','accelz','AVx','AVy','AVz','Ax','Ay','Az','temp'])
local_watch_accel_fetching.drop(['sensor','AVx','AVy','AVz','Ax','Ay','Az','temp'], axis = 1, inplace = True)
local_watch_accel_fetching.columns = ['time','x', 'y', 'z']
local_watch_accel_fetching.insert(0, 'activity', 'L')

data_accel_local_sum_3 = pd.concat([data_accel_local_sum_2, local_watch_accel_fetching], ignore_index=True)
print(data_accel_local_sum_3)

# use local typing accel data as training data
local_watch_accel_typing = pd.read_csv('../local/typing/typing.csv' , sep=",", header=0, index_col=False ,names = ['time','sensor','accelx','accely','accelz','AVx','AVy','AVz','Ax','Ay','Az','temp'])

local_watch_accel_typing.drop(['sensor','AVx','AVy','AVz','Ax','Ay','Az','temp'], axis = 1, inplace = True)
local_watch_accel_typing.columns = ['time','x', 'y', 'z']
local_watch_accel_typing.insert(0, 'activity', 'F')

data_accel_local_sum_3 = data_accel_local_sum_3[data_accel_local_sum_3.iloc[:, 0].isin(['A', 'E', 'D', 'J', 'K','L'])]
data_accel_local_sum_4 = pd.concat([data_accel_local_sum_3, local_watch_accel_typing], ignore_index=True)
print(data_accel_local_sum_4)

        activity             time         x         y         z
0              D  349285545742917  0.190339 -5.708974  8.047360
1              D  349285595242917  0.132878 -5.749675  7.968352
2              D  349285644742917  0.211887 -5.704186  7.819911
3              D  349285694242917  0.209493 -5.823895  8.066514
4              D  349285743742917  0.125696 -5.577293  7.927651
...          ...              ...       ...       ...       ...
1174534        A     10:00:48.873 -0.079000  0.662000 -0.080000
1174535        A     10:00:48.873 -0.079000  0.631000 -0.072000
1174536        A     10:00:48.873 -0.081000  0.631000 -0.072000
1174537        A     10:00:48.873 -0.081000  0.605000 -0.055000
1174538        A     10:00:48.873 -0.079000  0.605000 -0.055000

[1174539 rows x 5 columns]
        activity             time         x         y         z
0              D  349285545742917  0.190339 -5.708974  8.047360
1              D  349285595242917  0.132878 -5.749675  7.968352
2           


---
# 2. Preprocessing

---

## 2.1 compute features

In [12]:
# define a function to compute all the features

def compute_features(window):
    features = {}
    for axis in ['x', 'y', 'z']:
        # Histogram features
        hist, bin_edges = np.histogram(window[axis], bins=10, density=True)
        bin_range = np.ptp(window[axis])
        bin_width = bin_range / 10
        features.update({f'{axis}{i}': count for i, count in enumerate(hist * bin_width)})

        # Average
        features[f'{axis}AVG'] = np.mean(window[axis])

        # Peak Time
        peaks, _ = find_peaks(window[axis])
        if len(peaks) > 1:
            peak_times = np.diff(peaks)
            features[f'{axis}PEAK'] = np.mean(peak_times)
        else:
            features[f'{axis}PEAK'] = 0  # No peaks or single peak

        # Absolute Deviation
        features[f'{axis}ABSOLDEV'] = np.mean(np.abs(window[axis] - features[f'{axis}AVG']))

        # Standard Deviation and Variance
        features[f'{axis}STANDDEV'] = np.std(window[axis])
        features[f'{axis}VAR'] = np.var(window[axis])

        # MFCCs
        mfccs = librosa.feature.mfcc(y=window[axis].values.astype(float), sr=100, n_mfcc=13)
        for i, mfcc in enumerate(mfccs):
            features[f'{axis}MFCC{i}'] = np.mean(mfcc)

    # Cosine distances and Correlations
    for pair in [('x', 'y'), ('x', 'z'), ('y', 'z')]:
        cos_distance = cosine(window[pair[0]], window[pair[1]])
        correlation, _ = pearsonr(window[pair[0]], window[pair[1]])
        features[f'{pair[0]}{pair[1]}COS'] = cos_distance
        features[f'{pair[0]}{pair[1]}COR'] = correlation

    # Resultant
    resultant = np.sqrt(window['x']**2 + window['y']**2 + window['z']**2)
    features['RESULTANT'] = np.mean(resultant)

    return features


label = []

def apply_to_windows(data, window_size=200):
    num_windows = len(data) // window_size  # Ensure full windows only
    all_features = []

    for i in range(num_windows):
        start = i * window_size
        end = start + window_size
        window = data.iloc[start:end]
        ######## change label
        activity = data_accel_local_sum_4['activity'][start:end]
        features = compute_features(window)
        all_features.append(features)
        activity = activity.value_counts().idxmax()
        label.append(activity)

    return pd.DataFrame(all_features)

# Apply the function to the data
features_df = apply_to_windows(data_accel_local_sum_4)

# Add the activity column
features_df['activity'] = label
# print(features_df)

# convert the activity column to numbers
cols = list(features_df.columns)

# move the last column (activity) to the front
cols = [cols[-1]] + cols[:-1]
features_df = features_df[cols]

# yy is the activity column(label), xx is the data without labels
yy = features_df['activity']
XX = features_df.drop('activity', axis=1)

print(features_df)



     activity     x0     x1     x2     x3     x4     x5     x6     x7     x8  \
0           J  0.005  0.000  0.025  0.105  0.065  0.075  0.025  0.110  0.475   
1           J  0.055  0.040  0.050  0.040  0.025  0.025  0.060  0.515  0.165   
2           J  0.095  0.075  0.015  0.065  0.285  0.055  0.015  0.055  0.205   
3           J  0.065  0.080  0.095  0.075  0.330  0.180  0.060  0.035  0.060   
4           J  0.130  0.030  0.040  0.095  0.325  0.190  0.025  0.040  0.045   
...       ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
4087        F  0.010  0.020  0.020  0.030  0.100  0.210  0.305  0.220  0.040   
4088        F  0.010  0.040  0.020  0.020  0.020  0.100  0.155  0.255  0.200   
4089        F  0.030  0.030  0.010  0.060  0.155  0.150  0.325  0.180  0.050   
4090        F  0.010  0.000  0.010  0.020  0.080  0.205  0.100  0.220  0.250   
4091        F  0.030  0.030  0.050  0.060  0.180  0.270  0.120  0.130  0.080   

      ...   zMFCC10    zMFCC11   zMFCC1

## prepare the data for training

In [None]:
# split the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(XX, yy, 
                                                    train_size = 0.75, 
                                                    test_size = 0.25,
                                                    shuffle = True, 
                                                    stratify = features_df.activity)

X_train.insert(0, 'y', y_train)

X_train.drop(['y'], axis = 1, inplace = True)

# scaling the data
scaling_transformer = MaxAbsScaler().fit(X_train[['xAVG', 'yAVG', 'zAVG', 'xPEAK', 'yPEAK', 'zPEAK', 'xABSOLDEV', 'yABSOLDEV', 'zABSOLDEV', 'RESULTANT']])
X_train[['xAVG', 'yAVG', 'zAVG', 'xPEAK', 'yPEAK', 'zPEAK', 'xABSOLDEV', 'yABSOLDEV', 'zABSOLDEV', 'RESULTANT']] = scaling_transformer.transform(X_train[['xAVG', 'yAVG', 'zAVG', 'xPEAK', 'yPEAK', 'zPEAK', 'xABSOLDEV', 'yABSOLDEV', 'zABSOLDEV', 'RESULTANT']])
X_test = X_test.copy()
X_test[['xAVG', 'yAVG', 'zAVG', 'xPEAK', 'yPEAK', 'zPEAK', 'xABSOLDEV', 'yABSOLDEV', 'zABSOLDEV', 'RESULTANT']] = scaling_transformer.transform(X_test[['xAVG', 'yAVG', 'zAVG', 'xPEAK', 'yPEAK', 'zPEAK', 'xABSOLDEV', 'yABSOLDEV', 'zABSOLDEV', 'RESULTANT']])
X_test.reset_index(drop = True, inplace = True)
y_train.reset_index(drop = True, inplace = True)
y_test.reset_index(drop = True, inplace = True)                                                    

---
# 3. Training and test

---

## KNN is the most simple method to classify the data in machine learning, there are a lot of technical articles about it on the internet

![jupyter](./knn.png)

In [None]:
# cross validation, you need to search for some technical articles to understand this, this is important
my_cv = StratifiedShuffleSplit(n_splits=5, train_size=0.7, test_size=0.3)

# KNN model
knn_classifier = KNeighborsClassifier()
my_param_grid = {'n_neighbors': [5, 10, 20], 'leaf_size': [20, 30, 40]}
knn_model_gs = GridSearchCV(estimator = knn_classifier, 
                            param_grid = my_param_grid,
                            cv = my_cv, 
                            scoring ='accuracy')

knn_model_gs.fit(X_train, y_train)

knn_best_classifier = knn_model_gs.best_estimator_

print(knn_model_gs.best_params_)

knn_model_gs.cv_results_

knn_best_classifier.get_params()

scores = cross_val_score(knn_best_classifier, X_train, y_train, cv=my_cv, scoring='accuracy')
list(scores)

y_train_pred=knn_best_classifier.predict(X_train)

accuracy_score(y_true=y_train, y_pred=y_train_pred)

y_test_pred = knn_best_classifier.predict(X_test)

cm = confusion_matrix(y_true=y_test,
                      y_pred=y_test_pred)
    
cm_act = pd.DataFrame(cm,
                      index = knn_best_classifier.classes_,
                      columns = knn_best_classifier.classes_)

cm_act.columns = activity_codes_mapping.values()
cm_act.index = activity_codes_mapping.values()
print(cm_act)

sns.set(font_scale=1.6)
fig, ax = plt.subplots(figsize=(12,10))
_ = sns.heatmap(cm_act, cmap="YlGnBu")
plt.savefig('knn_confusion_matrix.svg')

sns.set(font_scale=1.2)

accuracy_per_activity = pd.DataFrame([cm_act.iloc[i][i]/np.sum(cm_act.iloc[i]) for i in range(7)],index=activity_codes_mapping.values())
print(accuracy_per_activity)

print(classification_report(y_true=y_test,
                            y_pred=y_test_pred))

accuracy_score(y_true = y_test, y_pred = y_test_pred)