# K-nearest neighbor for classification

In [48]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import keras
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

import utils

In [2]:
from sklearn import neighbors, datasets, preprocessing 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler, StandardScaler

###### Load Data


In [3]:
### Naming explanation:
# punchtype_subject No._run_number_datatype
# cross_01_01_Gyroscope: Gyroscope data of the 1st Subject collected in 1st run

dirpath_experimental_data = "Data/Raw Data/Experimental"
dict_exp_data = dict()


# Walk through each folder in Experimental Data Folder 
for foldername in os.listdir(dirpath_experimental_data):
    
    # punch_name = "_".join(foldername.split("_")[:2])

    # Walk through each file in each folder. Fetch and load the Linear Acceleration data and Gyroscope data
    for filename in os.listdir(os.path.join(dirpath_experimental_data,foldername)):
        
        if any([filename.startswith("Gyroscope"), filename.startswith("Linear")]):

            data_name = str(foldername) + "_" + str(filename)
            
            # Modify the file name to make the format consistent
            data_name = "_".join(data_name.replace(".csv","").replace("Accelerometer", "Acceleration").replace("sub-", "").replace("run-","").split("_"))

            # Assign each csv as a DataFrame to a dictionary
            data_content = pd.read_csv(os.path.join(dirpath_experimental_data, foldername, filename))
            dict_exp_data.setdefault(data_name, data_content)

print(dict_exp_data.keys())

dict_keys(['cross_01_01_Gyroscope', 'cross_01_01_Linear Acceleration', 'cross_02_01_Gyroscope', 'cross_02_01_Linear Acceleration', 'cross_03_01_Gyroscope', 'cross_03_01_Linear Acceleration', 'jab_01_01_Gyroscope', 'jab_01_01_Linear Acceleration', 'jab_02_01_Gyroscope', 'jab_02_01_Linear Acceleration', 'jab_03_01_Gyroscope', 'jab_03_01_Linear Acceleration', 'lefthook_01_01_Gyroscope', 'lefthook_01_01_Linear Acceleration', 'lefthook_02_01_Gyroscope', 'lefthook_02_01_Linear Acceleration', 'lefthook_03_01_Gyroscope', 'lefthook_03_01_Linear Acceleration', 'leftnopunch_01_01_Gyroscope', 'leftnopunch_01_01_Linear Acceleration', 'leftnopunch_02_01_Gyroscope', 'leftnopunch_02_01_Linear Acceleration', 'leftnopunch_03_01_Gyroscope', 'leftnopunch_03_01_Linear Acceleration', 'leftuppercut_02_01_Gyroscope', 'leftuppercut_02_01_Linear Acceleration', 'leftuppercut_03_01_Gyroscope', 'leftuppercut_03_01_Linear Acceleration', 'leftupper_01_01_Gyroscope', 'leftupper_01_01_Linear Acceleration', 'righthook_

In [4]:
# Changing colnames for consistency

acc_columns = {"Linear Acceleration x (m/s^2)" : "X (m/s^2)",
               "Linear Acceleration y (m/s^2)" : "Y (m/s^2)",
               "Linear Acceleration z (m/s^2)" : "Z (m/s^2)"
               }

gyr_columns = {"Gyroscope x (rad/s)" : "X (rad/s)",
               "Gyroscope y (rad/s)" : "Y (rad/s)",
               "Gyroscope z (rad/s)" : "Z (rad/s)"
               }

for df_name, df_content in dict_exp_data.items():
    
    if df_name.endswith("Linear Acceleration"):
        df_content.rename(columns=acc_columns, inplace=True)

    elif df_name.endswith("Gyroscope"):
        df_content.rename(columns=gyr_columns, inplace=True)

In [5]:
for df in dict_exp_data.keys():
    print(df)

cross_01_01_Gyroscope
cross_01_01_Linear Acceleration
cross_02_01_Gyroscope
cross_02_01_Linear Acceleration
cross_03_01_Gyroscope
cross_03_01_Linear Acceleration
jab_01_01_Gyroscope
jab_01_01_Linear Acceleration
jab_02_01_Gyroscope
jab_02_01_Linear Acceleration
jab_03_01_Gyroscope
jab_03_01_Linear Acceleration
lefthook_01_01_Gyroscope
lefthook_01_01_Linear Acceleration
lefthook_02_01_Gyroscope
lefthook_02_01_Linear Acceleration
lefthook_03_01_Gyroscope
lefthook_03_01_Linear Acceleration
leftnopunch_01_01_Gyroscope
leftnopunch_01_01_Linear Acceleration
leftnopunch_02_01_Gyroscope
leftnopunch_02_01_Linear Acceleration
leftnopunch_03_01_Gyroscope
leftnopunch_03_01_Linear Acceleration
leftuppercut_02_01_Gyroscope
leftuppercut_02_01_Linear Acceleration
leftuppercut_03_01_Gyroscope
leftuppercut_03_01_Linear Acceleration
leftupper_01_01_Gyroscope
leftupper_01_01_Linear Acceleration
righthook_01_01_Gyroscope
righthook_01_01_Linear Acceleration
righthook_02_01_Gyroscope
righthook_02_01_Linear A

## 1. Data Transformation

In [6]:
# Concatenate data: First we concatenate them vertically (by same metrics), then we concatinate them horizontally (by same type of punch)

punch_types = ['leftnopunch', 'rightnopunch','jab','cross', 'lefthook', 'righthook', 'leftuppercut','rightuppercut']


def concat_df(dict_original_data, concat_step, axis):

    result_dict = dict()

    i = 0
    j = concat_step
    
    while i < len(dict_original_data):
        concat_df_name = [key for key in dict_original_data.keys()][i]
        concat_df_content = [val for val in dict_original_data.values()][i:j]
        result_dict.setdefault(concat_df_name, pd.concat(concat_df_content, axis=axis))
    
        i += concat_step
        j += concat_step
    
    return result_dict

def simplify_df_names(dict_df, list_new_names):

    dict_with_new_dfname = dict()

    for old_name, df_content in dict_df.items():

        for new_name in list_new_names:

            if any([new_name in old_name, old_name.startswith(new_name)]):
                dict_with_new_dfname.setdefault(new_name, df_content)

    return dict_with_new_dfname

dict_df_concated = concat_df(dict_exp_data, 2, 1)

dict_df_concated_final = simplify_df_names(concat_df(dict_df_concated, 3, 0), punch_types)

In [10]:
# Label the data

# Forward fill the missing value

def dataframe_labeller(df,label):
    df["Punch Type"] = label

for df, label in list(zip([df_content for df_content in dict_df_concated_final.values()],sorted(punch_types))):
    dataframe_labeller(df, label)
    df.ffill()

for punch_type, df_content in dict_df_concated_final.items():
    print(punch_type, ":")
    print(df_content)

cross :
         Time (s)  X (rad/s)  Y (rad/s)  Z (rad/s)    Time (s)  X (m/s^2)  \
0        0.065232   0.240676  -0.045339   0.668833    0.065232   1.683050   
1        0.085103   0.451193   0.059653   0.542470    0.085103   1.408963   
2        0.105020   0.605252   0.159319   0.360182    0.105020   0.555544   
3        0.124894   0.701722   0.136949   0.213113    0.124894  -0.377898   
4        0.144788   0.704385   0.005792   0.125697    0.144788  -0.791464   
...           ...        ...        ...        ...         ...        ...   
14995  299.926293  -0.307508   2.953404  -2.347397  299.926293  -0.143904   
14996  299.946296  -0.500159   3.585149  -1.779337  299.946296   0.225561   
14997  299.966297  -0.560263   4.260557  -1.421962  299.966297  -0.275865   
14998  299.986298  -0.623602   4.781751  -1.339635  299.986298  -1.019574   
14999  300.006300  -0.804991   5.138650  -1.371697  300.006300  -1.558969   

       Y (m/s^2)  Z (m/s^2) Punch Type  
0      -1.668346  -1.43239

In [9]:
# View the raw info of transformed data

for punch_type, df_content in dict_df_concated_final.items():
    print(punch_type, ":")
    print(df_content.describe())

cross :
           Time (s)     X (rad/s)     Y (rad/s)     Z (rad/s)      Time (s)  \
count  52752.000000  52752.000000  52752.000000  52752.000000  52751.000000   
mean     182.370743      0.010967      0.063197     -0.026782    182.368512   
std      113.776164      1.692620      3.565020      3.169228    113.776089   
min        0.000183    -16.179197    -34.887180    -24.224096      0.000183   
25%       87.560418     -0.442109     -0.597692     -0.486286     87.558405   
50%      175.099950     -0.004498     -0.000909     -0.000720    175.098877   
75%      262.839930      0.495264      1.143176      0.674835    262.831805   
max      450.566186     16.443207     27.484556     25.414774    450.566186   

          X (m/s^2)     Y (m/s^2)     Z (m/s^2)  
count  52751.000000  52751.000000  52751.000000  
mean       0.498109     -1.145481     -0.665079  
std       11.058316     13.689176      7.272231  
min      -80.683165   -115.448820   -123.536062  
25%       -1.179261     -2.262

In [20]:
# Remove outliers

# Before that, we drop time and return a new dict of df

def remove_outliers(df):

    # Remove the label from the original df because string is not used for statistics
    df_no_outliers = df.iloc[:,:-1].copy()
    df_label = df["Punch Type"].unique()

    accelerometer_feature = "(m/s^2)"

    for feature in df_no_outliers.columns:
        
        # Calculate Q1 (25th percentile) and Q3 (75th percentile) of each feature
        Q1 = df_no_outliers[feature].quantile(0.25)
        Q3 = df_no_outliers[feature].quantile(0.75)
        
        # Calculate IQR
        IQR = Q3 - Q1
        
        # Define outlier criteria
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Remove outliers, but treat Acceleration and other features differently (see report for details)
        if accelerometer_feature in feature:
            df_no_outliers = df_no_outliers[~((df_no_outliers[feature] < lower_bound))]
            
        else:
            df_no_outliers = df_no_outliers[~((df_no_outliers[feature] < lower_bound) | (df_no_outliers[feature] > upper_bound))]

    # Assign the label back
    df_no_outliers["Punch Type"] = df_label[0]

    return df_no_outliers


In [22]:
# # Removing outliers on raw data causes too many information loss. 
# # We remove outliers after the aggregation

# dict_df_concated_final_clean = dict()

# for punch_type, df_content in dict_df_concated_final.items():

#     dict_df_concated_final_clean.setdefault(punch_type, remove_outliers(df_content.drop(columns="Time (s)")))

# for punch_type, df_content in dict_df_concated_final_clean.items():

#     print(punch_type, ":")
#     print(df_content)

cross :
       X (rad/s)  Y (rad/s)  Z (rad/s)  X (m/s^2)  Y (m/s^2)  Z (m/s^2)  \
0       0.240676  -0.045339   0.668833   1.683050  -1.668346  -1.432390   
1       0.451193   0.059653   0.542470   1.408963  -0.833281  -1.033995   
2       0.605252   0.159319   0.360182   0.555544   0.084102  -0.627524   
3       0.701722   0.136949   0.213113  -0.377898   1.033287  -0.123379   
4       0.704385   0.005792   0.125697  -0.791464   1.543793   0.366435   
...          ...        ...        ...        ...        ...        ...   
14961  -0.201308  -0.881923   0.182616  -0.702299  -0.265639  -0.786810   
14962  -0.236596  -1.112740   0.253688  -0.626520  -0.247557  -1.134742   
14963  -0.243850  -1.404703   0.366524  -0.541992  -0.238490  -1.362985   
14964  -0.273569  -1.756004   0.497306  -0.822844  -0.071925  -1.604195   
14966  -0.479062  -2.636327   0.704278  -2.071029   0.707636  -1.608228   

      Punch Type  
0          cross  
1          cross  
2          cross  
3          cros

In [None]:
# def make_violinplot_feature(df, cls, figsize = (10,10), savefig=False):

#     df_copy = df.copy()
    
#     if "X (hPa)" in df.columns:
#         df_copy = df_copy.drop('X (hPa)', axis=1)

#     plt.figure(figsize=figsize)    
#     sns.violinplot(df_copy)
#     plt.title(f"Violinplot of features in {cls}")

#     if savefig:
#         plt.savefig(f"Figures\\violin_{cls}.png")
        
#     plt.show()

# def make_violinplot_punchtype(feature, figsize = (10,10), savefig=False):
#     df_joined = pd.concat([df for df in dict_transformed_data.values()],axis=0)

#     plt.figure(figsize=figsize)
#     df_melted = df_joined.melt(id_vars='Punch Type', value_vars=[feature], var_name='Feature', value_name='Value')
    
#     plt.figure(figsize=figsize)
#     sns.violinplot(data=df_melted, x='Feature', y='Value', hue='Punch Type', split=True)
#     plt.title(f"Violinplot of {feature} across classes")
    
#     if savefig:
#         plt.savefig(f"violinplot_{feature}.png")
    
#     plt.show()


In [39]:
# Remove time from the data
dict_df_concated_NoTime = dict()

for punch_type, df_content in dict_df_concated_final.items():

    dict_df_concated_NoTime.setdefault(punch_type, df_content.drop(columns="Time (s)"))

dict_df_concated_NoTime

{'cross':        X (rad/s)  Y (rad/s)  Z (rad/s)  X (m/s^2)  Y (m/s^2)  Z (m/s^2)  \
 0       0.240676  -0.045339   0.668833   1.683050  -1.668346  -1.432390   
 1       0.451193   0.059653   0.542470   1.408963  -0.833281  -1.033995   
 2       0.605252   0.159319   0.360182   0.555544   0.084102  -0.627524   
 3       0.701722   0.136949   0.213113  -0.377898   1.033287  -0.123379   
 4       0.704385   0.005792   0.125697  -0.791464   1.543793   0.366435   
 ...          ...        ...        ...        ...        ...        ...   
 14995  -0.307508   2.953404  -2.347397  -0.143904   1.580076   1.579204   
 14996  -0.500159   3.585149  -1.779337   0.225561   0.673862  -0.569482   
 14997  -0.560263   4.260557  -1.421962  -0.275865   0.650340  -0.795494   
 14998  -0.623602   4.781751  -1.339635  -1.019574   1.862951  -0.137417   
 14999  -0.804991   5.138650  -1.371697  -1.558969   2.324484  -1.287985   
 
       Punch Type  
 0          cross  
 1          cross  
 2          cross

In [47]:
### Create 2 versions of aggregated data with window size of 50 and 100. Both of them are groupped by mean absolute value

print(help(utils.aggregate_data))

dict_df_transformed_50 = dict()
dict_df_transformed_100 = dict()

for punch_type, df_content in dict_df_concated_NoTime.items():
    dict_df_transformed_50.setdefault(punch_type, utils.aggregate_data(df_content, 50, punch_type))
    dict_df_transformed_100.setdefault(punch_type, utils.aggregate_data(df_content, 100, punch_type))

Help on function aggregate_data in module utils:

aggregate_data(df, window)
    Returns an aggregated dataframe on a given window size by the mean absolut value of each window.
    
        Discards the remainders that have less datapoints than the window size. Usually the 'tail' of the dataframe.
        Does not take the influence of time series into account. Usually applied for data preprocessing
            for a classical ML algorithm.
    
    :param df: The Raw, Unprocessed Data (collected by Phyphox or other device)
    :param int window: The window size of data aggregation.
    :returns df_agg: A dataframe with features aggregated by the mean absolut value of each window.

None


TypeError: aggregate_data() takes 2 positional arguments but 3 were given

In [None]:
# Standardize the data
scaler = MinMaxScaler()

for df in dict_transformed_data.keys():
    feature, label = dict_transformed_data[df].iloc[:,:-1], dict_transformed_data[df].iloc[:,-1]
    feature = scaler.fit_transform(feature)
    dict_transformed_data[df] = pd.concat([pd.DataFrame(feature), label], axis=1)
    dict_transformed_data[df] = dict_transformed_data[df].dropna()


In [None]:
for i in dict_transformed_data.values():
    print(i)

In [None]:
# Divide the data into train-test sets and merge all subdataframes into a whole one

x_train_whole = dict()
y_train_whole = dict()
x_test_whole = dict()
y_test_whole = dict()

for data in dict_transformed_data.keys():
    df_to_process = dict_transformed_data[data]
    data_name = data.replace("transformed_", "")
    x,y = df_to_process.iloc[:,:-1], df_to_process.iloc[:,-1]
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.25, random_state=50)
    
    x_train_whole.setdefault(data_name, x_train)
    x_test_whole.setdefault(data_name, x_test)
    y_train_whole.setdefault(data_name,y_train)
    y_test_whole.setdefault(data_name, y_test)

In [None]:
x_train_whole = pd.concat([df for df in x_train_whole.values()], axis=0)
y_train_whole = pd.concat([df for df in y_train_whole.values()], axis=0)

x_test_whole = pd.concat([df for df in x_test_whole.values()], axis=0)
y_test_whole = pd.concat([df for df in y_test_whole.values()], axis=0)

In [None]:
def shuffle_data(feature, label):
    
    df_unshuffled = pd.concat([feature, label], axis=1)
    df_shuffled = df_unshuffled.sample(frac=1.0, random_state=50)
    shuffled_feature, shuffled_label = df_shuffled.iloc[:,:-1], df_shuffled.iloc[:,-1]
    return shuffled_feature, shuffled_label

# Train test data that have been shuffled and preprocessed.
x_train_shuffled, y_train_shuffled = shuffle_data(x_train_whole, y_train_whole)
x_test_shuffled, y_test_shuffled = shuffle_data(x_test_whole, y_test_whole)

# Model training and classification report

In [None]:
k = 8
knn = KNeighborsClassifier(n_neighbors=k)

knn.fit(x_train_shuffled, y_train_shuffled)

y_pred = knn.predict(x_test_shuffled)

print("Confusion Matrix:")
print(confusion_matrix(y_test_shuffled, y_pred))
print("\nClassification Report:")
print(classification_report(y_test_shuffled, y_pred))

In [None]:
# Compare with the dummy classifier

dummy_clf = DummyClassifier(strategy='uniform')
dummy_clf.fit(x_train_shuffled, y_train_shuffled)

dummy_pred = dummy_clf.predict(x_test_shuffled)

print("Confusion Matrix:")
print(confusion_matrix(y_test_shuffled, dummy_pred))
print("\nClassification Report:")
print(classification_report(y_test_shuffled, dummy_pred))

# Try different value of k
We will observe the difference of precision, recall and accuracy at different k, from 1 to 100

In [None]:
# Set up the variables to save precision, recall and accuracy at k
dict_precision_at_ks = dict()
dict_recall_at_ks = dict()
dict_f1_at_ks = dict()

accuracy_at_ks = []

for punch_type in y_train_shuffled.unique():
    dict_precision_at_ks.setdefault(punch_type, list())
    dict_recall_at_ks.setdefault(punch_type, list())
    dict_f1_at_ks.setdefault(punch_type, list())

In [None]:
# classification report is a string. We will do some string formatting to retrieve and assign the value we need

def get_values_from_report(report, feature_name):
    """
    get the precision, recall, and the accuracy of the given feature from the classification report
    """

    precision = 0
    recall = 0
    f1 = 0
    accuracy = 0
    
    report_l1=report.split("\n")    
    report_l1.pop(0)
    report_l1.pop(0)

    report_l2 = []
    
    for i in report_l1:
        i2 = i.strip()
        report_l2.append(i2.split("       "))
    
    for i in report_l2:
        if len(i) == 1:
            report_l2.remove(i)

    for idx,j in enumerate(report_l2):
        
        if j[0] == feature_name:
            
            scores = j[1].split("     ")
        
            precision = float(scores[0].strip())
            recall = float(scores[1].strip()) 
            f1 = float(scores[2].strip())
            
        if j[0] == 'accuracy':
            accuracy = float(report_l2[idx][3].strip())
  
    return precision, recall, f1, accuracy

In [None]:
# Model train and evaluation at an arbiatry k
def knn_model_pred(k, returns_report = False, print_results = False):
    
    """
    Trains a knn model at a given k, and makes predictions.
    Assume data have already been preprocessed. 

    Optional (False by default):
    - Returns a classification report if returns_report = True. 
    - Prints the evaluation results if print_results = True.
    """

    knn = KNeighborsClassifier(n_neighbors=k)

    knn.fit(x_train_shuffled, y_train_shuffled)
    
    y_pred = knn.predict(x_test_shuffled)

    print(f"training at {k} neighbors complete!")

    if print_results:
        print("Confusion Matrix:")
        print(confusion_matrix(y_test_shuffled, y_pred))
        print("\nClassification Report:")
        print(classification_report(y_test_shuffled, y_pred))

    if returns_report:
        return classification_report(y_test_shuffled, y_pred)

In [None]:
for k in range(1,101):
    report = knn_model_pred(k, returns_report=True)
    
    accuracy_at_k = 0
    for feature_name in y_train_shuffled.unique():
        precision, recall, f1, accuracy = get_values_from_report(report, feature_name)

        dict_precision_at_ks[feature_name].append(precision)
        dict_recall_at_ks[feature_name].append(recall)
        dict_f1_at_ks[feature_name].append(f1)

        accuracy_at_k = accuracy

    accuracy_at_ks.append(accuracy_at_k)

plot the result

In [None]:
def plot_metrics_at_ks(metrics, data, savefig=False):

    if metrics == 'Accuracy':
        accuracy_x = list(range(1,101))
        plt.plot(accuracy_x, data)
        
        plt.xlabel("k-neighbor")
        plt.ylabel("accuracy")
        plt.title("Overall accuracy at ks")
    
        if savefig:
            plt.savefig("Figures/Curve_accuracy_at_ks.png")
            
        plt.show()

    else:
        plt.figure(figsize=(12,10))
    
        sns.lineplot(data)
        
        plt.xlabel("k-neighbour")
        plt.ylabel(f'{metrics}')
        plt.title(f"{metrics} of punch types at ks")

        if savefig:
            plt.savefig(f"Figures/Curve_{metrics}_at_ks.png")
            
        plt.show()

In [None]:
dict_precision_at_ks = pd.DataFrame(dict_precision_at_ks)
dict_recall_at_ks = pd.DataFrame(dict_recall_at_ks)
dict_f1_at_ks = pd.DataFrame(dict_f1_at_ks)

plot_metrics_at_ks("Precision", dict_precision_at_ks, savefig=True)
plot_metrics_at_ks("Recall", dict_recall_at_ks, savefig=True)
plot_metrics_at_ks("F1-score", dict_f1_at_ks, savefig=True)
plot_metrics_at_ks("Accuracy", accuracy_at_ks, savefig=True)