# K-nearest neighbor for classification

In [22]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import keras
import sklearn
import matplotlib.pyplot as plt
import seaborn

In [114]:
from sklearn import neighbors, datasets, preprocessing 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [91]:
# Load data
dirpath_rawdata = "Data/Raw Data/Preliminary"
dirpath_transformed = "Data/Preprocessed Data/"

In [92]:
dict_transformed_data = dict()

for data in os.listdir(dirpath_transformed):

    data_name = data.replace(".csv", "")
    dict_transformed_data.setdefault(data_name, pd.read_csv(os.path.join(os.getcwd(), dirpath_transformed, data)))

print(dict_transformed_data.keys())
dict_transformed_data['transformed_cross']

dict_keys(['transformed_cross', 'transformed_jab', 'transformed_lft_nopunch', 'transformed_lh', 'transformed_lu', 'transformed_rght_nopunch', 'transformed_rh', 'transformed_ru'])


Unnamed: 0,X (m/s^2),Y (m/s^2),Z (m/s^2),X (rad/s),Y (rad/s),Z (rad/s),X (hPa),Punch Type
0,6.137,6.095,3.791,0.206,0.902,0.285,1012.885818,cross
1,14.568,28.642,10.195,2.934,5.756,2.912,1012.895355,cross
2,12.615,22.545,10.761,2.541,7.395,2.865,1012.904816,cross
3,13.644,21.431,8.318,2.365,6.340,2.121,1012.911606,cross
4,13.778,29.763,11.123,3.318,8.487,3.265,1012.919388,cross
...,...,...,...,...,...,...,...,...
297,10.143,13.389,6.017,0.790,3.033,2.237,1012.927442,cross
298,12.279,13.465,6.075,1.250,3.691,2.446,1012.927442,cross
299,7.412,12.693,4.174,0.863,3.172,2.020,1012.927442,cross
300,9.400,13.402,4.653,1.272,3.341,2.358,1012.927442,cross


In [93]:
# Divide the data into train-test sets and merge all subdataframes into a whole one

x_train_whole = dict()
y_train_whole = dict()
x_test_whole = dict()
y_test_whole = dict()

for data in dict_transformed_data.keys():
    df_to_process = dict_transformed_data[data]
    data_name = data.replace("transformed_", "")
    x,y = df_to_process.iloc[:,:-1], df_to_process.iloc[:,-1]
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.25, random_state=50)
    
    x_train_whole.setdefault(data_name, x_train)
    x_test_whole.setdefault(data_name, x_test)
    y_train_whole.setdefault(data_name,y_train)
    y_test_whole.setdefault(data_name, y_test)

In [94]:
x_train_whole = pd.concat([df for df in x_train_whole.values()], axis=0)
y_train_whole = pd.concat([df for df in y_train_whole.values()], axis=0)

x_test_whole = pd.concat([df for df in x_test_whole.values()], axis=0)
y_test_whole = pd.concat([df for df in y_test_whole.values()], axis=0)

In [95]:
def shuffle_data(feature, label):
    
    df_unshuffled = pd.concat([feature, label], axis=1)
    df_shuffled = df_unshuffled.sample(frac=1.0, random_state=50)
    shuffled_feature, shuffled_label = df_shuffled.iloc[:,:-1], df_shuffled.iloc[:,-1]
    return shuffled_feature, shuffled_label

x_train_shuffled, y_train_shuffled = shuffle_data(x_train_whole, y_train_whole)
x_test_shuffled, y_test_shuffled = shuffle_data(x_test_whole, y_test_whole)

# Model training and classification report

In [126]:
k = 8
knn = KNeighborsClassifier(n_neighbors=k)

knn.fit(x_train_shuffled, y_train_shuffled)

y_pred = knn.predict(x_test_shuffled)

print("Confusion Matrix:")
print(confusion_matrix(y_test_shuffled, y_pred))
print("\nClassification Report:")
print(classification_report(y_test_shuffled, y_pred))

Confusion Matrix:
[[55 13  6  0  0  2  0  0]
 [ 8 57  6  0  0  5  0  0]
 [ 0  0 29  0  0  1  0  0]
 [ 0  0  0 64  4  0  7  1]
 [ 0  0  0  6 62  0  1  7]
 [ 0  7  1  0  0 22  0  0]
 [ 0  0  0 18  1  0 56  1]
 [ 0  0  0  8 21  0  2 45]]

Classification Report:
              precision    recall  f1-score   support

       cross       0.87      0.72      0.79        76
         jab       0.74      0.75      0.75        76
 lft_nopunch       0.69      0.97      0.81        30
          lh       0.67      0.84      0.74        76
          lu       0.70      0.82      0.76        76
rght_nopunch       0.73      0.73      0.73        30
          rh       0.85      0.74      0.79        76
          ru       0.83      0.59      0.69        76

    accuracy                           0.76       516
   macro avg       0.76      0.77      0.76       516
weighted avg       0.77      0.76      0.75       516



In [132]:
# Compare with the dummy classifier

dummy_clf = DummyClassifier(strategy='uniform')
dummy_clf.fit(x_train_shuffled, y_train_shuffled)

dummy_pred = dummy_clf.predict(x_test_shuffled)

print("Confusion Matrix:")
print(confusion_matrix(y_test_shuffled, dummy_pred))
print("\nClassification Report:")
print(classification_report(y_test_shuffled, dummy_pred))

Confusion Matrix:
[[11  8 12  6 11 10  6 12]
 [ 7 15  6  8 12  7  9 12]
 [ 3  2  2  3  6  5  5  4]
 [ 9  7 14  7 17  4 11  7]
 [ 9  7  8 11  7 16 11  7]
 [ 6  2  5  6  2  1  2  6]
 [20  8  8  9  5 11  6  9]
 [10  8 12  8 12 11  6  9]]

Classification Report:
              precision    recall  f1-score   support

       cross       0.15      0.14      0.15        76
         jab       0.26      0.20      0.23        76
 lft_nopunch       0.03      0.07      0.04        30
          lh       0.12      0.09      0.10        76
          lu       0.10      0.09      0.09        76
rght_nopunch       0.02      0.03      0.02        30
          rh       0.11      0.08      0.09        76
          ru       0.14      0.12      0.13        76

    accuracy                           0.11       516
   macro avg       0.11      0.10      0.11       516
weighted avg       0.13      0.11      0.12       516



In [None]:
### ToDo:

# Make a linechart of different precision and recall, or maybe just accuracy under different k for knn (AUC)
# Write the report 