## Import Packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Access to Google Drive
from google.colab import drive
drive.mount('/content/drive')

## Load the SELECTED (Top 30) Features Dataset
* Results of ML3-1 and ML3-2

In [None]:
FeatureSelected = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/SavedFiles/FeatureSelected.csv', header=None)
FeatureSelected = FeatureSelected.T
FeatureSelected.shape

## Standardize the feature values

In [None]:
from sklearn.preprocessing import StandardScaler

FeatureSelected_std = StandardScaler().fit_transform(FeatureSelected)
FeatureSelected_std.shape

## Split Dataset into Training and Test Sets
- Use 'train_test_split' function
- It randomly samples the training and testing data according to the designated ratio.

In [None]:
# Number of data for each condition: 180
NoOfData   = int(FeatureSelected_std.shape[0]/2)

# Separate the dataset into normal and abnormal sets
NormalSet   = FeatureSelected_std[:NoOfData , :]
AbnormalSet = FeatureSelected_std[NoOfData: , :]

NormalSet.shape, AbnormalSet.shape

In [None]:
from sklearn.model_selection    import train_test_split

# Define the test data ratio
TestData_Ratio = 0.2 

# Split the normal and abnormal sets into training and test sets
TrainData_Nor, TestData_Nor = train_test_split(NormalSet  , test_size=TestData_Ratio, random_state=777)
TrainData_Abn, TestData_Abn = train_test_split(AbnormalSet, test_size=TestData_Ratio, random_state=777)

print(TrainData_Nor.shape, TestData_Nor.shape)
print(TrainData_Abn.shape, TestData_Abn.shape)

## Label the data using np.zeros and np.ones
- in this tutorial, 0 refers to 'Normal' and 1 refers to 'Abnormal'

In [None]:
# Create labels for the training and test sets
TrainLabel_Nor = np.zeros(TrainData_Nor.shape[0]) # 0: Normal
TrainLabel_Abn = np.ones( TrainData_Abn.shape[0]) # 1: Abnormal
TestLabel_Nor  = np.zeros(TestData_Nor.shape[0])  # 0: Normal
TestLabel_Abn  = np.ones( TestData_Abn.shape[0])  # 1: Abnormal

print(TrainLabel_Nor.shape, TestLabel_Nor.shape)
print(TrainLabel_Abn.shape, TestLabel_Abn.shape)

## Prepare the final Data and Label for ML modeling


In [None]:
# Combine the normal and abnormal data/labels
TrainData  = np.concatenate([TrainData_Nor , TrainData_Abn ], axis=0)
TestData   = np.concatenate([TestData_Nor  , TestData_Abn  ], axis=0)
TrainLabel = np.concatenate([TrainLabel_Nor, TrainLabel_Abn], axis=0)
TestLabel  = np.concatenate([TestLabel_Nor , TestLabel_Abn ], axis=0)

print(TrainData.shape,  TestData.shape)
print(TrainLabel.shape, TestLabel.shape)

.

.

.

.

.

.

.



## Grid search for K-Nearest Neighbor (KNN) hyperparameters

### [Main hyperparameters of KNN]

1. **Weights**: The weights determine how the importance of each neighbor is weighted when making a prediction. There are two common approaches for weighting:

- *Uniform*: All neighbors have equal weight when contributing to the prediction.
- *Distance*: Neighbors closer to the query point have a higher weight when contributing to the prediction. This can help to give more importance to closer points, which might be more relevant in some cases.

.

2. **n_neighbors**: The number of neighbors to consider when making a prediction. A small value of n_neighbors leads to a more flexible model that may be more sensitive to noise, while a large value of n_neighbors can result in a smoother decision boundary that may not capture the underlying patterns in the data. Selecting the appropriate value for n_neighbors is critical for the model's performance.

.

3. **Metric**: The metric defines how the distance between data points is calculated. Different metrics can result in different neighbor selections and, consequently, different predictions. Some commonly used distance metrics in KNN are:

- *Euclidean*: he straight-line distance between two points, given by the square root of the sum of the squared differences between their coordinates: 
  - $d(x, y) = \sqrt{\sum_i (x_i - y_i)^2}$
- *Manhattan*: The sum of the absolute differences between the coordinates of two points, which represents the shortest path when only horizontal and vertical movements are allowed:
  - $d(x, y) = \sum_i |x_i - y_i|$

### Prepare lists of hyperparameters for grid search

In [None]:
param_weight      = ['uniform', 'distance']     # weight type
param_n_neighbors = [3, 9, 15]   # number of neighbors for prediction
param_metric      = ['euclidean', 'manhattan']  # metric type

# Calculate the number of cases
NoOfCases = l
NoOfCases

In [None]:
# Create an empty dataframe to store the accuracy results



### Train the KNN models with different combinations of hyperparameters and save them

In [None]:
# Import necessary packages for KNN



# Initialize a count value to store the performance of each model



# Iterate through all possible combinations of weight, n_neighbors, and metric values









# Display the resulting dataframe with model performances


### Confirm the grid search results

In [None]:
# Sort the Accuracy_df by 'Accuracy' column in descending order


# Output the best case



In [None]:
# Calculate mean and standard deviation accuracy for each weight



In [None]:
# Calculate mean and standard deviation of accuracy for each n_neighbors



In [None]:
# Calculate mean and standard deviation of accuracy for each metric



### Visualize the performance comparison for the selected hyperparameter

In [None]:
# Set an index to select a hyperparameter
# 0: weight // 1: n_neighbors // 2: metric
idx = 1

# Automatically define variables based on the selected index







# Draw a bar chart to compare the model performance (diagnostic accuracy) for each hyperparameter
fig, ax = plt.subplots(figsize=(10, 5))

# Create a bar plot with error bars







plt.tight_layout()
plt.show()