**1.4 K Nearest Neighbors**

Calculates the distance between the test point (test observation) and all the points in the train set. The most common distance metric to use is the euclidean distance:

d(p, q) = sqrt((q1-p1)^2 + (q2-p2)^2 + ... + (qN-pN)^2)

Then, it identifies the K closest train neighbors based on the calculated distance. Those points are called "the nearest neighbors". KNN then counts the nearest neighbors classifications for each class, and finally the predicted ~
class for the new test point is decided based on vote majority.

In [1]:
%%capture
%run preprocessing.ipynb

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
from time import time
import numpy as np
import pandas as pd

# Load data
X_train = np.load('X_train.npy')
X_test = np.load('X_test.npy')
y_train = np.load('y_train.npy')
y_test = np.load('y_test.npy')

# Define the hyperparameters
hyperparameters = {
    'n_neighbors': list(range(1, 21)),
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2],
    'leaf_size': list(range(20, 40)),
    'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski']
}

# Create a KNeighborsClassifier model
model = KNeighborsClassifier()

# Create a scorer for the F1 score
f1_scorer = make_scorer(f1_score, average='weighted')

# Create the RandomizedSearchCV object
random_search = RandomizedSearchCV(model, hyperparameters, scoring=f1_scorer, cv=5, n_jobs=-1, n_iter=100, random_state=42)

# Fit the RandomizedSearchCV object to the data
start = time()
random_search.fit(X_train, y_train)
end = time()

# Predict on the test set
y_pred = random_search.predict(X_test)

# Calculate evaluation metrics for the model
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')  
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted') 

# Save the results to a DataFrame
results = pd.DataFrame(random_search.cv_results_)
results['Execution Time'] = end - start
results['Recall'] = recall
results['Precision'] = precision
results['Accuracy'] = accuracy
results['F1'] = f1

# Extract the parameters from the 'params' column and add them as separate columns
params_df = results['params'].apply(pd.Series)
results = pd.concat([results, params_df], axis=1).drop(columns='params')

# Save the DataFrame to a CSV file
results.to_csv('knn_results.csv', index=False)

In [5]:
# Order the results by F1 score
results = results.sort_values(by='F1', ascending = False)
results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_weights,param_p,param_n_neighbors,param_metric,param_leaf_size,param_algorithm,...,Recall,Precision,Accuracy,F1,weights,p,n_neighbors,metric,leaf_size,algorithm
0,0.130725,0.020409,20.862074,0.966677,uniform,2,14,minkowski,33,brute,...,0.970149,0.970136,0.970149,0.97008,uniform,2,14,minkowski,33,brute
63,0.221809,0.010573,67.118376,8.501104,uniform,2,17,chebyshev,25,ball_tree,...,0.970149,0.970136,0.970149,0.97008,uniform,2,17,chebyshev,25,ball_tree
73,0.259639,0.044741,3.269823,0.381547,distance,1,15,minkowski,21,kd_tree,...,0.970149,0.970136,0.970149,0.97008,distance,1,15,minkowski,21,kd_tree
72,0.149476,0.011758,22.195426,1.716306,distance,1,2,minkowski,33,ball_tree,...,0.970149,0.970136,0.970149,0.97008,distance,1,2,minkowski,33,ball_tree
71,0.151565,0.010276,30.570383,2.621352,distance,1,14,manhattan,36,ball_tree,...,0.970149,0.970136,0.970149,0.97008,distance,1,14,manhattan,36,ball_tree


In [7]:
# Order by execution time
results = results.sort_values(by='mean_fit_time', ascending = True)
results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_weights,param_p,param_n_neighbors,param_metric,param_leaf_size,param_algorithm,...,Recall,Precision,Accuracy,F1,weights,p,n_neighbors,metric,leaf_size,algorithm
38,0.017517,0.00127,34.088537,0.793603,uniform,2,10,manhattan,35,brute,...,0.970149,0.970136,0.970149,0.97008,uniform,2,10,manhattan,35,brute
96,0.017789,0.001239,33.128068,0.985563,distance,2,11,manhattan,26,brute,...,0.970149,0.970136,0.970149,0.97008,distance,2,11,manhattan,26,brute
32,0.01782,0.000242,15.374322,0.250802,distance,2,8,minkowski,39,brute,...,0.970149,0.970136,0.970149,0.97008,distance,2,8,minkowski,39,brute
36,0.018317,0.002424,15.192327,0.693791,uniform,2,1,minkowski,25,brute,...,0.970149,0.970136,0.970149,0.97008,uniform,2,1,minkowski,25,brute
89,0.018616,0.002224,102.85616,4.092939,distance,2,12,chebyshev,32,brute,...,0.970149,0.970136,0.970149,0.97008,distance,2,12,chebyshev,32,brute
