In [8]:
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from dataloader import load_and_split_data
import os
from utils import find_optimal_hyperparameters, load_model_from_json, fit_and_evaluate

### Load data using our custom dataloader

In [9]:
X_train, X_test, y_train, y_test = load_and_split_data("data/preprocessed_dataset.csv", 
                                                       target_column='increase_stock', 
                                                       class_zero='low_bike_demand', 
                                                       test_size=0.2, 
                                                       random_state=0)

### KNN without hyper param tuning

In [10]:
# Using arbitraily chosen n_neighbors=5
params = {"n_neighbors": 5}
knn = KNeighborsClassifier(**params)
_ = fit_and_evaluate(knn, X_train, y_train, X_test, y_test, verbose=True, float_precision=4)

Evaluating KNeighborsClassifier
Accuracy: 0.8219
Precision: 0.5102
Recall: 0.4310
F1: 0.4673
ROC AUC: 0.7884
Confusion Matrix: 
[[238  24]
 [ 33  25]]



### Find optimal hyperparameters

In [7]:
knn = KNeighborsClassifier()
param_grid = {"n_neighbors": range(1, 31) ,
              "weights": ["uniform", "distance"],
              "metric": ["euclidean", "manhattan", "chebyshev", "minkowski"],
              "algorithm": ["ball_tree", "kd_tree", "brute"],
              #"leaf_size": range(1, 51), # Only relevant if algorithm="ball_tree", "kd_tree"
              #"p": [1, 2, 3],   # Only relevant if metric="minkowski"
              }


best_params = find_optimal_hyperparameters(knn, param_grid, X_train, y_train, scoring='accuracy', save_dir="output/best_params", save_file='knn_best_params.json')

Best parameters found:  {'algorithm': 'ball_tree', 'metric': 'manhattan', 'n_neighbors': 21, 'weights': 'distance'}
Saving best parameters to 'output/best_params/knn_best_params.json'


### Use optimal hyperparameters to train and evaluate

In [11]:
knn = KNeighborsClassifier
# Load the best parameters
knn = load_model_from_json(knn, 'output/best_params/knn_best_params.json')
_ = fit_and_evaluate(knn, X_train, y_train, X_test, y_test, verbose=True, float_precision=4)

Evaluating KNeighborsClassifier
Accuracy: 0.8313
Precision: 0.5417
Recall: 0.4483
F1: 0.4906
ROC AUC: 0.8438
Confusion Matrix: 
[[240  22]
 [ 32  26]]

