# Instance-Based Learning: K-Nearest Neighbours(kNN)

## 1. Data Preprocessing

In [None]:
#importing the libraries needed
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [None]:
train = pd.read_csv('data_train.csv')
test = pd.read_csv('data_test.csv')

Data transformation of the categorical attrbutes to numeric

In [None]:
#transformation 
from sklearn.preprocessing import LabelEncoder

# Select the categorical columns to encode
cat_columns = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "gender", "native-country"]

# Encode categorical columns using Label Encoding for data_train
le = LabelEncoder()
for col in cat_columns:
    train[col] = le.fit_transform(train[col])

# Encode categorical columns using Label Encoding for data_test
le = LabelEncoder()
for col in cat_columns:
    test[col] = le.fit_transform(test[col])


# Print the first 5 rows of the transformed dataset
#print(train.head())
#print(test.head())

In [None]:

#train
x_train_trans = train.drop(columns = ['income'])
y_train = train['income']

#test
x_test_trans = test.drop(columns =['income'])
y_test = test['income']

#print(x_train_trans)
#print(y_train_trans)
#print(x_test_trans)


Standard Scaling


StandardScaler is being used to standardize the data by removing the mean and scaling to unit variance. This is common preprocessing step in machine learning to ensure that all features are on the same scale, which can improve the performance and accuracy of many algorithm

In [None]:
#Standard Scaling

from sklearn import preprocessing

x_train = preprocessing.StandardScaler().fit(x_train_trans).transform(x_train_trans.astype(float))
#print(x_train)

x_test = preprocessing.StandardScaler().fit(x_test_trans).transform(x_test_trans.astype(float))
#print(x_test)


## 2. Using default kNN

Doing a validation of performance and confusion matrix for doing default kNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix, ConfusionMatrixDisplay


#base model
default_knn = KNeighborsClassifier()
default_knn.fit(x_train, y_train)

#predict
y_pred=default_knn.predict(x_test)


#Summarize Result
#precision,recall,f1-score,support, accuracy, macro avg, weighted avg
print(classification_report(y_test,y_pred))

#ROC score
auc = roc_auc_score(y_test, y_pred)
print('ROC_AUC score: ', auc)

#plotting roc curve
fpr, tpr, _ = roc_curve(y_test, y_pred)
plt.plot(fpr,tpr,label="AUC="+str(auc))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.show()

#Confusion matrix
cm = (confusion_matrix(y_test,y_pred,))
print('Confusion Matrix: \n', cm)
#confusion matrix plot
disp = ConfusionMatrixDisplay(cm)
disp.plot()
plt.show()



## 4. Hyperparameter Tuning

Hyperparameter Tuning for K-neighbours, Weight and Distance Metric and validating its performance using accuracy, precision, recall and F1-score

This will be done by making a ranking of validation

-GridSearchCV 
with a sklearn.model_selection.GridSearchCV

-Training and Validation Split 10% 
    -goals on validation: how robust the validation - doesnt have much variance when you the validation again - (find the robust percentage-is it really 10%)

-Find whether training/validation split and k-folds is the best for kNN


### Method: K-folds Cross Validation

K-folds Cross Validation is used as it uses all the data for training and validation.

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV


# Creating a parameter of the grid
param_grid = [{
    'n_neighbors': list(range(1, 30, 2)),
    'metric': ['euclidean', 'manhattan', 'cosine', 'minkowski'],
    'weights':['uniform','distance'],
    #'algorithm':['auto','ball_tree','kd-tree','brute'],
    'leaf_size' : list(range(1,50, 5))
}]
#is this how you put the hyperparameter range?

#making a score ranking
from sklearn.metrics import  roc_auc_score, make_scorer
ROC_AUC = make_scorer(roc_auc_score, pos_label = 1, average = 'binary')

# Grid search model
knn_grid = KNeighborsClassifier()
grid_search = GridSearchCV(estimator = knn_grid, param_grid = param_grid, cv =5, scoring = 'ROC_AUC')
#cv is the folds
#go back and play around with this GridSearchCV parameter
#from youtube: grid_search = GridSearchCV(estimator=knn,param_grid=knn_param, n_jobs=1, cv=cv, scoring="accuracy", error_score=0)
#from pratibha: grid_search = GridSearchCV(estimator = knn_clf, param_grid = param_grid,cv = n_folds, verbose = 2, scoring = f1, n_jobs=-1)


# Fit the grid search to the data
grid_search.fit(x_train, y_train)

This data is divided into k of 5 of equal parts (folds), and the model is trained and validated k times. 

-1200 candidates is the probablity of the combination
-6000 fits are 1200 candidate x the k-folds=5

In [None]:
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results



Finding the best scorer and hyperparameter to achieve this score

In [None]:
#Best scorer
print('Best scorer: ', grid_search.best_score_)

#Finding the hyperparameter to achieve this score
print("The list of hyperparmeter in order to achieve this best score: ", grid_search.best_params_)

## 5. Model with the best Hyperparameter

In [None]:
#will inserting the cest hyperparameters that been found when grid search is done
best_model = KNeighborsClassifier(n_neighbors=15, weights='uniform', leaf_size= 25, metric="manhattan",)

In [None]:
best_model.fit(x_train,y_train)
y_pred = best_model.predict(x_test)

#accuracy = best_model.score(x_train, y_train)
#print("Accuracy:", accuracy)

Doing a validation of performance and confusion matrix

## 6. Validation and Confusion Matrix

The best model with the tuned hyperparameter

In [None]:
#Summarize Result
#precision,recall,f1-score,support, accuracy, macro avg, weighted avg
print(classification_report(y_test,y_pred))

#ROC score
auc = roc_auc_score(y_test, y_pred)
print('ROC_AUC score: ', auc)
#plotting roc curve
fpr, tpr, _ = roc_curve(y_test, y_pred)
plt.plot(fpr,tpr,label="AUC="+str(auc))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.show()

#Confusion matrix
cm = (confusion_matrix(y_test,y_pred,))
print('Confusion Matrix: \n', cm)
#confusion matrix plot
disp = ConfusionMatrixDisplay(cm)
disp.plot()
plt.show()

#Print the best hyperparameter tuned for the best model again
print(grid_search.best_params_)

Finally, we can check if the accuracy had improve between the default_knn and when the hyperparameter is tuned for the best_model

## 7. Exporting the model

In [None]:
import joblib
joblib.dump(best_model, 'kNN.pkl', compress=9)

#what is compress=9, do we need that