# Instance-Based Learning: K-Nearest Neighbours(kNN)

## 1. Data Preprocessing

In [108]:
#importing the libraries needed
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#scikit-learn library - not complete

In [109]:
train = pd.read_csv('data_train.csv')
test = pd.read_csv('data_test.csv')

Data transformation of the categorical attrbutes to numeric

In [110]:
#transformation 
from sklearn.preprocessing import LabelEncoder

# Select the categorical columns to encode
cat_columns = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "gender", "native-country"]

# Encode categorical columns using Label Encoding for data_train
le = LabelEncoder()
for col in cat_columns:
    train[col] = le.fit_transform(train[col])

# Encode categorical columns using Label Encoding for data_test
le = LabelEncoder()
for col in cat_columns:
    test[col] = le.fit_transform(test[col])


# Print the first 5 rows of the transformed dataset
#print(train.head())
#print(test.head())

In [111]:

#train
x_train_trans = train.drop(columns = ['income'])
y_train = train['income']

#test
x_test_trans = test.drop(columns =['income'])
y_test = test['income']

print(x_train_trans)
#print(y_train_trans)
#print(x_test_trans)


       age  workclass  fnlwgt  education  educational_num  marital-status  \
0       39          6   77516          9               13               4   
1       50          5   83311          9               13               2   
2       38          3  215646         11                9               0   
3       53          3  234721          1                7               2   
4       28          3  338409          9               13               2   
...    ...        ...     ...        ...              ...             ...   
32556   27          3  257302          7               12               2   
32557   40          3  154374         11                9               2   
32558   58          3  151910         11                9               6   
32559   22          3  201490         11                9               4   
32560   52          4  287927         11                9               2   

       occupation  relationship  race  gender  capital-gain  capital-loss  

Standard Scaling


StandardScaler is being used to standardize the data by removing the mean and scaling to unit variance. This is commn preprocessing step in machine learning to ensure that all features are on the same scale, which can improve the performance and accuracy of many algorithm

In [112]:
#Standard Scaling

from sklearn import preprocessing

x_train = preprocessing.StandardScaler().fit(x_train_trans).transform(x_train_trans.astype(float))
#print(x_train)

x_test = preprocessing.StandardScaler().fit(x_test_trans).transform(x_test_trans.astype(float))
print(x_test)

#x_test have nan value

[[-0.99412926 -0.09851079  0.35347399 ... -0.21806206 -0.03143184
   0.25775643]
 [-0.05541716 -0.09851079 -0.94239062 ... -0.21806206  0.7699177
   0.25775643]
 [-0.77750339 -1.88752825  1.39544986 ... -0.21806206 -0.03143184
   0.25775643]
 ...
 [-0.05541716 -0.09851079  1.75522095 ... -0.21806206  0.7699177
   0.25775643]
 [ 0.37783458 -0.09851079 -0.99842039 ... -0.21806206 -0.03143184
   0.25775643]
 [-0.27204303  0.79599794 -0.0689392  ... -0.21806206  1.57126723
   0.25775643]]


## 2. Using default kNN

In [113]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score


#base model
default_knn = KNeighborsClassifier()
default_knn.fit(x_train, y_train)

#predict
y_pred=default_knn.predict(x_test)
#Error:Input X contains NaN


#Summarize Result
#precision,recall,f1-score,support, accuracy, macro avg, weighted avg
print(classification_report(y_test,y_pred))
#ROC score
print(roc_auc_score(y_test, y_pred))
#confusion matrix
print(confusion_matrix(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.87      0.91      0.89     12435
           1       0.65      0.57      0.61      3846

    accuracy                           0.83     16281
   macro avg       0.76      0.74      0.75     16281
weighted avg       0.82      0.83      0.82     16281

0.7405577019220697
[[11269  1166]
 [ 1635  2211]]


Doing a validation of performance and confusion matrix

## 4. Hyperparameter Tuning

Hyperparameter Tuning for K-neighbours, Weight and Distance Metric and validating its performance using accuracy, precision, recall and F1-score

This will be done by making a ranking of validation

-GridSearchCV 
with a sklearn.model_selection.GridSearchCV

-Training and Validation Split 10% 
    -goals on validation: how robust the validation - doesnt have much variance when you the validation again - (find the robust percentage-is it really 10%)

-Find whether training/validation split and k-folds is the best for kNN


### Method: K-folds Cross Validation

K-folds Cross Validation is used as it uses all the data for training and validation.

In [114]:
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV


# Creating a parameter of the grid
param_grid = [{
    'n_neighbors': list(range(1, 30, 2)),
    'metric': ['euclidean', 'manhattan', 'cosine', 'minkowski'],
    'weights':['uniform','distance'],
    #'algorithm':['auto','ball_tree','kd-tree','brute'],
    'leaf_size' : list(range(1,50, 5))
}]
#is this how you put the hyperparameter range?


from sklearn.metrics import f1_score, make_scorer
f1 = make_scorer(f1_score, pos_label = 1, average = 'binary')

# Grid search model
knn_grid = KNeighborsClassifier()
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1)
grid_search = GridSearchCV(estimator = knn_grid, param_grid = param_grid, cv =cv, verbose = 2, scoring = f1, n_jobs=-1)
#go back and play around with this GridSearchCV parameter
#from youtube: grid_search = GridSearchCV(estimator=knn,param_grid=knn_param, n_jobs=1, cv=cv, scoring="accuracy", error_score=0)

# Fit the grid search to the data
grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 1200 candidates, totalling 6000 fits


This data is divided into k of 5 of equal parts (folds), and the model is trained and validated k times. 

In [None]:
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results

Finding what is the best score/optimal accuracy score

In [None]:
#Finding what is the best score/ optimal accuracy score
print("Best F1-score: ",grid_search.best_score_)

Finding the hyperparameter to achieve this score

In [None]:
#Finding the hyperparameter to achieve this score
print("The list of hyperparmeter in order to achieve this best score: ", grid_search.best_params_)

## 5. Model with the best Hyperparameter

In [None]:
best_model = KNeighborsClassifier(n_neighbors=15, weights='uniform', leaf_size= 25, metric="manhattan",)

In [None]:
best_model.fit(x_train,y_train)
y_pred = best_model.predict(x_test)

#accuracy = best_model.score(x_train, y_train)
#print("Accuracy:", accuracy)

Doing a validation of performance and confusion matrix

## 6. Validation and Confusion Matrix

of the best model with the tuned hyperparameter

In [None]:
#Summarize Result
#precision,recall,f1-score,support, accuracy, macro avg, weighted avg
print(classification_report(y_test,y_pred))
#ROC score
print(roc_auc_score(y_test, y_pred))
#confusion matrix
print(confusion_matrix(y_test,y_pred))

#again print the best hyperparameter tuned for the best model
print(grid_search.best_params_)

Finally, we can check if the accuracy had improve between the default_knn and when the hyperparameter is tuned for the best_model

## 7. Exporting the model

In [None]:
import joblib
joblib.dump(tuned_model, 'kNN.pkl', compress=9)

#what is compress=9, do we need that