# KNN


# Imports

In [1]:
import sys
import numpy as np
import matplotlib.pyplot as plt

sys.path.append('../')

from utils.preprocessing import train_test_split, kfolds_cross_validation
from utils.output_utils import do_cv_and_get_metrics_classification
from utils.scalers import MinMaxScaler
from utils.metrics import accuracy, recall, precision, f1_score

# Process

In [2]:
data = np.genfromtxt('./data/kc2.csv', delimiter=',')
print('Shape:', data.shape)
data[:3, :]
X = data[:, :-1]
y = data[:, [-1]]

Shape: (522, 22)


In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 0.8, random_seed=466852
)

print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_test shape:', X_test.shape)
print('y_test shape:', y_test.shape)


X_train shape: (417, 21)
y_train shape: (417, 1)
X_test shape: (105, 21)
y_test shape: (105, 1)


In [4]:
class MyKNN():
    def __init__(self, k_neighbours, distance='euclidian'):
        self.fitted = False
        self.K = k_neighbours
        self.distance = distance

    def get_cov_matrix(self):
        
        mu = np.mean(self.X, axis=0).reshape(-1, 1)
        
        n_features = self.X.shape[1]
        n_rows = self.X.shape[0]
        sigma=np.zeros((n_features, n_features))

        for i in range(n_rows):
            x_i = self.X[i,:].reshape(n_features, 1)
            sigma += (x_i-mu) @ (x_i-mu).T

        return sigma/(n_rows-1)

    def euclidian_distance(self, i, j):
        distances = np.sqrt(np.sum((i - j)**2, axis=1))
        return distances.reshape(-1, 1)

    def mahalanobis_distance(self, i, j, sigma_inv):
        distances = np.sqrt(np.diag(np.dot(np.dot((i-j), sigma_inv), (i-j).T)))
        return distances.reshape(-1, 1)

    def fit(self, X, y):
        self.X = X
        self.y = y.reshape(-1, 1)

    def get_more_frequently(self, arr):
        unique, counts = np.unique(arr, return_counts=True)
        return unique[np.argmax(counts)]

    def predict(self, X):
        preds = []
        for i in range(len(X)):
            if self.distance == 'euclidian':
                self.distances = self.euclidian_distance(X[[i], :], self.X)
            if self.distance == 'mahalanobis':
                sigma_inv = np.linalg.pinv(np.cov(self.X.T))
                sigma_inv = np.linalg.pinv(self.get_cov_matrix())
                self.distances  = self.mahalanobis_distance(
                    X[[i], :], 
                    self.X, 
                    sigma_inv
                )
                
            knn = np.hstack([self.distances, self.y])
            knn = knn[np.lexsort((knn[:,1], knn[:,0]))]
            preds.append(self.get_more_frequently(knn[:self.K, 1]))

        return np.array(preds).reshape(-1, 1)

## K=5 and Euclidian Distance

In [5]:
my_knn = MyKNN(5)

cv_splits = kfolds_cross_validation(
    data=X_train,
    n_folds=10,
    shuffle=True
)

do_cv_and_get_metrics_classification(
    classifier=my_knn, 
    cv_splits=cv_splits, 
    X_train=X_train, 
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    scaler=MinMaxScaler(),
    title=' My KNN - Euclidian Distance - K=5 '
)

#------------ My KNN - Euclidian Distance - K=5 -------------#

--->	Training Metrics
Accuracy Mean:     	0.8756 | Accuracy Std:   	0.0066
Recall Mean:     	0.5435 | Recall Std:       	0.0214
Precision Mean:     	0.7791 | Precision Std:   	0.0264
F1 Score Mean:     	0.6402 | F1 Score Std:   	0.0215

--->	Validation Metrics
Accuracy Mean:     	0.8251 | Accuracy Std:   	0.0332
Recall Mean:     	0.4503 | Recall Std:       	0.1690
Precision Mean:     	0.5953 | Precision Std:   	0.1372
F1 Score Mean:     	0.4915 | F1 Score Std:   	0.1341

--->	Test Metrics
Accuracy:     	0.8476
Recall:     	0.4091
Precision:     	0.7500
F1 Score:     	0.5294


In [6]:
# TO CHECK
from sklearn.neighbors import KNeighborsClassifier
sk_knn = KNeighborsClassifier(5)

cv_splits = kfolds_cross_validation(
    data=X_train,
    n_folds=10,
    shuffle=True
)

do_cv_and_get_metrics_classification(
    classifier=sk_knn, 
    cv_splits=cv_splits, 
    X_train=X_train, 
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    scaler=MinMaxScaler(),
    title=' Sklearn KNN - Euclidian Distance - K=5 '
)

#---------- Sklearn KNN - Euclidian Distance - K=5 ----------#

--->	Training Metrics
Accuracy Mean:     	0.8756 | Accuracy Std:   	0.0066
Recall Mean:     	0.5435 | Recall Std:       	0.0214
Precision Mean:     	0.7791 | Precision Std:   	0.0264
F1 Score Mean:     	0.6402 | F1 Score Std:   	0.0215

--->	Validation Metrics
Accuracy Mean:     	0.8251 | Accuracy Std:   	0.0332
Recall Mean:     	0.4503 | Recall Std:       	0.1690
Precision Mean:     	0.5953 | Precision Std:   	0.1372
F1 Score Mean:     	0.4915 | F1 Score Std:   	0.1341

--->	Test Metrics
Accuracy:     	0.8476
Recall:     	0.4091
Precision:     	0.7500
F1 Score:     	0.5294


## K=5 and Mahalanobis Distance

In [7]:
my_knn = MyKNN(5, distance='mahalanobis')

cv_splits = kfolds_cross_validation(
    data=X_train,
    n_folds=10,
    shuffle=True
)

do_cv_and_get_metrics_classification(
    classifier=my_knn, 
    cv_splits=cv_splits, 
    X_train=X_train, 
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    scaler=MinMaxScaler(),
    title=' My KNN - Mahalanobis Distance - K=5 '
)

#----------- My KNN - Mahalanobis Distance - K=5 ------------#

--->	Training Metrics
Accuracy Mean:     	0.8543 | Accuracy Std:   	0.0078
Recall Mean:     	0.3958 | Recall Std:       	0.0347
Precision Mean:     	0.7811 | Precision Std:   	0.0276
F1 Score Mean:     	0.5246 | F1 Score Std:   	0.0332

--->	Validation Metrics
Accuracy Mean:     	0.8417 | Accuracy Std:   	0.0420
Recall Mean:     	0.3615 | Recall Std:       	0.1368
Precision Mean:     	0.7171 | Precision Std:   	0.2304
F1 Score Mean:     	0.4698 | F1 Score Std:   	0.1649

--->	Test Metrics
Accuracy:     	0.8190
Recall:     	0.2273
Precision:     	0.7143
F1 Score:     	0.3448


In [8]:
# TO CHECK
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

sk_knn = KNeighborsClassifier(5, metric='mahalanobis', metric_params={'VI': np.linalg.pinv(np.cov(X_train_scaled.T))})

sk_knn.fit(X_train_scaled, y_train.ravel())

X_test_scaled = scaler.transform(X_test)
y_test_pred = sk_knn.predict(X_test_scaled)

print('SKLEARN - KNN with Mahalanobis Distance (To check)')
print('\n--->\tTest Metrics')
print('Accuracy:     \t{0:.4f}'.format(accuracy(y_test, y_test_pred)))
print('Recall:     \t{0:.4f}'.format(recall(y_test, y_test_pred)))
print('Precision:     \t{0:.4f}'.format(precision(y_test, y_test_pred)))
print('F1 Score:     \t{0:.4f}'.format(f1_score(y_test, y_test_pred)))

SKLEARN - KNN with Mahalanobis Distance (To check)

--->	Test Metrics
Accuracy:     	0.8190
Recall:     	0.2273
Precision:     	0.7143
F1 Score:     	0.3448


## K=1 and Euclidian Distance

In [9]:
my_knn = MyKNN(1)

cv_splits = kfolds_cross_validation(
    data=X_train,
    n_folds=10,
    shuffle=True
)

do_cv_and_get_metrics_classification(
    classifier=my_knn, 
    cv_splits=cv_splits, 
    X_train=X_train, 
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    scaler=MinMaxScaler(),
    title=' My KNN - Euclidian Distance - K=1 '
)

#------------ My KNN - Euclidian Distance - K=1 -------------#

--->	Training Metrics
Accuracy Mean:     	0.9832 | Accuracy Std:   	0.0024
Recall Mean:     	0.9177 | Recall Std:       	0.0113
Precision Mean:     	1.0000 | Precision Std:   	0.0000
F1 Score Mean:     	0.9570 | F1 Score Std:   	0.0061

--->	Validation Metrics
Accuracy Mean:     	0.8057 | Accuracy Std:   	0.0438
Recall Mean:     	0.4610 | Recall Std:       	0.1873
Precision Mean:     	0.5077 | Precision Std:   	0.1680
F1 Score Mean:     	0.4743 | F1 Score Std:   	0.1729

--->	Test Metrics
Accuracy:     	0.8190
Recall:     	0.5909
Precision:     	0.5652
F1 Score:     	0.5778


In [10]:
# TO CHECK
from sklearn.neighbors import KNeighborsClassifier
sk_knn = KNeighborsClassifier(1)

cv_splits = kfolds_cross_validation(
    data=X_train,
    n_folds=10,
    shuffle=True
)

do_cv_and_get_metrics_classification(
    classifier=sk_knn, 
    cv_splits=cv_splits, 
    X_train=X_train, 
    y_train=y_train,
    X_test=X_test,
    y_test=y_test, 
    scaler=MinMaxScaler(),
    title=' Sklearn KNN - Euclidian Distance - K=1 '
)

#---------- Sklearn KNN - Euclidian Distance - K=1 ----------#

--->	Training Metrics
Accuracy Mean:     	0.9710 | Accuracy Std:   	0.0060
Recall Mean:     	0.9516 | Recall Std:       	0.0062
Precision Mean:     	0.9105 | Precision Std:   	0.0240
F1 Score Mean:     	0.9305 | F1 Score Std:   	0.0142

--->	Validation Metrics
Accuracy Mean:     	0.7720 | Accuracy Std:   	0.0319
Recall Mean:     	0.4694 | Recall Std:       	0.2013
Precision Mean:     	0.4242 | Precision Std:   	0.1596
F1 Score Mean:     	0.4381 | F1 Score Std:   	0.1721

--->	Test Metrics
Accuracy:     	0.8190
Recall:     	0.5909
Precision:     	0.5652
F1 Score:     	0.5778


The slight differences in metrics occur because Sklearn chooses randonly the first label with the minimum distance of a specific X. If we get more than one example with minimum distance, it might diverge for my implementation. In my implementation i am always using the negative class if it is contained in the subset of minimum distances.

## K=1 and Mahalanobis Distance

In [11]:
my_knn = MyKNN(1, distance='mahalanobis')

cv_splits = kfolds_cross_validation(
    data=X_train,
    n_folds=10,
    shuffle=True
)

do_cv_and_get_metrics_classification(
    classifier=my_knn, 
    cv_splits=cv_splits, 
    X_train=X_train, 
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    scaler=MinMaxScaler(),
    title='My KNN - Mahalanobis Distance - K=1 '
)

#------------My KNN - Mahalanobis Distance - K=1 ------------#

--->	Training Metrics
Accuracy Mean:     	0.9832 | Accuracy Std:   	0.0024
Recall Mean:     	0.9177 | Recall Std:       	0.0113
Precision Mean:     	1.0000 | Precision Std:   	0.0000
F1 Score Mean:     	0.9570 | F1 Score Std:   	0.0061

--->	Validation Metrics
Accuracy Mean:     	0.8153 | Accuracy Std:   	0.0549
Recall Mean:     	0.3805 | Recall Std:       	0.2045
Precision Mean:     	0.5444 | Precision Std:   	0.1814
F1 Score Mean:     	0.4295 | F1 Score Std:   	0.1848

--->	Test Metrics
Accuracy:     	0.8190
Recall:     	0.4545
Precision:     	0.5882
F1 Score:     	0.5128


In [12]:
# TO CHECK
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

sk_knn = KNeighborsClassifier(1, metric='mahalanobis', metric_params={'VI': np.linalg.pinv(np.cov(X_train_scaled.T))})

sk_knn.fit(X_train_scaled, y_train.ravel())

X_test_scaled = scaler.transform(X_test)
y_test_pred = sk_knn.predict(X_test_scaled)

print('SKLEARN - KNN with Mahalanobis Distance and K=1 (To check)')
print('\n--->\tTest Metrics')
print('Accuracy:     \t{0:.4f}'.format(accuracy(y_test, y_test_pred)))
print('Recall:     \t{0:.4f}'.format(recall(y_test, y_test_pred)))
print('Precision:     \t{0:.4f}'.format(precision(y_test, y_test_pred)))
print('F1 Score:     \t{0:.4f}'.format(f1_score(y_test, y_test_pred)))

SKLEARN - KNN with Mahalanobis Distance and K=1 (To check)

--->	Test Metrics
Accuracy:     	0.8190
Recall:     	0.4545
Precision:     	0.5882
F1 Score:     	0.5128
