## Imports

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, accuracy_score
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.datasets import load_digits
from sklearn.neighbors import KNeighborsClassifier

## Custom KNN algorithm

In [5]:
class KNN:
    def __init__(self, k):
        self.k = k
        self.x_train, self.y_train = None, None

    @staticmethod
    def calculate_distance(x, y):
        return np.linalg.norm(x - y)  # Euclidean distance

    def find_k_nearest_neighbors(self, x, version):
        distances = [self.calculate_distance(x, x_train) for x_train in self.x_train]
        sorted_indices = np.argsort(distances)
        k_nearest_neighbors = sorted_indices[:self.k]
        return k_nearest_neighbors

    def fit(self, x, y):
        self.x_train = x
        self.y_train = y

    def predict(self, X, k, version, distance_matrix):
        n_samples = X.shape[0]
        predictions = np.zeros(n_samples, dtype=int)

        for i in range(n_samples):
            # Find the k nearest neighbors for the current data point
            neighbors = self.find_k_nearest_neighbors(X[i], version)

            # Get the class labels of the k nearest neighbors
            neighbor_labels = np.array([self.y_train[idx] for idx in neighbors])

            # Assign the most common class label as the prediction
            predictions[i] = np.argmax(np.bincount(neighbor_labels))

            # Print the message in each iteration
            # print("kNN for k = {}, data point # {}, predicted class = {} ".format(k, i + 1, predictions[i]))

        return predictions

# Load dataset

In [6]:
def load_digits_data():
    digits = load_digits()
    data = pd.DataFrame(data=digits.data, columns=[f'pixel_{i}' for i in range(digits.data.shape[1])])
    data['Target'] = digits.target
    return data

def load_data_subset():
    digits_data = load_digits_data()

    features = digits_data.columns[:-1].tolist()  # Exclude the target column
    target = 'Target'

    X = digits_data[features].values
    y = digits_data[target].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # print("Train data array size: ", X_train.shape)
    # print("Train truth array size: ", y_train.shape)
    # print("Test data array size: ", X_test.shape)
    # print("Test truth array size: ", y_test.shape)

    return X_train, X_test, y_train, y_test

# Load data using the Digits dataset
X_train, X_test, y_train, y_test = load_data_subset()

## Fit and Predict using the Custom KNN algorithm

In [7]:
best_dev_acc = 0.0
best_k = 0
best_clf = None

# Iterate over different values of k
for k in tqdm(range(1, 11)):
    knn = KNN(k=k)  # knn object with current k
    knn.fit(X_train, y_train)  # fit with current k in the TRAINING data...

    # Use X_test and y_test instead of X_dev and y_dev for prediction
    preds = knn.predict(X_test, k, 0, np.zeros((10, len(X_test))))  # predict with current k using the TEST data...
    acc = accuracy_score(y_test, preds)  # calculate accuracy

    # if the current accuracy is higher than the previously best...
    if best_dev_acc == 0.0 or acc > best_dev_acc:
        best_dev_acc = acc
        best_k = k
        best_clf = knn

print('\nBest dev accuracy:', best_dev_acc)
print('Best K:', best_k)

# Predict on the actual test data using the best classifier
test_preds = best_clf.predict(X_test, best_k, 0, np.zeros((10, len(X_test))))
print('Test accuracy:', accuracy_score(y_test, test_preds))

# Print the classification report
print('\nClassification Report:')
print(classification_report(y_test, test_preds))


100%|██████████| 10/10 [00:07<00:00,  1.37it/s]



Best dev accuracy: 0.9888888888888889
Best K: 6
Test accuracy: 0.9888888888888889

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        33
           1       1.00      1.00      1.00        28
           2       1.00      1.00      1.00        33
           3       1.00      1.00      1.00        34
           4       0.98      1.00      0.99        46
           5       0.98      0.98      0.98        47
           6       0.97      1.00      0.99        35
           7       1.00      0.97      0.99        34
           8       1.00      1.00      1.00        30
           9       0.97      0.95      0.96        40

    accuracy                           0.99       360
   macro avg       0.99      0.99      0.99       360
weighted avg       0.99      0.99      0.99       360



## Fit and Predict using KNeighborsClassifier from scikit-learn

In [2]:
# Initialize KNNClassifier
knn_classifier = KNeighborsClassifier()

# Fit the model
knn_classifier.fit(X_train, y_train)

# Predict on the test set
test_preds = knn_classifier.predict(X_test)

# Calculate accuracy
test_accuracy = accuracy_score(y_test, test_preds)
print('Test accuracy:', test_accuracy)

# Generate classification report
print('\nClassification Report:')
print(classification_report(y_test, test_preds))

Test accuracy: 0.9861111111111112

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        33
           1       1.00      1.00      1.00        28
           2       1.00      1.00      1.00        33
           3       1.00      1.00      1.00        34
           4       0.98      1.00      0.99        46
           5       0.98      0.96      0.97        47
           6       0.97      1.00      0.99        35
           7       1.00      0.97      0.99        34
           8       1.00      1.00      1.00        30
           9       0.95      0.95      0.95        40

    accuracy                           0.99       360
   macro avg       0.99      0.99      0.99       360
weighted avg       0.99      0.99      0.99       360

