In [26]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split

# Loads data of the file in numpy array
data = np.load('../datasets/mnist_train_small.npy')

# Dataset contained input image in form of array values starting from index 1 and 0th index contained value that image is representing 
X = data[:, 1:]
y = data[:, 0]

# Split the data in training and test data in the ratio 2:1 and random state ensures same training and test data is used for every testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Implementation of KNN algorithm
class CustomKNN:
    # constructor
    def __init__(self, n_neighbours=5):
        self.n_neighbours = n_neighbours
        
    # training function
    def fit(self, X, y):
        self._X = (X - X.mean()) / X.std() # standardisation: value of X being large can give wrong results so it is stored by standardising it in small value
        self._y = y
    
    # given a single point returns the class i.e. the number it represents
    def predict_point(self, point):
        # storing the distance of given 'point' from each point in training data
        list_dist = []
        
        # zip function zips together the corresponding values of X and y together
        for x_point, y_point in zip(self._X, self._y):
            dist_point = ((point - x_point) ** 2).sum() #image is represented by large number of coordinates values in X so to find the distance we need to sum the differences of each and every coordinate
            list_dist.append([dist_point, y_point])
        
        ## sorting the list according to the distance
        sorted_dist = sorted(list_dist)
        top_k = sorted_dist[:self.n_neighbours]
        
        # counting the frequency of value in top k neighbours and most frequent value is returned as answer
        items, counts = np.unique(np.array(top_k)[:, 1], return_counts=True)
        ans = items[np.argmax(counts)]
        return ans
            
    # predict the answer for each number in the array
    def predict(self, X):
        results = []
        X = (X - X.mean()) / X.std() 
        for point in X:
            results.append(self.predict_point(point))
        return np.array(results, dtype=int)
    
    # measures the accuracy of the model
    def score(self, X, y):
        s = sum(self.predict(X) == y) / len(y)
        return s*100

In [27]:
m2 = CustomKNN()

In [28]:
m2.fit(X_train, y_train)

In [29]:
m2.predict(X_test[:10])

array([1, 7, 0, 9, 4, 5, 4, 6, 9, 2])

In [30]:
y_test[:10]

array([7, 7, 0, 9, 4, 5, 4, 6, 9, 2], dtype=uint8)

In [31]:
m2.score(X_test[:100], y_test[:100])

95.0