In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import gzip
import matplotlib
import matplotlib.pyplot as plt
import struct as st
import random

def fetch_mnist_data(data):
    '''
    Function that convert mnist gz file to numpy array.
    Param: data: Can be: 'Train', 'Train_labels', 'Test', 'Test_labels'
    Return: numpy array with data from file.
    '''
    
    filenames = {'Train':'train-images-idx3-ubyte.gz', 'Train_labels': 'train-labels-idx1-ubyte.gz',
                'Test': 't10k-images-idx3-ubyte.gz', 'Test_labels': 't10k-labels-idx1-ubyte.gz'}
    
    if data == 'Train' or data == 'Test':
        Data = gzip.open(filenames[data])
        Data.seek(0)
        magic = st.unpack('>I',Data.read(4))[0] # magic number
        quantity = st.unpack('>I',Data.read(4))[0] # number of items
        nR = st.unpack('>I',Data.read(4))[0] # rows
        nC = st.unpack('>I',Data.read(4))[0] # cols
        total = quantity * nR * nC

        Img_data = np.array(st.unpack(f'>{total}B', Data.read(total))).reshape((quantity, nR*nC))
        return Img_data
    
    elif data == 'Train_labels' or data == 'Test_labels':
        Data = gzip.open(filenames[data])
        Data.seek(0)
        magic = st.unpack('>I',Data.read(4))[0] # magic number
        quantity = st.unpack('>I',Data.read(4))[0] # number of items
        
        Img_labels = np.array(st.unpack(f'>{quantity}B', Data.read(quantity)))
        return Img_labels
    
    else:
        raise ValueError("Wrong data")

In [2]:
Train = fetch_mnist_data('Train')
Train_labels = fetch_mnist_data('Train_labels')
Test = fetch_mnist_data('Test')
Test_labels = fetch_mnist_data('Test_labels')

#Shuffle data
shuffle_index = np.random.permutation(60000)
Train, Train_labels = Train[shuffle_index], Train_labels[shuffle_index]

In [3]:
from sklearn.metrics import pairwise_distances

class knn_classyfier():
    def __init__(self, n_neighbors):
        self.n_neighbors = n_neighbors
        
    def fit(self, train, train_target):
        self.train = train
        self.train_target = train_target
        
    def distances(self, test_data):
        return pairwise_distances(test_data, self.train)
    
    def classify(self, data):
        distance = self.distances(data)
        classes = []
        for i in range(len(distance)):
            classes.append([])
            sorted_distances = sorted(distance[i])
            for k in range(1, self.n_neighbors+1):
                classes[i].append(self.train_target[np.where(sorted_distances[k] == distance[i])[0][0]])
        print(classes)
        return classes
    
    def predict(self, data):
        classes = self.classify(data)
        class_ = []
        for i in range(len(classes)):
            counts = np.bincount(classes[i])
            class_.append(np.argmax(counts))
        return np.array(class_)
    
    def score(self, test, labels):
        Z = self.predict(test)
        score = np.sum(Z == labels) / len(Z)
        return score
        

In [4]:
# knn_clf = knn_classyfier(3)
# knn_clf.fit(Train, Train_labels)
# number = random.randint(0,1000)
# knn_clf.score(Train, Train_labels)
# print(Train_labels[number])


In [5]:
np.min(pairwise_distances([Train[300]], Train))

0.0

In [6]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier()
knn.fit(Train, Train_labels)
knn.score(Train, Train_labels)