# Exercise 3
## Part 1 - Loading MNIST dataset

In [4]:
# Imports
import os
import gzip
import numpy as np

# Credits = https://mattpetersen.github.io/load-mnist-with-numpy
"""Load from /home/USER/data/mnist or elsewhere; download if missing."""

from urllib.request import urlretrieve


def mnist(path=None):
    r"""Return (train_images, train_labels, test_images, test_labels).

    Args:
        path (str): Directory containing MNIST. Default is
            /home/USER/data/mnist or C:\Users\USER\data\mnist.
            Create if nonexistant. Download any missing files.

    Returns:
        Tuple of (train_images, train_labels, test_images, test_labels), each
            a matrix. Rows are examples. Columns of images are pixel values.
            Columns of labels are a onehot encoding of the correct class.
    """
    url = 'http://yann.lecun.com/exdb/mnist/'
    files = ['train-images-idx3-ubyte.gz',
             'train-labels-idx1-ubyte.gz',
             't10k-images-idx3-ubyte.gz',
             't10k-labels-idx1-ubyte.gz']

    if path is None:
        # Set path to /home/USER/data/mnist or C:\Users\USER\data\mnist
        path = os.path.join(os.path.expanduser('~'), 'data', 'mnist')

    # Create path if it doesn't exist
    os.makedirs(path, exist_ok=True)

    # Download any missing files
    for file in files:
        if file not in os.listdir(path):
            urlretrieve(url + file, os.path.join(path, file))
            print("Downloaded %s to %s" % (file, path))

    def _images(path):
        """Return images loaded locally."""
        with gzip.open(path) as f:
            # First 16 bytes are magic_number, n_imgs, n_rows, n_cols
            pixels = np.frombuffer(f.read(), 'B', offset=16)
        return pixels.reshape(-1, 784).astype('float32') / 255

    def _labels(path):
        """Return labels loaded locally."""
        with gzip.open(path) as f:
            # First 8 bytes are magic_number, n_labels
            integer_labels = np.frombuffer(f.read(), 'B', offset=8)

        def _onehot(integer_labels):
            """Return matrix whose rows are onehot encodings of integers."""
            n_rows = len(integer_labels)
            n_cols = integer_labels.max() + 1
            onehot = np.zeros((n_rows, n_cols), dtype='uint8')
            onehot[np.arange(n_rows), integer_labels] = 1
            return onehot

        return _onehot(integer_labels)

    train_images = _images(os.path.join(path, files[0]))
    train_labels = _labels(os.path.join(path, files[1]))
    test_images = _images(os.path.join(path, files[2]))
    test_labels = _labels(os.path.join(path, files[3]))

    return train_images, train_labels, test_images, test_labels

train_X, train_y, test_X, test_y = mnist()

## Part 2

In [8]:
# For some odd reason this runs much faster when you restart the 
# kernel and run all cells instead of just running this cell, at least on my pc.

from scipy import stats

# Make the dataset smaller for fast testing.
# Comment out for full size.
train_X = train_X[:10000]
train_y = train_y[:10000]
test_X = test_X[:1000]
test_y = test_y[:1000]

# Adapt knn tester to work with n dimensions - TODO: make this more readable.
def test_knn(k: int, test, X, y): # Takes k, test point and model
    if k % 2 == 0:
        print("k must be odd.")

    # Get distance of each point in model from test point.
    test_dist = np.sqrt(np.sum((test - X)**2, axis=1))
    
    # Sort by distance and return k rows of y - (Results can be processed by user in various ways).
    return np.copy(y)[test_dist.argsort()][:k] # Copy so that the actual y array isn't sorted.

# Test the knn solution against model.
for k in range(1,12,2):
    correct = 0
    wrong = 0
    
    # Iterate over each test sample.
    for i in range(test_X.shape[0]):
        
        # Guess and parse the guess.
        guesses_raw = test_knn(k, test_X[i], train_X, train_y)
        guesses = np.zeros(k, dtype=int)
        idx = 0
        for guess_raw in guesses_raw:
            guesses[idx] = np.where(guess_raw == 1)[0][0]
            idx += 1

        # Record guess and correct response.
        guess = stats.mode(guesses)[0][0]
        response = np.where(test_y[i] == 1)[0][0]

        # Record performance.
        if guess == response:
            correct += 1
        else:
            wrong += 1

    print(f"k = {k}, test success rate: ", correct/(correct + wrong)*100, "%", sep="")
    

k = 1, test success rate: 92.0%
k = 3, test success rate: 91.9%
k = 5, test success rate: 91.60000000000001%
k = 7, test success rate: 91.60000000000001%
k = 9, test success rate: 91.3%
k = 11, test success rate: 91.10000000000001%
lol
k = 1, test success rate: 92.0%
k = 3, test success rate: 91.9%
k = 5, test success rate: 91.60000000000001%
k = 7, test success rate: 91.60000000000001%
k = 9, test success rate: 91.3%
k = 11, test success rate: 91.10000000000001%


Using "only" 10,000 samples for training and 1,000 for testing, the test success rate appears to be 91-92% which is almost the same for all k values in k = {1,3,5,7,9,11}. In fact, it is a little worse as k goes up.