In [56]:
import math
import numpy as np
import pandas as pd
import scipy.io as sio
import matplotlib.pyplot as plt
import csv
import time

from sklearn.utils.extmath import weighted_mode
from scipy.stats import mode
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances
from sklearn.metrics.pairwise import polynomial_kernel
from numpy.linalg import inv, norm
from mpl_toolkits.mplot3d import axes3d
from numpy import mean
%matplotlib inline

### Import data from csv
Store the data as a numpy array. \
Shape of the full dataset: 
9298 observations (rows), 257 cols, where column 1 corresponds to the true label of the digit, while the cols 2:257 store greyscale values.


In [57]:
# Import to df
data_df = pd.read_csv("zipcombo2.csv")

# Convert to array to facilitate vectorization later
data = data_df.to_numpy()
X = data[:, 1:]
y = data[:, 0]

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

In [59]:
class KNN_classifier(object):
    
    def __init__(self, X_train, y_train, X_test, y_test, n_neighbours=3, prediction = 'mode', metric ='euclidean'):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.k = n_neighbours
        self.prediction = prediction
        self.metric = metric

        # Initialise
        self.n_Data = self.y_train.shape[0]  
        self.n_Data_test = self.y_test.shape[0]

        # Distances
        if metric == 'euclidean':
            self.distances = euclidean_distances(X_train, X_train)
            self.test_distances = euclidean_distances(X_train, X_test)
    
    def predict(self, neighbours, weights = None):
       
        if self.prediction == 'mode':
            y_hat = mode(neighbours)[0]
        if self.prediction == 'weighted':
            y_hat = weighted_mode(neighbours, weights)[0][0]
        
        return y_hat
    
                              
    def train(self):
        self.y_hat = np.zeros(self.n_Data)

        # For a given observation
        for i in range(self.n_Data):

            # Get indexes of neighbouring in increasing order
            x_dist = np.argsort(self.distances[:,i])

            # Convert to a list 
            x_dist_list = list(x_dist)

            # Get the indexes of the k neighbours
            neigh_ind = x_dist_list[:self.k]

            # Get the y-values of the k_neighbours
            neighbours = self.y_train[neigh_ind]

            # Weights
            weights = self.distances[neigh_ind, i]

            # Get the prediction and save it
            self.y_hat[i] = self.predict(neighbours, weights)

        return np.sum(self.y_hat!=self.y_train)/self.n_Data*100


    def test(self):
        self.y_hat_test = np.zeros(self.n_Data_test)

        # For a given observation
        for i in range(self.n_Data_test):

            # Get indexes of neighbouring in increasing order
            x_dist = np.argsort(self.test_distances[:,i])

            # Convert to a list 
            x_dist_list = list(x_dist)

            # Get the indexes of the k neighbours
            neigh_ind = x_dist_list[:self.k]

            # Get the y-values of the k_neighbours
            neighbours = self.y_train[neigh_ind]

            # Weights
            weights = 1/self.test_distances[neigh_ind, i]

            # Get the prediction and save it
            self.y_hat_test[i] = self.predict(neighbours, weights)

        return np.sum(self.y_hat_test!=self.y_test)/self.n_Data_test*100

In [77]:
times = []
for i in range(20):
    startTime = time.perf_counter()

    clf = KNN_classifier(X_train, y_train, X_test, y_test, n_neighbours=3, prediction = 'mode')
    train_error = clf.train()
    test_error = clf.test()
    
    t = time.perf_counter() - startTime
    times.append(t)

In [78]:
print("Mean computational time: " + str(np.mean(times)) +"+/-" + str(np.std(times)) + " seconds")

Mean computational time: 19.03642466000456+/-4.036722710598951 seconds


## Vertify our algorithm is equivalent to the library implementation

In [62]:
for k in [1,2,3,4,5]:
    
    clf = KNN_classifier(X_train, y_train, X_test, y_test, n_neighbours=k, prediction = 'mode')
    train_error = clf.train()
    test_error = clf.test()
    print("k = " + str(k) + " Train Error : " + str(train_error) + " Test Error : " + str(test_error) )

k = 1 Train Error : 0.0 Test Error : 3.387096774193549
k = 2 Train Error : 1.8822264049475665 Test Error : 4.67741935483871
k = 3 Train Error : 1.774670610379134 Test Error : 3.870967741935484
k = 4 Train Error : 2.339338531863404 Test Error : 4.086021505376344
k = 5 Train Error : 2.3662274805055126 Test Error : 4.193548387096775


In [63]:
for k in [1,2,3,4,5]:
    neigh = KNeighborsClassifier(n_neighbors=k)
    neigh.fit(X_train, y_train)
    train_error = 100 - (neigh.score(X_train, y_train))*100
    test_error = 100 - (neigh.score(X_test, y_test))*100
    print("k = " + str(k) + " Test Error : " + str(test_error) + " Train Error : " + str(train_error))

k = 1 Test Error : 3.3870967741935374 Train Error : 0.0
k = 2 Test Error : 4.677419354838705 Train Error : 1.8822264049475592
k = 3 Test Error : 3.8709677419354875 Train Error : 1.7746706103791325
k = 4 Test Error : 4.086021505376351 Train Error : 2.3393385318634046
k = 5 Test Error : 4.193548387096783 Train Error : 2.366227480505515


## Preliminary tests to find good range of k

In [64]:
for k in [1,2,3,4,5,10,15,20]:
    
    clf = KNN_classifier(X_train, y_train, X_test, y_test, n_neighbours=k, prediction = 'mode')
    train_error = clf.train()
    test_error = clf.test()
    print("k = " + str(k) + " Train Error : " + str(train_error) + " Test Error : " + str(test_error) )

k = 1 Train Error : 0.0 Test Error : 3.387096774193549
k = 2 Train Error : 1.8822264049475665 Test Error : 4.67741935483871
k = 3 Train Error : 1.774670610379134 Test Error : 3.870967741935484
k = 4 Train Error : 2.339338531863404 Test Error : 4.086021505376344
k = 5 Train Error : 2.3662274805055126 Test Error : 4.193548387096775
k = 10 Train Error : 3.5762301694003766 Test Error : 4.78494623655914
k = 15 Train Error : 4.235009411132025 Test Error : 5.591397849462366
k = 20 Train Error : 4.961011024468943 Test Error : 6.236559139784946


In [65]:
for k in [1,2,3,4,5,6,7,8]:
    neigh = KNeighborsClassifier(n_neighbors=k, weights = 'distance')
    neigh.fit(X_train, y_train)
    train_error = 100 - (neigh.score(X_train, y_train))*100
    test_error = 100 - (neigh.score(X_test, y_test))*100
    print("k = " + str(k) + " Test Error : " + str(test_error) + " Train Error : " + str(train_error))

k = 1 Test Error : 3.3870967741935374 Train Error : 0.0
k = 2 Test Error : 3.3870967741935374 Train Error : 0.0
k = 3 Test Error : 3.7634408602150557 Train Error : 0.0
k = 4 Test Error : 3.225806451612897 Train Error : 0.0
k = 5 Test Error : 3.8709677419354875 Train Error : 0.0
k = 6 Test Error : 3.709677419354847 Train Error : 0.0
k = 7 Test Error : 4.247311827956992 Train Error : 0.0
k = 8 Test Error : 4.086021505376351 Train Error : 0.0


### 1.1 Basic Results

In [66]:
## 20 runs for d = {1,..,8}

train_errors = []
train_sd = []
test_errors = []
test_sd = []

for k in range(1,9):
    
    run_train_errors = []
    run_test_errors = []
    
    for run in range(20):
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)
        
        clf = KNN_classifier(X_train, y_train, X_test, y_test, n_neighbours=k, prediction = 'mode')
        train_error = clf.train()
        test_error = clf.test()
        run_train_errors.append(train_error)
        run_test_errors.append(test_error)
        
    train_errors.append(np.mean(run_train_errors))
    print("k: " + str(k) + " Average % train error: "+ str(np.mean(run_train_errors))+ " Average % test error: "+ str(np.mean(run_test_errors)))
    test_errors.append(np.mean(run_test_errors))
    test_sd.append(np.std(run_test_errors))
    train_sd.append(np.std(run_train_errors))

k: 1 Average % train error: 0.0 Average % test error: 3.198924731182796
k: 2 Average % train error: 1.8761763915030922 Average % test error: 4.223118279569892
k: 3 Average % train error: 1.840548534552299 Average % test error: 3.53494623655914
k: 4 Average % train error: 2.4005108900242 Average % test error: 3.6962365591397854
k: 5 Average % train error: 2.5732723850497448 Average % test error: 3.7284946236559136
k: 6 Average % train error: 2.945012100026889 Average % test error: 4.126344086021506
k: 7 Average % train error: 3.0162678139284758 Average % test error: 4.309139784946236
k: 8 Average % train error: 3.3759075020166707 Average % test error: 4.295698924731183


In [67]:
# Create a df to store the error results
errors_df = pd.DataFrame()
errors_df['d'] = np.array(range(1,9))
errors_df['Train Error %'] = train_errors
errors_df['+/- Train %'] = train_sd
errors_df['Test Error %'] = test_errors
errors_df['+/- Test %'] = test_sd

In [68]:
errors_df

Unnamed: 0,d,Train Error %,+/- Train %,Test Error %,+/- Test %
0,1,0.0,0.0,3.198925,0.478011
1,2,1.876176,0.066339,4.223118,0.389729
2,3,1.840549,0.080768,3.534946,0.396129
3,4,2.400511,0.094826,3.696237,0.367356
4,5,2.573272,0.096987,3.728495,0.431207
5,6,2.945012,0.118317,4.126344,0.486812
6,7,3.016268,0.147906,4.30914,0.470323
7,8,3.375908,0.129298,4.295699,0.452668


In [69]:
# Export to csv
errors_df.to_csv('KNN-errors_df.csv',index=False)

### 1.2 Cross-Validation

In [70]:
k_stars = []
test_errors = []

for run in range(20):

    # Split the data into 80% training, 20% test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)
    
    # Initialise
    best_error = 500
    best_k = 0

    for k in range(1,9):
       
        error = 0 
        
        # Implement cross-validation
        kfold = KFold(5, True, 1)

        for train_index, test_index in kfold.split(X_train):
            Xtrain, Xtest = X_train[train_index], X_train[test_index]
            ytrain, ytest = y_train[train_index], y_train[test_index]
            clf = KNN_classifier(Xtrain, ytrain, Xtest, ytest, n_neighbours=k, prediction = 'mode')
            train_error = clf.train()
            error += clf.test()
        
        if error/5 < best_error:
            best_error = error/5
            best_k = k
            
    # Once all the polynomial orders considered, retrain on full 80% using d*
    clf = KNN_classifier(X_train, y_train, X_test, y_test, n_neighbours=best_k, prediction = 'mode')
    train_error = clf.train()
    test_error = clf.test()
    print("Run: " + str(run) + " Test Error: " + str(test_error) + " k*:  " + str(best_k))
    test_errors.append(test_error)
    k_stars.append(best_k)

Run: 0 Test Error: 3.1720430107526885 k*:  1
Run: 1 Test Error: 4.46236559139785 k*:  1
Run: 2 Test Error: 3.763440860215054 k*:  1
Run: 3 Test Error: 2.849462365591398 k*:  1
Run: 4 Test Error: 2.741935483870968 k*:  1
Run: 5 Test Error: 3.2795698924731185 k*:  1
Run: 6 Test Error: 3.1720430107526885 k*:  1
Run: 7 Test Error: 3.7096774193548385 k*:  1
Run: 8 Test Error: 3.064516129032258 k*:  1
Run: 9 Test Error: 3.3333333333333335 k*:  3
Run: 10 Test Error: 3.3333333333333335 k*:  3
Run: 11 Test Error: 3.7096774193548385 k*:  1
Run: 12 Test Error: 3.3333333333333335 k*:  1
Run: 13 Test Error: 3.6021505376344085 k*:  1
Run: 14 Test Error: 3.7096774193548385 k*:  1
Run: 15 Test Error: 2.903225806451613 k*:  1
Run: 16 Test Error: 3.655913978494624 k*:  1
Run: 17 Test Error: 2.849462365591398 k*:  3
Run: 18 Test Error: 2.5268817204301075 k*:  1
Run: 19 Test Error: 3.118279569892473 k*:  1


In [71]:
print("Mean test error: " + str(np.mean(test_errors)) + " +/- " + str(np.std(test_errors)))
print("Mean k*: " + str(np.mean(k_stars)) + " +/- " + str(np.std(k_stars)))

Mean test error: 3.3145161290322585 +/- 0.4404912565707445
Mean k*: 1.3 +/- 0.714142842854285
