In [1]:
# Assignment1
# Author: David Bui
# DSC-540
# Date: 11/3/2021
# Data Pulled From: https://pjreddie.com/projects/mnist-in-csv/
# Description: Implementation of the K-Nearest Neighbor Algorithm or K-NN. Each observation within the
#  dataset represents an image that depicts a number ranging from 0 to 9. The algorithm learns the 
#  patterns of each number and then predicts against actual measures.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk

# Pulling split data, Number and Image column names added.
train = pd.read_csv(r'C:\Users\David\OneDrive\Desktop\GCU Studies\DSC-540\Topic 2\Assignment\Data\mnist_train.csv')
test = pd.read_csv(r'C:\Users\David\OneDrive\Desktop\GCU Studies\DSC-540\Topic 2\Assignment\Data\mnist_test.csv')
mnist = pd.concat([train, test],ignore_index=True)
mnist.head(1)

Unnamed: 0,Number,Image1,Image2,Image3,Image4,Image5,Image6,Image7,Image8,Image9,...,Image775,Image776,Image777,Image778,Image779,Image780,Image781,Image782,Image783,Image784
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [2]:
# Dimensional Reduction
for col in mnist.columns:
    if len(mnist[col].unique()) == 1:
        mnist.drop(col,inplace=True,axis=1)
mnist.head(1)

Unnamed: 0,Number,Image13,Image14,Image15,Image16,Image33,Image34,Image35,Image36,Image37,...,Image771,Image772,Image773,Image774,Image775,Image776,Image777,Image778,Image779,Image780
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
from sklearn.model_selection import train_test_split
# Instantiate vector variables
X_mnist = mnist.iloc[: , 1:]
y_mnist = mnist.iloc[: , :1]

# Splitting data into train and test sets, its a 80% to 20% split
# From researching I learned that a 7:3 or 8:2 ratio is a good split for prediction testing.
X, X_test, y, y_test = train_test_split(X_mnist, y_mnist, test_size=0.2, random_state=1)

In [4]:
# Implementing the KNN algorithm
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 10)

# Fit the classifier to the data
knn.fit(X,y.values.ravel())

KNeighborsClassifier(n_neighbors=10)

In [19]:
# Define the Euclidean distance between an element in the test set and the training set.
from scipy.spatial import distance
# Euclidean Distance: the distance between two elements.
train_element = X.loc[[2]] # Number 4
test_element = X_test.loc[[59772]] # Also Number 4

distance = distance.euclidean(X.loc[[2]],test_element)
print("Distance between same elements within training and test sets:",distance)

Distance between same elements within training and test sets: 2109.4515874985136


In [20]:
# Calculate the distance between the test element and each of if its k nearest neighbors.
from sklearn.neighbors import NearestNeighbors
neighbors = NearestNeighbors(n_neighbors=10)
neighbors.fit(X,y)
NearestNeighbors(algorithm='auto', leaf_size=30)
ndist,indexN = neighbors.kneighbors(test_element)
print("Distances to nearest neighbors:",ndist)

Distances to nearest neighbors: [[ 843.50222288  905.22870038  961.80039509  989.68479831  990.4584797
  1022.27882694 1047.9460864  1069.49614305 1075.32320723 1076.81660463]]


In [81]:
# Count the occurrence of each digit within the k nearest neighbors and identify the most popular digit.
from collections import Counter
array = []
number = indexN.tolist()
for i in range(len(number)):
    array.append(y_mnist.loc[number[i]])
array # This array reflects the distance of each neighbor in decending order, the next step is to measure frequency.
# The frequency has higher priority over the distance, hence popular vote. In case of a draw, distance is the deciding factor.

[       Number
 26111       4
 6361        7
 3390        2
 27389       3
 37370       6
 38938       4
 54139       9
 43486       2
 33919       6
 37423       7]

In [85]:
# Identify the test element as the digit voted as most popular in the set of the k nearest neighbors.
Counter([4,7,2,3,6,4,9,2,6,7]) # I did this because the method I used involving ndist,indexN broke the array and made it immutable
# the nearestneighbor function ordered the distancing first, and the counter ordered the frequency 2nd.
# This leaves number 4 as the most frequent and nearest neighbor.

Counter({4: 2, 7: 2, 2: 2, 3: 1, 6: 2, 9: 1})

In [107]:
# Classify the test element accordingly (i.e. based on the popular vote).
predict = 4
actual = y_test.loc[[59772]]
actual
# Prediciton correct.

Unnamed: 0,Number
59772,4


In [109]:
# Calculate the error.
# check accuracy of our model on the test data
y_pred = knn.predict(X_test)

accur = knn.score(X_test, y_test) # Above 95% accuracy!
error = 1-accur
print(error)

0.03192857142857142


In [112]:
# K-Fold Cross-Validation Method
from sklearn.model_selection import cross_val_score

#train model with cv of 5 
cv_scores = cross_val_score(knn, X, y.values.ravel(), cv=5)

#print each cv score (accuracy) and average them
print("Mean Cross-Validation Accuracy:",np.mean(cv_scores))# Also above 95%!

Mean Cross-Validation Accuracy: 0.9650892857142856


In [125]:
# Confusion Matrix
from sklearn import metrics

#labels=["0","1","2","3","4","5","6","7","8","9"]
print(" Actual:  0     1     2     3     4     5     6     7     8     9")
metrics.confusion_matrix(y_test, y_pred)

 Actual:  0     1     2     3     4     5     6     7     8     9


array([[1370,    1,    3,    1,    0,    1,    2,    1,    0,    1],
       [   1, 1622,    5,    2,    0,    0,    1,    1,    0,    0],
       [  13,   14, 1372,    3,    2,    0,    1,   19,    6,    3],
       [   2,    6,    7, 1371,    0,    8,    3,   14,   10,   10],
       [   0,   11,    1,    0, 1284,    0,    1,    6,    0,   25],
       [   1,    4,    1,   13,    2, 1254,   11,    0,    3,    8],
       [   5,    1,    0,    0,    2,    8, 1315,    0,    0,    0],
       [   2,   21,    2,    0,    2,    0,    0, 1404,    1,   12],
       [   8,   23,    1,    8,    6,   25,    7,    3, 1250,   20],
       [   2,    8,    3,    8,   12,    2,    1,   22,    4, 1311]],
      dtype=int64)