# Imported Libraries

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from scipy.spatial import distance
from statistics import mode
import time

In [2]:
# Read text file into np array
training_data = np.loadtxt('./data/pa1train.txt')
training_labels = training_data[:,784]
print(training_data.shape)

(2000, 785)


In [3]:
# Read text file into np array
testing_data = np.loadtxt('./data/pa1test.txt')
testing_labels = testing_data[:,784]
print(testing_data.shape)

(1000, 785)


In [4]:
# Read text files into np array
validation_data = np.loadtxt('./data/pa1validate.txt')
validation_labels = validation_data[:,784]
print(validation_data.shape)

(1000, 785)


# Define KNN Classifier

In [5]:
# Define Function for Gathering K Nearest Points
def getNeighbors(X_train, X_test, K): 
    # Dictionary for storing nearest K data points
    rows = X_train.shape[0]
    cols = (X_train.shape[1] - 1)
    labels = []
    nearest_neighbors = {}
    # Iterate through rows and take euclidean distance
    for i in range(rows): 
        # Take Euclidean Distance of Test Data Point and Each Row Vector
        dist = distance.euclidean(X_test[:cols], X_train[i,:cols])
        label = X_train[i,cols]
        nearest_neighbors[dist] = label
    # Sort Dictionary Based on Euclidean Distances
    counter = 0
    for key in sorted(nearest_neighbors.keys()):
        if counter >= K: 
            # Labels corresponding to nearest K neighbors found, exit
            break
        labels.append(nearest_neighbors[key])
        counter += 1
    # Return List of Labels for K nearest data points
    return labels

In [6]:
# Define Function for Predicting Label 
def K_nearest_neighbor(X_train, X_test, K): 
    # Create list for storing predicted labels
    Y_labels = getNeighbors(X_train, X_test, K)
    # Check if there is a tie between labels
    # Take most common label of the KNN
    Y_pred = max(Y_labels,key = Y_labels.count)
    # Return predicted label 
    return Y_pred

In [7]:
# Define Function for Calculating Error Rate 
def calc_error(Y_pred, Y_label): 
    # Calculate Error Rate for Predicted Labels
    error = [0 for x,y in zip(Y_pred,Y_label) if x != y]
    error_rate = len(error)/len(Y_pred)
    return error_rate

# Question 1: Calculate Training Error

In [8]:
training_data.shape

(2000, 785)

In [9]:
testing_data.shape

(1000, 785)

In [10]:
validation_data.shape

(1000, 785)

In [11]:
# Training Data with K = 1,5,9,15
rows = training_data.shape[0]
KNN = [1,5,9,15]
error = {}
start_time = time.time()
for K in KNN: 
    Y_pred = []
    #print("Nearest Neighbor: ", K)
    for i in range(rows): 
        # Predict label for each feature vector in training data
        #print("Currently on row: ", i)
        prediction = K_nearest_neighbor(training_data, training_data[i,:], K)
        Y_pred.append(prediction)
    # Save Error Rate for Each K
    error[K] = calc_error(Y_pred, training_labels)
end_time = time.time()
print("Time Elapsed: ",abs(end_time - start_time))
print(error)

Time Elapsed:  288.2392728328705
{1: 0.0, 5: 0.0485, 9: 0.0625, 15: 0.0845}


## Check if K = 3 Training Accuracy is Correct

In [12]:
Y_pred_train = []
for i in range(rows): 
    # Predict label for each feature vector in training data
    #print("Currently on row: ", i)
    prediction = K_nearest_neighbor(training_data, training_data[i,:], 3)
    Y_pred_train.append(prediction)
    # Save Error Rate for Each K
training_3_error = calc_error(Y_pred_train, training_labels)
print(training_3_error)

0.034


# Calculate Validation Error

In [13]:
# Validation Data with K = 1,5,9,15
rows = validation_data.shape[0]
KNN = [1,5,9,15]
validation_error = {}
start_time = time.time()
for K in KNN: 
    Y_pred = []
    #print("Nearest Neighbor: ", K)
    for i in range(rows): 
        # Predict label for each feature vector in training data
        #print("Currently on row: ", i)
        prediction = K_nearest_neighbor(training_data, validation_data[i,:], K)
        Y_pred.append(prediction)
    # Save Error Rate for Each K
    validation_error[K] = calc_error(Y_pred, validation_labels)
end_time = time.time()
print("Time Elapsed: ",abs(end_time - start_time))
print(validation_error)

Time Elapsed:  140.82688188552856
{1: 0.082, 5: 0.097, 9: 0.1, 15: 0.102}


# Calculate Test Error w/ Optimal K

In [14]:
Y_pred_test = []
for i in range(rows): 
    # Predict label for each feature vector in training data
    #print("Currently on row: ", i)
    prediction = K_nearest_neighbor(training_data, testing_data[i,:], 1)
    Y_pred.append(prediction)
    # Save Error Rate for Each K
testing_error = calc_error(Y_pred, testing_labels)
print(testing_error)

0.051


# Question 2: Projection Matrix

In [15]:
# Read text file into np array
projection_matrix = np.loadtxt('./data/projection.txt')
training_data = training_data[:,:training_data.shape[1]-1]
validation_data = validation_data[:,:validation_data.shape[1]-1]
testing_data = testing_data[:,:testing_data.shape[1]-1]

In [16]:
# Get Projected Data in 2D Array
proj_training = np.dot(training_data , projection_matrix)
proj_validation = np.dot(validation_data , projection_matrix)
proj_testing = np.dot(testing_data , projection_matrix)

In [17]:
# Reshape Label Vectors
training_labels = training_labels.reshape(2000,1)
testing_labels = testing_labels.reshape(1000,1)
validation_labels = validation_labels.reshape(1000,1)

In [18]:
# Append Labels onto Projected Data
proj_training = np.hstack((proj_training, training_labels))
proj_testing = np.hstack((proj_testing, testing_labels))
proj_validation = np.hstack((proj_validation, validation_labels))

# Training Error with Projected Data

In [19]:
# Projected Training Data with K = 1,5,9,15
rows = proj_training.shape[0]
KNN = [1,3,5,9,15]
error = {}
start_time = time.time()
for K in KNN: 
    Y_pred = []
    #print("Nearest Neighbor: ", K)
    for i in range(rows): 
        # Predict label for each feature vector in training data
        #print("Currently on row: ", i)
        prediction = K_nearest_neighbor(proj_training, proj_training[i,:], K)
        Y_pred.append(prediction)
    # Save Error Rate for Each K
    error[K] = calc_error(Y_pred, training_labels)
end_time = time.time()
print("Time Elapsed: ",abs(end_time - start_time))
print(error)

Time Elapsed:  300.9265892505646
{1: 0.0, 3: 0.0785, 5: 0.1495, 9: 0.198, 15: 0.2325}


# Validation Error with Projected Data

In [20]:
# Validation Data with K = 1,5,9,15
rows = proj_validation.shape[0]
KNN = [1,5,9,15]
proj_validation_error = {}
start_time = time.time()
for K in KNN: 
    Y_pred_validation = []
    #print("Nearest Neighbor: ", K)
    for i in range(rows): 
        # Predict label for each feature vector in training data
        #print("Currently on row: ", i)
        prediction = K_nearest_neighbor(proj_training, proj_validation[i,:], K)
        Y_pred_validation.append(prediction)
    # Save Error Rate for Each K
    validation_error[K] = calc_error(Y_pred_validation, validation_labels)
end_time = time.time()
print("Time Elapsed: ",abs(end_time - start_time))
print(validation_error)

Time Elapsed:  116.72657108306885
{1: 0.32, 5: 0.285, 9: 0.281, 15: 0.285}


# Testing Error with Optimal K 

In [21]:
Y_pred_test = []
for i in range(rows): 
    # Predict label for each feature vector in training data
    #print("Currently on row: ", i)
    prediction = K_nearest_neighbor(proj_training, proj_testing[i,:], 9)
    Y_pred_test.append(prediction)
    # Save Error Rate for Each K
testing_error = calc_error(Y_pred_test, testing_labels)
print(testing_error)

0.282
