In [1]:
import numpy as np
import pandas as pd
import time

In [2]:
class kNN(object):
    """Uniform voting by k nearest neighbors as predictions"""
    def __init__(self, k):
        self.k = k

    def train(self, X, y):
        self.Xtrain = X
        self.ytrain = y

    def predict(self, Xtest):
        """Find the k nearest neighbor of the input and do uniform voting"""
        prediction = []
        for xtest in Xtest:
            # initialize k nearest neighbors
            Ind_Dist = [(float("inf"), 0)] * self.k

            for ind, xtrain in enumerate(self.Xtrain):
                dist = np.sum((xtest - xtrain)**2)

                if dist <= Ind_Dist[-1][0]:
                    Ind_Dist[-1] = (dist, ind)
                    Ind_Dist.sort()

            voted_result = sum([self.ytrain[ind] for _, ind in Ind_Dist])
            predict = 1 if voted_result >= 0 else -1
            prediction.append(predict)

        return np.array(prediction)


In [3]:
train_data = pd.read_csv('Data/hw4_knn_train.dat', sep=' ', header=None)
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.8105,-0.35,0.4769,0.4541,-0.9829,0.5252,0.3838,-0.3408,-0.4824,-1
1,-0.6273,-0.2097,0.9404,0.1143,0.3487,-0.5206,0.0061,0.5024,-0.6687,1
2,0.1624,-0.1173,0.426,-0.3607,-0.6632,0.4431,-0.8355,0.7206,-0.8977,1
3,-1.0,0.7758,-0.267,-0.888,-0.1099,-0.9183,-0.4086,0.8962,0.5841,1
4,0.8464,0.1762,0.2729,0.2724,0.8155,0.6096,-0.2844,0.98,0.3302,-1


In [4]:
test_data = pd.read_csv('Data/hw4_knn_test.dat', sep=' ', header=None)
test_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0531,-0.1884,-0.0351,-0.1796,-0.9891,0.612,0.2486,0.844,-0.5123,-1
1,0.5123,0.5047,0.5404,-0.1742,-0.0317,0.9585,-0.4016,-0.18,-0.5633,1
2,0.3286,0.4251,-0.4837,-0.7065,-0.7546,-0.4727,0.9055,0.4941,-0.6287,1
3,-0.0795,-0.1617,-0.8414,-0.5391,0.6641,0.1269,-0.5806,-0.7375,0.9469,1
4,-0.4362,0.149,-0.7232,0.0802,0.4424,-0.4777,0.6075,0.348,-0.9837,1


In [5]:
start = time.clock()

K = 5
one_neighbor = kNN(K)
one_neighbor.train(train_data[list(range(9))].values, train_data[9].values)
print('Using k-nearest-neighbor with k = %d' % K)

train_pred = one_neighbor.predict(train_data[list(range(9))].values)
Ein = sum(train_pred != train_data[9].values) / len(train_pred)
print('\tError rate on training set: %.2f %%' % (100 * Ein))

test_pred = one_neighbor.predict(test_data[list(range(9))].values)
Eout = sum(test_pred != test_data[9].values) / len(test_pred)
print('\tError rate on test set: %.2f %%' % (100 * Eout))

print('\nUsing %.2f seconds' % (time.clock() - start))

Using k-nearest-neighbor with k = 5
	Error rate on training set: 16.00 %
	Error rate on test set: 31.60 %

Using 0.66 seconds
