In [1]:
import pandas as pd
import numpy as np
import sklearn as sk

header = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation',
       'relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','salary']
data_df = pd.read_csv(r'F:\COMP551\p1\Adult Dataset\adult.data',header = None,na_values = ' ?',names = header)
test_df = pd.read_csv(r'F:\COMP551\p1\Adult Dataset\adult.test',header = None,na_values = ' ?',names = header)

In [2]:
euclidean = lambda x1, x2: np.sqrt(np.sum((x1 - x2)**2, axis=-1))
manhattan = lambda x1, x2: np.sum(np.abs(x1 - x2), axis=-1)

class KNN:

    def __init__(self, K=1, dist_fn= euclidean):
        self.dist_fn = dist_fn
        self.K = K
        return
    
    def fit(self, x, y):
        ''' Store the training data using this method as it is a lazy learner'''
        self.x = x
        self.y = y
        self.C = np.max(y) + 1
        return self
    
    def predict(self, x_test):
        ''' Makes a prediction using the stored training data and the test data given as argument'''
        num_test = x_test.shape[0]
        #calculate distance between the training & test samples and returns an array of shape [num_test, num_train]
        distances = self.dist_fn(self.x[None,:,:], x_test[:,None,:])
        #ith-row of knns stores the indices of k closest training samples to the ith-test sample 
        knns = np.zeros((num_test, self.K), dtype=int)
        #ith-row of y_prob has the probability distribution over C classes
        y_prob = np.zeros((num_test, self.C))
        for i in range(num_test):
            knns[i,:] = np.argsort(distances[i])[:self.K]
            y_prob[i,:] = np.bincount(self.y[knns[i,:]], minlength=self.C) #counts the number of instances of each class in the K-closest training samples
        #y_prob /= np.sum(y_prob, axis=-1, keepdims=True)
        #simply divide by K to get a probability distribution
        y_prob /= self.K
        return y_prob, knns

In [3]:
def split(dataset, folds):
	
	return np.array_split(dataset, folds)

test = split(data_df, 5)

In [4]:
def cross_validation(dataset, folds, K):
    
    best_acc = 0
    best_config = 0
    best_model = KNN(K=0)
    
    for k in np.arange(1,K):

        model = KNN(K=K)
        
        data_splited = split(dataset, folds)

        acc_l = 0
        
        for i in range(folds):
            
            validation = data_splited[i]

            r = list(range(folds))
            r.pop(i)

            for j in r:

                if j == r[0] :
                    training = data_splited[j]
                else:
                    training = np.concatenate((training,data_splited[j]), axis=0)

            y_prob, knns = model.fit(training, training[:][14]).predict(validation)
            y_pred = np.argmax(y_prob,axis=-1)
            acc = np.sum(y_pred == validation[:][14])/validation[:][14].shape[0]
            acc_l += acc

        acc_m = acc_l/folds
        
        if best_acc < acc_m:
            best_acc = acc_m
            best_config = K
            best_model = model

    return  best_acc, best_config, best_model   

    
        
        