# Part 1 - With builtin Python Data structures

In [1]:
import random
import math
import operator
import profile

train = []
test = []
computedClassesforTrain = []

def loadDataAndSplit():
    w, h = 5, 150
    Matrix = [[0 for x in range(w)] for y in range(h)]
    file = open('iris.csv','r')
    i=0
    for line in file:
        listt = line.split(',')
        Matrix[i][0] = float(listt[0])
        Matrix[i][1] = float(listt[1])
        Matrix[i][2] = float(listt[2])
        Matrix[i][3] = float(listt[3])
        Matrix[i][4] = listt[4].rstrip()
        i+=1
    x = 0
    while x!= 51:
        x+=1
        ind = random.randint(0,149)
        if Matrix[ind] in test:
            x-=1
            continue
        else:
            test.append(Matrix[ind])
    
    for x in range(150):
        if Matrix[x] not in test:
            train.append(Matrix[x])

def computeEucledianDistance(tuple1,tuple2):
    distance = 0
    
    distance += pow(tuple1[0] - tuple2[0],2)
    distance += pow(tuple1[1] - tuple2[1],2)
    distance += pow(tuple1[2] - tuple2[2],2)
    distance += pow(tuple1[3] - tuple2[3],2)
    
    return math.sqrt(distance)

def getNeighbours(tuple):
    distances = []
    neighbours = []
    for i in range(len(train)):
        dist = computeEucledianDistance(tuple, train[i])
        distances.append((train[i],dist))
    distances.sort(key=operator.itemgetter(1))
    
    neighbours.append(distances[0][0])
    neighbours.append(distances[1][0])
    neighbours.append(distances[2][0])
    
    return neighbours

def getBelongingClass(neighbours):
    frequencies = dict()
    for tuple in neighbours:
        c = tuple[4]
        if c in frequencies:
            frequencies[c] += 1
        else:
            frequencies[c] = 1
    
    frequencies = sorted(frequencies.items(), key=lambda kv: kv[1], reverse=True)
    return frequencies[0][0]
    
def computeAccuracy(computed):
    correct = 0
    for tuple in train:
        if tuple[4] == computed[train.index(tuple)]:
            correct += 1
    accuracy = "{:.4f}".format((correct/len(train))*100)
    print("Accuracy : ",accuracy,"%")
    
if __name__ == '__main__':
    loadDataAndSplit()
    neighbours = []
    for tuple in train:
        neighbours = getNeighbours(tuple)
        computedClassesforTrain.append(getBelongingClass(neighbours))
    computeAccuracy(computedClassesforTrain)
                  
    #%prun getBelongingClass(neighbours)
    #%prun loadDataAndSplit()
    #%prun getNeighbours(train[0])
    

Accuracy :  94.8454 %


# Part 2 - With Numpy and Pandas

In [2]:
import random
import math
import operator
import pandas as pd
import numpy as np

train = []
test = []
computedClassesforTrain = []

def loadDataAndSplit():
    df = pd.read_csv('./IRIS.csv', header=None)
    tr=df.sample(frac=0.66,random_state=150)
    tt=df.drop(tr.index)
    
    train = np.array(tr)
    test = np.array(tt)
    
    return train,test
    

def computeEucledianDistance(tuple1,tuple2):
    distance = 0
    
    distance += pow(tuple1[0] - tuple2[0],2)
    distance += pow(tuple1[1] - tuple2[1],2)
    distance += pow(tuple1[2] - tuple2[2],2)
    distance += pow(tuple1[3] - tuple2[3],2)
    
    return math.sqrt(distance)

def getNeighbours(tuple):
    distances = []
    neighbours = []
    for i in range(len(train)):
        dist = computeEucledianDistance(tuple, train[i])
        distances.append((train[i],dist))
    distances.sort(key=operator.itemgetter(1))
    
    neighbours.append(distances[0][0])
    neighbours.append(distances[1][0])
    neighbours.append(distances[2][0])
    
    return neighbours

def getBelongingClass(neighbours):
    frequencies = dict()
    for tuple in neighbours:
        c = tuple[4]
        if c in frequencies:
            frequencies[c] += 1
        else:
            frequencies[c] = 1
    
    frequencies = sorted(frequencies.items(), key=lambda kv: kv[1], reverse=True)
    return frequencies[0][0]
    
def computeAccuracy(computed):
    correct = 0
    ind = 0
    for tuple in train:
        if tuple[4] == computed[ind]:
            correct += 1
        ind += 1
    accuracy = "{:.4f}".format((correct/len(train))*100)
    print("Accuracy : ",accuracy,"%")
    
if __name__ == '__main__':
    train,test = loadDataAndSplit()
    neighbours = []
    for tuple in train:
        neighbours = getNeighbours(tuple)
        computedClassesforTrain.append(getBelongingClass(neighbours))
    computeAccuracy(computedClassesforTrain)
    
    #%prun getBelongingClass(neighbours)
    #%prun loadDataAndSplit()
    #%prun getNeighbours(train[0])

Accuracy :  97.9798 %


# Part 3 - Performance Profiling

           Data Load and Split    Get Neighbors    Get Class     Total time
Part 1      0.010s                  0.003s          0.00s          0.013s  
Part 2      0.002s                  0.003s          0.00s          0.005s

The major portion where the numpy implementation got ahead of the basic implementation, was in the data load and split function. Turns out that the built in methods to load and process data are much more robust than when wrote using existing data structures.