In [21]:
import math
import operator
import pandas as pd

from sklearn.model_selection import train_test_split

In [22]:
################
#
# Handling data
#
################

iris_df = pd.read_csv("iris_data.csv")


In [23]:
iris_df.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [24]:
train_df, test_df = train_test_split(iris_df, test_size = 0.25, random_state = 42)

In [25]:
train_df.iloc[0][1]

3.6

In [26]:
def euclideanDistance(instance1, instance2, length):

    distance_squared = 0
    for idx in range(length):
        distance_squared += math.pow( instance2[idx] - instance1[idx], 2)
        
    return math.sqrt( distance_squared )


## No need to use Euclidean distance, since minkowski is a general case
    # for p=1/2 minkowski distance = euclidean distance
def minkowskiDistance(instance1,instance2, length, p=1/2):

    distance_squared = 0
    for idx in range(length):
        distance_squared += math.pow( abs(instance2[idx] - instance1[idx]), int(1/p))
        
    return math.pow( distance_squared, p )


In [27]:
trainInstance = [[2, 2, 2, 'a'], [4, 4, 4, 'b']]
testInstance = [5, 5, 5]


min_d = minkowskiDistance([2, 2, 2, 'a'], [5, 5, 5], len(testInstance)-1, 1/2)
euc_d = euclideanDistance([2, 7, 2, 'a'], [5, 5, 5], len(testInstance)-1)
# Are both distances equal for p=1/2?
print( math.pow( math.pow( abs(7-2), 2.0), 1/2 ) )
print( math.sqrt( math.pow( 7-2, 2) ) ) 
print(min_d == euc_d)

5.0
5.0
False


In [28]:

def getNeighbors(trainingSet, testInstance, k, p = 1/2):

    distances = []

    length = len(testInstance)-1

    for x in range(len(trainingSet)):

        dist = minkowskiDistance(testInstance, trainingSet[x], length, p)

        distances.append((trainingSet[x], dist))

        distances.sort(key=operator.itemgetter(1))

        neighbors = []

    for x in range(k):

        neighbors.append(distances[x][0])

    return neighbors





In [29]:
trainSet = [[2, 2, 2, 'a'], [4, 4, 4, 'b']]
testInstance = [5, 5, 5]
k = 1
neighbors = getNeighbors(trainSet, testInstance, 1)
print(neighbors)


[[4, 4, 4, 'b']]


In [30]:
def getResponse(neighbors):

    classVotes = {}
    for x in range(len(neighbors)):
        response = neighbors[x][-1] #complete with appropriate number

        if response in classVotes:
            classVotes[response] += 1
        else:
            classVotes[response] = 1
    

    sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
    # This is inaccurate in case of ties
    return sortedVotes[0][0]




In [31]:
neighbors = [[1,1,1,'a'], [2,2,2,'a'], [3,3,3,'b']]

response = getResponse(neighbors)

print(response)

a


In [32]:
def getAccuracy(testSet, predictions):

    assert len(testSet) == len(predictions)

    correct = 0
    for idx in range(len(testSet)):
        if testSet[idx][-1] == predictions[idx]:
            correct += 1


    return (correct/float(len(testSet))) * 100.0

In [33]:
testSet = [[1,1,1,'a'], [2,2,2,'a'], [3,3,3,'b']]

predictions = ['a', 'a', 'a']

accuracy = getAccuracy(testSet, predictions)

print(accuracy)



66.66666666666666


In [48]:
def main_knn_scorer(train_df, test_df,p):

    predictions = []
    test_df_resultless = test_df.drop(columns="species")
    for idx in range(len(test_df)):
        instance_neighbours = getNeighbors(train_df.values.tolist(), test_df_resultless.iloc[idx], k=4, p=4)
        prediction = getResponse(instance_neighbours)
        predictions.append(prediction)

   
    return getAccuracy(test_df.values.tolist(),predictions )

In [49]:
accuracy_euclidean = main_knn_scorer(train_df, test_df, p=1/2)
accuracy_manhattan = main_knn_scorer(train_df, test_df, p=1)
accurracy_minkowski = main_knn_scorer(train_df, test_df, p=6)

print("Accuracy using Euclidean distance: " + str(accuracy_euclidean))
print("Accuracy using Manhattan distance: " + str(accuracy_manhattan))
print("Accuracy using Minkowski distance: " + str(accurracy_minkowski))

Accuracy using Euclidean distance: 39.473684210526315
Accuracy using Manhattan distance: 39.473684210526315
Accuracy using Minkowski distance: 39.473684210526315
