In [3]:
import csv
import math
import operator
import random


def loadDataset(filename, split, trainingSet=[], testSet=[]):
    with open(filename, newline='') as csvfile:
        lines = csv.reader(csvfile, delimiter=',')
        dataset = list(lines)
        for x in range(1, len(dataset) - 1):  # skip header
            for y in range(0, 5):
                dataset[x][y] = float(dataset[x][y])
            if random.random() < split:
                trainingSet.append(dataset[x])
            else:
                testSet.append(dataset[x])


def euclideanDistance(instance1, instance2):
    sum = 0
    for i in range(0, 4):
        sum += ((instance1[i] - instance2[i]) ** 2)
    return math.sqrt(sum)


def getNeighbors(trainingSet, testInstance, k):
    distances = []
    length = len(testInstance) - 1
    for x in range(len(trainingSet)):
        dist = euclideanDistance(testInstance, trainingSet[x])
        distances.append((trainingSet[x], dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(k):
        neighbors.append(distances[x][0])
    return neighbors


def getResponse(neighbors):
    classVotes = {}
    for x in range(len(neighbors)):
        response = neighbors[x][-1]
        if response in classVotes:
            classVotes[response] += 1
        else:
            classVotes[response] = 1
    sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
    return sortedVotes[0][0]


def getAccuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions[x]:
            correct += 1
    return (correct / float(len(testSet))) * 100.0


# prepare data
trainingSet = []
testSet = []
split = 0.2
loadDataset('Iris (1).csv', split, trainingSet, testSet)
print('Train set: ' + repr(len(trainingSet)))
print('Test set: ' + repr(len(testSet)))
# generate predictions
for y in range(1, 18):
    predictions = []
    for x in range(len(testSet)):
        neighbors = getNeighbors(trainingSet, testSet[x], y)
        result = getResponse(neighbors)
        predictions.append(result)
    accuracy = getAccuracy(testSet, predictions)
    print('Accuracy: ' + repr(accuracy) + '% for count of neighbors ' + str(y))

neighbors = getNeighbors(trainingSet, [1, 5.1, 3.5, 1.4, 0.2], 1)  # Iris-setosa
result = getResponse(neighbors)
print(result)
neighbors = getNeighbors(trainingSet, [73, 6.3, 2.5, 4.9, 1.5], 1)  # Iris-versicolor
result = getResponse(neighbors)
print(result)
neighbors = getNeighbors(trainingSet, [122, 5.6, 2.8, 4.9, 2], 1)  # Iris-virginica
result = getResponse(neighbors)
print(result)


Train set: 24
Test set: 125
Accuracy: 96.0% for count of neighbors 1
Accuracy: 96.0% for count of neighbors 2
Accuracy: 95.19999999999999% for count of neighbors 3
Accuracy: 95.19999999999999% for count of neighbors 4
Accuracy: 92.80000000000001% for count of neighbors 5
Accuracy: 95.19999999999999% for count of neighbors 6
Accuracy: 88.8% for count of neighbors 7
Accuracy: 92.0% for count of neighbors 8
Accuracy: 86.4% for count of neighbors 9
Accuracy: 87.2% for count of neighbors 10
Accuracy: 80.0% for count of neighbors 11
Accuracy: 76.0% for count of neighbors 12
Accuracy: 71.2% for count of neighbors 13
Accuracy: 68.0% for count of neighbors 14
Accuracy: 64.8% for count of neighbors 15
Accuracy: 64.0% for count of neighbors 16
Accuracy: 64.0% for count of neighbors 17
Iris-setosa
Iris-versicolor
Iris-virginica
