In [1]:
# Import all functions necessary for the notebook
import csv
from random import randrange
import math
import operator

In [2]:
##### Loading, and conversion of CSV file #####

## Load a CSV file
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = csv.reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

## Convert string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())

## Convert string column to integer
def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup


##### Normalize Data ###########

# Find the min and max values for each column

def dataset_minmax(dataset):
    minmax = list()
    for i in range(len(dataset[0])):
        colvalues = [row[i] for row in dataset]
        min_value = min(colvalues) 
        max_value = max(colvalues)
        minmax.append([min_value, max_value])
    return minmax

# Normalize the dataset except last row for classification values
def Normalize_Dataset(dataset, minmax):
    for row in dataset:
        for i in range(len(row)-1):
            row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])

In [3]:
### Splitting dataset methods ###

# Split a dataset into a train and test set
def train_test_split(dataset, split):
    train = list()
    train_size = split * len(dataset)
    dataset_copy = list(dataset)
    while len(train) < train_size:
        index = randrange(len(dataset_copy))
        train.append(dataset_copy.pop(index))
    return train, dataset_copy

# Split a dataset into $k$ folds
def cross_validation_split(dataset, folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / folds)
    for i in range(folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

In [4]:
####### Accuracy for classification problems ######

# Get accuracy of prediction #
def getAccuracy(actual,predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i][-1] == predicted[i]:
            correct += 1
    return (correct / float(len(actual))) * 100.00

# Calculate a Confusion Matrix #
def confusion_matrix(actual, predicted):
    unique = set([row[-1] for row in actual])
    matrix = [list() for x in range(len(unique))]
    for i in range(len(unique)):
        matrix[i] = [0 for x in range(len(unique))]
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
    for i in range(len(actual)):
        x = lookup[actual[i][-1]]
        y = lookup[predicted[i]]
        matrix[x][y] += 1
    return unique, matrix

# Printing a confusion matrix
def print_confusion_matrix(unique, matrix):
    print('Unique prediction values:')
    print('(P)' + ' '.join(str(x) for x in unique))
    print('(A)---')
    print("Confusion Matrix:")
    for i, x in enumerate(unique):
        print("%s| %s" % (x, ' '.join(str(x) for x in matrix[i])))

# Recall classification estimator #
def recall_precision_calc(matrix):
    for i in range(len(matrix[0])):
        row_values = matrix[i] # row values of matrix
        col_values = [row[i] for row in matrix] # column values of matrix
        tp = col_values[i]
        fp = sum(row_values)-row_values[i] # sum all row values - ones in diagonal
        fn = sum(col_values)-col_values[i] # sum all col values - ones in diagonal
    
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    
    F1_score = 2 * (precision * recall) / (precision + recall)
    
    return recall, precision, F1_score

In [5]:
##### Distances definition ######

#Euclidean Distance
def EuclideanDistance(instance1, instance2, length):
    distance = 0
    for i in range(length):
        distance += pow(instance2[i]-instance1[i],2)
    return math.sqrt(distance)

#Manhattan Distance
def ManhattanDistance(instance1, instance2, length):
    distance = 0
    for i in range(length):
        distance += abs(instance2[i]-instance1[i])
    return distance

#Minkowski distance with parameter p for power 
def MinkowskiDistance(instance1, instance2, length, p):
    distance = 0
    for i in range(length):
        distance += pow(abs(instance2[i]-instance1[i]), p)
    return pow(distance, 1/p)

In [6]:
#Get neighbors
def getNeighbors(trainingSet, testInstance, num_neighbors, distancetype, *args):
    distances = []
    length = len(testInstance)-1
    for i in range(len(trainingSet)):
        if distancetype == "Euclidean":
            dist = EuclideanDistance(testInstance, trainingSet[i], length)
        elif distancetype == "Manhattan":
            dist = ManhattanDistance(testInstance, trainingSet[i], length)
        else:
            dist = MinkowskiDistance(testInstance, trainingSet[i], length, *args)
        distances.append((trainingSet[i],dist))
    distances.sort(key=operator.itemgetter(1))
    #return distances
    neighbors = []
    for x in range(num_neighbors):
        neighbors.append(distances[x][0])
    return neighbors

#Classification from neighbors (Classification problem)
def getResponse(neighbors):
    classVotes = {}
    for x in range(len(neighbors)):
        response = neighbors[x][-1]
        if response in classVotes:
            classVotes[response] += 1
        else:
            classVotes[response] = 1
    sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
    return sortedVotes[0][0]

In [7]:
def main():

    # Load iris dataset
    filename = 'iris.csv'
    dataset = load_csv(filename)
    print('Loaded data file {0} with {1} rows and {2} columns'.format(filename, len(dataset), len(dataset[0])))
    print('First line of dataset: ', dataset[0])

    # convert string columns to float
    for i in range(4):
        str_column_to_float(dataset, i)
    # convert class column to int
    lookup = str_column_to_int(dataset, 4)
    print('First line of dataset with class defined by integer: ', dataset[0])
    print('')
    print('Dictionary of lookup classes: ', lookup)
    print('\n')
    
    # normalization of dataset
    minmax = dataset_minmax(dataset)
    Normalize_Dataset(dataset, minmax)

    # Splitting dataset between Training and Testing Set
    split = 0.6
    trainingSet, testSet = train_test_split(dataset, split)

    #generate predictions
    print('Algorithm solving:')
    predictions = []
    num_neighbors = 3
    for i in range(len(testSet)):
        neighbors = getNeighbors(trainingSet, testSet[i], num_neighbors, "Euclidean")
        classify = getResponse(neighbors)
        predictions.append(classify)
        print('> predicted=' + repr(classify) + ', actual=' + repr(testSet[i][-1]))

    #Accuracy Assessment
    accuracy = getAccuracy(testSet,predictions)
    print('Accuracy :' + repr(accuracy) + '%')
    unique, matrix = confusion_matrix(testSet, predictions)

    print('\n')
    print_confusion_matrix(unique, matrix)
    print('\n')

    #Calculate properties for recall and precision
    Recall, Precision, F1_score = recall_precision_calc(matrix)
    print('Recall:', Recall)
    print('Precision:', Precision)
    print('F1 score:', F1_score)

main()

Loaded data file iris.csv with 150 rows and 5 columns
First line of dataset:  ['5.1', '3.5', '1.4', '0.2', 'Iris-setosa']
First line of dataset with class defined by integer:  [5.1, 3.5, 1.4, 0.2, 2]

Dictionary of lookup classes:  {'Iris-versicolor': 0, 'Iris-virginica': 1, 'Iris-setosa': 2}


Algorithm solving:
> predicted=2, actual=2
> predicted=2, actual=2
> predicted=2, actual=2
> predicted=2, actual=2
> predicted=2, actual=2
> predicted=2, actual=2
> predicted=2, actual=2
> predicted=2, actual=2
> predicted=2, actual=2
> predicted=2, actual=2
> predicted=2, actual=2
> predicted=2, actual=2
> predicted=2, actual=2
> predicted=2, actual=2
> predicted=2, actual=2
> predicted=2, actual=2
> predicted=2, actual=2
> predicted=0, actual=0
> predicted=0, actual=0
> predicted=0, actual=0
> predicted=0, actual=0
> predicted=0, actual=0
> predicted=0, actual=0
> predicted=0, actual=0
> predicted=0, actual=0
> predicted=0, actual=0
> predicted=0, actual=0
> predicted=0, actual=0
> predicted=0