In [1]:
# Import all functions necessary for the notebook
import csv
from random import randrange
import math
import operator

In [2]:
##### Loading, and conversion of CSV file #####

## Load a CSV file
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = csv.reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

## Convert string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())

## Convert string column to integer
def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup


##### Normalize Data ###########

# Find the min and max values for each column

def dataset_minmax(dataset):
    minmax = list()
    for i in range(len(dataset[0])):
        colvalues = [row[i] for row in dataset]
        min_value = min(colvalues) 
        max_value = max(colvalues)
        minmax.append([min_value, max_value])
    return minmax

# Normalize the dataset except last row for classification values
def Normalize_Dataset(dataset, minmax):
    for row in dataset:
        for i in range(len(row)-1):
            row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])

In [3]:
### Splitting dataset methods ###

# Split a dataset into a train and test set
def train_test_split(dataset, split):
    train = list()
    train_size = split * len(dataset)
    dataset_copy = list(dataset)
    while len(train) < train_size:
        index = randrange(len(dataset_copy))
        train.append(dataset_copy.pop(index))
    return train, dataset_copy

# Split a dataset into $k$ folds
def cross_validation_split(dataset, folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / folds)
    for i in range(folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

In [4]:
###### Accuracy methods for Regression problems ########

# Calculate mean absolute error (MAE) #
def mae_metric(actual, predicted):
    sum_error = 0.0
    for i in range(len(actual)):
        sum_error += abs(predicted[i] - actual[i][-1])
    return sum_error / float(len(actual))

# Calculate root mean squared error #
def rmse_metric(actual, predicted):
    sum_error = 0.0
    for i in range(len(actual)):
        prediction_error = predicted[i] - actual[i][-1]
        sum_error += (prediction_error ** 2)
    mean_error = sum_error / float(len(actual))
    return math.sqrt(mean_error)

In [5]:
##### Distances definition ######

#Euclidean Distance
def EuclideanDistance(instance1, instance2, length):
    distance = 0
    for i in range(length):
        distance += pow(instance2[i]-instance1[i],2)
    return math.sqrt(distance)

#Manhattan Distance
def ManhattanDistance(instance1, instance2, length):
    distance = 0
    for i in range(length):
        distance += abs(instance2[i]-instance1[i])
    return distance

#Minkowski distance with parameter p for power 
def MinkowskiDistance(instance1, instance2, length, p):
    distance = 0
    for i in range(length):
        distance += pow(abs(instance2[i]-instance1[i]), p)
    return pow(distance, 1/p)

In [6]:
#Get neighbors
def getNeighbors(trainingSet, testInstance, num_neighbors, distancetype, *args):
    distances = []
    length = len(testInstance)-1
    for i in range(len(trainingSet)):
        if distancetype == "Euclidean":
            dist = EuclideanDistance(testInstance, trainingSet[i], length)
        elif distancetype == "Manhattan":
            dist = ManhattanDistance(testInstance, trainingSet[i], length)
        else:
            dist = MinkowskiDistance(testInstance, trainingSet[i], length, *args)
        distances.append((trainingSet[i],dist))
    distances.sort(key=operator.itemgetter(1))
    #return distances
    neighbors = []
    for x in range(num_neighbors):
        neighbors.append(distances[x][0])
    return neighbors

#Regression by taking mean from neighbors (Regression problem)
def getRegression(neighbors):
    output_values = [row[-1] for row in neighbors]
    return sum(output_values) / float(len(output_values))

In [7]:
def main():

    # Load iris dataset
    filename = 'abalone.csv'
    dataset = load_csv(filename)

    # convert string columns to float
    for i in range(1, len(dataset[0])):
        str_column_to_float(dataset, i)
    # convert first column to int
    str_column_to_int(dataset, 0)

    # Splitting dataset between Training and Testing Set
    split = 0.6
    trainingSet, testSet = train_test_split(dataset, split)

    #generate predictions
    predictions = []
    num_neighbors = 3
    for i in range(len(testSet)):
        neighbors = getNeighbors(trainingSet, testSet[i], num_neighbors, "Euclidean")
        output = getRegression(neighbors)
        predictions.append(output)
        print('> predicted = %.2f, actual = %.1f' % (output, testSet[i][-1]))

    print('')
    RMSE = rmse_metric(testSet, predictions)
    print('RMSE: %.3f' % RMSE)

main()

> predicted = 10.67, actual = 9.0
> predicted = 9.67, actual = 10.0
> predicted = 8.00, actual = 8.0
> predicted = 10.33, actual = 16.0
> predicted = 8.33, actual = 9.0
> predicted = 13.00, actual = 19.0
> predicted = 12.67, actual = 14.0
> predicted = 10.33, actual = 10.0
> predicted = 9.67, actual = 7.0
> predicted = 8.00, actual = 11.0
> predicted = 9.67, actual = 12.0
> predicted = 9.00, actual = 9.0
> predicted = 12.00, actual = 10.0
> predicted = 10.00, actual = 11.0
> predicted = 12.67, actual = 12.0
> predicted = 11.00, actual = 15.0
> predicted = 8.67, actual = 11.0
> predicted = 12.67, actual = 10.0
> predicted = 15.00, actual = 15.0
> predicted = 11.67, actual = 18.0
> predicted = 11.33, actual = 19.0
> predicted = 9.00, actual = 8.0
> predicted = 10.33, actual = 11.0
> predicted = 7.33, actual = 9.0
> predicted = 8.33, actual = 9.0
> predicted = 10.00, actual = 7.0
> predicted = 8.33, actual = 6.0
> predicted = 9.67, actual = 7.0
> predicted = 7.33, actual = 10.0
> predicte

> predicted = 11.33, actual = 11.0
> predicted = 8.00, actual = 10.0
> predicted = 8.67, actual = 12.0
> predicted = 10.67, actual = 12.0
> predicted = 9.33, actual = 13.0
> predicted = 9.00, actual = 7.0
> predicted = 3.67, actual = 7.0
> predicted = 9.67, actual = 10.0
> predicted = 10.67, actual = 12.0
> predicted = 8.00, actual = 15.0
> predicted = 8.67, actual = 9.0
> predicted = 9.33, actual = 9.0
> predicted = 9.33, actual = 8.0
> predicted = 9.33, actual = 7.0
> predicted = 9.33, actual = 6.0
> predicted = 8.00, actual = 7.0
> predicted = 15.00, actual = 13.0
> predicted = 12.00, actual = 19.0
> predicted = 6.67, actual = 11.0
> predicted = 9.00, actual = 6.0
> predicted = 11.67, actual = 11.0
> predicted = 8.67, actual = 19.0
> predicted = 10.33, actual = 11.0
> predicted = 13.00, actual = 15.0
> predicted = 14.00, actual = 14.0
> predicted = 15.67, actual = 12.0
> predicted = 10.33, actual = 11.0
> predicted = 18.33, actual = 23.0
> predicted = 8.00, actual = 7.0
> predicted 

> predicted = 7.00, actual = 6.0
> predicted = 5.67, actual = 7.0
> predicted = 6.33, actual = 7.0
> predicted = 6.00, actual = 6.0
> predicted = 6.67, actual = 7.0
> predicted = 7.00, actual = 6.0
> predicted = 6.67, actual = 6.0
> predicted = 6.00, actual = 8.0
> predicted = 7.00, actual = 7.0
> predicted = 6.67, actual = 8.0
> predicted = 7.00, actual = 7.0
> predicted = 7.00, actual = 7.0
> predicted = 8.00, actual = 8.0
> predicted = 6.67, actual = 5.0
> predicted = 8.33, actual = 7.0
> predicted = 6.00, actual = 7.0
> predicted = 8.33, actual = 8.0
> predicted = 6.67, actual = 7.0
> predicted = 6.00, actual = 8.0
> predicted = 8.00, actual = 8.0
> predicted = 8.67, actual = 8.0
> predicted = 7.67, actual = 8.0
> predicted = 8.67, actual = 9.0
> predicted = 8.33, actual = 9.0
> predicted = 8.67, actual = 8.0
> predicted = 7.67, actual = 7.0
> predicted = 8.67, actual = 9.0
> predicted = 9.33, actual = 10.0
> predicted = 9.00, actual = 9.0
> predicted = 8.67, actual = 10.0
> predic

> predicted = 10.67, actual = 10.0
> predicted = 13.00, actual = 9.0
> predicted = 9.33, actual = 10.0
> predicted = 11.33, actual = 9.0
> predicted = 10.33, actual = 9.0
> predicted = 12.33, actual = 10.0
> predicted = 9.33, actual = 9.0
> predicted = 11.00, actual = 10.0
> predicted = 9.33, actual = 8.0
> predicted = 9.33, actual = 9.0
> predicted = 9.67, actual = 9.0
> predicted = 11.00, actual = 15.0
> predicted = 9.67, actual = 9.0
> predicted = 10.00, actual = 10.0
> predicted = 9.33, actual = 11.0
> predicted = 10.00, actual = 9.0
> predicted = 11.33, actual = 12.0
> predicted = 11.00, actual = 12.0
> predicted = 9.67, actual = 11.0
> predicted = 13.67, actual = 10.0
> predicted = 9.67, actual = 10.0
> predicted = 9.33, actual = 11.0
> predicted = 11.00, actual = 16.0
> predicted = 14.33, actual = 11.0
> predicted = 9.67, actual = 9.0
> predicted = 10.00, actual = 12.0
> predicted = 11.33, actual = 10.0
> predicted = 12.00, actual = 11.0
> predicted = 11.00, actual = 13.0
> pred

> predicted = 6.33, actual = 6.0
> predicted = 6.67, actual = 8.0
> predicted = 8.67, actual = 7.0
> predicted = 7.67, actual = 8.0
> predicted = 8.00, actual = 8.0
> predicted = 8.33, actual = 9.0
> predicted = 8.33, actual = 9.0
> predicted = 7.33, actual = 7.0
> predicted = 10.00, actual = 8.0
> predicted = 8.67, actual = 9.0
> predicted = 9.67, actual = 8.0
> predicted = 9.33, actual = 7.0
> predicted = 9.00, actual = 8.0
> predicted = 8.67, actual = 9.0
> predicted = 10.00, actual = 8.0
> predicted = 12.00, actual = 9.0
> predicted = 10.00, actual = 9.0
> predicted = 10.33, actual = 9.0
> predicted = 10.33, actual = 10.0
> predicted = 11.00, actual = 10.0
> predicted = 11.33, actual = 9.0
> predicted = 10.00, actual = 9.0
> predicted = 11.00, actual = 9.0
> predicted = 10.00, actual = 9.0
> predicted = 10.00, actual = 10.0
> predicted = 11.00, actual = 11.0
> predicted = 11.33, actual = 13.0
> predicted = 10.00, actual = 11.0
> predicted = 11.00, actual = 13.0
> predicted = 10.33,

> predicted = 11.67, actual = 11.0
> predicted = 10.33, actual = 8.0
> predicted = 14.00, actual = 19.0
> predicted = 10.00, actual = 9.0
> predicted = 13.67, actual = 17.0
> predicted = 12.00, actual = 10.0
> predicted = 5.67, actual = 5.0
> predicted = 4.33, actual = 5.0
> predicted = 9.00, actual = 15.0
> predicted = 10.00, actual = 14.0
> predicted = 6.33, actual = 7.0
> predicted = 11.00, actual = 16.0
> predicted = 14.33, actual = 13.0
> predicted = 13.33, actual = 19.0
> predicted = 13.33, actual = 12.0
> predicted = 8.33, actual = 8.0
> predicted = 5.67, actual = 8.0
> predicted = 6.00, actual = 6.0
> predicted = 6.00, actual = 7.0
> predicted = 7.00, actual = 7.0
> predicted = 8.67, actual = 8.0
> predicted = 8.67, actual = 9.0
> predicted = 9.00, actual = 7.0
> predicted = 8.67, actual = 9.0
> predicted = 5.00, actual = 4.0
> predicted = 9.00, actual = 6.0
> predicted = 6.33, actual = 8.0
> predicted = 7.00, actual = 8.0
> predicted = 7.00, actual = 7.0
> predicted = 8.00, ac

> predicted = 10.33, actual = 9.0
> predicted = 8.00, actual = 6.0
> predicted = 7.33, actual = 7.0
> predicted = 7.67, actual = 7.0
> predicted = 8.00, actual = 8.0
> predicted = 9.00, actual = 8.0
> predicted = 8.33, actual = 8.0
> predicted = 9.00, actual = 11.0
> predicted = 10.00, actual = 9.0
> predicted = 9.67, actual = 11.0
> predicted = 9.33, actual = 13.0
> predicted = 10.67, actual = 9.0
> predicted = 11.33, actual = 9.0
> predicted = 10.33, actual = 8.0
> predicted = 11.00, actual = 9.0
> predicted = 10.33, actual = 8.0
> predicted = 8.67, actual = 9.0
> predicted = 10.00, actual = 9.0
> predicted = 10.33, actual = 9.0
> predicted = 5.33, actual = 4.0
> predicted = 9.00, actual = 7.0
> predicted = 8.33, actual = 9.0
> predicted = 9.33, actual = 10.0
> predicted = 8.67, actual = 8.0
> predicted = 8.33, actual = 10.0
> predicted = 10.67, actual = 11.0
> predicted = 17.33, actual = 11.0
> predicted = 10.33, actual = 11.0
> predicted = 6.00, actual = 6.0
> predicted = 7.33, act