In [1]:
# Introduction to Python Reassessment Assignment.

In [2]:
# Import .csv file and store in data (a list of lists).
# Note: quoting=csv.QUOTE_NONNUMERIC ensures the elements of data are floating point numbers.

import csv
def load_from_csv(datafile):
    
        with open(datafile, 'r') as infile:
            data = [rec for rec in csv.reader(infile, quoting=csv.QUOTE_NONNUMERIC)]
        return data


In [3]:
# Calculates the Euclidean distance between two data points given by the lists a and b.

def get_distance(a, b):

    sum_squares=0
    for element in range(len(a)):
        sum_squares = sum_squares+(a[element] - b[element])**2
    Euclidean_distance = sum_squares ** 0.5
    
    return Euclidean_distance



In [4]:
# The variable matrix is in the form of a list of lists.
# Each of the interior lists can be thought of as a row in a matrix.
# This function simply calculates the average of a specified column of the matrix.

def get_average(matrix, column):

    sum_elements=0
    num_rows = len(matrix)    
    for i in range(num_rows):
        sum_elements += matrix[i][column]
        
    mean_value = sum_elements/num_rows
    average = mean_value
    
    return average


In [5]:
# Similar comments apply here as to the above function get_average.
# In this case the standard deviation of a specified column is calculated.

def get_standard_deviation(matrix, column):

    sum_elements=0
    num_rows = len(matrix)    
    for i in range(num_rows):
        sum_elements += matrix[i][column]
        
    mean_value = sum_elements/num_rows
    sum_of_squared_deviation = 0
    
    for i in range(num_rows):
        sum_of_squared_deviation += (matrix[i][column]- mean_value)**2
    Standard_Deviation_column = ((sum_of_squared_deviation)/(num_rows-1))**0.5
    
    return Standard_Deviation_column


In [6]:
# This function takes in a matrix and then standardises each column of the matrix.


def get_standardised_matrix(matrix):
    num_rows = len(matrix)
    num_cols = len(matrix[0])    
    col_average = []
    col_std = []

    for col in range(num_cols):
        col_average.append(get_average(matrix,col))
        col_std.append(get_standard_deviation(matrix,col))

    standardised_matrix = []
    for row in range(num_rows):
        standardised_row = []
        for col in range(num_cols):
            if col_std[col] == 0:
               col_std[col] = 0.01 # Avoid divide by zero (it will generally be nonzero). 
            standardised_value = (matrix[row][col] - col_average[col])/col_std[col]
            standardised_row.append(standardised_value)
        standardised_matrix.append(standardised_row)  
        
    return standardised_matrix


In [7]:
# Finds the minimum value in a list, and its associated index.

def find_min(a):
    if len(a) == 1:
        smallest = a[0]
        index_smallest = 0
    else: 
        smallest = a[0]
        index_smallest = 0
        for i in range(1,len(a)):
            if a[i] < smallest:
                smallest = a[i]
                index_smallest = i
                
    return smallest, index_smallest

# Finds the maximum value in a list

def find_max(a):
    if len(a) == 1:
        largest = a[0]
    else: 
        largest = a[0]
        for i in range(1,len(a)):
            if a[i] > largest:
                largest = a[i]
                
    return largest



In [8]:
# Determines indices of the k nearest neighbours of a data point (row_of_data).
# The rows from Learning_Data_Labels corresponding to these indices are returned.

def get_k_nearest_labels(row_of_data, matrix_learning, matrix_learning_labels, k):
    num_rows = len(matrix_learning)
    distances = []
    nearest_distances = []
    nearest_indices = []
    
    for row in range(num_rows):
        dist = get_distance(row_of_data, matrix_learning[row])
        distances.append(dist)
        max_distance = find_max(distances)

    distances_slice = distances.copy()  
    for i in range(k):
        min_distance, min_index = find_min(distances_slice)
        nearest_distances.append(min_distance)
        nearest_indices.append(min_index)
        distances_slice[min_index] = max_distance # Set to maximum value.
        
    k_nearest_labels = []
    for i in range(k):
        k_nearest_labels.append(matrix_learning_labels[nearest_indices[i]])
            
    return k_nearest_labels


In [9]:
# The mode of a set of numbers is determined by using a Python dictionary to keep a
# count on the number of times distinct numbers appear.

def get_mode(column_matrix):
    H = dict()
    max_H = 0
    mode = None 
    
    num_rows = len(column_matrix)
    for row in range(num_rows):
        x = column_matrix[row][0]
        if x in H:
            H[x] = H[x]+1
        else:
            H[x] = 0
            
        if H[x] > max_H:
            max_H = H[x]
            mode = x
            
    return mode


In [10]:
# This function attempts to predict labels for the data Data by using the most frequent label
# found using the k nearest neighbours approach.

def classify(matrix_data_stand, matrix_learning_data_stand, matrix_learning_data_labels, k):
    num_rows = len(matrix_data_stand)
    data_labels = []
    for row in range(num_rows):
        k_nearest_labels = get_k_nearest_labels(matrix_data_stand[row], matrix_learning_data_stand, matrix_learning_data_labels, k)
        mode = get_mode(k_nearest_labels)
        data_labels.append([mode])
        
    return(data_labels)
        

In [11]:
# The accuracy is determined by comparing the predicted data labels with the correct data labels.

def get_accuracy(matrix_correct_data_labels, data_labels):
    num_rows = len(matrix_correct_data_labels)
    matching = 0
    for row in range(num_rows):
        if matrix_correct_data_labels[row][0] == data_labels[row][0]:
            matching += 1
    fraction = matching/num_rows
    accuracy = fraction * 100 # Percentage accuracy.
    
    return(accuracy)
        

In [12]:
# Run the classification problem using values of k from 3 to 15.
# Print out the accuracy for each value of k.

def run_test():
    
    # TOY data.
    #matrix_data = load_from_csv("D:\My Documents\Essex Mdrive\Intro to Python\My Files\Data Files\Data_TOY.csv")
    #matrix_learning_data = load_from_csv("D:\My Documents\Essex Mdrive\Intro to Python\My Files\Data Files\Learning_Data_TOY.csv")
    #matrix_learning_data_labels = load_from_csv("D:\My Documents\Essex Mdrive\Intro to Python\My Files\Data Files\Learning_Data_Labels_TOY.csv")
    #matrix_correct_data_labels = load_from_csv("D:\My Documents\Essex Mdrive\Intro to Python\My Files\Data Files\Correct_Data_Labels_TOY.csv")

    matrix_data = load_from_csv("D:\My Documents\Essex Mdrive\Intro to Python\My Files\Data Files\Data.csv")
    matrix_learning_data = load_from_csv("D:\My Documents\Essex Mdrive\Intro to Python\My Files\Data Files\Learning_Data.csv")
    matrix_learning_data_labels = load_from_csv("D:\My Documents\Essex Mdrive\Intro to Python\My Files\Data Files\Learning_Data_Labels.csv")
    matrix_correct_data_labels = load_from_csv("D:\My Documents\Essex Mdrive\Intro to Python\My Files\Data Files\Correct_Data_Labels.csv")

    standardised_matrix_data = get_standardised_matrix(matrix_data)
    standardised_matrix_learning_data = get_standardised_matrix(matrix_learning_data)
    
    for k in range(3,16):
        data_labels = classify(standardised_matrix_data, standardised_matrix_learning_data, matrix_learning_data_labels, k)
        accuracy = get_accuracy(matrix_correct_data_labels, data_labels)
        print('k =',k,' Accuracy is %.2f' % accuracy, '%')


In [13]:
# Run the classification problem.

run_test()

k = 3  Accuracy is 95.00 %
k = 4  Accuracy is 95.00 %
k = 5  Accuracy is 95.71 %
k = 6  Accuracy is 95.71 %
k = 7  Accuracy is 94.29 %
k = 8  Accuracy is 94.29 %
k = 9  Accuracy is 95.71 %
k = 10  Accuracy is 95.71 %
k = 11  Accuracy is 95.71 %
k = 12  Accuracy is 95.71 %
k = 13  Accuracy is 95.71 %
k = 14  Accuracy is 95.71 %
k = 15  Accuracy is 95.00 %
