In [39]:
# Gonna go through code that doesn't over use libraries (scikit learn) and code the K Nearest Neighbours from scratch
# The purpose of this coding practice is to become more knowledgable in Machine Learning
# Most of my projects / lessons (I teach believe it or not) relies way too much on libraries, even normalized in my schools
# Cirriculum. The problem I have with this is that it kinda dumbs us down as programmers
# All we need to do is write a few lines of code, import an algorithm from sckiti learn, fit the data, hyper tune parameters
# Then boom, within 20 minutes we suddenly become "Machine Learning Engineers"
# Yet we don't even know about the mathematic side, the equations, the loss functions, etc...
# Of course as a teacher I do teach the theory and math behind these algorithms
# But then again I think, what's even the point? Just import everything, edit it a tiny bit then boom, ML algorithm working.
# I think it's time to start a new trend in machine learning to try and code everything ourselves to truly get a better understanding
# From a technical standpoint, code everything ourselves, be in more control of our Machine Learning algorithms
# Yes, I understand how nerdy I am coming accross, especially doing this on a saturday night lol. But getting more technical is fun
# 
#
# It's like driving, sure, we can drive an automatic, or we can drive stick and have more control of the car (and then brag about it lol)

In [40]:
# Lets get to it, code a K nearest Neighbour algorithm from scratch, I will comment along and document what I learned
# How it's different from relying on libraries, and of course rating the difficulty of the process. There's a reason we use libraries after all

In [41]:
# Importing libraries (LOL) that we absolutely need
from random import randrange
from csv import reader
from math import sqrt

In [44]:
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())

In [45]:
def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

In [46]:
# This searches for the min and max value from each column and row using a for loop in a list in the dataset
def dataset_minmax(dataset):
    minmax = list()
    for i in range(len(dataset[0])):
        col_values = [row[i] for row in dataset]
        value_min = min(col_values)
        value_max = max(col_values)
        minmax.append([value_min, value_max])
    return minmax

In [47]:
# Normalizes the dataset with a simple for loop that does subtraction and division with each row in the dataset
# Supringsly low amount of code to normalize a dataset, just as easy as using sklearn.preprocessing and numpy
# But maybe dont sleep on defining a normalizing function yourself! A simple for loop with somewhat basic math
def normalize_dataset(dataset, minmax):
    for row in dataset:
        for i in range(len(row)):
            row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])

In [48]:
# Splitting data to k fold
# Splitting the dataset and creating n folds (k folds) which we will use in our evaluation
# Think of this as an alternative to sklearn.model_selection train_test_split()
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for _ in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

In [49]:
# Using actual and predicted values to return an accuracy metric
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

In [50]:
# Evaluating algorithm, splitting the folds in train and test sets
# Once dataset is split, we use the predicted and actual set within an accuracy metric
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_Set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predicted = algorithm(train_set, test_set, *args)
        actual = [row[-1] for row in fold]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
    return scores

In [51]:
# Calculate Euclidean distance between 2 vectors
# Formula: d = √[ (x2 - x1)^2 + (y2 - y1)^2]
def euclidean_distance(row1, row2):     # creating seperate rows for x and y where x1, y1 are cooridnates while x2, y2 are coordinates for a different point
    distance = 0.0
    for i in range(len(row1)-1):
        distance += (row1[i] - row2[i])**2 # subtracting y and x while also adding them up outside the bracket
    return sqrt(distance) # square d (distance between points)

In [52]:
# Locating similar neighbours via their distance
# Calculatig the distance between x and y in the train and test data
# Creating neighbours based on the distances calculated using euclidean distance
def get_neighbours(train, test_row, num_neighbours):
    distances = list()
    for train_row in train:
        dist = euclidean_distance(test_row, train_row)
        distances.append((train_row, dist))
    distances.sort(key = lambda tup: tup[1])
    neighbours = list()
    for i in range(num_neighbours):
        neighbours.append(distances[i][0])
    return neighbours

In [53]:
# Predicitng classification with neighbours
# Returning the outputs from the neighbours object as our prediction
def predict_classification(train, test_row, num_neighbours):
    neighbours = get_neighbours(train, test_row, num_neighbours)
    output_values = [row[-1] for row in neighbours]
    prediction = max(set(output_values), key = output_values.count)
    return prediction

In [54]:
# Finally the KNN Algorithm
# Define the KNN with train, test, and number of neighbours as our paramters
# Using those paramters to predict the classification
# appending the output into the prediction list which should be printed in our results.
def KNN(train, test, num_neighbours):
    predictions = list()
    for row in test:
        output = predict_classification(train, row, num_neighbours)
        predictions.append(output)
    return(predictions)

In [None]:
# Okay, instantly way more code compared to using a bunch of libraries
# A lot more math too. This should be expected in Machine Learning
# But forcing myself to create a bunch of functions to combine together at the end gave me a much better understanding
# Of how the KNN works and more importantly the formula of the euclidean distance.
# If I was to do another KNN project with the usual sklearn functions I would have never coded out the euclidean distance myself
# Nor normalize the data myself, nor split the data myself, nor code everything else myself etc...

In [None]:
# It is a good idea do try and get into a habit of coding like this (especially if you want to become a machine learning engineer)
# BUT, coding like this is more time consuming, and has a lot more lines of code which can increase the chance of errors and debugging
# Which results in lost time compared to someone who uses the appropriate libraries.
# But once again it never hurts to get a better understanding of the math, and doing it in a practical way helps a lot more rather
# Than reading the formula on google and pretending to know what it means.

In [None]:
# This was honestly a little challenging and quite fun.
# I would like to do this more and even dip my feet into deep learning without the use of Tensorflow / Keras
# However, I would rather do deep learning in c++ as I am sick of doing it in python where I have to wait 12 hours for the epochs
# I'm still learning c++ so give me some time before I do the deep learning type of "from scratch" projects.