In [None]:
import torch
from collections import Counter

trainingSet = torch.tensor(
    [list(map(float, line.strip().split(',')))
     for line in open("yeast_train.txt")],
    dtype=torch.float32)

testSet = torch.tensor(
    [list(map(float, line.strip().split(',')))
     for line in open("yeast_test.txt")],
    dtype=torch.float32)

In [8]:
def getDistance(x, y):
    return torch.sqrt(torch.sum((x - y) ** 2))

In [None]:
def getNeighbors(trainingSet, tester, k):
    distances = []
    for i in range(trainingSet.shape[0]):
        dist = getDistance(tester[:-1], trainingSet[i][:-1])
        distances.append((dist, trainingSet[i][-1], i))
    distances.sort(key=lambda x: (x[0], x[2]))

    neighbors = [distances[i][1] for i in range(k)]
    return neighbors

In [None]:
def guessClass(neighbors, classOrder):
    labels = [label for _, label, _ in neighbors]

    mostVotes = max(Counter(labels).values())
    options = [label for label, total in Counter(labels).items() if total == mostVotes]

    for cls in classOrder:
        if cls in options:
            return cls

In [None]:
def mykNN(trainingSet, testingSet, k):
    classOrder = []
    for label in trainingSet[:, -1].toList():
        if label not in classOrder:
            classOrder.append(label)

    predictions = []

    for i in range(testingSet.shape[0]):
        tester = testingSet[i, :-1]
        actual = testingSet[i, -1].item()
        neighbors = getNeighbors(trainingSet, tester, k)
        guess = guessClass(neighbors, classOrder)
        predictions.append((guess, actual))

    return predictions

In [None]:
def loocv(trainingSet, k):
    errors = []

    for i in range(trainingSet.shape[0]):
        trainer = torch.cat((trainingSet[:i], trainingSet[i+1:]))
        tester = trainingSet[i, :-1]

        predicted = mykNN(trainer, tester, k)[0][0]
        actual = trainingSet[i, -1].item()

        errors.append(abs(predicted - actual))
    return sum(errors) / len(errors)

In [None]:
def runkNN(trainingSet, testingSet):
    errors = []
    for k in range(1,4):
        mae = loocv(trainingSet, k)
        errors.append((mae, k))
        
    errors.sort()
    bestK = errors[0][1]

    print(f"K chosen to be: {bestK}")

    predictions = mykNN(trainingSet, testingSet, bestK)

    correct = 0
    totalError = 0
    for guess, actual in predictions:
        if guess == actual:
            correct += 1
        totalError += abs(guess - actual)
    
    print(correct, len(predictions))
    print(totalError / len(predictions), len(predictions))