In [1]:
import torch
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np

def loadData(fileName):
    data = []
    with open(fileName, 'r') as f:
        for line in f:
            row = list(map(float, line.strip().split(',')))
            data.append(row)
    return torch.tensor(data, dtype=torch.float32)

trainingSet = loadData("yeast_train.txt")
testingSet = loadData("yeast_test.txt")

In [2]:
def getDistance(x, y):
    return torch.sqrt(torch.sum((x - y) ** 2))

In [3]:
def getNeighbors(trainingSet, tester, k):
    distances = []
    for i in range(trainingSet.shape[0]):
        dist = getDistance(tester, trainingSet[i][:-1])
        distances.append((dist, trainingSet[i][-1], i))
    distances.sort(key=lambda x: (x[0], x[2]))

    neighbors = [distances[i][1].item() for i in range(k)]
    return neighbors

In [4]:
def guessClass(neighbors, classOrder):
    
    if(len(neighbors) > 1):
        
        labels = [label for label in neighbors]

        mostVotes = max(Counter(labels).values())
        options = [label for label, total in Counter(labels).items() if total == mostVotes]

        for cls in classOrder:
            if cls in options:
                return cls
    else:
        return neighbors[0]

In [5]:
def mykNN(trainingSet, testingSet, k):
    classOrder = []
    for label in trainingSet[:, -1]:
        if label not in classOrder:
            classOrder.append(label)

    predictions = []

    for i in range(testingSet.shape[0]):
        tester = testingSet[i, :-1]
        actual = testingSet[i, -1].item()
        neighbors = getNeighbors(trainingSet, tester[0], k)
        guess = guessClass(neighbors, classOrder)
        predictions.append((guess, actual))

    return predictions

In [6]:
def loocv(trainingSet, k):
    errors = []

    for i in range(trainingSet.shape[0]):
        trainer = torch.cat((trainingSet[:i], trainingSet[i+1:]))
        tester = trainingSet[i, :-1]

        predicted = mykNN(trainer, tester.unsqueeze(0), k)[0][0]
        actual = trainingSet[i, -1].item()

        errors.append(abs(predicted - actual))
    return sum(errors) / len(errors)

In [None]:
def runkNN(trainingSet, testingSet):
    
    errors = []
    
    for k in range(1,4):
        mae = loocv(trainingSet, k+1)
        errors.append((mae, k+1))
        
    errors.sort()
    bestK = errors[0][1]

    print(f"K chosen to be: {bestK}")

    predictions = mykNN(trainingSet, testingSet, bestK)

    correct = 0
    totalError = 0
    for guess, actual in predictions:
        if guess == actual:
            correct += 1
        totalError += abs(guess - actual)
    
    print(f"Predicted Class Label: {guess}, Actual Class Label: {actual}")
    print(f"Correctly Classified Instances: {correct}, Total Instances Predicted: {len(predictions)}")
    print(f"Mean Absolute Error: {totalError / len(predictions)}, Total Instances Predicted: {len(predictions)}")

In [None]:
runkNN(trainingSet, testingSet)

In [13]:
def confMatrix(predictions, numClasses):
    matrix = np.zeros((numClasses, numClasses), dtype=int)

    for guess, actual in predictions:
        matrix[int(actual)][int(guess)] += 1
    return matrix

In [11]:
def plotkNN(trainingSet, testingSet):
    accuracies = []

    numClasses = len(set(trainingSet[:, -1].tolist() + testingSet[:, -1].tolist()))

    kVals = [1,5,10,20,30]
    for k in kVals:
        predictions = mykNN(trainingSet, testingSet, k)
        if k == 1 or k == 30:
            matrix = confMatrix(predictions, numClasses)
            print(f"Confusion Matrix for k={k}:\n{matrix}\n")
        correct = sum(1 for guess, actual in predictions if guess == actual)
        acc = correct / len(predictions)
        accuracies.append(acc)
    
    plt.plot(kVals, accuracies, marker="o")
    plt.xlabel("k")
    plt.ylabel("Test Accuracy")
    plt.title("Accuracy vs k on Yeast Dataset")
    plt.show()

In [14]:
plotkNN(trainingSet, testingSet)

IndexError: index 11 is out of bounds for axis 0 with size 11