In [1]:
# Import definition
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelBinarizer
from sklearn.neighbors import KNeighborsClassifier
from scipy.io import arff
import pandas as pd
import numpy as np
import math

# Constants definition
GROUP_NUMBER = 16  # Our group number
NEIGHBOURS = [3, 5, 7]  # Number of neighbours to be used

In [2]:
# Loading dataset into working desk
data = arff.loadarff('breast.w.arff')
df = pd.DataFrame(data[0])

# Removes NaN values from dataset by deleting rows
df.dropna(axis=0, how="any", inplace=True)

In [3]:
# Gets X (data matrix) and y (target values column matrix)
X = df.drop("Class", axis=1).to_numpy()
y = df["Class"].to_numpy()

# Performs some preprocessing by turning labels into binaries (benign is 1)
# We are doing a "double conversion" to convert everything to Binary type
for count, value in enumerate(y):
    if value == b"benign":
        y[count] = "yes"
    else:
        y[count] = "no"
lb = LabelBinarizer()
y = lb.fit_transform(y)

In [10]:
# We need to create a classifier for each number of neighbours
for n in NEIGHBOURS:

    # Holds training and testing accuracy to be latter used to determine which K is more susceptible to over fit
    train_acc = []
    test_acc = []

    print(f"Classifying n = {n}:")

    # Creates a k fold cross validator
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=GROUP_NUMBER)

    # Creates KNN classifier for n neighbours
    clf = KNeighborsClassifier(n, weights="uniform", p=2, metric="minkowski")

    # For each train/test set, we use a KNN classifier
    for train_index, test_index in skf.split(X, y):

        # Uses indexes to fetch which values are going to be used to train and test
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Trains knn classifier
        clf.fit(X_train, y_train.ravel())

        # Uses testing data and gets model accuracy
        acc = clf.score(X_test, y_test)
        test_acc.append(acc)
        print("Acc using test data {:.3f}".format(acc))

        # Uses training data and gets model accuracy to determine over fitting
        acc = clf.score(X_train, y_train)
        train_acc.append(acc)
        print("Acc using training data {:.3f}".format(acc))

    # Calculates means for train and test to determine which one is over fitting less
    train_mean = sum(train_acc) / 10
    test_mean = sum(test_acc) / 10
    error = math.sqrt(np.square(np.subtract(train_acc, test_acc)).mean())
    print("Training acc: {:.3f}".format(train_mean))
    print("Test acc: {:.3f}".format(test_mean))
    print("Diff: {:.3f}".format(train_mean - test_mean))
    print("RMSE: {:.3f}".format(error))

    print("\n")


Classifying n = 3:
Acc using test data 0.957
Acc using training data 0.985
Acc using test data 0.971
Acc using training data 0.982
Acc using test data 0.971
Acc using training data 0.980
Acc using test data 0.971
Acc using training data 0.979
Acc using test data 0.956
Acc using training data 0.980
Acc using test data 0.971
Acc using training data 0.977
Acc using test data 0.971
Acc using training data 0.980
Acc using test data 0.971
Acc using training data 0.979
Acc using test data 0.985
Acc using training data 0.977
Acc using test data 0.985
Acc using training data 0.979
Training acc: 0.980
Test acc: 0.971
Diff: 0.009
RMSE: 0.014


Classifying n = 5:
Acc using test data 0.957
Acc using training data 0.985
Acc using test data 0.971
Acc using training data 0.982
Acc using test data 0.971
Acc using training data 0.980
Acc using test data 0.985
Acc using training data 0.980
Acc using test data 0.985
Acc using training data 0.980
Acc using test data 0.971
Acc using training data 0.984
Acc 