In [7]:
# Import definition
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelBinarizer
from sklearn.neighbors import KNeighborsClassifier
from scipy.io import arff
import pandas as pd

# Constants definition
GROUP_NUMBER = 16  # Our group number
NEIGHBOURS = [3, 5, 7]  # Number of neighbours to be used

In [21]:
# Loading dataset into working desk
data = arff.loadarff('breast.w.arff')
df = pd.DataFrame(data[0])

# Removes NaN values from dataset by deleting rows
df.dropna(axis=0, how="any", inplace=True)

In [9]:
# Gets X (data matrix) and y (target values column matrix)
X = df.drop("Class", axis=1).to_numpy()
y = df["Class"].to_numpy()

# Performs some preprocessing by turning labels into binaries (benign is 1)
# We are doing a "double conversion" to convert everything to Binary type
for count, value in enumerate(y):
    if value == b"benign":
        y[count] = "yes"
    else:
        y[count] = "no"
lb = LabelBinarizer()
y = lb.fit_transform(y)

In [11]:
# We need to create a classifier for each number of neighbours
for n in NEIGHBOURS:

    print(f"Classifying n = {n}:")

    # Creates a k fold cross validator
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=GROUP_NUMBER)

    # Creates KNN classifier for n neighbours
    clf = KNeighborsClassifier(n, weights="uniform", p=2, metric="minkowski")

    # For each train/test set, we use a KNN classifier
    for train_index, test_index in skf.split(X, y):

        # Uses indexes to fetch which values are going to be used to train and test
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Trains knn classifier
        clf.fit(X_train, y_train.ravel())

        # Uses testing data and gets model accuracy
        acc = clf.score(X_test, y_test)
        print(f"Acc using test data {acc}")

        # Uses training data and gets model accuracy to determine over fitting
        acc = clf.score(X_train, y_train)
        print(f"Acc using training data {acc}")

    print("\n")


Classifying n = 3:
Acc using test data 0.9565217391304348
Acc using training data 0.9853420195439739


Acc using test data 0.9710144927536232
Acc using training data 0.9820846905537459


Acc using test data 0.9710144927536232
Acc using training data 0.9804560260586319


Acc using test data 0.9705882352941176
Acc using training data 0.9788617886178862


Acc using test data 0.9558823529411765
Acc using training data 0.9804878048780488


Acc using test data 0.9705882352941176
Acc using training data 0.9772357723577236


Acc using test data 0.9705882352941176
Acc using training data 0.9804878048780488


Acc using test data 0.9705882352941176
Acc using training data 0.9788617886178862


Acc using test data 0.9852941176470589
Acc using training data 0.9772357723577236


Acc using test data 0.9852941176470589
Acc using training data 0.9788617886178862




Classifying n = 5:
Acc using test data 0.9565217391304348
Acc using training data 0.9853420195439739


Acc using test data 0.97101449275362