## Setup

In [0]:
import os
import pickle

from scipy.io import loadmat
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier

if not os.path.exists("model"):
    os.makedirs("model")

## Load data

In [0]:
train_data = dict()
test_data = dict()

for dataset in os.listdir("data"):
    data_dir = f"data/{dataset}"
    train_data[dataset] = loadmat(f"{data_dir}/{dataset}_Train.mat")
    test_data[dataset] = loadmat(f"{data_dir}/{dataset}_Test.mat")

assert len(train_data.keys()) == len(test_data.keys())

## Train

In [0]:
folds = 4
candidates_n = [1, 3, 5, 7, 9, 11, 13, 15]

cross_validator = StratifiedKFold(n_splits=folds)
for dataset, mat in train_data.items():
    print(f"dataset: {dataset:<20}", end='')

    data = mat['Data']
    labels = mat['Label'].flatten()

    # stratified k-fold cross-validation
    validation_accuracies = dict()
    for n in candidates_n:
        cumulative_accuracy = 0
        for train_indices, test_indices in cross_validator.split(data, labels):
            model = KNeighborsClassifier(n_neighbors=n, algorithm='brute')
            model.fit(data[train_indices], labels[train_indices])
            cumulative_accuracy += model.score(data[test_indices], labels[test_indices])
        validation_accuracies[n] = cumulative_accuracy / folds

    best_n, validation_acc = list(sorted(validation_accuracies.items(), key=lambda item: item[1], reverse=True))[0]
    model = KNeighborsClassifier(n_neighbors=best_n, algorithm='brute')
    model.fit(data, labels)

    print(f"n_neighbors: {best_n:<5}", end='')
    print(f"Validation accuracy: {validation_acc:<8.3f}", end='')
    print(f"Train accuracy: {model.score(data, labels):.3f}")

    # save model
    with open(f"model/knn_{dataset}", 'wb') as f:
        pickle.dump(model, f)

dataset: bank                n_neighbors: 13   Validation accuracy: 0.890   Train accuracy: 0.897
dataset: car                 n_neighbors: 11   Validation accuracy: 0.941   Train accuracy: 0.968
dataset: abalone             n_neighbors: 11   Validation accuracy: 0.641   Train accuracy: 0.694
dataset: image-segmentation  n_neighbors: 1    Validation accuracy: 0.953   Train accuracy: 1.000
dataset: plant-shape         n_neighbors: 1    Validation accuracy: 0.612   Train accuracy: 1.000
dataset: wine-quality-red    n_neighbors: 1    Validation accuracy: 0.613   Train accuracy: 1.000
dataset: titanic             n_neighbors: 11   Validation accuracy: 0.793   Train accuracy: 0.792
dataset: ozone               n_neighbors: 15   Validation accuracy: 0.969   Train accuracy: 0.970
dataset: led-display         n_neighbors: 3    Validation accuracy: 0.708   Train accuracy: 0.743
dataset: yeast               n_neighbors: 15   Validation accuracy: 0.593   Train accuracy: 0.630


## Test

In [0]:
for dataset, mat in test_data.items():
    print(f"dataset: {dataset:<20}", end='')

    # load model
    with open(f"model/knn_{dataset}", 'rb') as f:
        model = pickle.load(f)

    print(f"n_neighbors: {model.n_neighbors:<5}", end='')
    print(f"Test accuracy: {model.score(mat['Data'], mat['Label'].flatten())}")

dataset: bank                n_neighbors: 13   Test accuracy: 0.8983425414364641
dataset: car                 n_neighbors: 11   Test accuracy: 0.9508670520231214
dataset: abalone             n_neighbors: 11   Test accuracy: 0.6279904306220095
dataset: image-segmentation  n_neighbors: 1    Test accuracy: 0.9632034632034632
dataset: plant-shape         n_neighbors: 1    Test accuracy: 0.63125
dataset: wine-quality-red    n_neighbors: 1    Test accuracy: 0.61875
dataset: titanic             n_neighbors: 11   Test accuracy: 0.7777777777777778
dataset: ozone               n_neighbors: 15   Test accuracy: 0.9822834645669292
dataset: led-display         n_neighbors: 3    Test accuracy: 0.68
dataset: yeast               n_neighbors: 15   Test accuracy: 0.5993265993265994
