#### Configure project

In [None]:
!cd .. && mkdir build
!cd ../build/ && rm -rf *
!rm -f *.so
!cd ../build && cmake -DCMAKE_BUILD_TYPE=Release ..

#### Compile and install

In [None]:
!cd ../build && make install

#### Imports

In [None]:
import kNN
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from tqdm.autonotebook import tqdm
import timeit
from sklearn.metrics import accuracy_score

#### Graph style

In [None]:
sns.set(style='ticks')
sns.set_palette("Set2")
sns.set(rc={'figure.figsize':(10,5)})
plt.style.use('fivethirtyeight')
plt.tight_layout()

#### Util

In [None]:
def data_labels(dataset):
    return (dataset[dataset.columns[1:]].values, dataset["label"].values.reshape(-1,1))

def save_res(cols, rows, name):
    res = pd.DataFrame(rows, columns=cols)
    res.to_csv("res/{}.csv".format(name), index=False, header=True)

#### Load dataset

In [None]:
df_train = pd.read_csv("../data/fashion-mnist_train.csv")
df_test = pd.read_csv("../data/fashion-mnist_test.csv")
df_test_data,df_test_labels = data_labels(df_test)
df_train_data,df_train_labels = data_labels(df_train)

### kNN analysis

#### Training dataset size

k proportional to training dataset size

In [None]:
test_vector = df_test_data[0]

clf = kNN.KNNClassifier(1, 10)
sizes = np.arange(1, 1001, 1)
times = []
for i in tqdm(sizes):
    clf.fit(df_test_data[0:i], df_test_labels[0:i])
    clf.setneighbors(i)
    times.append(timeit.timeit(lambda: clf.predict(test_vector), number = 20)/20)
save_res(["size","time"], zip(sizes,times), "kNN-time-kprop")

In [None]:
df = pd.read_csv("res/kNN-time-kprop.csv")
g = sns.lineplot(data=df, x='size', y='time')
plt.xlabel("k")
plt.ylabel("Time(s)")

Fixed k, training dataset size variable

In [None]:
test_vector = df_test_data[0]

clf = kNN.KNNClassifier(1, 10)
sizes = np.arange(1, 1001, 1)
times = []
for i in tqdm(sizes):
    clf.fit(df_test_data[0:i], df_test_labels[0:i])
    times.append(timeit.timeit(lambda: clf.predict(test_vector), number = 100)/100)
save_res(["size","time"], zip(sizes,times), "kNN-time-kfixed")

In [None]:
df = pd.read_csv("res/kNN-time-kfixed.csv")
g = sns.lineplot(data=df, x='size', y='time')
plt.xlabel("k")
plt.ylabel("Time(s)")

#### Accuracy

Fixed k, training dataset size variable

In [None]:
test_vectors, test_labels = data_labels(df_test.groupby("label").sample(5))

clf = kNN.KNNClassifier(10, 10)
sizes = np.arange(10, 10001, 10)
accs = []
for size in tqdm(sizes):
    data_sample,label_sample = data_labels(df_train.sample(size))
    clf.fit(data_sample, label_sample)
    accs.append(accuracy_score(test_labels, clf.predict(test_vectors)))
save_res(["size","acc"], zip(sizes, accs), "kNN-acc-kfixed")

In [None]:
df = pd.read_csv("res/kNN-acc-kfixed.csv")
g = sns.scatterplot(data=df, x='size', y='acc')
plt.xlabel("size")
plt.ylabel("Accuracy")

Variable k, fixed dataset size

In [None]:
test_vectors, test_labels = data_labels(df_test.groupby("label").sample(5))
data_sample,label_sample = data_labels(df_train.sample(6000))
clf = kNN.KNNClassifier(1, 10)
ks = np.arange(1, 1001, 1)
accs = []
for k in tqdm(ks):
    clf.setneighbors(k)
    clf.fit(data_sample, label_sample)
    accs.append(accuracy_score(test_labels, clf.predict(test_vectors)))
save_res(["k","acc"], zip(ks, accs), "kNN-acc-kvariable")

In [None]:
df = pd.read_csv("res/kNN-acc-kvariable.csv")
g = sns.lineplot(data=df, x='k', y='acc')
plt.xlabel("k")
plt.ylabel("Accuracy")
plt.show()
g = sns.lineplot(data=df[0:200], x='k', y='acc')
plt.xlabel("k")
plt.ylabel("Accuracy")
plt.show()

### PCA analysis

#### Principal component number

In [None]:
# Fit PCA
pca = kNN.PCA(df_train_data.shape[1])
pca.fit(df_train_data)

##### Scree plot

In [None]:
values = pca.pc_values()
g = sns.lineplot(data=values, legend='full')
plt.xlabel("Components")
plt.ylabel("Eigenvalue")
plt.show()
g = sns.lineplot(data=values[0:50], legend='full')
plt.axhline(y=values[5], color='orange', linestyle='--')
plt.axhline(y=values[25], color='r', linestyle='--')
plt.xlabel("Components")
plt.ylabel("Eigenvalue")
plt.show()

##### Accumulated explained variance

In [None]:
values = pca.pc_values()
ratios = pca.pc_values()/sum(values)
var_accum = ratios.cumsum()
g = sns.lineplot(data=var_accum)
plt.axhline(y=var_accum[5], color='orange', linestyle='--')
plt.axhline(y=var_accum[25], color='r', linestyle='--')
plt.xlabel("Components")
plt.ylabel("Variance explained")