#### Configure project

In [None]:
# !cd .. && mkdir build
# !cd ../build/ && rm -rf *
# !rm -f *.so
!cd ../build && cmake -DCMAKE_BUILD_TYPE=Release ..

#### Compile and install

In [None]:
!cd ../build && make install

#### Imports

In [None]:
import kNN
from utils import *
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import timeit

np.random.seed(1998)

#### Graph style

In [None]:
sns.set(style='ticks')
sns.set_palette("Set2")
sns.set(rc={'figure.figsize':(10,5)})
plt.style.use('fivethirtyeight')
plt.tight_layout()

#### Load dataset

In [None]:
df_train = pd.read_csv("../data/fashion-mnist_train.csv")
# df_test = pd.read_csv("../data/fashion-mnist_test.csv")
label_description = {0:"T-shirt/top", 1:"Trouser", 2:"Pullover", 3:"Dress", 4:"Coat", 5:"Sandal", 6:"Shirt", 7:"Sneaker", 8:"Bag", 9:"Ankle boot"}

In [None]:
df_train["label"].value_counts(normalize=True)

In [None]:
examples = df_train.groupby("label").sample(1)
fig = plt.figure()
for i in range(0,10):
    fig.add_subplot(2, 5, i+1)
    example = examples.iloc[i].to_numpy()
    plt.imshow(example[1:].reshape(28,28))
    plt.axis('off')
    plt.title(label_description[example[0]])


Check label distribution

In [None]:
# Reduced dataset to test
df_train_small = df_train.groupby("label").sample(300)
df_train_small = df_train_small.sample(frac=1)
df_train_small["label"].value_counts(normalize=True)
# train_vectors, train_labels = data_labels(df_train_3000)
# # val_vectors, val_labels = data_labels(df_test.groupby("label").sample(50))

### Cross validation

#### Leave-One-Out CV

In [None]:
K = df_train_small.shape[0]
splits = Kfold_gen(df_train_small, K)
ks = [10]
res = []
for split in tqdm(splits):
    res.append(kNN_predict(Kfold_split(df_train_small, split),ks))
# Add values for each split (3 dimensional data, (K, k, metric_count))
res = sum(res)
# Now we got (k, metric_count) so we divide by K to get the mean
res *= 1/K

save_res(['Accuracy', 'Precision', 'Recall', 'F1'], res, 'LOOCV')

#### KFold CV

In [None]:
Ks = np.arange(2,30+1,1)
results = pd.DataFrame(columns=['K', 'Acc_min', 'Acc_max', 'Accuracy', 'Precision', 'Recall', 'F1'])
results['K'] = Ks
for K in tqdm(Ks):
    splits = Kfold_gen(df_train_small, K)
    ks = [10]
    res = []
    for split in splits:
        res.append(kNN_predict(Kfold_split(df_train_small, split),ks))
    # Get accuracy list to find max and min accuracy 
    accs = []
    for fold in res:
        accs.append(fold[0][0])
    # Add values for each split (3 dimensional data, (K, k, metric_count))
    res = sum(res)
    # Now we got (k, metric_count) so we divide by K to get the mean
    res *= 1/K
    results.loc[results['K']== K,'Acc_min':] = np.concatenate(([min(accs),max(accs)], res[0]))

save_res(['K', 'Acc_min', 'Acc_max', 'Accuracy', 'Precision', 'Recall', 'F1'], results, "KFold_K")

In [None]:
np.concatenate([[1],res[0]])

### kNN analysis

#### Performance

k proportional to training dataset size

In [None]:
clf = kNN.KNNClassifier(1, 10)
sizes = np.arange(1, 1001, 1)
times = []
for i in tqdm(sizes):
    clf.fit(train_vectors[0:i], train_labels[0:i])
    clf.setneighbors(i)
    times.append(timeit.timeit(lambda: clf.predict(val_vectors[0]), number = 20)/20)
save_res(["size","time"], zip(sizes,times), "kNN-time-kprop")

Fixed k, training dataset size variable

In [None]:
clf = kNN.KNNClassifier(1, 10)
sizes = np.arange(1, 1001, 1)
times = []
for i in tqdm(sizes):
    clf.fit(train_vectors[0:i], train_labels[0:i])
    times.append(timeit.timeit(lambda: clf.predict(val_vectors[0]), number = 100)/100)
save_res(["size","time"], zip(sizes,times), "kNN-time-kfixed")

In [None]:
df_kfixed = pd.read_csv("res/kNN-time-kfixed.csv")
df_kprop = pd.read_csv("res/kNN-time-kprop.csv")
g = sns.lineplot(data=df_kfixed, x='size', y='time', label='k = 1')
sns.lineplot(data=df_kprop, x='size', y='time', label='k = Dataset size')
plt.xlabel("Train dataset size")
plt.ylabel("Time(s)")

#### Accuracy

Fixed k, training dataset size variable

In [None]:
clf = kNN.KNNClassifier(10, 10)
sizes = np.arange(10, 5001, 10)
accs = []
for size in tqdm(sizes):
    data_sample,label_sample = data_labels(df_train.sample(size))
    clf.fit(data_sample, label_sample)
    pred_labels = clf.predict(val_vectors)
    accs.append(accuracy_score(val_labels, pred_labels))
save_res(["size","acc"], zip(sizes, accs), "kNN-acc-kfixed")

In [None]:
df = pd.read_csv("res/kNN-acc-kfixed.csv")
g = sns.scatterplot(data=df, x='size', y='acc', label='accuracy')
plt.xlabel("Training dataset size")
plt.ylabel("Score")

Variable k, fixed dataset size

In [None]:
clf = kNN.KNNClassifier(1, 10)
clf.fit(train_vectors, train_labels)
ks = np.arange(1, 201, 1)
accs = []
f1s = []
for k in tqdm(ks):
    clf.setneighbors(k)
    accs.append(accuracy_score(val_labels, clf.predict(val_vectors)))
save_res(["k","acc"], zip(ks, accs), "kNN-acc-kvariable")

In [None]:
df = pd.read_csv("res/kNN-acc-kvariable.csv")
g = sns.lineplot(data=df, x='k', y='acc')
plt.xlabel("k")
plt.ylabel("Accuracy")
plt.show()

### PCA analysis

In [None]:
# Fit PCA
pca = kNN.PCA(train_vectors.shape[1])
pca.fit(train_vectors)

#### Principal component number

##### Scree plot

In [None]:
values = pca.pc_values()
g = sns.lineplot(data=values[0:100], legend='full')
plt.axhline(y=values[2], color='orange', linestyle='--')
plt.axhline(y=values[50], color='r', linestyle='--')
plt.xlabel("Components")
plt.ylabel("Eigenvalue")
plt.show()

##### Accumulated explained variance

In [None]:
ratios = pca.pc_values()/sum(values)
var_accum = ratios.cumsum()
g = sns.lineplot(data=var_accum, label='Variance explained')
plt.axhline(y=var_accum[2], color='orange', linestyle='--', label='1 Component')
plt.axhline(y=var_accum[50], color='r', linestyle='--', label='50 Components')
plt.legend()
plt.xlabel("Components")
plt.ylabel("Variance explained")

##### kNN + PCA

Now lets see how alpha affects kNN accuracy

In [None]:
ks = np.arange(5, 20, 1)
alphas = np.arange(25, 75,1)
accs = []
clf = kNN.KNNClassifier(1, 10)
for alpha in tqdm(alphas):
    pca.setalpha(alpha)
    train_vectors_t = pca.transform(train_vectors)
    for k in ks:
        clf.setneighbors(k)
        clf.fit(train_vectors_t, train_labels)
        pred_labels = clf.predict(val_vectors)
        accs.append(accuracy_score(val_labels, pred_labels))
    save_res(["k","acc"], zip(ks, accs), "kNN-acc-kvariable")

### PCA & t-sne

In [None]:
pca.setalpha(2)
train_vectors_t = pca.transform(train_vectors)
df = pd.DataFrame(train_vectors_t, columns=["x","y"])
df["label"] = train_labels
g = sns.scatterplot(data=df, x='x', y='y', hue='label', palette=sns.color_palette("hls",10))
plt.xlabel('Component 1')
plt.ylabel('Component 2')

In [None]:
from sklearn.manifold import TSNE
X_embedded = TSNE(n_components=2, perplexity=50, early_exaggeration=20, learning_rate=500, init='random').fit_transform(train_vectors)
df = pd.DataFrame(X_embedded, columns=["x", "y"])
df["label"] = train_labels
g = sns.scatterplot(data=df, x='x', y='y', hue='label', palette=sns.color_palette("hls",10))

In [None]:
pca.setalpha(25)
train_vectors_t = pca.transform(train_vectors)
X_embedded = TSNE(n_components=2, perplexity=50, learning_rate='auto', init='random').fit_transform(train_vectors_t)
df = pd.DataFrame(X_embedded, columns=["x", "y"])
df["label"] = train_labels
g = sns.scatterplot(data=df, x='x', y='y', hue='label', palette=sns.color_palette("hls",10))