#### Configure project

In [1]:
# !cd .. && mkdir build
# !cd ../build/ && rm -rf *
# !rm -f *.so
# !cd ../build && cmake -DCMAKE_BUILD_TYPE=Release ..

#### Compile and install

In [2]:
# !cd ../build && make install

#### Imports

In [3]:
import kNN
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm.notebook import tqdm
from IPython.display import clear_output
import timeit
np.random.seed(1998)

In [4]:
import matplotlib.pyplot as plt

# importing the style package
from matplotlib import style
plt.style.use('seaborn-whitegrid')
plt.rcParams["figure.figsize"] = (10,5)

#### Utils

In [5]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

labels_ = [0,1,2,3,4,5,6,7,8,9]
label_count_ = 10

def data_labels(dataset):
    return (dataset[dataset.columns[1:]].values, dataset["label"].values.reshape(-1,1))

def save_df(df, name):
    df.to_csv("res/{}.csv".format(name), index=False, header=True)

# Returns k splits of dataset (indexes)
def Kfold_gen_splits(dataset, K):
    return KFold(n_splits=K, shuffle=False, random_state=None).split(dataset)

def Kfold_get_split(dataset, split):
    train, test = split
    return (data_labels(dataset.iloc[train]), data_labels(dataset.iloc[test]))

def metrics(true_labels, pred_labels):
    accuracy = accuracy_score(y_true=true_labels, y_pred=pred_labels)
    precision = precision_score(y_true=true_labels, y_pred=pred_labels, labels=labels_, average='weighted', zero_division=0)
    recall = recall_score(y_true=true_labels, y_pred=pred_labels, labels=labels_, average='weighted', zero_division=0)
    f1 = f1_score(y_true=true_labels, y_pred=pred_labels, labels=labels_, average='weighted', zero_division=0)
    return [accuracy, precision, recall, f1]

def kNN_Kfold(dataset, K, ks, results):
    splits = Kfold_gen_splits(dataset, K)
    clf = kNN.KNNClassifier(0, label_count_)
    dataset_size = dataset.shape[0]
    for split in splits:
        train_folds, test_fold = Kfold_get_split(dataset, split)
        clf.fit(train_folds[0], train_folds[1])
        for k in ks:
            clf.setneighbors(k)
            pred_labels = clf.predict(test_fold[0])
            results.loc[len(results)] = [K, k, dataset_size] + metrics(test_fold[1], pred_labels)
    return results

#### Load dataset

In [6]:
df_train = pd.read_csv("../data/fashion-mnist_train.csv")
df_test = pd.read_csv("../data/fashion-mnist_test.csv")
label_description = {0:"T-shirt/top", 1:"Trouser", 2:"Pullover", 3:"Dress", 4:"Coat", 5:"Sandal", 6:"Shirt", 7:"Sneaker", 8:"Bag", 9:"Ankle boot"}

In [7]:
# # Check label distribution
# print(df_train["label"].value_counts(normalize=True))

# # Visualize
# examples = df_train.groupby("label").sample(1)
# fig = plt.figure()
# for i in range(0,10):
#     fig.add_subplot(2, 5, i+1)
#     example = examples.iloc[i].to_numpy()
#     plt.imshow(example[1:].reshape(28,28))
#     plt.axis('off')
#     plt.title(label_description[example[0]])

In [8]:
# Reduced dataset to test
df_train_small = df_train.groupby("label").sample(300)
df_train_small = df_train_small.sample(frac=1)

## Experimentation

In [9]:
columns = ["K", "k", "size", "accuracy", "precision", "recall", "f1"]

### Cross validation

#### Leave-One-Out CV

In [10]:
# K = df_train_small.shape[0]
# k = 10
# results = pd.DataFrame(columns=columns)
# kNN_Kfold(df_train_small, K, [k], results)
# save_df(results, 'LOOCV')

#### KFold CV

In [11]:
# Ks = np.arange(2, 30+2, 1)
# k = 10
# results = pd.DataFrame(columns=columns)
# for K in tqdm(Ks, position=0, leave=True):
#     kNN_Kfold(df_train_small, K, [k], results)
# save_df(results, "KFold_K")

In [12]:
# df = pd.read_csv('res/KFold_K.csv')
# df = df.groupby('K')
# means = df['accuracy'].mean().to_numpy()
# mins = means - df['accuracy'].min().to_numpy()
# maxs = df['accuracy'].max().to_numpy() - means
# ideal = pd.read_csv('res/LOOCV.csv')['accuracy'].mean()
# plt.errorbar(df['K'].mean().to_numpy(), means, yerr=[mins, maxs], fmt='o')
# plt.axhline(y=ideal, color='r', label='LOOCV estimation')
# plt.legend()
# plt.xlabel('K-Folds')
# plt.ylabel('Accuracy')

K=9 seems to produce good estimations without too much variance

In [13]:
K_ = 10

### kNN analysis

#### Accuracy

Fixed k, training dataset size variable

In [14]:
ks = [10]
sizes = np.arange(10, df_train_small.shape[0], 10)
results = pd.DataFrame(columns=columns)
for size in tqdm(sizes):
    df = df_train_small.groupby('label').apply(lambda x: x.sample(size))
    kNN_Kfold(df, K_, ks, results)
save_df(results, "kNN_k_fixed")

  0%|          | 0/299 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
df = pd.read_csv("res/kNN_k_fixed.csv")
df = 
g = sns.scatterplot(data=df, x='size', y='acc', label='accuracy')
plt.xlabel("Training dataset size")
plt.ylabel("Score")

Variable k, fixed dataset size

In [None]:
clf = kNN.KNNClassifier(1, 10)
clf.fit(train_vectors, train_labels)
ks = np.arange(1, 201, 1)
accs = []
f1s = []
for k in tqdm(ks):
    clf.setneighbors(k)
    accs.append(accuracy_score(val_labels, clf.predict(val_vectors)))
save_res(["k","acc"], zip(ks, accs), "kNN-acc-kvariable")

In [None]:
df = pd.read_csv("res/kNN-acc-kvariable.csv")
g = sns.lineplot(data=df, x='k', y='acc')
plt.xlabel("k")
plt.ylabel("Accuracy")
plt.show()

#### Performance

k proportional to training dataset size

In [None]:
clf = kNN.KNNClassifier(1, 10)
sizes = np.arange(1, 1001, 1)
times = []
results = pd.DataFrame(columns=['size', 'time', 'k'])
for i in tqdm(sizes):
    clf.fit(train_data_small[0:i], train_labels_small[0:i])
    clf.setneighbors(i)
    results.loc[len(results)] = [i, (timeit.timeit(lambda: clf.predict(val_data_small[0]), number = 20)/20),i]
save_df(results, 'kNN_time_k_prop')

Fixed k, training dataset size variable

In [None]:
clf = kNN.KNNClassifier(1, 10)
sizes = np.arange(1, 1001, 1)
times = []
results = pd.DataFrame(columns=['size', 'time', 'k'])
for i in tqdm(sizes):
    clf.fit(train_data_small[0:i], train_labels_small[0:i])
    results.loc[len(results)] = [i, (timeit.timeit(lambda: clf.predict(val_data_small[0]), number = 100)/100), 1]
save_df(results, 'kNN_time_k_fixed')

In [None]:
df_kfixed = pd.read_csv("res/kNN_time_k_fixed.csv")
df_kprop = pd.read_csv("res/kNN_time_k_prop.csv")
g = sns.lineplot(data=df_kfixed, x='size', y='time', label='k = 1')
sns.lineplot(data=df_kprop, x='size', y='time', label='k = Dataset size')
plt.xlabel("Train dataset size")
plt.ylabel("Time(s)")

### PCA analysis

In [None]:
# Fit PCA
pca = kNN.PCA(train_vectors.shape[1])
pca.fit(train_vectors)

#### Principal component number

##### Scree plot

In [None]:
values = pca.pc_values()
g = sns.lineplot(data=values[0:100], legend='full')
plt.axhline(y=values[2], color='orange', linestyle='--')
plt.axhline(y=values[50], color='r', linestyle='--')
plt.xlabel("Components")
plt.ylabel("Eigenvalue")
plt.show()

##### Accumulated explained variance

In [None]:
ratios = pca.pc_values()/sum(values)
var_accum = ratios.cumsum()
g = sns.lineplot(data=var_accum, label='Variance explained')
plt.axhline(y=var_accum[2], color='orange', linestyle='--', label='1 Component')
plt.axhline(y=var_accum[50], color='r', linestyle='--', label='50 Components')
plt.legend()
plt.xlabel("Components")
plt.ylabel("Variance explained")

##### kNN + PCA

Now lets see how alpha affects kNN accuracy

In [None]:
ks = np.arange(5, 20, 1)
alphas = np.arange(25, 75,1)
accs = []
clf = kNN.KNNClassifier(1, 10)
for alpha in tqdm(alphas):
    pca.setalpha(alpha)
    train_vectors_t = pca.transform(train_vectors)
    for k in ks:
        clf.setneighbors(k)
        clf.fit(train_vectors_t, train_labels)
        pred_labels = clf.predict(val_vectors)
        accs.append(accuracy_score(val_labels, pred_labels))
    save_res(["k","acc"], zip(ks, accs), "kNN-acc-kvariable")

### PCA & t-sne

In [None]:
pca.setalpha(2)
train_vectors_t = pca.transform(train_vectors)
df = pd.DataFrame(train_vectors_t, columns=["x","y"])
df["label"] = train_labels
g = sns.scatterplot(data=df, x='x', y='y', hue='label', palette=sns.color_palette("hls",10))
plt.xlabel('Component 1')
plt.ylabel('Component 2')

In [None]:
from sklearn.manifold import TSNE
X_embedded = TSNE(n_components=2, perplexity=50, early_exaggeration=20, learning_rate=500, init='random').fit_transform(train_vectors)
df = pd.DataFrame(X_embedded, columns=["x", "y"])
df["label"] = train_labels
g = sns.scatterplot(data=df, x='x', y='y', hue='label', palette=sns.color_palette("hls",10))

In [None]:
pca.setalpha(25)
train_vectors_t = pca.transform(train_vectors)
X_embedded = TSNE(n_components=2, perplexity=50, learning_rate='auto', init='random').fit_transform(train_vectors_t)
df = pd.DataFrame(X_embedded, columns=["x", "y"])
df["label"] = train_labels
g = sns.scatterplot(data=df, x='x', y='y', hue='label', palette=sns.color_palette("hls",10))