#### Configure project

In [None]:
# !cd .. && mkdir build
# !cd ../build/ && rm -rf *
# !rm -f *.so
# !cd ../build && cmake -DCMAKE_BUILD_TYPE=Release ..

#### Compile and install

In [None]:
# !cd ../build && make install

#### Imports

In [None]:
import kNN
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import timeit
import time
np.random.seed(1998)

#### Utils

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

labels_ = [0,1,2,3,4,5,6,7,8,9]
label_count_ = 10

def data_labels(dataset):
    # return (dataset[dataset.columns[1:]].values, dataset["label"].values.reshape(-1,1))
    return (dataset[dataset.columns[1:]].values, dataset[dataset.columns[0]])

def save_df(df, name):
    df.to_csv("res/{}.csv".format(name), index=False, header=True)

# Returns k splits of dataset (indexes)
def Kfold_gen_splits(dataset, K):
    return KFold(n_splits=K, shuffle=True, random_state=None).split(dataset)

# Returns ((train_data, train_labels),(val_data, val_labels)) pair
def Kfold_get_split(dataset, split):
    train, test = split
    return (data_labels(dataset.iloc[train]), data_labels(dataset.iloc[test]))

def metrics(true_labels, pred_labels):
    accuracy = accuracy_score(y_true=true_labels, y_pred=pred_labels)
    precision = precision_score(y_true=true_labels, y_pred=pred_labels, labels=labels_, average='weighted', zero_division=0)
    recall = recall_score(y_true=true_labels, y_pred=pred_labels, labels=labels_, average='weighted', zero_division=0)
    f1 = f1_score(y_true=true_labels, y_pred=pred_labels, labels=labels_, average='weighted', zero_division=0)
    return [accuracy, precision, recall, f1]

def kNN_Kfold(dataset, K, ks, results, pbar_splits=False, pbar_ks=False):
    splits = Kfold_gen_splits(dataset, K)
    clf = kNN.KNNClassifier(0, label_count_)
    dataset_size = dataset.shape[0]
    ks_pbar = tqdm(total=len(ks), disable=not(pbar_ks))
    for split in tqdm(list(splits), disable=not(pbar_splits)):
        train_folds, test_fold = Kfold_get_split(dataset, split)
        clf.fit(train_folds[0], train_folds[1])
        for k in ks:
            clf.setneighbors(k)
            pred_labels = clf.predict(test_fold[0])
            results.loc[len(results)] = [K, k, dataset_size] + metrics(test_fold[1], pred_labels)
            ks_pbar.update()
        ks_pbar.refresh()
        ks_pbar.reset()
    return results

def kNN_PCA_Kfold(dataset, K, ks, alphas, results, pbar_splits=False, pbar_ks=False, pbar_alphas=False):
    # Progress Bars
    ks_pbar = tqdm(total=len(ks), disable=not(pbar_ks))
    ks_pbar.set_description("k loop")
    alphas_pbar = tqdm(total=len(alphas), disable=not(pbar_alphas))
    alphas_pbar.set_description("alpha loop")
    
    splits = Kfold_gen_splits(dataset, K)
    clf = kNN.KNNClassifier(0, label_count_)
    dataset_size = dataset.shape[0]
    pca = kNN.PCA(0)
    for split in tqdm(list(splits), disable=not(pbar_splits)):
        train, test = Kfold_get_split(dataset, split)
        pca.fit(train[0])
        for alpha in alphas:
            pca.setalpha(alpha)
            train_t = pca.transform(train[0])
            test_t = pca.transform(test[0])
            clf.fit(train_t, train[1])
            for k in ks:
                clf.setneighbors(k)
                pred_labels = clf.predict(test_t)
                results.loc[len(results)] = [K, k, alpha, dataset_size] + metrics(test[1], pred_labels)
                ks_pbar.update()
            alphas_pbar.update()
            ks_pbar.refresh()
            ks_pbar.reset()
        alphas_pbar.refresh()
        alphas_pbar.reset()

columns_kNN = ["K", "k", "size", "accuracy", "precision", "recall", "f1"]
columns_PCA = ["K", "k", "alpha", "size", "accuracy", "precision", "recall", "f1"]

#### Load dataset

In [None]:
label_description = {0:"T-shirt/top", 1:"Trouser", 2:"Pullover", 3:"Dress", 4:"Coat", 5:"Sandal", 6:"Shirt", 7:"Sneaker", 8:"Bag", 9:"Ankle boot"}
df_train = pd.read_csv("../data/fashion-mnist_train.csv")
df_test = pd.read_csv("../data/fashion-mnist_test.csv")

# Reduced dataset to test
df_train_small = df_train.groupby("label").sample(300)

## Experimentation

### Cross validation

#### Leave-One-Out CV

In [None]:
K = df_train_small.shape[0]
k = 1
results = pd.DataFrame(columns=columns_kNN)
kNN_Kfold(df_train_small, K, [k], results, True)
save_df(results, 'LOOCV_kNN')

In [None]:
K = df_train_small.shape[0]
k = 5
alpha = 20
results = pd.DataFrame(columns=columns_PCA)
kNN_PCA_Kfold(df_train_small, K, [k], [alpha], results, True)
save_df(results, 'LOOCV_PCA')

#### KFold CV

In [None]:
Ks = np.arange(2, 30+1, 1)
k = 1
results = pd.DataFrame(columns=columns_kNN)
for K in tqdm(Ks, position=0, leave=True):
    kNN_Kfold(df_train_small, K, [k], results)
save_df(results, "KFold_K_kNN")

In [None]:
Ks = np.arange(2, 30+1, 1)
k = 5
alpha = 20
results = pd.DataFrame(columns=columns_PCA)
for K in tqdm(Ks, position=0, leave=True):
    kNN_PCA_Kfold(df_train_small, K, [k], [alpha], results, True)
save_df(results, "KFold_K_PCA")

K=10 seems to be a reasonable value

In [None]:
K_ = 10

### kNN analysis

#### Accuracy

**Dataset size variable**
1. Fixed k (1)
2. Variable k proportional to dataset size (0.1)
3. Variable k proportional to dataset size (size - size/K)

In [None]:
sizes = np.arange(10, df_train_small.shape[0]+1, 10)
df = df_train_small.groupby("label")

In [None]:
# 1
k = 1
results = pd.DataFrame(columns=columns_kNN)
for size in tqdm(sizes):
    kNN_Kfold(df.sample(size//10), K_, [k], results)
save_df(results, "kNN_k_fixed")

In [None]:
# 2
results = pd.DataFrame(columns=columns_kNN)
for size in tqdm(sizes):
    k = size // 10
    kNN_Kfold(df.sample(size//10), K_, [k], results)
save_df(results, "kNN_k_proportional")

In [None]:
# 3
results = pd.DataFrame(columns=columns_kNN)
for size in tqdm(sizes):
    k = size - size // 10
    kNN_Kfold(df.sample(size//10), K_, [k], results)
save_df(results, "kNN_k_size")

**Fixed dataset size**
1. k variable (linear)
2. k variable (log)

In [None]:
# 1
ks = np.arange(1,200,1)
results = pd.DataFrame(columns=columns_kNN)
kNN_Kfold(df_train_small, K_, ks, results, True, True)
save_df(results, "kNN_k_linspace")

In [None]:
# 2
ks = np.unique(np.geomspace(1, df_train_small.shape[0] - df_train_small.shape[0]//K_).astype(int))
results = pd.DataFrame(columns=columns_kNN)
kNN_Kfold(df_train_small, K_, ks, results, True, True)
save_df(results, "kNN_k_logspace")

#### Performance

k proportional to training dataset size

In [None]:
perf_data, perf_labels = data_labels(df_train_small)

In [None]:
clf = kNN.KNNClassifier(1, label_count_)
sizes = np.arange(1, 1001, 1)
times = []
results = pd.DataFrame(columns=['size', 'time', 'k'])
for size in tqdm(sizes):
    clf.fit(perf_data[0:size], perf_labels[0:size])
    clf.setneighbors(size)
    results.loc[len(results)] = [size, (timeit.timeit(lambda: clf.predict(perf_data[0]), number = 20)/20),size]
save_df(results, "kNN_perf_k_proportional")

Fixed k, training dataset size variable

In [None]:
clf = kNN.KNNClassifier(1, 10)
sizes = np.arange(1, 1001, 1)
times = []
results = pd.DataFrame(columns=['size', 'time', 'k'])
for size in tqdm(sizes):
    clf.fit(perf_data[0:size], perf_labels[0:size])
    results.loc[len(results)] = [size, (timeit.timeit(lambda: clf.predict(perf_data[0]), number = 100)/100), 1]
save_df(results, 'kNN_perf_k_fixed')

### PCA & t-sne

In [None]:
from sklearn.manifold import TSNE
train_data, train_labels = data_labels(df_train)
test_data, test_labels = data_labels(df_test)

In [None]:
X_embedded = TSNE(n_components=2, perplexity=6, early_exaggeration=20,init='random').fit_transform(train_data)
save_df(pd.DataFrame(X_embedded, columns=["x", "y"]), "tsne_n2")

In [None]:
# Fit PCA
pca = kNN.PCA(train_data.shape[1])
pca.fit(train_data)
pca.setalpha(2)

train_data_t = pca.transform(train_data)
df = pd.DataFrame(train_data_t, columns=['x','y'])
save_df(df, "PCA_alpha2")

In [None]:
# Fit PCA
pca = kNN.PCA(train_data.shape[1])
pca.fit(train_data)
pca.setalpha(5)

train_data_t = pca.transform(train_data)
X_embedded = TSNE(n_components=2, perplexity=6, early_exaggeration=20, learning_rate='auto', init='random').fit_transform(train_data_t)
save_df(pd.DataFrame(X_embedded, columns=["x", "y"]), "tsne_PCA_alpha5")

In [None]:
# Fit PCA
pca = kNN.PCA(train_data.shape[1])
pca.fit(train_data)
pca.setalpha(50)

train_data_t = pca.transform(train_data)
X_embedded = TSNE(n_components=2, perplexity=6, early_exaggeration=20, learning_rate='auto', init='random').fit_transform(train_data_t)
save_df(pd.DataFrame(X_embedded, columns=["x", "y"]), "tsne_PCA_50")

In [None]:
# Fit PCA
pca = kNN.PCA(train_data.shape[1])
pca.fit(train_data)
pca.setalpha(100)

train_data_t = pca.transform(train_data)
X_embedded = TSNE(n_components=2, perplexity=6, early_exaggeration=20, learning_rate='auto', init='random').fit_transform(train_data_t)
save_df(pd.DataFrame(X_embedded, columns=["x", "y"]), "tsne_PCA_100")

### PCA analysis

#### Principal component number

##### Scree plot

In [None]:
# Fit PCA
pca = kNN.PCA(train_data.shape[1])
pca.fit(train_data)
values = pca.pc_values()

save_df(pd.DataFrame(values, columns=['values']), "PCA_component_values")

##### Cumulative explained variance

##### kNN + PCA

Now lets see how alpha affects kNN accuracy

In [None]:
ks = np.arange(1, 100+1, 1)
alphas = np.arange(1, 100+1, 1)
results = pd.DataFrame(columns=columns_PCA)
kNN_PCA_Kfold(df_train_small, K_, ks, alphas, results, True, True, True)
save_df(results, 'kNN_PCA_k_alpha_variable_small')

In [None]:
ks = np.linspace(1, df_train_small.shape[0] - df_train_small.shape[0]//K_, num=28,dtype=int)
alphas = np.linspace(1, df_train_small.shape[1], num=28, dtype=int)
results = pd.DataFrame(columns=columns_PCA)
kNN_PCA_Kfold(df_train_small, K_, ks, alphas, results, True, True, True)
save_df(results, 'kNN_PCA_k_alpha_variable')

Test optimized parameters against train and test datasets

In [None]:
# Predict labels with kNN
knn_start = time.time()

clf = kNN.KNNClassifier(6, label_count_)
clf.fit(train_data, train_labels)
pred_labels = clf.predict(test_data)

knn_time = time.time() - knn_start
print("kNN fit/predict time: %.3fs" % knn_time)

results = pd.DataFrame(columns=['true_labels', 'pred_labels'])
results['true_labels'] = test_labels
results['pred_labels'] = pred_labels
save_df(results, "kNN_k6")

In [None]:
# Dimensionality reduction with PCA
pca_start = time.time()

pca = kNN.PCA(43)
pca.fit(train_data)
train_data_t = pca.transform(train_data)
test_data_t = pca.transform(test_data)

pca_time = time.time() - pca_start
print("PCA fit/transform time: %.3fs" % pca_time)

# Predict labels with kNN
knn_start = time.time()

clf = kNN.KNNClassifier(6, label_count_)
clf.fit(train_data_t, train_labels)
pred_labels = clf.predict(test_data_t)

knn_time = time.time() - knn_start
print("kNN fit/predict time: %.3fs" % knn_time)
print("Total time: %.3fs" % (pca_time + knn_time))
results = pd.DataFrame(columns=['true_labels', 'pred_labels'])
results['true_labels'] = test_labels
results['pred_labels'] = pred_labels
save_df(results, "kNN_PCA_k6_alpha43")