#### Configure project

In [None]:
!cd .. && mkdir build
!cd ../build/ && rm -rf *
!rm -f *.so
!cd ../build && cmake -DCMAKE_BUILD_TYPE=Release ..

#### Compile and install

In [None]:
!cd ../build && make install

In [None]:
import kNN
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.autonotebook import tqdm
from timeit import default_timer as timer
from sklearn.metrics import accuracy_score
import numpy as np
np.random.seed(1998)

### Util

In [None]:
def data_labels(dataset):
    data = dataset[dataset.columns[1:]].values
    labels = dataset["label"].values.reshape(-1,1)
    return(data, labels)

def save_res(cols, rows, name):
    res = pd.DataFrame(rows, columns=cols)
    res.to_csv("res/{}.csv".format(name), index=False, header=True)

# Run kNN for different k values on train dataset and validate
def run_KNN(train_data, train_labels, val_data, val_labels, klist, reps):    
    # Fit classifier with train data
    clf = kNN.KNNClassifier(0, 10)
    clf.fit(train_data, train_labels)

    results = []
    for k in klist:
        clf.setneighbors(k)
        time = 0
        for i in range(0, reps):
            start = timer()

            pred_labels = clf.predict(val_data)

            end = timer()
            time += end - start

        results.append([k, accuracy_score(val_labels, pred_labels), time / reps])

    return results

def run_PCA_KNN(train, validate, klist, alphalist):
    train_data, train_labels = data_labels(train)
    val_data, val_labels = data_labels(validate)

    # Fit
    fit_time = timer()
    pca = kNN.PCA(50)
    pca.fit(train_data)
    fit_time = timer() - fit_time

    results = []
    for alpha in alphalist:
        # Basis change
        pca.setalpha(alpha)

        transform_train_time = timer()
        train_data_t = pca.transform(train_data)
        transform_train_time = timer() - transform_train_time
        
        transform_val_time = timer()
        val_data_t = pca.transform(val_data)
        transform_val_time = timer() - transform_val_time

        # Predict with kNN
        
        kNN_results = run_KNN(train_data_t, train_labels, val_data_t, val_labels, klist, 1)
        # Complete results
        for result in kNN_results:
            results.append([alpha] + result + [fit_time, transform_train_time, transform_val_time])
            
    return results




# Experimentacion

In [None]:
df_train = pd.read_csv("../data/fashion-mnist_train.csv")
df_test = pd.read_csv("../data/fashion-mnist_test.csv")

### k size test

In [None]:
percent = 0.1
train = df_train.sample(int(df_train.shape[0] * percent))
test = df_test.sample(int(df_test.shape[0] * percent))

train_data, train_labels = data_labels(train)
val_data, val_labels = data_labels(test)

In [None]:
klist = np.arange(1,100+1, 1)
cols = ["k", "acc", "time"]
results = run_KNN(train_data, train_labels, val_data, val_labels, klist, 1)
save_res(cols, results, "k_size")

In [None]:
results = pd.read_csv("res/k_size.csv")
g = sns.lineplot(data=results, x="k", y="acc", linewidth=2, label='accuracy')
plt.show()

g = sns.scatterplot(data=results, x='k', y='time')

### alpha test

In [None]:
percent = 0.1
train = df_train.sample(int(df_train.shape[0] * percent))
test = df_test.sample(int(df_test.shape[0] * percent))

In [None]:
alphalist = tqdm(np.arange(1, 101, 1))
klist = np.arange(1, 101, 1)
cols = ["alpha","k", "acc", "kNN_time", "fit_time", "transform_train_time", "transform_val_time"]
results = run_PCA_KNN(train, test, klist, alphalist)
save_res(cols, results, "PCA_KNN")

In [None]:
df = pd.read_csv("res/PCA_KNN.csv")
g = sns.lineplot(data=df, x="alpha", y="acc", hue="k")