# Compilación
---

In [11]:
!cd .. && mkdir build
!cd ../build/ && rm -rf *
!rm -f *.so
!cd ../build && cmake \
  -DPYTHON_EXECUTABLE="$(which python)" \
  -DCMAKE_BUILD_TYPE=Release ..
!cd ../build && make install

mkdir: no se puede crear el directorio «build»: El archivo ya existe
-- The C compiler identification is GNU 9.4.0
-- The CXX compiler identification is GNU 9.4.0
-- Check for working C compiler: /usr/bin/cc
-- Check for working C compiler: /usr/bin/cc -- works
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Detecting C compile features
-- Detecting C compile features - done
-- Check for working CXX compiler: /usr/bin/c++
-- Check for working CXX compiler: /usr/bin/c++ -- works
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Detecting CXX compile features
-- Detecting CXX compile features - done
Release mode
-- Found PythonInterp: /home/eitancho/.pyenv/shims/python (found version "3.6.5") 
-- Found PythonLibs: /home/eitancho/.pyenv/versions/3.6.5/lib/libpython3.6m.a
-- pybind11 v2.2.4
-- Performing Test HAS_FLTO
-- Performing Test HAS_FLTO - Success
-- LTO enabled
CMAKE_INSTALL_PREFIX=/home/eitancho/Escritorio/tp2_eigen+pybind

In [23]:
# Verifico la correcta instalación. Si no falla el import está OK
!pwd
!python --version
import metnum
import pandas as pd
import numpy as np
np.random.seed(1998)

/home/eitancho/Escritorio/tp2_eigen+pybind/tp2/notebooks
Python 3.6.5


ModuleNotFoundError: No module named 'metnum'

# K fold configuration y split simple
---

In [None]:
from sklearn.model_selection import KFold
# Retorna el par data/labels a partir de un dataframe
def Xypair(dataset):
    X = dataset[dataset.columns[1:]].values
    y = dataset["label"].values.reshape(-1,1)
    return (X,y)

# Retorna k splits a partir del dataset (train/test) en pares (data/labels) o (X,y)
def Kfoldconfig(dataset, K):
    kfold = KFold(K, True, 7)
    splits = []
    for train, test in kfold.split(dataset):
        splits.append((Xypair(dataset.iloc[train]), Xypair(dataset.iloc[test])))
    return splits

def singlesplit(dataset, limit):
    train_data = Xypair(dataset[:int(limit)])
    val_data = Xypair(dataset[int(limit):])
    return (train_data, val_data)

# Funciones para correr los experimentos
---

In [None]:
# Cuando los splits son muy chicos o el accuracy es muy malo y no aparecen todos digitos se invalida el F-score para esos labels
# Lo siguiente es para evitar el mensaje de warning al calcular el F-score
import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

import time
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

def run_KNN(dataset, K, k):
    # Definimos los splits del dataset
    splits = []
    if K == 0:
        splits.append(singlesplit(dataset, int(dataset.shape[0] * 0.8)))
    else:
        splits = Kfoldconfig(dataset, K)
    # Para cada split del dataset
    m = len(splits)
    acc = []
    f1 = []
    times = []
    y_pred = []
    for split in splits:
        X,y = split[0]
        X_val, y_val = split[1]

        start = time.clock()

        # Predecimos con KNN
        clf = metnum.KNNClassifier(k)
        clf.fit(X, y)
        y_pred = clf.predict(X_val)

        end = time.clock()

        acc.append(accuracy_score(y_val, y_pred))
        f1.append(f1_score(y_val, y_pred, average="macro", labels=np.unique(y_pred)))
        times.append(end - start)

    return (np.mean(times), np.mean(acc), np.mean(f1), y_pred)

def run_KNN_PCA(dataset, K, k, alpha):
    # Definimos los splits del dataset
    splits = []
    if K == 0:
        splits.append(singlesplit(dataset, int(dataset.shape[0] * 0.8)))
    else:
        splits = Kfoldconfig(dataset, K)
    # Para cada split del dataset
    m = len(splits)
    acc = []
    f1 = []
    times = []
    y_pred = []
    for split in splits:
        X,y = split[0]
        X_val, y_val = split[1]
        
        start = time.clock()

        # Generamos un nuevos datasets de dimension alpha de train y de validacion con PCA
        pca = metnum.PCA(alpha)
        pca.fit(X)
        X = pca.transform(X)
        X_val = pca.transform(X_val)
        
        # Predecimos con KNN
        clf = metnum.KNNClassifier(k)
        clf.fit(X, y)
        y_pred = clf.predict(X_val)
        
        end = time.clock()

        acc.append(accuracy_score(y_val, y_pred))
        f1.append(f1_score(y_val, y_pred, average="weighted", labels=np.unique(y_pred)))
        times.append(end - start)
   
    return (np.mean(times), np.mean(acc), np.mean(f1), y_pred)



In [None]:
def guardar_resultado(columnas, filas):
    res = pd.DataFrame(filas, columns=columnas)
    res.to_csv("../results/{}.csv".format(filas[0][0]), index=False, header=True)

# Cargamos los datos
---

# Buscamos valores óptimos con un subconjunto de los datos

In [None]:
df_train = pd.read_csv("../data/train.csv")
df_30000 = df_train.sample(30000)
df_20000 = df_train.sample(20000)
df_10000 = df_train.sample(10000)
df_5000 = df_train.sample(5000)
df_1000 = df_train.sample(1000)
df_500 = df_train.sample(500)
df_100 = df_train.sample(100)

m = df_train.shape[1]
df_train.shape

# Corremos los experimentos
---

In [None]:
from tqdm.notebook import tqdm
import seaborn as sns
columnas = ["experimento", "metodo", "K-folds", "k", "alpha", "tiempo", "accuracy", "F1-score", "prediccion"]

## KNN con Kfold

In [None]:
filas = []
for K in tqdm(range(2, 10 + 1, 2), desc='Fold loop'):
    for k in tqdm(range(1, 50 + 1, 1), desc='k loop'):
        res = run_KNN(df_1000, K, k)
        filas.append(["KNN_1000_KFold", "KNN", K, k, 0, res[0], res[1], res[2], res[3]])

guardar_resultado(columnas, filas)

## KNN + PCA sin Kfold

In [None]:
filas = []
# Testeamos distintos valores de alpha para los 3 mejores k encontrados
k = 12

for alpha in tqdm(range(1, 50, 1), desc='Alpha loop'):
    res = run_KNN_PCA(df_1000, 0, k, alpha)
    filas.append(["KNN_k:12_PCA_1000", "KNN_PCA", 0, k, alpha, res[0], res[1], res[2], res[3]])

guardar_resultado(columnas, filas)

## KNN + PCA con Kfold

In [None]:
filas = []
k = 1
Ks = (5, 10)
for K in tqdm(range(0, len(Ks)), desc='Fold loop'):
    for alpha in tqdm(range(1, 30 + 1, 2), desc='Alpha loop'):
        res = run_KNN_PCA(df_100, Ks[K], k, alpha)
        filas.append(["KNN_k:5_PCA_KFold", "KNN_PCA", Ks[K], k, alpha, res[0], res[1], res[2], res[3]])

guardar_resultado(columnas, filas)

## Dataset entero k=1 alpha=25

In [49]:
filas = []
k = 12
alpha = 25
res = run_KNN_PCA(df_train, 0, k, alpha)
filas.append(["KNN_k:12_PCA_alpha:25", "KNN_PCA", 0, k, alpha, res[0], res[1], res[2], res[3]])

guardar_resultado(columnas, filas)