# Compilación
---

In [None]:
!cd .. && mkdir build
!cd ../build/ && rm -rf *
!rm -f *.so
!cd ../build/ && cmake \
  -DPYTHON_EXECUTABLE="$(which python)" \
  -DCMAKE_BUILD_TYPE=Release ..
!cd ../build/ && make install

In [1]:
# Verifico la correcta instalación. Si no falla el import está OK
!pwd
!python --version
import metnum
import numpy as np
from sklearn.model_selection import KFold

# Cuando los splits son muy chicos o el accuracy es muy malo y no aparecen todos digitos se invalida el F-score para esos labels
# Lo siguiente es para evitar el menssudo apt-get install libbz2-devaje de warning al calcular el F-score
import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

import time
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LinearRegression

import pandas as pd
import seaborn as sns
from tqdm.auto import tqdm
import matplotlib.pyplot as plt


from sklearn.metrics import ConfusionMatrixDisplay
np.random.seed(1998)


/home/eitancho/Escritorio/tp2-nuevo/notebooks
Python 3.8.0


  from .autonotebook import tqdm as notebook_tqdm


# K fold configuration y split simple
---

In [2]:

# Retorna el par data/labels a partir de un dataframe
def Xypair(dataset):
    X = dataset[dataset.columns[1:]].values
    y = dataset["label"].values.reshape(-1,1)
    return (X,y)

# Retorna k splits a partir del dataset (train/test) en pares (data/labels) o (X,y)
def Kfoldconfig(dataset, K):
    kfold = KFold(K, shuffle= True, random_state= 7)
    splits = []
    for train, test in kfold.split(dataset):
        splits.append((Xypair(dataset.iloc[train]), Xypair(dataset.iloc[test])))
    return splits

def singlesplit(dataset, limit):
    train_data = Xypair(dataset[:int(limit)])
    val_data = Xypair(dataset[int(limit):])
    return (train_data, val_data)

# Funciones para correr los experimentos
---

In [3]:


def run_KNN(dataset, K, k):
    # Definimos los splits del dataset
    splits = []
    if K == 1:
        splits.append(singlesplit(dataset, int(dataset.shape[0] * 0.8)))
    else:
        splits = Kfoldconfig(dataset, K)
    # Para cada split del dataset
    m = len(splits)
    acc = []
    f1 = []
    prec = []
    rec = []
    times = []
    confusion = np.zeros((10, 10))
    for split in splits:
        X,y = split[0]
        X_val, y_val = split[1]

        start = time.time()

        # Predecimos con KNN
        clf = metnum.KNNClassifier(k)
        clf.fit(X, y)
        y_pred = clf.predict(X_val)

        end = time.time()

        acc.append(accuracy_score(y_val, y_pred))
        f1.append(f1_score(y_val, y_pred, average="macro", labels=np.unique(y_pred)))
        prec.append(precision_score(y_val, y_pred, average="macro", labels=np.unique(y_pred)))
        rec.append(recall_score(y_val, y_pred, average="macro", labels=np.unique(y_pred)))
        confusion = confusion + confusion_matrix(y_val, y_pred, labels=np.arange(0,10))
        times.append(end - start)

    confusion = (confusion/ K).astype(int)
    return (np.mean(times), np.mean(acc), np.mean(prec), np.mean(rec), np.mean(f1), confusion)

def run_KNN_PCA(dataset, K, k, alpha):
    # Definimos los splits del dataset
    splits = []
    if K == 1:
        splits.append(singlesplit(dataset, int(dataset.shape[0] * 0.8)))
    else:
        splits = Kfoldconfig(dataset, K)
    # Para cada split del dataset
    m = len(splits)
    acc = []
    f1 = []
    prec = []
    rec = []
    confusion = np.zeros((10, 10))
    times = []
    for split in splits:
        X,y = split[0]
        X_val, y_val = split[1]
        
        start = time.time()

        # Generamos un nuevos datasets de dimension alpha de train y de validacion con PCA
        pca = metnum.PCA(alpha)
        pca.fit(X)
        X = pca.transform(X)
        X_val = pca.transform(X_val)
        
        # Predecimos con KNN
        clf = metnum.KNNClassifier(k)
        clf.fit(X, y)
        y_pred = clf.predict(X_val)
        
        end = time.time()

        acc.append(accuracy_score(y_val, y_pred))
        f1.append(f1_score(y_val, y_pred, average="macro", labels=np.unique(y_pred)))
        prec.append(precision_score(y_val, y_pred, average="macro", labels=np.unique(y_pred)))
        rec.append(recall_score(y_val, y_pred, average="macro", labels=np.unique(y_pred)))
        confusion = confusion + confusion_matrix(y_val, y_pred, labels=np.arange(0,10))
        times.append(end - start)

    confusion = (confusion).astype(int)
    return ([np.sum(times),np.std(times)], [np.mean(acc),np.std(acc)], [np.mean(prec),np.std(prec)], [np.mean(rec),np.std(rec)], [np.mean(f1),np.std(f1)], confusion)



# Cargamos los datos


In [4]:
def guardar_resultado(columnas, filas,guardarComo):
    res = pd.DataFrame(filas, columns=columnas)
    res.to_csv(guardarComo.format(filas[0][0]), index=False, header=True)

In [1]:
df_train = pd.read_csv("../data/train.csv.gz")
df_30000 = df_train.sample(30000)
df_20000 = df_train.sample(20000)
df_10000 = df_train.sample(10000)
df_5000 = df_train.sample(5000)
df_1000 = df_train.sample(1000)
df_500 = df_train.sample(500)
df_100 = df_train.sample(100)




NameError: name 'pd' is not defined

# Corremos los experimentos
---

In [6]:

columnas = ["experimento", "metodo", "K-folds", "k", "alpha", "tiempo", "accuracy", "precision", "recall", "F1-score",  "confusion"]

## KNN sin Kfold

In [6]:
filas = []
K = 1
alpha = 36
for k in tqdm(range(1, 50 , 1), desc='k loop'):
    res = run_KNN_PCA(df_train, K, k, alpha)
    filas.append(["Encontramos_Mejor_k_Con_PCA_en_df_Train", "KNN_PCA", K, k, alpha, res[0], res[1], res[2], res[3], res[4]])



k loop: 100%|██████████| 49/49 [35:25<00:00, 43.38s/it]


In [7]:
columnas = ["experimento", "metodo", "K-folds", "k", "alpha", "tiempo", "accuracy", "precision", "recall", "F1-score"]
guardar_resultado(columnas, filas, guardarComo="../results2/KNN/{}.csv")

## KNN con Kfold

In [None]:
filas = []
for K in tqdm(range(2, 15 + 1, 1), desc='Fold low loop '):
    #for k in tqdm(range(1, 25 + 1, 5), desc='k loop'):
    res = run_KNN(df_10000, K, 6)
    filas.append(["KNN_10000_KFold", "KNN", K, 6, 0, res[0], res[1], res[2], res[3], res[4], res[5]])

guardar_resultado(columnas, filas)

## KNN + PCA sin Kfold

In [8]:

filas = []
# Testeamos distintos valores de alpha para los 3 mejores k encontrados
k = 3
K = 1
for alpha in tqdm(range(1, 50, 1), desc='Alpha loop'):
    res = run_KNN_PCA(df_train, K, k, alpha)
    filas.append(["Bucamos_Mejor_alpha_en_df", "KNN_PCA", K, k, alpha, res[0], res[1], res[2], res[3], res[4]])
for alpha in tqdm(range(50, 100, 10), desc='Alpha loop'):
    res = run_KNN_PCA(df_train, K, k, alpha)
    filas.append(["Bucamos_Mejor_alpha_en_df", "KNN_PCA", K, k, alpha, res[0], res[1], res[2], res[3], res[4]])



Alpha loop: 100%|██████████| 49/49 [33:38<00:00, 41.20s/it]
Alpha loop: 100%|██████████| 5/5 [04:27<00:00, 53.44s/it]


In [9]:
columnas = ["experimento", "metodo", "K-folds", "k", "alpha", "tiempo", "accuracy", "precision", "recall", "F1-score"]
guardar_resultado(columnas, filas,guardarComo="../results2/PCA/{}.csv")

## KNN + PCA con Kfold

In [6]:
filas_mean = []
filas_std = []
filas_confusion = []
k = 3
alpha = 36
for K in tqdm(range(1, 20 + 1, 1), desc='k loop'):
    res = run_KNN_PCA(df_train, K, k, alpha)
    filas_mean.append(["Kfold_mean", "KNN_PCA", K, k, alpha, res[0][0], res[1][0], res[2][0], res[3][0], res[4][0]])
    filas_std.append(["Kfold_std", "KNN_PCA", K, k, alpha, res[0][1], res[1][1], res[2][1], res[3][1], res[4][1]])
    filas_confusion.append(res[5])
        


k loop: 100%|██████████| 20/20 [1:26:15<00:00, 258.80s/it]


In [7]:
columnas2 = range(0,10)
filas_confusion = np.array(filas_confusion).reshape(10*20, 10)

In [8]:
guardar_resultado(columnas2, filas_confusion, guardarComo="../results2/K-Fold/Kfold_confusion_dftrain.csv")

In [9]:
columnas = ["experimento", "metodo", "K-folds", "k", "alpha", "tiempo", "accuracy", "precision", "recall", "F1-score"]

guardar_resultado(columnas, filas_mean,guardarComo="../results2/K-Fold/{}.csv")
guardar_resultado(columnas, filas_std,guardarComo="../results2/K-Fold/{}.csv")


## Variamos el tamaño de la base de datos para los mejores parametros encontrados: (k = 3, alpha = 36, K = 1,  20 y 5)

In [12]:
filas_mean = []
filas_std = []
filas_confusion = []
k = 3
alpha = 36
#K = 1
#K = 20
K = 5
bases = {"df_100": df_100, "df_500":df_500, "df_1000":df_1000, "df_5000": df_5000, "df_10000": df_10000, 
        "df_20000": df_20000, "df_30000": df_30000, "df_42000": df_train}
for df in tqdm( bases, desc='Alpha loop'):
    res = run_KNN_PCA(bases[df], K, k, alpha)
    filas_mean.append(["Df_mean", "KNN_PCA", df, K, k, alpha, res[0][0], res[1][0], res[2][0], res[3][0], res[4][0]])
    filas_std.append(["Df_std", "KNN_PCA", df, K, k, alpha, res[0][1], res[1][1], res[2][1], res[3][1], res[4][1]])   
    filas_confusion.append(res[5])


Alpha loop: 100%|██████████| 8/8 [07:24<00:00, 55.60s/it] 


In [14]:
columnas2 = range(0,10)
filas_confusion = np.array(filas_confusion).reshape(10*8, 10)
guardar_resultado(columnas2, filas_confusion, guardarComo="../results2/VariandoDf/Confusion_ConKfold5.csv")

In [15]:
columnas = ["experimento", "metodo", "Dataset", "K-folds", "k", "alpha", "tiempo", "accuracy", "precision", "recall", "F1-score"]

guardar_resultado(columnas, filas_mean,guardarComo="../results2/VariandoDf/MeanConKfold5.csv")
guardar_resultado(columnas, filas_std,guardarComo="../results2/VariandoDf/StdConKfold5.csv")
