## Do
- Utilizar el dataset (DONE)
- Reducir dimensionalidad
    - PCA
    - Hard wavelet
- Implementar o **usar librerías** de SVM, KNN y Árboles de Decisión
- Realizar el proceso de entrenamiento mediante K-fold cross validation y Bootstrap para estimar el error
- Valores de Precisión
    - Recall
    - F1 - Score
    - AUC
- Conclusión

In [41]:
import numpy as np
import pandas as pd

#from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, recall_score, roc_auc_score
from sklearn.model_selection import KFold
from sklearn.utils import resample

# Revisar OPENCV 
# Probar con la data actual sin reducir dimensionalidad

# Seed

In [2]:
from numpy.random import MT19937
from numpy.random import RandomState, SeedSequence
rs = RandomState(MT19937(SeedSequence(421413123)))

# Restart the stream
#rs = RandomState(MT19937(SeedSequence(987654321)))

## Procesos de Entrenamiento

In [68]:
def show_results(K, f1_arr, r_score_arr, auc_arr, model_name):
    print("=============================")
    print(f"Model: {model_name}")
    print("=============================")
    print(f"AVERAGE k = {K}")
    print(f"F1 score: {f1_arr.mean()}")
    print(f"Recall score: {r_score_arr.mean()}")
    print(f"AUC: {auc_arr.mean()}")

In [117]:
def K_fold_cross_validation(X_train, y_train, X_test, y_test ,model, random = None, k = 3, model_name = None):
    KFOLD = KFold(n_splits = k, random_state=None)
    f1_arr = np.array([])
    r_score_arr = np.array([])
    auc_arr = np.array([])
    for train_index, test_index in KFOLD.split(X_train):
        train_X, train_y = X_train[train_index], y_train[train_index]
        #test_X, test_y = X_train[test_index], y_train[test_index]
        
        model.fit(train_X, train_y)
        y_pred = model.predict(X_test)
        # F1 SCORE
        f1 = f1_score(y_test, y_pred, average=None)
        f1_arr = np.append(f1_arr, [f1])
        # RECALL SCORE
        r_score = recall_score(y_test, y_pred, average = None)
        r_score_arr = np.append(r_score_arr, [r_score])
        # AUC
        AUC = roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr')
        auc_arr = np.append(auc_arr, [AUC])
    return np.array([f1_arr.mean(), r_score_arr.mean(), auc_arr.mean()])
    #show_results(k, f1_arr, r_score_arr, auc_arr, model_name)


In [118]:
def bs(n, n_bootstrap= 3, n_train = 0.5, random_state = None):
    n_train = int(n * n_train)
    arr = np.array([i for i in range(n)])
    for i in range(n_bootstrap):
        train_index = resample(arr, n_samples = n_train, replace = True, random_state = random_state) 
        test_index = np.delete(arr, train_index)
        yield train_index, test_index

def Bootstrap(X, y, X_t, y_t,model, random, k, model_name):
    f1_arr = np.array([])
    r_score_arr = np.array([])
    auc_arr = np.array([])
    for train_index, test_index in bs(len(X), k, random_state = random):
        X_train, y_train = X[train_index], y[train_index]
        # X_test, y_test = X[test_index], y[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_t)
        # F1 SCORE
        f1 = f1_score(y_t, y_pred, average=None)
        f1_arr = np.append(f1_arr, [f1])
        # RECALL SCORE
        r_score = recall_score(y_t, y_pred, average =None)
        r_score_arr = np.append(r_score_arr, [r_score])
        # AUC
        AUC = roc_auc_score(y_t, model.predict_proba(X_t), multi_class='ovr')
        auc_arr = np.append(auc_arr, [AUC])
        #auc_arr = np.append(auc_arr, AUC)
    return np.array([f1_arr.mean(), r_score_arr.mean(), auc_arr.mean()])
    #show_results(k, f1_arr, r_score_arr, auc_arr, model_name)

## Data

In [74]:
train_path = "dataset/sign_mnist_train.csv"
test_path = "dataset/sign_mnist_test.csv"

In [75]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

In [76]:
# TRAIN
y_train = np.array(df_train.iloc[:,0])
X_train = np.array(df_train.iloc[:, 1:])
# TEST
y_test = np.array(df_test.iloc[:, 0])
X_test = np.array(df_test.iloc[:, 1:])

## Support Vector Machine

In [106]:
# https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
from sklearn.svm import SVC
svm = SVC(decision_function_shape='ovr', probability=True)


In [107]:
begin = 3
end = 13

resultados = {}

for i in range(begin, end, 1):
    resultados[i] = K_fold_cross_validation(X_train, y_train,X_test, y_test, svm, None, k = i, model_name="Support Vector Machine")

svm_kFold = pd.DataFrame(resultados, index=["f1 score", "recall score", "AUC"])
svm_kFold.head()

Unnamed: 0,3
f1 score,0.821017
recall score,0.825507
AUC,0.989594


In [None]:
resultados = {}

for i in range(begin, end, 1):
    resultados[i] = Bootstrap(X_train, y_train,X_test, y_test, svm, rs, k = i, model_name="Support Vector Machine")

svm_Bootstrap = pd.DataFrame(resultados, index=["f1 score", "recall score", "AUC"])
svm_Bootstrap.head()

Model: Support Vector Machine
AVERAGE k = 3
F1 score: 0.997177948745308
Recall score: 0.997177948745308
AUC: 1.0


## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
# https://scikit-learn.org/stable/modules/tree.html
tree = DecisionTreeClassifier(random_state = 0)

In [103]:
begin = 3
end = 12

resultados = {}

for i in range(begin, end, 1):
    resultados[i] = K_fold_cross_validation(X_train, y_train, X_test, y_test, tree, None, k = i, model_name="Decision Tree")

DTree_kFold = pd.DataFrame(resultados, index=["f1 score", "recall score", "AUC"])
DTree_kFold.head()

Unnamed: 0,3
f1 score,0.423212
recall score,0.431181
AUC,0.703356


In [104]:
resultados = {}
for i in range(begin, end, 1):
    resultados[i] = Bootstrap(X_train, y_train, X_test, y_test, tree, rs, k = i, model_name="Decision Tree")

DTree_bootstrap = pd.DataFrame(resultados, index=["f1 score", "recall score", "AUC"])
DTree_bootstrap.head()

Unnamed: 0,3
f1 score,0.384303
recall score,0.389273
AUC,0.681508


## KNN

In [108]:
# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5, n_jobs = -1)


In [110]:
begin = 3
end = 13

resultados = {}
for i in range(begin, end, 1):
    resultados[i] = K_fold_cross_validation(X_train, y_train, X_test, y_test, knn, None, k = i, model_name="KNN")
knn_kfold = pd.DataFrame(resultados, index=["F1 Score", "Recall Score", "AUC"])
knn_kfold.T.head()


Unnamed: 0,f1 score,recall score,AUC
3,0.783984,0.788955,0.936947
4,0.786921,0.791664,0.935527


In [116]:
resultados = {}
for i in range(begin, end, 1):
    resultados[i]= Bootstrap(X_train, y_train, X_test, y_test, knn, rs, k = i, model_name="KNN")
knn_bootstrap = pd.DataFrame(resultados, index=["F1 score", "Recall score", "AUC"])
knn_bootstrap.T.head()

Unnamed: 0,f1 score,recall score,AUC
3,0.762687,0.76749,0.93205
4,0.757351,0.761681,0.93246
