## Do
- Utilizar el dataset (DONE)
- Reducir dimensionalidad
    - PCA
    - Hard wavelet
- Implementar o **usar librerías** de SVM, KNN y Árboles de Decisión
- Realizar el proceso de entrenamiento mediante K-fold cross validation y Bootstrap para estimar el error
- Valores de Precisión
    - Recall
    - F1 - Score
    - AUC
- Conclusión

In [34]:
import numpy as np
import pandas as pd

#from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, recall_score, roc_auc_score, precision_score
from sklearn.model_selection import KFold
from sklearn.utils import resample

# Revisar OPENCV 
# Probar con la data actual sin reducir dimensionalidad

# Seed

In [2]:
from numpy.random import MT19937
from numpy.random import RandomState, SeedSequence
rs = RandomState(MT19937(SeedSequence(421413123)))

# Restart the stream
#rs = RandomState(MT19937(SeedSequence(987654321)))

## Procesos de Entrenamiento

In [3]:
def show_results(K, f1_arr, r_score_arr, auc_arr, model_name):
    print("=============================")
    print(f"Model: {model_name}")
    print("=============================")
    print(f"AVERAGE k = {K}")
    print(f"F1 score: {f1_arr.mean()}")
    print(f"Recall score: {r_score_arr.mean()}")
    print(f"AUC: {auc_arr.mean()}")

In [36]:
def K_fold_cross_validation(X_train, y_train, X_test, y_test ,model, random = None, k = 3, model_name = None):
    KFOLD = KFold(n_splits = k, random_state=None)
    f1_arr = np.array([])
    r_score_arr = np.array([])
    auc_arr = np.array([])
    precision_arr = np.array([])
    for train_index, test_index in KFOLD.split(X_train):
        train_X, train_y = X_train[train_index], y_train[train_index]
        #test_X, test_y = X_train[test_index], y_train[test_index]
        
        model.fit(train_X, train_y)
        y_pred = model.predict(X_test)

        # Precision score
        precision = precision_score(y_test, y_pred, average="micro")
        precision_arr = np.append(precision_arr, precision)
        # F1 SCORE
        f1 = f1_score(y_test, y_pred, average=None)
        f1_arr = np.append(f1_arr, [f1])
        # RECALL SCORE
        r_score = recall_score(y_test, y_pred, average = None)
        r_score_arr = np.append(r_score_arr, [r_score])
        # AUC
        AUC = roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr')
        auc_arr = np.append(auc_arr, [AUC])
    return np.array([f1_arr.mean(), r_score_arr.mean(), auc_arr.mean(), precision_arr.mean()])
    #show_results(k, f1_arr, r_score_arr, auc_arr, model_name)


In [37]:
def bs(n, n_bootstrap= 3, n_train = 0.5, random_state = None):
    n_train = int(n * n_train)
    arr = np.array([i for i in range(n)])
    for i in range(n_bootstrap):
        train_index = resample(arr, n_samples = n_train, replace = True, random_state = random_state) 
        test_index = np.delete(arr, train_index)
        yield train_index, test_index

def Bootstrap(X, y, X_t, y_t,model, random, k, model_name):
    f1_arr = np.array([])
    r_score_arr = np.array([])
    auc_arr = np.array([])
    precision_arr = np.array([])
    for train_index, test_index in bs(len(X), k, random_state = random):
        X_train, y_train = X[train_index], y[train_index]
        # X_test, y_test = X[test_index], y[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_t)
        # Precision score
        precision = precision_score(y_t, y_pred, average="micro")
        precision_arr = np.append(precision_arr, precision)
        # F1 SCORE
        f1 = f1_score(y_t, y_pred, average=None)
        f1_arr = np.append(f1_arr, [f1])
        # RECALL SCORE
        r_score = recall_score(y_t, y_pred, average =None)
        r_score_arr = np.append(r_score_arr, [r_score])
        # AUC
        AUC = roc_auc_score(y_t, model.predict_proba(X_t), multi_class='ovr')
        auc_arr = np.append(auc_arr, [AUC])
        #auc_arr = np.append(auc_arr, AUC)
    return np.array([f1_arr.mean(), r_score_arr.mean(), auc_arr.mean(), precision_arr.mean()])
    #show_results(k, f1_arr, r_score_arr, auc_arr, model_name)

## Data

In [6]:
train_path = "dataset/sign_mnist_train.csv"
test_path = "dataset/sign_mnist_test.csv"

In [7]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

In [8]:
# TRAIN
y_train = np.array(df_train.iloc[:,0])
X_train = np.array(df_train.iloc[:, 1:])
# TEST
y_test = np.array(df_test.iloc[:, 0])
X_test = np.array(df_test.iloc[:, 1:])

## Support Vector Machine

In [19]:
# https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
from sklearn.svm import SVC
svm = SVC(decision_function_shape='ovr', probability=True, max_iter=1000, cache_size=600)

In [20]:
begin = 3
end = 10

resultados = {}

for i in range(begin, end, 1):
    resultados[i] = K_fold_cross_validation(X_train, y_train,X_test, y_test, svm, None, k = i, model_name="Support Vector Machine")

svm_kFold = pd.DataFrame(resultados, index=["f1 score", "recall score", "AUC", "Precision"])
svm_kFold.T.head()

Unnamed: 0,f1 score,recall score,AUC
3,0.821017,0.825507,0.989678
4,0.82323,0.827853,0.98984
5,0.821848,0.826246,0.990007
6,0.823292,0.827778,0.990026
7,0.822957,0.827441,0.990052


In [23]:
svm_kFold.T

Unnamed: 0,f1 score,recall score,AUC
3,0.821017,0.825507,0.989678
4,0.82323,0.827853,0.98984
5,0.821848,0.826246,0.990007
6,0.823292,0.827778,0.990026
7,0.822957,0.827441,0.990052
8,0.823866,0.828468,0.990057
9,0.823123,0.827744,0.990065


In [21]:
resultados = {}

for i in range(begin, end, 1):
    resultados[i] = Bootstrap(X_train, y_train,X_test, y_test, svm, rs, k = i, model_name="Support Vector Machine")

svm_Bootstrap = pd.DataFrame(resultados, index=["f1 score", "recall score", "AUC", "Precision"])
svm_Bootstrap.T.head()

Unnamed: 0,f1 score,recall score,AUC
3,0.817613,0.821844,0.988514
4,0.817299,0.821981,0.989045
5,0.815042,0.820263,0.988993
6,0.81463,0.819086,0.988728
7,0.814619,0.819422,0.988664


In [24]:
svm_Bootstrap.T

Unnamed: 0,f1 score,recall score,AUC
3,0.817613,0.821844,0.988514
4,0.817299,0.821981,0.989045
5,0.815042,0.820263,0.988993
6,0.81463,0.819086,0.988728
7,0.814619,0.819422,0.988664
8,0.814766,0.818628,0.988839
9,0.813735,0.818376,0.988729


## Decision Tree

In [12]:
from sklearn.tree import DecisionTreeClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
# https://scikit-learn.org/stable/modules/tree.html
tree = DecisionTreeClassifier(random_state = 0)

In [13]:
begin = 3
end = 12

resultados = {}

for i in range(begin, end, 1):
    resultados[i] = K_fold_cross_validation(X_train, y_train, X_test, y_test, tree, None, k = i, model_name="Decision Tree")

DTree_kFold = pd.DataFrame(resultados, index=["f1 score", "recall score", "AUC", "Precision"])
DTree_kFold.T.head()

Unnamed: 0,f1 score,recall score,AUC
3,0.423212,0.431181,0.703356
4,0.420637,0.430606,0.702984
5,0.4066,0.41217,0.693506
6,0.422299,0.43023,0.702879
7,0.418374,0.426561,0.701016


In [25]:
DTree_kFold.T

Unnamed: 0,f1 score,recall score,AUC
3,0.423212,0.431181,0.703356
4,0.420637,0.430606,0.702984
5,0.4066,0.41217,0.693506
6,0.422299,0.43023,0.702879
7,0.418374,0.426561,0.701016
8,0.421341,0.428036,0.701769
9,0.418872,0.426671,0.701057
10,0.419613,0.428622,0.702029
11,0.417927,0.426318,0.700882


In [18]:
resultados = {}
for i in range(begin, end, 1):
    resultados[i] = Bootstrap(X_train, y_train, X_test, y_test, tree, rs, k = i, model_name="Decision Tree")

DTree_bootstrap = pd.DataFrame(resultados, index=["f1 score", "recall score", "AUC", "Precision"])
DTree_bootstrap.T.head()

Unnamed: 0,f1 score,recall score,AUC
3,0.410935,0.417398,0.696182
4,0.39297,0.400266,0.6872
5,0.400341,0.406529,0.690472
6,0.39173,0.400051,0.687086
7,0.405064,0.412825,0.693766


In [26]:
DTree_bootstrap.T

Unnamed: 0,f1 score,recall score,AUC
3,0.410935,0.417398,0.696182
4,0.39297,0.400266,0.6872
5,0.400341,0.406529,0.690472
6,0.39173,0.400051,0.687086
7,0.405064,0.412825,0.693766
8,0.402369,0.410859,0.692758
9,0.409572,0.417799,0.696345
10,0.400116,0.408281,0.691355
11,0.398108,0.404577,0.68949


## KNN

In [9]:
# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5, n_jobs = -1)

In [10]:
begin = 3
end = 13

resultados = {}
for i in range(begin, end, 1):
    resultados[i] = K_fold_cross_validation(X_train, y_train, X_test, y_test, knn, None, k = i, model_name="KNN")
knn_kfold = pd.DataFrame(resultados, index=["F1 Score", "Recall Score", "AUC", "Precision"])
knn_kfold.T.head()


Unnamed: 0,F1 Score,Recall Score,AUC
3,0.783984,0.788955,0.936947
4,0.786921,0.791664,0.935527
5,0.788452,0.793029,0.935876
6,0.790564,0.795229,0.935228
7,0.790697,0.795248,0.935199


In [27]:
knn_kfold.T

Unnamed: 0,F1 Score,Recall Score,AUC
3,0.783984,0.788955,0.936947
4,0.786921,0.791664,0.935527
5,0.788452,0.793029,0.935876
6,0.790564,0.795229,0.935228
7,0.790697,0.795248,0.935199
8,0.791836,0.796439,0.935037
9,0.792157,0.796802,0.934828
10,0.792429,0.797005,0.934799
11,0.792632,0.797263,0.93465
12,0.793168,0.797772,0.93456


In [38]:
resultados = {}
for i in range(begin, end, 1):
    resultados[i]= Bootstrap(X_train, y_train, X_test, y_test, knn, rs, k = i, model_name="KNN")
knn_bootstrap = pd.DataFrame(resultados, index=["F1 score", "Recall score", "AUC", "Precision"])
knn_bootstrap.T.head()

In [None]:
knn_bootstrap.T

Unnamed: 0,F1 score,Recall score,AUC
3,0.752481,0.756043,0.930082
4,0.758741,0.762568,0.931098
5,0.753143,0.758396,0.932157
6,0.756431,0.760524,0.930713
7,0.754312,0.758804,0.931975
8,0.75723,0.760927,0.931749
9,0.758195,0.762814,0.932633
10,0.761261,0.766602,0.933813
11,0.760211,0.765654,0.932599
12,0.75633,0.761307,0.930773
