## Do
- Utilizar el dataset (DONE)
- Reducir dimensionalidad
    - PCA
    - Hard wavelet
- Implementar o **usar librerías** de SVM, KNN y Árboles de Decisión
- Realizar el proceso de entrenamiento mediante K-fold cross validation y Bootstrap para estimar el error
- Valores de Precisión
    - Recall
    - F1 - Score
    - AUC
- Conclusión

In [3]:
import numpy as np
import pandas as pd

#from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, recall_score, roc_auc_score, precision_score
from sklearn.model_selection import KFold
from sklearn.utils import resample

# Revisar OPENCV 
# Probar con la data actual sin reducir dimensionalidad

# Seed

In [4]:
from numpy.random import MT19937
from numpy.random import RandomState, SeedSequence
rs = RandomState(MT19937(SeedSequence(421413123)))

# Restart the stream
#rs = RandomState(MT19937(SeedSequence(987654321)))

## Procesos de Entrenamiento

In [5]:
def show_results(K, f1_arr, r_score_arr, auc_arr, model_name):
    print("=============================")
    print(f"Model: {model_name}")
    print("=============================")
    print(f"AVERAGE k = {K}")
    print(f"F1 score: {f1_arr.mean()}")
    print(f"Recall score: {r_score_arr.mean()}")
    print(f"AUC: {auc_arr.mean()}")

In [6]:
def K_fold_cross_validation(X_train, y_train, X_test, y_test ,model, random = None, k = 3, model_name = None):
    KFOLD = KFold(n_splits = k, random_state=None)
    f1_arr = np.array([])
    r_score_arr = np.array([])
    auc_arr = np.array([])
    precision_arr = np.array([])
    for train_index, test_index in KFOLD.split(X_train):
        train_X, train_y = X_train[train_index], y_train[train_index]
        #test_X, test_y = X_train[test_index], y_train[test_index]
        
        model.fit(train_X, train_y)
        y_pred = model.predict(X_test)

        # Precision score
        precision = precision_score(y_test, y_pred, average="micro")
        precision_arr = np.append(precision_arr, precision)
        # F1 SCORE
        f1 = f1_score(y_test, y_pred, average=None)
        f1_arr = np.append(f1_arr, [f1])
        # RECALL SCORE
        r_score = recall_score(y_test, y_pred, average = None)
        r_score_arr = np.append(r_score_arr, [r_score])
        # AUC
        AUC = roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr')
        auc_arr = np.append(auc_arr, [AUC])
    return np.array([f1_arr.mean(), r_score_arr.mean(), auc_arr.mean(), precision_arr.mean()])
    #show_results(k, f1_arr, r_score_arr, auc_arr, model_name)


In [7]:
def bs(n, n_bootstrap= 3, n_train = 0.5, random_state = None):
    n_train = int(n * n_train)
    arr = np.array([i for i in range(n)])
    for i in range(n_bootstrap):
        train_index = resample(arr, n_samples = n_train, replace = True, random_state = random_state) 
        test_index = np.delete(arr, train_index)
        yield train_index, test_index

def Bootstrap(X, y, X_t, y_t,model, random, k, model_name):
    f1_arr = np.array([])
    r_score_arr = np.array([])
    auc_arr = np.array([])
    precision_arr = np.array([])
    for train_index, test_index in bs(len(X), k, random_state = random):
        X_train, y_train = X[train_index], y[train_index]
        # X_test, y_test = X[test_index], y[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_t)
        # Precision score
        precision = precision_score(y_t, y_pred, average="micro")
        precision_arr = np.append(precision_arr, precision)
        # F1 SCORE
        f1 = f1_score(y_t, y_pred, average=None)
        f1_arr = np.append(f1_arr, [f1])
        # RECALL SCORE
        r_score = recall_score(y_t, y_pred, average =None)
        r_score_arr = np.append(r_score_arr, [r_score])
        # AUC
        AUC = roc_auc_score(y_t, model.predict_proba(X_t), multi_class='ovr')
        auc_arr = np.append(auc_arr, [AUC])
        #auc_arr = np.append(auc_arr, AUC)
    return np.array([f1_arr.mean(), r_score_arr.mean(), auc_arr.mean(), precision_arr.mean()])
    #show_results(k, f1_arr, r_score_arr, auc_arr, model_name)

## Data

In [8]:
#train_path = "dataset/sign_mnist_train.csv"
#test_path = "dataset/sign_mnist_test.csv"
#train_path = "data_redimensionada/train_14_by_14.csv"
#test_path = "data_redimensionada/test_14_by_14.csv"
train_path = "data_redimensionada/train_7_by_7.csv"
test_path = "data_redimensionada/test_7_by_7.csv"

In [9]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

In [10]:
# TRAIN
y_train = np.array(df_train.iloc[:,0])
X_train = np.array(df_train.iloc[:, 1:])
# TEST
y_test = np.array(df_test.iloc[:, 0])
X_test = np.array(df_test.iloc[:, 1:])

In [11]:
X_test.shape

(7172, 49)

In [12]:
X_train.shape

(27455, 49)

## Support Vector Machine

In [32]:
# https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
from sklearn.svm import SVC
svm = SVC(decision_function_shape='ovr', probability=True, max_iter=1000, cache_size=600)

In [37]:
begin = 3
end = 10

resultados = {}

for i in range(begin, end, 1):
    resultados[i] = K_fold_cross_validation(X_train, y_train,X_test, y_test, svm, None, k = i, model_name="Support Vector Machine")

svm_kFold = pd.DataFrame(resultados, index=["f1 score", "recall score", "AUC", "Precision"])



Unnamed: 0,f1 score,recall score,AUC,Precision
3,0.834617,0.838878,0.991171,0.850669
4,0.833107,0.837343,0.991308,0.849449
5,0.834429,0.838909,0.991389,0.850418
6,0.834785,0.839375,0.991426,0.850623
7,0.835029,0.839753,0.99153,0.850769


In [38]:
svm_kFold.T

Unnamed: 0,f1 score,recall score,AUC,Precision
3,0.834617,0.838878,0.991171,0.850669
4,0.833107,0.837343,0.991308,0.849449
5,0.834429,0.838909,0.991389,0.850418
6,0.834785,0.839375,0.991426,0.850623
7,0.835029,0.839753,0.99153,0.850769
8,0.83535,0.840036,0.991527,0.851053
9,0.835308,0.840005,0.991512,0.851057


In [10]:
begin = 3
end = 10
resultados = {}

for i in range(begin, end, 1):
    resultados[i] = Bootstrap(X_train, y_train,X_test, y_test, svm, rs, k = i, model_name="Support Vector Machine")

svm_Bootstrap = pd.DataFrame(resultados, index=["f1 score", "recall score", "AUC", "Precision"])



Unnamed: 0,f1 score,recall score,AUC,Precision
3,0.823022,0.825913,0.990352,0.84063
4,0.828541,0.832018,0.990197,0.845475
5,0.82586,0.829837,0.990322,0.84169
6,0.829475,0.833252,0.990525,0.845603
7,0.827032,0.831498,0.990148,0.843379


In [11]:
svm_Bootstrap.T

Unnamed: 0,f1 score,recall score,AUC,Precision
3,0.823022,0.825913,0.990352,0.84063
4,0.828541,0.832018,0.990197,0.845475
5,0.82586,0.829837,0.990322,0.84169
6,0.829475,0.833252,0.990525,0.845603
7,0.827032,0.831498,0.990148,0.843379
8,0.826048,0.829917,0.990184,0.842757
9,0.828588,0.833001,0.989983,0.84472


## Decision Tree

In [13]:
from sklearn.tree import DecisionTreeClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
# https://scikit-learn.org/stable/modules/tree.html
tree = DecisionTreeClassifier(random_state = 0, min_samples_leaf=4, criterion="entropy")

In [14]:
begin = 3
end = 13

resultados = {}

for i in range(begin, end, 1):
    resultados[i] = K_fold_cross_validation(X_train, y_train, X_test, y_test, tree, None, k = i, model_name="Decision Tree")

DTree_kFold = pd.DataFrame(resultados, index=["f1 score", "recall score", "AUC", "Precision"])

In [15]:
DTree_kFold.T

Unnamed: 0,f1 score,recall score,AUC,Precision
3,0.478919,0.481665,0.763003,0.494888
4,0.484196,0.488581,0.764614,0.499407
5,0.490898,0.495188,0.768068,0.504071
6,0.487125,0.49223,0.767021,0.503765
7,0.486499,0.491956,0.765451,0.501992
8,0.484398,0.48902,0.764413,0.499041
9,0.490412,0.494469,0.76758,0.505345
10,0.493607,0.497358,0.769829,0.50969
11,0.487324,0.490929,0.766609,0.503042
12,0.493692,0.498193,0.771954,0.509853


In [16]:
resultados = {}
for i in range(begin, end, 1):
    resultados[i] = Bootstrap(X_train, y_train, X_test, y_test, tree, rs, k = i, model_name="Decision Tree")

DTree_bootstrap = pd.DataFrame(resultados, index=["f1 score", "recall score", "AUC", "Precision"])

In [17]:
DTree_bootstrap.T

Unnamed: 0,f1 score,recall score,AUC,Precision
3,0.44984,0.454619,0.749312,0.46333
4,0.444518,0.449224,0.742573,0.460158
5,0.444395,0.446648,0.744406,0.457585
6,0.446448,0.451424,0.747408,0.460866
7,0.43858,0.442476,0.743101,0.453071
8,0.439781,0.444775,0.741214,0.454981
9,0.444004,0.449587,0.744085,0.458759
10,0.443588,0.446508,0.744384,0.459927
11,0.443882,0.449239,0.745003,0.458373
12,0.440317,0.445011,0.745806,0.454313


## KNN

In [21]:
# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5, n_jobs = -1)

In [22]:
begin = 3
end = 13

resultados = {}
for i in range(begin, end, 1):
    resultados[i] = K_fold_cross_validation(X_train, y_train, X_test, y_test, knn, None, k = i, model_name="KNN")
knn_kfold = pd.DataFrame(resultados, index=["F1 Score", "Recall Score", "AUC", "Precision"])


In [23]:
knn_kfold.T

Unnamed: 0,F1 Score,Recall Score,AUC,Precision
3,0.767186,0.769752,0.93658,0.775795
4,0.772131,0.774812,0.93492,0.780954
5,0.77571,0.778311,0.934106,0.784189
6,0.777652,0.780444,0.933875,0.786159
7,0.778988,0.781877,0.933908,0.787407
8,0.779694,0.782596,0.933714,0.788117
9,0.780557,0.783557,0.93366,0.788932
10,0.781168,0.784149,0.933558,0.789557
11,0.781446,0.784475,0.933554,0.789789
12,0.781543,0.784557,0.933467,0.789866


In [24]:
resultados = {}
for i in range(begin, end, 1):
    resultados[i]= Bootstrap(X_train, y_train, X_test, y_test, knn, rs, k = i, model_name="KNN")
knn_bootstrap = pd.DataFrame(resultados, index=["F1 score", "Recall score", "AUC", "Precision"])

In [25]:
knn_bootstrap.T

Unnamed: 0,F1 score,Recall score,AUC,Precision
3,0.734199,0.737239,0.931671,0.742843
4,0.734903,0.73592,0.932603,0.74505
5,0.732846,0.735465,0.931751,0.741634
6,0.731203,0.733379,0.929177,0.738892
7,0.733157,0.734769,0.93044,0.742511
8,0.733793,0.735182,0.928761,0.743516
9,0.730106,0.731915,0.929395,0.739729
10,0.73468,0.736764,0.9301,0.744325
11,0.732135,0.733674,0.927877,0.741736
12,0.731386,0.733142,0.929423,0.740867
