In [23]:
import numpy as np
import pandas as pd

#from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, recall_score, roc_auc_score, precision_score
from sklearn.model_selection import KFold
from sklearn.utils import resample

# Seed
Se establece un semilla random para replicar el experimento.

In [24]:
from numpy.random import MT19937
from numpy.random import RandomState, SeedSequence
rs = RandomState(MT19937(SeedSequence(421413123)))

## Procesos de Entrenamiento

### K- Fold

In [25]:
def K_fold_cross_validation(X_train, y_train, X_test, y_test ,model, random = None, k = 3, model_name = None):
    KFOLD = KFold(n_splits = k, random_state=None)
    f1_arr = np.array([])
    r_score_arr = np.array([])
    auc_arr = np.array([])
    precision_arr = np.array([])
    for train_index, test_index in KFOLD.split(X_train):
        train_X, train_y = X_train[train_index], y_train[train_index]
                
        model.fit(train_X, train_y)
        y_pred = model.predict(X_test)

        # Precision score
        precision = precision_score(y_test, y_pred, average="micro")
        precision_arr = np.append(precision_arr, precision)

        # F1 SCORE
        f1 = f1_score(y_test, y_pred, average=None)
        f1_arr = np.append(f1_arr, [f1])

        # RECALL SCORE
        r_score = recall_score(y_test, y_pred, average = None)
        r_score_arr = np.append(r_score_arr, [r_score])

        # AUC
        AUC = roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr')
        auc_arr = np.append(auc_arr, [AUC])
    
    return np.array([f1_arr.mean(), r_score_arr.mean(), auc_arr.mean(), precision_arr.mean()])


### Bootstrap

In [26]:
def bs(n, n_bootstrap= 3, n_train = 0.5, random_state = None):
    n_train = int(n * n_train)
    arr = np.array([i for i in range(n)])
    for i in range(n_bootstrap):
        train_index = resample(arr, n_samples = n_train, replace = True, random_state = random_state) 
        test_index = np.delete(arr, train_index)
        yield train_index, test_index

def Bootstrap(X, y, X_t, y_t,model, random, k, model_name):
    f1_arr = np.array([])
    r_score_arr = np.array([])
    auc_arr = np.array([])
    precision_arr = np.array([])
    for train_index, test_index in bs(len(X), k, random_state = random):
        X_train, y_train = X[train_index], y[train_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_t)
        # Precision score
        precision = precision_score(y_t, y_pred, average="macro")
        precision_arr = np.append(precision_arr, precision)
        # F1 SCORE
        f1 = f1_score(y_t, y_pred, average=None)
        f1_arr = np.append(f1_arr, [f1])
        # RECALL SCORE
        r_score = recall_score(y_t, y_pred, average =None)
        r_score_arr = np.append(r_score_arr, [r_score])
        # AUC
        AUC = roc_auc_score(y_t, model.predict_proba(X_t), multi_class='ovr')
        auc_arr = np.append(auc_arr, [AUC])
        #auc_arr = np.append(auc_arr, AUC)
    return np.array([f1_arr.mean(), r_score_arr.mean(), auc_arr.mean(), precision_arr.mean()])

## Data
El dataset de entrenamiento contiene 27456 observaciones y 785 features, y el dataset de testeo contiene 7173 y 785 features. Además, se realizo cortes a través de **Haar** para reducir la dimensionalidad de los datos originales. Se realizó 1 y 2 cortes a la data original y la información fue almacenada los siguientes csv's: 
- train_14_by_14 (1 corte)
- test_14_by_14 (1 corte)
- train_7_by_7 (2 cortes)
- test_7_by_7 (2 cortes)

In [38]:
#train_path = "dataset/sign_mnist_train.csv"
#test_path = "dataset/sign_mnist_test.csv"
#train_path = "data_redimensionada/train_14_by_14.csv"
#test_path = "data_redimensionada/test_14_by_14.csv"
train_path = "data_redimensionada/train_7_by_7.csv"
test_path = "data_redimensionada/test_7_by_7.csv"

In [31]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

In [32]:
# TRAIN
y_train = np.array(df_train.iloc[:,0])
X_train = np.array(df_train.iloc[:, 1:])
# TEST
y_test = np.array(df_test.iloc[:, 0])
X_test = np.array(df_test.iloc[:, 1:])

## Support Vector Machine

In [33]:
# https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
from sklearn.svm import SVC
svm = SVC( decision_function_shape='ovr', probability=True, cache_size=600, max_iter = 2000)

#### K-Fold
Se experimenta en el método K-Fold en el SVM, para un **K** que está en el rango de 3 a 10.

In [34]:
begin = 3
end = 10

resultados = {}

for i in range(begin, end, 1):
    resultados[i] = K_fold_cross_validation(X_train, y_train,X_test, y_test, svm, None, k = i, model_name="Support Vector Machine")

svm_kFold = pd.DataFrame(resultados, index=["f1 score", "recall score", "AUC", "Precision"])

In [35]:
svm_kFold.T

Unnamed: 0,f1 score,recall score,AUC,Precision
3,0.829857,0.834602,0.988947,0.84723
4,0.834679,0.839371,0.989121,0.851436
5,0.83593,0.841164,0.989176,0.852454
6,0.836365,0.841308,0.989126,0.852784
7,0.836397,0.841451,0.98922,0.852801
8,0.837383,0.842408,0.989248,0.853667
9,0.836856,0.841897,0.989193,0.853102
10,0.837251,0.842228,0.989191,0.853639
11,0.838131,0.843288,0.989239,0.854269
12,0.837832,0.842865,0.989173,0.853969


#### Bootstrap
Se experimenta en el método Bootstrap en el SVM, para un **K** que está en el rango de 3 a 10.

In [36]:
begin = 3
end = 10
resultados = {}

for i in range(begin, end, 1):
    resultados[i] = Bootstrap(X_train, y_train,X_test, y_test, svm, rs, k = i, model_name="Support Vector Machine")

svm_Bootstrap = pd.DataFrame(resultados, index=["f1 score", "recall score", "AUC", "Precision"])

In [37]:
svm_Bootstrap.T

Unnamed: 0,f1 score,recall score,AUC,Precision
3,0.809071,0.812073,0.987706,0.813718
4,0.815034,0.819525,0.988077,0.818381
5,0.811652,0.815482,0.987461,0.815269
6,0.81252,0.817264,0.987912,0.816273
7,0.815497,0.820436,0.987507,0.818223
8,0.811121,0.815768,0.988022,0.814289
9,0.813837,0.818878,0.988261,0.817692
10,0.815378,0.820731,0.98779,0.8187
11,0.81301,0.818224,0.987787,0.816383
12,0.811828,0.816301,0.9877,0.815904


## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state = 0, min_samples_leaf=4, criterion="entropy")

#### K-Fold
Se experimenta en el método K-Fold en el **Decision Tree**, para un **K** que está en el rango de 3 a 12.

In [None]:
begin = 3
end = 13

resultados = {}

for i in range(begin, end, 1):
    resultados[i] = K_fold_cross_validation(X_train, y_train, X_test, y_test, tree, None, k = i, model_name="Decision Tree")

DTree_kFold = pd.DataFrame(resultados, index=["f1 score", "recall score", "AUC", "Precision"])

In [None]:
DTree_kFold.T

Unnamed: 0,f1 score,recall score,AUC,Precision
3,0.478919,0.481665,0.763003,0.494888
4,0.484196,0.488581,0.764614,0.499407
5,0.490898,0.495188,0.768068,0.504071
6,0.487125,0.49223,0.767021,0.503765
7,0.486499,0.491956,0.765451,0.501992
8,0.484398,0.48902,0.764413,0.499041
9,0.490412,0.494469,0.76758,0.505345
10,0.493607,0.497358,0.769829,0.50969
11,0.487324,0.490929,0.766609,0.503042
12,0.493692,0.498193,0.771954,0.509853


#### Bootstrap
Se experimenta en el método Bootstrap en el **Decision Tree**, para un **K** que está en el rango de 3 a 12.

In [None]:
resultados = {}
for i in range(begin, end, 1):
    resultados[i] = Bootstrap(X_train, y_train, X_test, y_test, tree, rs, k = i, model_name="Decision Tree")

DTree_bootstrap = pd.DataFrame(resultados, index=["f1 score", "recall score", "AUC", "Precision"])

In [None]:
DTree_bootstrap.T

Unnamed: 0,f1 score,recall score,AUC,Precision
3,0.44984,0.454619,0.749312,0.46333
4,0.444518,0.449224,0.742573,0.460158
5,0.444395,0.446648,0.744406,0.457585
6,0.446448,0.451424,0.747408,0.460866
7,0.43858,0.442476,0.743101,0.453071
8,0.439781,0.444775,0.741214,0.454981
9,0.444004,0.449587,0.744085,0.458759
10,0.443588,0.446508,0.744384,0.459927
11,0.443882,0.449239,0.745003,0.458373
12,0.440317,0.445011,0.745806,0.454313


## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5, n_jobs = -1)

#### K-Fold
Se experimenta en el método K-Fold en el **KNN**, para un **K** que está en el rango de 3 a 12.

In [None]:
begin = 3
end = 13

resultados = {}
for i in range(begin, end, 1):
    resultados[i] = K_fold_cross_validation(X_train, y_train, X_test, y_test, knn, None, k = i, model_name="KNN")
knn_kfold = pd.DataFrame(resultados, index=["F1 Score", "Recall Score", "AUC", "Precision"])


In [None]:
knn_kfold.T

Unnamed: 0,F1 Score,Recall Score,AUC,Precision
3,0.767186,0.769752,0.93658,0.77904
4,0.772131,0.774812,0.93492,0.783083
5,0.77571,0.778311,0.934106,0.7866
6,0.777652,0.780444,0.933875,0.788228
7,0.778988,0.781877,0.933908,0.78947
8,0.779694,0.782596,0.933714,0.790135
9,0.780557,0.783557,0.93366,0.790897
10,0.781168,0.784149,0.933558,0.791568
11,0.781446,0.784475,0.933554,0.791702
12,0.781543,0.784557,0.933467,0.791823


#### Bootstrap
Se experimenta en el método Bootstrap en el **KNN**, para un **K** que está en el rango de 3 a 12.

In [None]:
resultados = {}
for i in range(begin, end, 1):
    resultados[i]= Bootstrap(X_train, y_train, X_test, y_test, knn, rs, k = i, model_name="KNN")
knn_bootstrap = pd.DataFrame(resultados, index=["F1 score", "Recall score", "AUC", "Precision"])

In [None]:
knn_bootstrap.T

Unnamed: 0,F1 score,Recall score,AUC,Precision
3,0.728989,0.731338,0.928148,0.741613
4,0.736419,0.737201,0.927908,0.748353
5,0.733,0.734168,0.930256,0.74576
6,0.732717,0.734352,0.930751,0.745502
7,0.73192,0.733775,0.929457,0.743817
8,0.734211,0.735306,0.929698,0.747524
9,0.737853,0.739634,0.932635,0.750477
10,0.737214,0.739518,0.931481,0.749138
11,0.737247,0.738998,0.93035,0.750633
12,0.732643,0.73438,0.928334,0.745711
