In [66]:
from sklearn import svm
from sklearn.utils import resample
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, zero_one_loss
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Lectura y normalización de data

In [78]:
scaler = MinMaxScaler()

dataset = pd.read_csv('./data/Cardiotocographic-Training.csv')
y = dataset.CLASE.to_numpy()
X = dataset.drop('CLASE', axis=1).to_numpy()
X = scaler.fit_transform(X)

## Bootsrap

In [68]:
def bootstrap(X, y, model, k, c, g):
    indices = np.array([i for i in range (len(X))])
    precisions = []
    recalls = []
    f1s = []
    aucs = []
    errors = []
    for i in range(k):
        train_index = resample(indices, n_samples=k, replace=True)
        test_index = np.array([j for j in indices if j not in train_index])
        
        x_train, y_train = X[train_index], y[train_index]
        x_test, y_test = X[test_index], y[test_index]
        
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        y_pred_auc = model.predict_proba(x_test)

        errors.append(zero_one_loss(y_test, y_pred))

        precision = precision_score(y_test, y_pred, average='micro') # micro porque toma en cuenta el desbalanceaminto de clases
        precisions.append(precision)

        recall = recall_score(y_test, y_pred, average='micro')
        recalls.append(recall)

        f1 = f1_score(y_test, y_pred, average='micro')
        f1s.append(f1)

        auc = roc_auc_score(y_test, y_pred_auc, multi_class='ovr')
        aucs.append(auc)
    return [np.mean(errors), np.var(errors), c, g, precisions, recalls, f1s, aucs, errors]

## K-Fold Cross-Validation

In [69]:
def k_fold(X, y, model, k, c, g):
    skf = KFold(n_splits=k, shuffle=True, random_state=42)
    precisions = []
    recalls = []
    f1s = []
    aucs = []
    errors = []
    for train_index, test_index in skf.split(X, y):
        x_train, y_train = X[train_index], y[train_index]
        x_test, y_test = X[test_index], y[test_index]

        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        y_pred_auc = model.predict_proba(x_test)

        errors.append(zero_one_loss(y_test, y_pred))

        precision = precision_score(y_test, y_pred, average='micro') # micro porque toma en cuenta el desbalanceaminto de clases
        precisions.append(precision)

        recall = recall_score(y_test, y_pred, average='micro')
        recalls.append(recall)

        f1 = f1_score(y_test, y_pred, average='micro')
        f1s.append(f1)

        auc = roc_auc_score(y_test, y_pred_auc, multi_class='ovr')
        aucs.append(auc)
    return [np.mean(errors), np.var(errors), c, g, precisions, recalls, f1s, aucs, errors]


### Tuneo de modelo SVM con bootstrap

In [70]:
Cs = [1, 10, 100]
gammas = [0.1, 0.01, 0.001]

results_svm_bootstrap = []

k = int(len(X)/10)

for c in Cs:
    for gama in gammas:
        results_svm_bootstrap.append(bootstrap(X, y, svm.SVC(kernel='linear', C=c, gamma=gama, decision_function_shape='ovr', probability=True), k, c, gama))

# Valores de bias y varianza del respectivo hiperparametro
for r in results_svm_bootstrap:
    print("c y g: ", r[2], r[3], "\nbias: ", r[0], "\nvarianza: ", r[1])

c y g:  1 0.1 
bias:  0.14375546484394902 
varianza:  0.00029009612314725205
c y g:  1 0.01 
bias:  0.14267079651904602 
varianza:  0.0003157299330377769
c y g:  1 0.001 
bias:  0.14306489196234964 
varianza:  0.00028258943762687585
c y g:  10 0.1 
bias:  0.12800548789799762 
varianza:  9.140604703949627e-05
c y g:  10 0.01 
bias:  0.12694915760005 
varianza:  8.798282132084974e-05
c y g:  10 0.001 
bias:  0.12792769176495347 
varianza:  9.924067590581299e-05
c y g:  100 0.1 
bias:  0.13410231926393607 
varianza:  0.00018139133816191558
c y g:  100 0.01 
bias:  0.13381204473455602 
varianza:  0.0001290412579859576
c y g:  100 0.001 
bias:  0.13167194804635496 
varianza:  0.00015124808884962784


### Valores de metricas del mejor modelo SVM con Bootstrap

In [71]:
print("precision: ", np.mean(results_svm_bootstrap[4][4]))
print("recall: ", np.mean(results_svm_bootstrap[4][5]))
print("f1: ", np.mean(results_svm_bootstrap[4][6]))
print("auc: ", np.mean(results_svm_bootstrap[4][7]))

precision:  0.8730508423999501
recall:  0.8730508423999501
f1:  0.8730508423999501
auc:  0.9452893756425058


In [72]:
#clf = svm.SVC(kernel='linear', C=10, gamma=0.001, decision_function_shape='ovr', probability=True) # C es penalización por los valores fuera de la clasificación
# gamma controla la distancia de influencia de un punto de entrenamiento, valor pequeño indica que el radio de distancia es más grande, por lo que mas puntos se agrupan correctamente.
# valora alto implica que el radio se reduce y los puntos deben estar más cerca entre ellos para ser considerados del mismo grupo
# precisions, recalls, f1s, aucs, errors = k_fold(X, y, clf, 10)

### Tuneo del modelo SVM con K-Fold Cross-Validation

In [73]:
Cs = [1, 10, 100]
gammas = [0.1, 0.01, 0.001]

results_svm_k = []

for c in Cs:
    for gama in gammas:
        results_svm_k.append(k_fold(X, y, svm.SVC(kernel='linear', C=c, gamma=gama, decision_function_shape='ovr', probability=True), 10, c, gama))

# Valores de bias y varianza del respectivo hiperparametro
for r in results_svm_k:
    print("c y g: ", r[2], r[3], "\nbias: ", r[0], "\nvarianza: ", r[1])

c y g:  1 0.1 
bias:  0.11063567839195979 
varianza:  0.00041917338577308693
c y g:  1 0.01 
bias:  0.11063567839195979 
varianza:  0.00041917338577308693
c y g:  1 0.001 
bias:  0.11063567839195979 
varianza:  0.00041917338577308693
c y g:  10 0.1 
bias:  0.1026256281407035 
varianza:  0.000527016773566324
c y g:  10 0.01 
bias:  0.1026256281407035 
varianza:  0.000527016773566324
c y g:  10 0.001 
bias:  0.1026256281407035 
varianza:  0.000527016773566324
c y g:  100 0.1 
bias:  0.10212311557788945 
varianza:  0.0005317254930431063
c y g:  100 0.01 
bias:  0.10212311557788945 
varianza:  0.0005317254930431063
c y g:  100 0.001 
bias:  0.10212311557788945 
varianza:  0.0005317254930431063


### Valores de metricas del mejor modelo SVM con K-Fold

In [74]:
print("precision: ", np.mean(results_svm_k[4][4]))
print("recall: ", np.mean(results_svm_k[4][5]))
print("f1: ", np.mean(results_svm_k[4][6]))
print("auc: ", np.mean(results_svm_k[4][7]))

precision:  0.8973743718592966
recall:  0.8973743718592966
f1:  0.8973743718592966
auc:  0.9695352376362546


### Tuneo del modelo logística con K-Fold Cross-Validation

In [75]:
results_log_k = []
penalty = ['l1', 'l2']
cs = [10, 1, 0.1]

for p in penalty:
    for c in cs:
        results_log_k.append(k_fold(X, y, LogisticRegression(penalty=p, C=c, multi_class='ovr', solver='liblinear', max_iter=1000), 10, p, c))

for r in results_log_k:
   print("penalty y c: ", r[2], r[3], "\nbias: ", r[0], "\nvarianza: ", r[1])

penalty y c:  l1 10 
bias:  0.10611557788944723 
varianza:  0.00022118421251988624
penalty y c:  l1 1 
bias:  0.11162060301507534 
varianza:  0.000407358431605263
penalty y c:  l1 0.1 
bias:  0.13915075376884423 
varianza:  0.0005178608873513305
penalty y c:  l2 10 
bias:  0.10712311557788942 
varianza:  0.0003048898070755793
penalty y c:  l2 1 
bias:  0.11212814070351759 
varianza:  0.0005600506363475667
penalty y c:  l2 0.1 
bias:  0.16668341708542714 
varianza:  0.000892777833893083


### Valores de métricas del mejor modelo logística con K-Fold

In [76]:

print("precision: ", np.mean(results_log_k[4][4]))
print("recall: ", np.mean(results_log_k[4][5]))
print("f1: ", np.mean(results_log_k[4][6]))
print("auc: ", np.mean(results_log_k[4][7]))

precision:  0.8878718592964825
recall:  0.8878718592964825
f1:  0.8878718592964825
auc:  0.9589489224375172


### Tuneo del modelo logística con Bootstrap

In [77]:
results_log_bootstrap = []
penalty = ['l1', 'l2']
cs = [10, 1, 0.1]

k = int(len(X)/10)

for p in penalty:
    for c in cs:
        results_log_bootstrap.append(bootstrap(X, y, LogisticRegression(penalty=p, C=c, multi_class='ovr', solver='liblinear', max_iter=1000), k, p, c))

for r in results_log_bootstrap:
   print("penalty y c: ", r[2], r[3], "\nbias: ", r[0], "\nvarianza: ", r[1])

penalty y c:  l1 10 
bias:  0.12648370788598004 
varianza:  7.843229993452602e-05
penalty y c:  l1 1 
bias:  0.1470087523670004 
varianza:  0.00024048535225050662
penalty y c:  l1 0.1 
bias:  0.22201310672806981 
varianza:  9.372234207256527e-06
penalty y c:  l2 10 
bias:  0.12890514405443643 
varianza:  8.421140059238825e-05
penalty y c:  l2 1 
bias:  0.16991727519998917 
varianza:  0.00027923283998021264
penalty y c:  l2 0.1 
bias:  0.2217022800104486 
varianza:  9.786547569525493e-06
