In [5]:
from sklearn import svm
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, zero_one_loss
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [10]:
dataset = pd.read_csv('./data/Cardiotocographic-Training.csv')
y = dataset.CLASE.to_numpy()
X = dataset.drop('CLASE', axis=1).to_numpy()

In [61]:
def bootstrap(X, y, model, k, c, g):
    indices = np.array([i for i in range (len(X))])
    precisions = []
    recalls = []
    f1s = []
    aucs = []
    errors = []
    for i in range(k):
        train_index = resample(indices, n_samples=k, replace=True)
        test_index = np.array([j for j in indices if j not in train_index])
        
        x_train, y_train = X[train_index], y[train_index]
        x_test, y_test = X[test_index], y[test_index]
        
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        y_pred_auc = model.predict_proba(x_test)

        errors.append(zero_one_loss(y_test, y_pred))

        precision = precision_score(y_test, y_pred, average='micro') # micro porque toma en cuenta el desbalanceaminto de clases
        precisions.append(precision)

        recall = recall_score(y_test, y_pred, average='micro')
        recalls.append(recall)

        f1 = f1_score(y_test, y_pred, average='micro')
        f1s.append(f1)

        auc = roc_auc_score(y_test, y_pred_auc, multi_class='ovr')
        aucs.append(auc)
    return [np.mean(errors), np.var(errors), c, g, precisions, recalls, f1s, aucs, errors]

In [46]:
bootstrap(X, y, "hola", 10) 

[2 8 0 4]
[0 9 4 1]
[9 6 4 8]
[9 3 4 8]


In [49]:
def k_fold(X, y, model, k, c, g):
    skf = KFold(n_splits=k, shuffle=True, random_state=42)
    precisions = []
    recalls = []
    f1s = []
    aucs = []
    errors = []
    for train_index, test_index in skf.split(X, y):
        x_train, y_train = X[train_index], y[train_index]
        x_test, y_test = X[test_index], y[test_index]

        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        y_pred_auc = model.predict_proba(x_test)

        errors.append(zero_one_loss(y_test, y_pred))

        precision = precision_score(y_test, y_pred, average='micro') # micro porque toma en cuenta el desbalanceaminto de clases
        precisions.append(precision)

        recall = recall_score(y_test, y_pred, average='micro')
        recalls.append(recall)

        f1 = f1_score(y_test, y_pred, average='micro')
        f1s.append(f1)

        auc = roc_auc_score(y_test, y_pred_auc, multi_class='ovr')
        aucs.append(auc)
    return [np.mean(errors), np.var(errors), c, g, precisions, recalls, f1s, aucs, errors]


In [62]:
Cs = [1, 10, 100]
gammas = [0.1, 0.01, 0.001]

results = []

k = int(len(X)/10)

for c in Cs:
    for gama in gammas:
        results.append(bootstrap(X, y, svm.SVC(kernel='linear', C=c, gamma=gama, decision_function_shape='ovr', probability=True), k, c, gama))

for r in results:
    print("bias: ", r[0], "\nvarianza: ", r[1])

KeyboardInterrupt: 

In [20]:
#clf = svm.SVC(kernel='linear', C=10, gamma=0.001, decision_function_shape='ovr', probability=True) # C es penalización por los valores fuera de la clasificación
# gamma controla la distancia de influencia de un punto de entrenamiento, valor pequeño indica que el radio de distancia es más grande, por lo que mas puntos se agrupan correctamente.
# valora alto implica que el radio se reduce y los puntos deben estar más cerca entre ellos para ser considerados del mismo grupo
#precisions, recalls, f1s, aucs, errors = k_fold(X, y, clf, 10)

Cs = [1, 10, 100]
gammas = [0.1, 0.01, 0.001]

results = []

for c in Cs:
    for gama in gammas:
        results.append(k_fold(X, y, svm.SVC(kernel='linear', C=c, gamma=gama, decision_function_shape='ovr', probability=True), 10, c, gama))

for r in results:
    print("bias: ", r[0], "\nvarianza: ", r[1])

# El menor bias y varianza será el que nos de el modelo con menor overfitting

# clf = svm.SVC(kernel='linear') # 0.91166666
# clf.fit(X_train, y_train)
# print(clf.class_weight_)
# print(clf.score(X_test, y_test))

bias:  0.09811809045226129 
varianza:  0.0003309761685310984
bias:  0.09811809045226129 
varianza:  0.0003309761685310984
bias:  0.09811809045226129 
varianza:  0.0003309761685310984
bias:  0.09311055276381906 
varianza:  0.0002550091916870789
bias:  0.09311055276381906 
varianza:  0.0002550091916870789
bias:  0.09311055276381906 
varianza:  0.0002550091916870789
bias:  0.10111557788944721 
varianza:  0.00037274301154011335
bias:  0.10111557788944721 
varianza:  0.00037274301154011335
bias:  0.10111557788944721 
varianza:  0.00037274301154011335


In [27]:
#for r in results:
#    print("c y g: ", r[2], r[3], "\nbias: ", r[0], "\nvarianza: ", r[1])

print("precision: ", np.mean(results[4][4]))
print("recall: ", np.mean(results[4][5]))
print("f1: ", np.mean(results[4][6]))
print("auc: ", np.mean(results[4][7]))
print(results[4][8])

precision:  0.906889447236181
recall:  0.906889447236181
f1:  0.906889447236181
auc:  0.9686436003609835
[0.06999999999999995, 0.08499999999999996, 0.08999999999999997, 0.07499999999999996, 0.09499999999999997, 0.08499999999999996, 0.08999999999999997, 0.12, 0.12060301507537685, 0.10050251256281406]


In [32]:
results2 = []
penalty = ['l1', 'l2']
cs = [10, 1, 0.1]

for p in penalty:
    for c in cs:
        results2.append(k_fold(X, y, LogisticRegression(penalty=p, C=c, multi_class='ovr', solver='liblinear', max_iter=1000), 10, p, c))

for r in results2:
    print("bias: ", r[0], "\nvarianza: ", r[1])

bias:  0.10661557788944723 
varianza:  0.00025731863463043905
bias:  0.10611306532663316 
varianza:  0.00020053357869750794
bias:  0.11512311557788943 
varianza:  0.00013202071286078657
bias:  0.10611557788944723 
varianza:  0.00023618421251988625
bias:  0.10762311557788942 
varianza:  0.00035001669149769003
bias:  0.11512814070351758 
varianza:  0.0001982817921264618


In [35]:
for r in results2:
   print("penalty y c: ", r[2], r[3], "\nbias: ", r[0], "\nvarianza: ", r[1])
print("precision: ", np.mean(results2[4][4]))
print("recall: ", np.mean(results2[4][5]))
print("f1: ", np.mean(results2[4][6]))
print("auc: ", np.mean(results2[4][7]))

penalty y c:  l1 10 
bias:  0.10661557788944723 
varianza:  0.00025731863463043905
penalty y c:  l1 1 
bias:  0.10611306532663316 
varianza:  0.00020053357869750794
penalty y c:  l1 0.1 
bias:  0.11512311557788943 
varianza:  0.00013202071286078657
penalty y c:  l2 10 
bias:  0.10611557788944723 
varianza:  0.00023618421251988625
penalty y c:  l2 1 
bias:  0.10762311557788942 
varianza:  0.00035001669149769003
penalty y c:  l2 0.1 
bias:  0.11512814070351758 
varianza:  0.0001982817921264618
precision:  0.8923768844221106
recall:  0.8923768844221106
f1:  0.8923768844221106
auc:  0.9624252699845126
