In [2]:
from sklearn import svm
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, zero_one_loss
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Lectura y normalización de data

In [3]:
scaler = MinMaxScaler()

dataset = pd.read_csv('./data/Cardiotocographic-Training.csv')
y = dataset.CLASE.to_numpy()
X = dataset.drop('CLASE', axis=1).to_numpy()
X = scaler.fit_transform(X)

## Bootsrap

In [4]:
def bootstrap(X, y, model, k, c, g):
    indices = np.array([i for i in range (len(X))])
    precisions = []
    recalls = []
    f1s = []
    aucs = []
    errors = []
    for i in range(k):
        train_index = resample(indices, n_samples=k, replace=True)
        test_index = np.array([j for j in indices if j not in train_index])
        
        x_train, y_train = X[train_index], y[train_index]
        x_test, y_test = X[test_index], y[test_index]
        
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        y_pred_auc = model.predict_proba(x_test)

        errors.append(zero_one_loss(y_test, y_pred))

        precision = precision_score(y_test, y_pred, average='micro') # micro porque toma en cuenta el desbalanceaminto de clases
        precisions.append(precision)

        recall = recall_score(y_test, y_pred, average='micro')
        recalls.append(recall)

        f1 = f1_score(y_test, y_pred, average='micro')
        f1s.append(f1)

        auc = roc_auc_score(y_test, y_pred_auc, multi_class='ovr')
        aucs.append(auc)
    return [np.mean(errors), np.var(errors), c, g, precisions, recalls, f1s, aucs, errors]

## K-Fold Cross-Validation

In [5]:
def k_fold(X, y, model, k, c, g):
    skf = KFold(n_splits=k, shuffle=True, random_state=42)
    precisions = []
    recalls = []
    f1s = []
    aucs = []
    errors = []
    for train_index, test_index in skf.split(X, y):
        x_train, y_train = X[train_index], y[train_index]
        x_test, y_test = X[test_index], y[test_index]

        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        y_pred_auc = model.predict_proba(x_test)

        errors.append(zero_one_loss(y_test, y_pred))

        precision = precision_score(y_test, y_pred, average='micro') # micro porque toma en cuenta el desbalanceaminto de clases
        precisions.append(precision)

        recall = recall_score(y_test, y_pred, average='micro')
        recalls.append(recall)

        f1 = f1_score(y_test, y_pred, average='micro')
        f1s.append(f1)

        auc = roc_auc_score(y_test, y_pred_auc, multi_class='ovr')
        aucs.append(auc)
    return [np.mean(errors), np.var(errors), c, g, precisions, recalls, f1s, aucs, errors]


### Tuneo de modelo SVM con bootstrap

In [15]:
Cs = [1, 10, 100]
gammas = [0.1, 0.01, 0.001]

results_svm_bootstrap = []

k = int(len(X)/10)

for c in Cs:
    for gama in gammas:
        results_svm_bootstrap.append(bootstrap(X, y, svm.SVC(kernel='linear', C=c, gamma=gama, decision_function_shape='ovr', probability=True), k, c, gama))

# Valores de bias y varianza del respectivo hiperparametro
for r in results_svm_bootstrap:
    print("c y g: ", r[2], r[3], "\nbias: ", r[0], "\nvarianza: ", r[1])

c y g:  1 0.1 
bias:  0.1439361675105836 
varianza:  0.00030112760334526794
c y g:  1 0.01 
bias:  0.1423274739312672 
varianza:  0.0003072605206073213
c y g:  1 0.001 
bias:  0.14342937459374375 
varianza:  0.00026729563538479623
c y g:  10 0.1 
bias:  0.12781933462177456 
varianza:  7.859097853899336e-05
c y g:  10 0.01 
bias:  0.12791266926001227 
varianza:  9.324019726630779e-05
c y g:  10 0.001 
bias:  0.12736541890552758 
varianza:  9.475980371313346e-05
c y g:  100 0.1 
bias:  0.1331621553717318 
varianza:  0.0001572181068333326
c y g:  100 0.01 
bias:  0.13209832375199287 
varianza:  0.00016049068362795443
c y g:  100 0.001 
bias:  0.13148838596305595 
varianza:  0.00011209861605542345


### Valores de metricas del mejor modelo SVM con Bootstrap

In [16]:
print("precision: ", np.mean(results_svm_bootstrap[4][4]))
print("recall: ", np.mean(results_svm_bootstrap[4][5]))
print("f1: ", np.mean(results_svm_bootstrap[4][6]))
print("auc: ", np.mean(results_svm_bootstrap[4][7]))

precision:  0.8720873307399879
recall:  0.8720873307399879
f1:  0.8720873307399879
auc:  0.9450841632767766


In [91]:
#clf = svm.SVC(kernel='linear', C=10, gamma=0.001, decision_function_shape='ovr', probability=True) # C es penalización por los valores fuera de la clasificación
# gamma controla la distancia de influencia de un punto de entrenamiento, valor pequeño indica que el radio de distancia es más grande, por lo que mas puntos se agrupan correctamente.
# valora alto implica que el radio se reduce y los puntos deben estar más cerca entre ellos para ser considerados del mismo grupo
# precisions, recalls, f1s, aucs, errors = k_fold(X, y, clf, 10)

### Tuneo del modelo SVM con K-Fold Cross-Validation

In [11]:
Cs = [1, 10, 100]
gammas = [0.1, 0.01, 0.001]

results_svm_k = []

for c in Cs:
    for gama in gammas:
        results_svm_k.append(k_fold(X, y, svm.SVC(kernel='linear', C=c, gamma=gama, decision_function_shape='ovr', probability=True), 10, c, gama))

# Valores de bias y varianza del respectivo hiperparametro
for r in results_svm_k:
    print("c y g: ", r[2], r[3], "\nbias: ", r[0], "\nvarianza: ", r[1])

c y g:  1 0.1 
bias:  0.11063567839195979 
varianza:  0.00041917338577308693
c y g:  1 0.01 
bias:  0.11063567839195979 
varianza:  0.00041917338577308693
c y g:  1 0.001 
bias:  0.11063567839195979 
varianza:  0.00041917338577308693
c y g:  10 0.1 
bias:  0.1026256281407035 
varianza:  0.000527016773566324
c y g:  10 0.01 
bias:  0.1026256281407035 
varianza:  0.000527016773566324
c y g:  10 0.001 
bias:  0.1026256281407035 
varianza:  0.000527016773566324
c y g:  100 0.1 
bias:  0.10212311557788945 
varianza:  0.0005317254930431063
c y g:  100 0.01 
bias:  0.10212311557788945 
varianza:  0.0005317254930431063
c y g:  100 0.001 
bias:  0.10212311557788945 
varianza:  0.0005317254930431063


### Valores de metricas del mejor modelo SVM con K-Fold

In [14]:
print("precision: ", np.mean(results_svm_k[7][4]))
print("recall: ", np.mean(results_svm_k[7][5]))
print("f1: ", np.mean(results_svm_k[7][6]))
print("auc: ", np.mean(results_svm_k[7][7]))

precision:  0.8978768844221106
recall:  0.8978768844221106
f1:  0.8978768844221106
auc:  0.9696444519793601


### Tuneo del modelo logística con K-Fold Cross-Validation

In [6]:
results_log_k = []
penalty = ['l1', 'l2']
cs = [10, 1, 0.1]

for p in penalty:
    for c in cs:
        results_log_k.append(k_fold(X, y, LogisticRegression(penalty=p, C=c, multi_class='ovr', solver='liblinear', max_iter=1000), 10, p, c))

for r in results_log_k:
   print("penalty y c: ", r[2], r[3], "\nbias: ", r[0], "\nvarianza: ", r[1])

penalty y c:  l1 10 
bias:  0.10611557788944723 
varianza:  0.00022118421251988624
penalty y c:  l1 1 
bias:  0.11162060301507534 
varianza:  0.000407358431605263
penalty y c:  l1 0.1 
bias:  0.13915075376884423 
varianza:  0.0005178608873513305
penalty y c:  l2 10 
bias:  0.10712311557788942 
varianza:  0.0003048898070755793
penalty y c:  l2 1 
bias:  0.11212814070351759 
varianza:  0.0005600506363475667
penalty y c:  l2 0.1 
bias:  0.16668341708542714 
varianza:  0.000892777833893083


### Valores de métricas del mejor modelo logística con K-Fold

In [7]:

print("precision: ", np.mean(results_log_k[0][4]))
print("recall: ", np.mean(results_log_k[0][5]))
print("f1: ", np.mean(results_log_k[0][6]))
print("auc: ", np.mean(results_log_k[0][7]))

precision:  0.8938844221105529
recall:  0.8938844221105529
f1:  0.8938844221105529
auc:  0.9664880441174803


### Tuneo del modelo logística con Bootstrap

In [8]:
results_log_bootstrap = []
penalty = ['l1', 'l2']
cs = [10, 1, 0.1]

k = int(len(X)/10)

for p in penalty:
    for c in cs:
        results_log_bootstrap.append(bootstrap(X, y, LogisticRegression(penalty=p, C=c, multi_class='ovr', solver='liblinear', max_iter=1000), k, p, c))

for r in results_log_bootstrap:
   print("penalty y c: ", r[2], r[3], "\nbias: ", r[0], "\nvarianza: ", r[1])

penalty y c:  l1 10 
bias:  0.12552573073566517 
varianza:  7.832044372455536e-05
penalty y c:  l1 1 
bias:  0.1488188010043481 
varianza:  0.0002502422351480533
penalty y c:  l1 0.1 
bias:  0.2218754836281879 
varianza:  1.0075781509611317e-05
penalty y c:  l2 10 
bias:  0.12922527920708887 
varianza:  8.063018991392728e-05
penalty y c:  l2 1 
bias:  0.16701004947655654 
varianza:  0.00020324513834105214
penalty y c:  l2 0.1 
bias:  0.22178458661514813 
varianza:  1.0450366203825303e-05


### Valores de métricas del mejor modelo logística con Bootstrap

In [10]:
print("precision: ", np.mean(results_log_bootstrap[0][4]))
print("recall: ", np.mean(results_log_bootstrap[0][5]))
print("f1: ", np.mean(results_log_bootstrap[0][6]))
print("auc: ", np.mean(results_log_bootstrap[0][7]))

precision:  0.8744742692643348
recall:  0.8744742692643348
f1:  0.8744742692643348
auc:  0.9461086742800177


### Tuneo del modelo DecisionTree con K-Fold

In [19]:
criterion = ["gini", "entropy"]
min_leaf = [4, 6, 8]

results_tree_k = []

for c in criterion:
    for l in min_leaf:
        results_tree_k.append(k_fold(X, y, DecisionTreeClassifier(criterion=c, min_samples_leaf=l), 10, c, l))
for r in results_tree_k:
    print("criterio y min_leaf: ", r[2], r[3], "\nbias: ", r[0], "\nvarianza: ", r[1])

criterio y min_leaf:  gini 4 
bias:  0.07808542713567838 
varianza:  0.0007126785687230112
criterio y min_leaf:  gini 6 
bias:  0.07607035175879398 
varianza:  0.0006983770864372101
criterio y min_leaf:  gini 8 
bias:  0.07908793969849245 
varianza:  0.0005361430330042166
criterio y min_leaf:  entropy 4 
bias:  0.05056281407035178 
varianza:  0.00019378591449710833
criterio y min_leaf:  entropy 6 
bias:  0.06856281407035175 
varianza:  0.0004651291381530762
criterio y min_leaf:  entropy 8 
bias:  0.07557788944723617 
varianza:  0.0001875984192318368


### Valores de metricas para el modelo DecisionTree con K-Fold

In [21]:
print("precision: ", np.mean(results_tree_k[3][4]))
print("recall: ", np.mean(results_tree_k[3][5]))
print("f1: ", np.mean(results_tree_k[3][6]))
print("auc: ", np.mean(results_tree_k[3][7]))

precision:  0.949437185929648
recall:  0.949437185929648
f1:  0.949437185929648
auc:  0.9474398589743911


### Tuneo del modelo de DecisionTree con Bootstrap

In [23]:
criterion = ["gini", "entropy"]
min_leaf = [4, 6, 8]

results_tree_bootstrap = []

k = int(len(X)/10)

for c in criterion:
    for l in min_leaf:
        results_tree_bootstrap.append(bootstrap(X, y, DecisionTreeClassifier(criterion=c, min_samples_leaf=l), k, c, l))

for r in results_tree_bootstrap:
   print("criterio y min_leaf: ", r[2], r[3], "\nbias: ", r[0], "\nvarianza: ", r[1])

criterio y min_leaf:  gini 4 
bias:  0.1412028926025946 
varianza:  0.00040025944004300483
criterio y min_leaf:  gini 6 
bias:  0.14863084546150954 
varianza:  0.0004789413708422947
criterio y min_leaf:  gini 8 
bias:  0.15285316795027007 
varianza:  0.0003706799047221454
criterio y min_leaf:  entropy 4 
bias:  0.14235170108008824 
varianza:  0.00038567857018375874
criterio y min_leaf:  entropy 6 
bias:  0.14965142518474675 
varianza:  0.0004304999728206793
criterio y min_leaf:  entropy 8 
bias:  0.15743294367637262 
varianza:  0.00046421599506013366


### Valores de métricas para el modelo DecisionTree con Bootstrap

In [27]:
print("precision: ", np.mean(results_tree_bootstrap[3][4]))
print("recall: ", np.mean(results_tree_bootstrap[3][5]))
print("f1: ", np.mean(results_tree_bootstrap[3][6]))
print("auc: ", np.mean(results_tree_bootstrap[3][7]))

precision:  0.8576482989199116
recall:  0.8576482989199116
f1:  0.8576482989199116
auc:  0.8409504122166064


### Predicción del modelo DecisionTree

In [9]:


dataset = pd.read_csv('./data/Cardiotocographic-Test.csv')
x = dataset.to_numpy()
x = scaler.fit_transform(x)
dt = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=4)
dt.fit(X, y)
y_pred = dt.predict(x)

text_file = open("predicts.txt", "w")
for l in y_pred:
    text_file.write(str(l)+"\n")
text_file.close()