In [1]:
from sklearn import svm
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, zero_one_loss
from scipy.stats import mode
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Lectura y normalización de data

In [3]:
scaler = MinMaxScaler()

dataset = pd.read_csv('./data/Cardiotocographic-Training.csv')
#dataset = pd.read_csv("dataset.csv")

y = dataset.CLASE.to_numpy()
X = dataset.drop('CLASE', axis=1).to_numpy()
X = scaler.fit_transform(X)

# Bootstrap

In [8]:
def bootstrap(X, y, model, k, c, g):
    indices = np.array([i for i in range (len(X))])
    precisions = []
    recalls = []
    f1s = []
    errors = []
    for i in range(k):
        train_index = resample(indices, n_samples=k, replace=True)
        test_index = np.array([j for j in indices if j not in train_index])
        
        x_train, y_train = X[train_index], y[train_index]
        x_test, y_test = X[test_index], y[test_index]
        
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)

        errors.append(zero_one_loss(y_test, y_pred))

        precision = precision_score(y_test, y_pred, average='micro') # micro porque toma en cuenta el desbalanceaminto de clases
        precisions.append(precision)

        recall = recall_score(y_test, y_pred, average='micro')
        recalls.append(recall)

        f1 = f1_score(y_test, y_pred, average='micro')
        f1s.append(f1)

    return [np.mean(errors), np.var(errors), c, g, precisions, recalls, f1s, errors]

# K-Fold Cross Validation

In [9]:
def k_fold(X, y, model, k, c, g):
    skf = KFold(n_splits=k, shuffle=True, random_state=42)
    precisions = []
    recalls = []
    f1s = []
    errors = []
    for train_index, test_index in skf.split(X, y):
        x_train, y_train = X[train_index], y[train_index]
        x_test, y_test = X[test_index], y[test_index]

        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)

        errors.append(zero_one_loss(y_test, y_pred))

        precision = precision_score(y_test, y_pred, average='micro') # micro porque toma en cuenta el desbalanceaminto de clases
        precisions.append(precision)

        recall = recall_score(y_test, y_pred, average='micro')
        recalls.append(recall)

        f1 = f1_score(y_test, y_pred, average='micro')
        f1s.append(f1)

    return [np.mean(errors), np.var(errors), c, g, precisions, recalls, f1s, errors]

# KNN

In [6]:
def distance(x1, x2):
    distance = np.sqrt(np.sum((x1-x2)**2))
    return distance


class KNN:
    def __init__(self, k):
        self.k = k


    def fit(self, X, y):
        self.X_train = X
        self.y_train = y


    def predict(self, X):
        predictions = [self._predict(x) for x in X]
        return predictions
    

    def _predict(self, x):

        distances = [distance(x, x_train) for x_train in self.X_train]


        indices = np.argsort(distances)[:self.k]
        labels = [self.y_train[i] for i in indices]

        most_common = mode(labels)

        return most_common.mode[0]

### Tuneo el modelo con Bootstrap

In [16]:
neighbors = [3, 5, 7, 9, 11]
g = []

results_knn_bootstrap = []

for k in neighbors:
    results_knn_bootstrap.append(bootstrap(X, y, KNN(k), 10, k, g))

for r in results_knn_bootstrap:
    print("Neighbors: ", r[2], "\nBias: ", r[0], "\nVarianza", r[1])

Neighbors:  3 
Bias:  0.23091808771179112 
Varianza 0.0015909344425265946
Neighbors:  5 
Bias:  0.224195171026157 
Varianza 5.050676291147304e-05
Neighbors:  7 
Bias:  0.2219315895372233 
Varianza 2.429061289264594e-07
Neighbors:  9 
Bias:  0.22198189134808857 
Varianza 1.5434660275534964e-07
Neighbors:  11 
Bias:  0.22157947686116702 
Varianza 2.6567857851332167e-07


### Valores de métricas del mejor modelo KNN con Bootstrap

In [22]:
print("Precision: ", np.mean(results_knn_bootstrap[3][4]))
print("Recall   : ", np.mean(results_knn_bootstrap[3][5]))
print("F1       : ", np.mean(results_knn_bootstrap[3][6]))


Precision:  0.7780181086519116
Recall   :  0.7780181086519116
F1       :  0.7780181086519115


### Tuneo el modelo con K-Fold

In [19]:
neighbors = [3, 5, 7, 9, 11]
g = []

results_knn_kfold = []

for k in neighbors:
    results_knn_kfold.append(k_fold(X, y, KNN(k), 10, k, g))

for r in results_knn_kfold:
    print("Neighbors: ", r[2], "\nBias: ", r[0], "\nVarianza", r[1])

Neighbors:  3 
Bias:  0.08210552763819094 
Varianza 0.00048635221332794527
Neighbors:  5 
Bias:  0.09110804020100502 
Varianza 0.000653632162066615
Neighbors:  7 
Bias:  0.10112060301507535 
Varianza 0.0003838426049847234
Neighbors:  9 
Bias:  0.10161306532663314 
Varianza 0.0003031034317315224
Neighbors:  11 
Bias:  0.10211557788944722 
Varianza 0.00034986449837125396


### Valores de métricas del mejor modelo KNN con K-Fold

In [20]:
print("Precision: ", np.mean(results_knn_kfold[3][4]))
print("Recall   : ", np.mean(results_knn_kfold[3][5]))
print("F1       : ", np.mean(results_knn_kfold[3][6]))

Precision:  0.8983869346733669
Recall   :  0.8983869346733669
F1       :  0.8983869346733669
