# Lectura del dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import joblib

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay, classification_report

from sklearn.svm import SVC

In [None]:
df_train = pd.read_csv('data/diabetes_prediction_dataset_train-labeled.csv')
df_test = pd.read_csv('data/diabetes_prediction_dataset_test.csv')

In [None]:
X = df_train.drop(columns=['patient', 'diabetes'])
y = df_train['diabetes']
x_train, x_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state = 8)

In [None]:
x_test = df_test.drop(columns=['patient', 'diabetes'])
y_test = df_test['diabetes']
PatientId_test = df_test['patient']

# Preprocesamiento

In [None]:
pp_pipe = joblib.load('preproc_pipeline.pkl')

In [None]:
# Fiteo el pipeline
x_train_transformed = pp_pipe.fit_transform(x_train)
x_val_transformed = pp_pipe.transform(x_val)
x_test_transformed = pp_pipe.transform(x_test)

# SVM

## Prueba inicial

Se utiliza el clasificador con todos sus valores por defecto.

In [None]:
name = 'SVM'
svm = SVC(random_state = 10, verbose = True)
print(svm)
svm.fit(x_train_transformed, y_train)

train_predictions = svm.predict(x_train_transformed)
accuracy = accuracy_score(y_train, train_predictions)
print(f"Accuracy train {name}: %.3f%%" % (accuracy * 100.0))

val_predictions = svm.predict(x_val_transformed)
accuracy = accuracy_score(y_val, val_predictions)
print(f"Accuracy val {name}: %.3f%%" % (accuracy * 100.0))

## Ajuste de hiperparámetros

In [None]:
params = {'C' : [1E-3, 1E-2, 1E-1, 1, 1E1, 1E2],
          'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'], 
          'class_weight' : [None, 'balanced'], 
          'random_state' : [10]}

In [None]:
# grid = GridSearchCV(SVC(), params, verbose=10)
# grid.fit(x_train_transformed,y_train)
# grid.best_params_

In [None]:
grid_bal = pd.read_csv('svm_gridsearch_balanced.csv', sep=',')
grid_notbal = pd.read_csv('svm_gridsearch_not-balanced.csv', sep=',')

Vemos que:
* El Kernel sigmoideo es el peor en ambos casos.
* El caso no balanceado tiene mejores scores y tiempos que el caso balanceado.

Centrándonos en los casos no balanceados:
* El que tiene el mejor score es con Kernel=poly cuando C=100 con un score de 0.9684. Sin embargo, para el caso con Kernel=poly cuando C=10 el score es de 0.9668, *i.e.* hay una diferencia menor al 0.2% entre ambos scores. El mayor impacto se tiene en el tiempo de la corrida: para C=100 tarda unos 11 minutos, mientras que para C=10 tarda menos de 3 minutos, en promedio.
* Tomando ahora C=10, vemos que la diferencia entre poly y rbf no es significativa, pero sí se diferencian por sus tiempos: con rbf tarda poco más de un minuto, en promedio.

Se toma como parámetros óptimos entonces:
* C=1E1
* kernel='rbf' (default)
* class_weight=None (default)
* random_state=10

In [None]:
plt.rcParams["font.size"] = 25

fig, axs = plt.subplots(2, 2, figsize=(25, 20))

sns.pointplot(grid_bal, x='C', y='score', hue='Kernel', ax=axs[0, 0], markers='o')
axs[0, 0].set_title('Balanceado')

sns.pointplot(grid_notbal, x='C', y='score', hue='Kernel', ax=axs[0, 1], markers='o')
axs[0, 1].set_title('No balanceado')

sns.pointplot(grid_bal, x='C', y='time[s]', hue='Kernel', ax=axs[1, 0], markers='o')
axs[1, 0].set_title('Balanceado')

sns.pointplot(grid_notbal, x='C', y='time[s]', hue='Kernel', ax=axs[1, 1], markers='o')
axs[1, 1].set_title('No balanceado')

plt.show()


## Aplicando parámetros óptimos

In [None]:
popt = {'C':1E1, 'random_state':10}

In [None]:
svm = SVC(**popt)
print(svm)
svm.fit(x_train_transformed, y_train)

train_predictions = svm.predict(x_train_transformed)
accuracy = accuracy_score(y_train, train_predictions)
print(f"Accuracy train {name}: %.3f%%" % (accuracy * 100.0))

val_predictions = svm.predict(x_val_transformed)
accuracy = accuracy_score(y_val, val_predictions)
print(f"Accuracy val {name}: %.3f%%" % (accuracy * 100.0))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_val, val_predictions)
plt.show()

In [None]:
print(classification_report(y_val,svm.predict(x_val_transformed)))

## Cross validation

In [None]:
FOLDS=5
cv = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=10)

x_train2 = np.array(x_train_transformed)
y_train2 = np.array(y_train)

svm = SVC(**popt)
print(svm)
avg_accuracy = 0
for fold, (train_idx, val_idx) in enumerate(cv.split(x_train2, y_train2)):
    xi, yi = x_train2[train_idx], y_train2[train_idx]
    x_valid, y_valid = x_train2[val_idx], y_train2[val_idx]
    svm = svm.fit(xi, yi)

    val_predictions = svm.predict(x_valid)
    accuracy = accuracy_score(y_valid, val_predictions)
    avg_accuracy +=accuracy
    print(f"Precisión val fold {fold}: {accuracy * 100.0 :.2f}" % ())

avg_accuracy /= FOLDS
print(f'Avg. accuracy = {avg_accuracy * 100}')