# Lectura del dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import joblib

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay, classification_report

from sklearn.svm import SVC

In [None]:
df_train = pd.read_csv('data/diabetes_prediction_dataset_train-labeled.csv')
df_test = pd.read_csv('data/diabetes_prediction_dataset_test.csv')

In [None]:
X = df_train.drop(columns=['patient', 'diabetes'])
y = df_train['diabetes']
x_train, x_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state = 8)

In [None]:
x_test = df_test.drop(columns=['patient', 'diabetes'])
y_test = df_test['diabetes']
PatientId_test = df_test['patient']

# Preprocesamiento

In [None]:
pp_pipe = joblib.load('preproc_pipeline.pkl')

In [None]:
# Fiteo el pipeline
x_train_transformed = pp_pipe.fit_transform(x_train)
x_val_transformed = pp_pipe.transform(x_val)
x_test_transformed = pp_pipe.transform(x_test)

# SVM

## Prueba inicial

Se utiliza el clasificador con todos sus valores por defecto.

In [None]:
name = 'SVM'
svm = SVC(random_state = 10)
print(svm)
svm.fit(x_train_transformed, y_train)

train_predictions = svm.predict(x_train_transformed)
accuracy = accuracy_score(y_train, train_predictions)
print(f"Accuracy train {name}: %.3f%%" % (accuracy * 100.0))

val_predictions = svm.predict(x_val_transformed)
accuracy = accuracy_score(y_val, val_predictions)
print(f"Accuracy val {name}: %.3f%%" % (accuracy * 100.0))

## Ajuste de hiperparámetros

In [None]:
params = {'C' : [1E-3, 1E-2, 1E-1, 1, 1E1, 1E2, 1E3], 
          'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'], 
          'degree' : [2, 3, 4], 
          'gamma' : ['scale', 'auto'], 
          'tol' : [1E-3, 1E-2, 1E-1, 1, 1E1, 1E2, 1E3], 
          'class_weight' : [None, 'balanced'], 
          'decision_function_shape' : ['ovr', 'ovo'], 
          'random_state' : [10]}

In [None]:
grid = GridSearchCV(SVC(), params)
grid.fit(x_train_transformed,y_train)
grid.best_params_

## Aplicando parámetros óptimos

In [None]:
svm = SVC(**grid.best_params_)
print(svm)
svm.fit(x_train_transformed, y_train)

train_predictions = svm.predict(x_train_transformed)
accuracy = accuracy_score(y_train, train_predictions)
print(f"Accuracy train {name}: %.3f%%" % (accuracy * 100.0))

val_predictions = svm.predict(x_val_transformed)
accuracy = accuracy_score(y_val, val_predictions)
print(f"Accuracy val {name}: %.3f%%" % (accuracy * 100.0))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_val, val_predictions)
plt.show()

In [None]:
print(classification_report(y_val,svm.predict(x_val_transformed)))

## Cross validation

In [None]:
FOLDS=5
cv = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=10)

x_train2 = np.array(x_train_transformed)
y_train2 = np.array(y_train)

svm = SVC(**grid.best_params_)
print(svm)
avg_accuracy = 0
for fold, (train_idx, val_idx) in enumerate(cv.split(x_train2, y_train2)):
    xi, yi = x_train2[train_idx], y_train2[train_idx]
    x_valid, y_valid = x_train2[val_idx], y_train2[val_idx]
    svm = svm.fit(xi, yi)

    val_predictions = svm.predict(x_valid)
    accuracy = accuracy_score(y_valid, val_predictions)
    avg_accuracy +=accuracy
    print(f"Precisión val fold {fold}: {accuracy * 100.0 :.2f}" % ())

avg_accuracy /= FOLDS
print(f'Avg. accuracy = {avg_accuracy * 100}')

## Generar la salida para entregar

Para poder evaluar nuestra predicción los datos de prueba deben tener exactamente el mismo tratamiento que los datos de entrenamiento

In [None]:
# Para obtener el nombre de las columnas creadas a partir del OneHotEncoder es necesario acceder al mismo de esta manera:
pipeline.transformers_[0][1]

In [None]:
# Con el método get_features_names_out se puede obtener el nombre de las columnas creadas
pipeline.transformers_[0][1].get_feature_names_out()

In [None]:
cols = pipeline.transformers_[0][1].get_feature_names_out().tolist() + num_cols
X_test_transformed = pd.DataFrame(X_test_transformed, columns=cols)

Generamos la salida

In [None]:
test_id = PatientId_test
test_pred = np.int64(svm.predict(X_test_transformed))

Con el resultado predicho tenemos que generar el archivo `.csv` para subir a la competencia de kaggle:

In [None]:
submission = pd.DataFrame(list(zip(test_id, test_pred)), columns=["patient", "diabetes"])
submission.to_csv("results/svm.csv", header=True, index=False)