In [20]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

Importación de dataframes

In [None]:
heart_df = pd.read_csv('./csv/heart_2020_cleaned.csv')

In [23]:
heart_df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [24]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Convertimos las variables categóricas en variables dummy
heart_df_encoded = pd.get_dummies(heart_df, drop_first=True)

# Preparamos los datos
X = heart_df_encoded.drop('HeartDisease_Yes', axis=1)
y = heart_df_encoded['HeartDisease_Yes']

# Dividimos los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
# Creamos el pipeline con StandardScaler y SVM
sigmoid = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(kernel='sigmoid'))
])

# Entrenamos el modelo
sigmoid.fit(X_train, y_train)

# Hacemos predicciones
y_pred = sigmoid.predict(X_test)

# Evaluamos el modelo
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8601804123711341
Classification Report:
               precision    recall  f1-score   support

       False       0.91      0.93      0.92      2802
        True       0.23      0.19      0.21       302

    accuracy                           0.86      3104
   macro avg       0.57      0.56      0.57      3104
weighted avg       0.85      0.86      0.85      3104



In [26]:
poly = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(kernel='poly'))
])

# Entrenamos el modelo
poly.fit(X_train, y_train)

# Hacemos predicciones
y_pred = poly.predict(X_test)

# Evaluamos el modelo
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.897229381443299
Classification Report:
               precision    recall  f1-score   support

       False       0.91      0.98      0.95      2802
        True       0.39      0.10      0.16       302

    accuracy                           0.90      3104
   macro avg       0.65      0.54      0.55      3104
weighted avg       0.86      0.90      0.87      3104



In [27]:
rbf = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(kernel='rbf'))
])

# Entrenamos el modelo
rbf.fit(X_train, y_train)

# Hacemos predicciones
y_pred = rbf.predict(X_test)

# Evaluamos el modelo
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9030283505154639
Classification Report:
               precision    recall  f1-score   support

       False       0.91      0.99      0.95      2802
        True       0.51      0.06      0.11       302

    accuracy                           0.90      3104
   macro avg       0.71      0.53      0.53      3104
weighted avg       0.87      0.90      0.87      3104



Como podemos observar el kernel rbf es el que mejor resultado ofrece, ahora vamos a probar haciendo valores cruzados y añadiéndole algún hiperparámetro

In [28]:
from sklearn.model_selection import cross_val_score

# Realizamos la validación cruzada
scores = cross_val_score(rbf, X, y, cv=5)

# Mostramos los resultados
print("Cross-validation scores:", scores)
print("Mean cross-validation score:", scores.mean())

Cross-validation scores: [0.90947165 0.91108247 0.90850515 0.91043814 0.90879794]
Mean cross-validation score: 0.9096590720320542


In [34]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform, uniform

param_distrib = {
    "svm__gamma": loguniform(0.01, 0.1),
    "svm__C": uniform(1, 5)
}
rnd_search_cv = RandomizedSearchCV(rbf, param_distrib, n_iter=50, cv=5,
                                   random_state=42)
rnd_search_cv.fit(X_train, y_train)
rnd_search_cv.best_estimator_

In [35]:
rnd_search_cv.best_score_

np.float64(0.9127668143374951)

In [36]:
rnd_search_cv.score(X_test, y_test)

0.9039948453608248

In [37]:
from sklearn.ensemble import RandomForestClassifier

# Creamos el pipeline con StandardScaler y RandomForestClassifier
random_forest = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier(random_state=42))
])

# Entrenamos el modelo
random_forest.fit(X_train, y_train)

# Hacemos predicciones
y_pred_rf = random_forest.predict(X_test)

# Evaluamos el modelo
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

Accuracy: 0.9007731958762887
Classification Report:
               precision    recall  f1-score   support

       False       0.91      0.99      0.95      2802
        True       0.45      0.10      0.16       302

    accuracy                           0.90      3104
   macro avg       0.68      0.54      0.55      3104
weighted avg       0.87      0.90      0.87      3104



In [38]:
from sklearn.linear_model import LogisticRegression

# Creamos el pipeline con StandardScaler y LogisticRegression
logistic_regression = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(random_state=42))
])

# Entrenamos el modelo
logistic_regression.fit(X_train, y_train)

# Hacemos predicciones
y_pred_logreg = logistic_regression.predict(X_test)

# Evaluamos el modelo
print("Accuracy:", accuracy_score(y_test, y_pred_logreg))
print("Classification Report:\n", classification_report(y_test, y_pred_logreg))

Accuracy: 0.9056056701030928
Classification Report:
               precision    recall  f1-score   support

       False       0.91      0.99      0.95      2802
        True       0.56      0.15      0.23       302

    accuracy                           0.91      3104
   macro avg       0.74      0.57      0.59      3104
weighted avg       0.88      0.91      0.88      3104



In [39]:
from sklearn.ensemble import VotingClassifier

# Creamos el VotingClassifier con los tres modelos
voting_clf = VotingClassifier(
    estimators=[
        ('rbf_svm', rbf),
        ('random_forest', random_forest),
        ('logistic_regression', logistic_regression)
    ],
    voting='hard'
)

# Entrenamos el modelo
voting_clf.fit(X_train, y_train)

# Hacemos predicciones
y_pred_voting = voting_clf.predict(X_test)

# Evaluamos el modelo
print("Accuracy:", accuracy_score(y_test, y_pred_voting))
print("Classification Report:\n", classification_report(y_test, y_pred_voting))

Accuracy: 0.904639175257732
Classification Report:
               precision    recall  f1-score   support

       False       0.91      0.99      0.95      2802
        True       0.57      0.08      0.14       302

    accuracy                           0.90      3104
   macro avg       0.74      0.54      0.54      3104
weighted avg       0.88      0.90      0.87      3104



Después de todas las pruebas, Regresión Logística es el que da mejores resultados y el que menos tarda. Casualidades de la vida