In [25]:
# Importar las librerías necesarias
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE  # Para el balanceo de clases
from imblearn.pipeline import Pipeline
import optuna  # Para la optimización de hiperparámetros
import warnings
warnings.filterwarnings("ignore")

In [4]:
# Cargar el dataset
df = pd.read_csv('stroke_dataset.csv')

In [6]:
# Separar las características (X) y la variable objetivo (y)
X = df.drop('stroke', axis=1)  # Cambia 'target_column' por el nombre de tu columna objetivo
y = df['stroke']

In [7]:
# Definir características
cat_cols = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
num_cols = ['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']

In [13]:
# Preprocesamiento de las variables numéricas y categóricas
# Pipeline para variables numéricas
num_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler())  # Estandarización de las variables numéricas
])

# Pipeline para variables categóricas
cat_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))  # Codificación OneHot
])

In [14]:
# Crear un ColumnTransformer para aplicar las transformaciones adecuadas a cada tipo de variable
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_cols),
        ('cat', cat_pipeline, cat_cols)
    ])

In [19]:
# Balanceo de clases con SMOTE (oversampling)
smote = SMOTE(random_state=42)

In [21]:
# Crear el pipeline para aplicar el preprocesamiento y entrenar el modelo
def create_pipeline(model):
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('smote', smote),  # Aplicar SMOTE dentro del pipeline
        ('classifier', model)
    ])
    return pipeline

In [22]:
# Definir el diccionario de modelos
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'LightGBM': LGBMClassifier(),
    'XGBoost': XGBClassifier(),
    'CatBoost': CatBoostClassifier(verbose=0)
}

In [23]:
# Definir la validación cruzada con StratifiedKFold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [26]:
# Evaluar cada modelo utilizando cross_val_score con validación estratificada
model_results = {}
for name, model in models.items():
    print(f"Evaluando modelo: {name}")
    pipeline = create_pipeline(model)
    scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy')
    model_results[name] = scores
    print(f"Accuracy medio para {name}: {np.mean(scores)}")

Evaluando modelo: Logistic Regression
Accuracy medio para Logistic Regression: 0.7400061630675159
Evaluando modelo: Random Forest
Accuracy medio para Random Forest: 0.9241106854700648
Evaluando modelo: LightGBM
[LightGBM] [Info] Number of positive: 3786, number of negative: 3786
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001288 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2819
[LightGBM] [Info] Number of data points in the train set: 7572, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 3787, number of negative: 3787
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001452 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2824
[LightGBM] [Info] Number of data points in the train set: 7574, number of used features:

In [27]:
# Optuna para la optimización de Random Forest
def objective(trial):
    # Definir el espacio de búsqueda de hiperparámetros
    n_estimators = trial.suggest_int('n_estimators', 100, 500)
    max_depth = trial.suggest_int('max_depth', 5, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    
    # Definir el modelo
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        random_state=42
    )
    
    # Crear pipeline con el modelo optimizado
    pipeline = create_pipeline(model)
    
    # Realizar validación cruzada con StratifiedKFold
    scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy')
    
    # Devolver la media de los scores (Optuna maximiza esta métrica)
    return np.mean(scores)

In [28]:
# Ejecutar la optimización con Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

[I 2024-10-10 23:44:21,784] A new study created in memory with name: no-name-5f79185a-a3da-4a0e-aec9-6d7061d18eb5
[I 2024-10-10 23:44:44,530] Trial 0 finished with value: 0.9208994453239235 and parameters: {'n_estimators': 448, 'max_depth': 33, 'min_samples_split': 10}. Best is trial 0 with value: 0.9208994453239235.
[I 2024-10-10 23:45:02,056] Trial 1 finished with value: 0.9249151067660814 and parameters: {'n_estimators': 327, 'max_depth': 48, 'min_samples_split': 6}. Best is trial 1 with value: 0.9249151067660814.
[I 2024-10-10 23:45:18,450] Trial 2 finished with value: 0.9229072760450023 and parameters: {'n_estimators': 312, 'max_depth': 44, 'min_samples_split': 7}. Best is trial 1 with value: 0.9249151067660814.
[I 2024-10-10 23:45:29,816] Trial 3 finished with value: 0.9223048664064482 and parameters: {'n_estimators': 221, 'max_depth': 22, 'min_samples_split': 9}. Best is trial 1 with value: 0.9249151067660814.
[I 2024-10-10 23:45:38,864] Trial 4 finished with value: 0.9255173149

In [29]:
# Obtener el mejor conjunto de hiperparámetros
print(f"Mejores hiperparámetros encontrados por Optuna: {study.best_params}")

Mejores hiperparámetros encontrados por Optuna: {'n_estimators': 422, 'max_depth': 29, 'min_samples_split': 4}


In [30]:
# Entrenar el mejor modelo de Random Forest con los mejores hiperparámetros
best_rf = RandomForestClassifier(**study.best_params, random_state=42)
pipeline = create_pipeline(best_rf)
pipeline.fit(X, y)

In [31]:
# Separar los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Evaluar el modelo en el conjunto de prueba
y_pred = pipeline.predict(X_test)

In [32]:
# Calcular métricas de evaluación
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Métricas en el conjunto de prueba:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Métricas en el conjunto de prueba:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0


In [33]:
# Matriz de confusión y reporte de clasificación
print("Matriz de Confusión:")
print(confusion_matrix(y_test, y_pred))

print("Reporte de Clasificación:")
print(classification_report(y_test, y_pred))

Matriz de Confusión:
[[947   0]
 [  0  50]]
Reporte de Clasificación:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       947
           1       1.00      1.00      1.00        50

    accuracy                           1.00       997
   macro avg       1.00      1.00      1.00       997
weighted avg       1.00      1.00      1.00       997



In [None]:
# Guardar el mejor modelo si es necesario
import joblib
joblib.dump(pipeline, 'mejor_modelo_rf.pkl')