In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Cargar los datos balanceados y limpios
df = pd.read_csv(r'c:\Users\Administrator\OneDrive\Documentos\GitHub\G5_D.Scientist\dataset_balanceado.csv')

# Separar características y variable objetivo
X = df.drop('stroke', axis=1)
y = df['stroke']

# Identificar columnas numéricas y categóricas
numeric_features = ['age', 'avg_glucose_level', 'bmi']
categorical_features = ['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

# Crear preprocesador
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features)
    ])

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocesar los datos
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Crear y entrenar el modelo
model = LogisticRegression(random_state=42)
model.fit(X_train_processed, y_train)

# Hacer predicciones
y_pred = model.predict(X_test_processed)
y_pred_proba = model.predict_proba(X_test_processed)[:, 1]

# Evaluar el modelo
print(classification_report(y_test, y_pred))
print("\nMatriz de confusión:")
print(confusion_matrix(y_test, y_pred))
print("\nAUC-ROC:", roc_auc_score(y_test, y_pred_proba))

# Validación cruzada
cv_scores = cross_val_score(model, X_train_processed, y_train, cv=5, scoring='roc_auc')
print(f"\nPuntuaciones de validación cruzada (AUC-ROC): {cv_scores}")
print(f"Media de AUC-ROC en validación cruzada: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")

              precision    recall  f1-score   support

           0       0.78      0.74      0.76        39
           1       0.77      0.81      0.79        42

    accuracy                           0.78        81
   macro avg       0.78      0.78      0.78        81
weighted avg       0.78      0.78      0.78        81


Matriz de confusión:
[[29 10]
 [ 8 34]]

AUC-ROC: 0.8333333333333333

Puntuaciones de validación cruzada (AUC-ROC): [0.66193182 0.80156403 0.83105469 0.8359375  0.85742188]
Media de AUC-ROC en validación cruzada: 0.798 (+/- 0.140)
