In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, brier_score_loss, f1_score, recall_score, roc_curve
from sklearn.pipeline import Pipeline
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Cargar el dataset
df = pd.read_csv(r'C:\Users\Administrator\OneDrive\Documentos\GitHub\G5_D.Scientist\data\stroke_dataset_processed.csv')

# Separar características y variable objetivo
X = df.drop('stroke', axis=1)
y = df['stroke']

# Verificar la distribución de clases
print("Distribución de clases:")
print(y.value_counts(normalize=True))

# Ingeniería de características (si es necesario)
X['age_squared'] = X['age'] ** 2
X['glucose_age_interaction'] = X['age'] * X['avg_glucose_level']

# División en conjuntos de entrenamiento, validación y prueba
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

# Función para encontrar el umbral óptimo
def find_optimal_threshold(y_true, y_pred_proba):
    thresholds = np.linspace(0, 1, 100)
    f1_scores = [f1_score(y_true, y_pred_proba >= threshold) for threshold in thresholds]
    optimal_threshold = thresholds[np.argmax(f1_scores)]
    return optimal_threshold

# Definir el pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

# Definir los parámetros para la búsqueda
param_grid = {
    'classifier__C': np.logspace(-3, -1, 20),
    'classifier__penalty': ['l1', 'l2', 'elasticnet'],
    'classifier__solver': ['saga'],
    'classifier__l1_ratio': np.linspace(0, 1, 5)
}

# Configurar la búsqueda de hiperparámetros con validación cruzada
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='roc_auc', n_jobs=-1)

# Realizar la búsqueda de hiperparámetros
grid_search.fit(X_train, y_train)

# Obtener el mejor modelo de regresión logística
best_lr = grid_search.best_estimator_

# Crear el ensemble
nb = GaussianNB()
svm = SVC(kernel='linear', probability=True, random_state=42, C=0.1)
rf = RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_leaf=5, random_state=42)

ensemble = VotingClassifier(
    estimators=[
        ('lr', best_lr),
        ('nb', nb),
        ('svm', svm),
        ('rf', rf)
    ],
    voting='soft'
)

# Función para evaluar el modelo
def evaluate_model(model, X, y, dataset_name):
    y_pred_proba = model.predict_proba(X)[:, 1]
    optimal_threshold = find_optimal_threshold(y, y_pred_proba)
    y_pred = (y_pred_proba >= optimal_threshold).astype(int)
    
    accuracy = accuracy_score(y, y_pred)
    auc_roc = roc_auc_score(y, y_pred_proba)
    brier = brier_score_loss(y, y_pred_proba)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)

    print(f"\n{dataset_name} Set Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"ROC-AUC: {auc_roc:.4f}")
    print(f"Brier Score: {brier:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")

    return accuracy, auc_roc, brier, recall, f1

# Entrenar y evaluar el ensemble
ensemble.fit(X_train, y_train)

# Evaluar en los conjuntos de entrenamiento, validación y prueba
train_accuracy, train_auc, train_brier, train_recall, train_f1 = evaluate_model(ensemble, X_train, y_train, "Training")
val_accuracy, val_auc, val_brier, val_recall, val_f1 = evaluate_model(ensemble, X_val, y_val, "Validation")
test_accuracy, test_auc, test_brier, test_recall, test_f1 = evaluate_model(ensemble, X_test, y_test, "Test")

# Calcular el overfitting (entre entrenamiento y prueba)
print("\nOverfitting Metrics:")
print(f"Accuracy Overfitting (Train - Test): {train_accuracy - test_accuracy:.4f}")
print(f"AUC Overfitting (Train - Test): {train_auc - test_auc:.4f}")
print(f"Brier Score Difference (Train - Test): {train_brier - test_brier:.4f}")
print(f"Recall Overfitting (Train - Test): {train_recall - test_recall:.4f}")
print(f"F1-Score Overfitting (Train - Test): {train_f1 - test_f1:.4f}")

# Validación cruzada del ensemble
cv_scores = cross_val_score(ensemble, X_train, y_train, cv=5, scoring='roc_auc')
print(f"\nCross-validation ROC-AUC scores: {cv_scores}")
print(f"Mean CV ROC-AUC: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores) * 2:.4f})")

# Visualizar la importancia de las características usando Random Forest
rf_model = ensemble.named_estimators_['rf']
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Feature Importance')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()

# Visualizar la curva ROC
y_pred_proba = ensemble.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {test_auc:.2f})')
plt.plot([0, 1], [0, 1], linestyle='--', label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.savefig('roc_curve.png')
plt.close()

# Nombre del archivo para el modelo
model_filename = 'best_ensemble_model.joblib'

# Guardar el modelo en el mismo directorio que el notebook
joblib.dump(ensemble, model_filename)

# Verificar que el modelo se ha guardado
if os.path.exists(model_filename):
    print(f"Modelo guardado exitosamente como '{model_filename}'")
    print(f"Tamaño del archivo: {os.path.getsize(model_filename)} bytes")
else:
    print("No se pudo guardar el modelo.")

Distribución de clases:
stroke
0    0.5
1    0.5
Name: proportion, dtype: float64





Training Set Performance:
Accuracy: 0.7631
ROC-AUC: 0.8545
Brier Score: 0.1537
Recall: 0.9208
F1-Score: 0.7965

Validation Set Performance:
Accuracy: 0.7635
ROC-AUC: 0.8444
Brier Score: 0.1587
Recall: 0.9235
F1-Score: 0.7968

Test Set Performance:
Accuracy: 0.7788
ROC-AUC: 0.8626
Brier Score: 0.1502
Recall: 0.8914
F1-Score: 0.7972

Overfitting Metrics:
Accuracy Overfitting (Train - Test): -0.0157
AUC Overfitting (Train - Test): -0.0080
Brier Score Difference (Train - Test): 0.0035
Recall Overfitting (Train - Test): 0.0294
F1-Score Overfitting (Train - Test): -0.0007





Cross-validation ROC-AUC scores: [0.85460476 0.83367329 0.83869737 0.86855199 0.84719787]
Mean CV ROC-AUC: 0.8485 (+/- 0.0246)
Modelo guardado exitosamente como 'best_ensemble_model.joblib'
Tamaño del archivo: 601359 bytes
