In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, brier_score_loss, f1_score
from imblearn.over_sampling import ADASYN
from imblearn.pipeline import Pipeline as ImbPipeline
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

import mlflow
import mlflow.sklearn

# Configurar el experimento de MLflow
mlflow.set_experiment("Stroke Prediction Experiment")

# Cargar el dataset
df = pd.read_csv('../data/stroke_dataset_processed.csv')

# Separar características y variable objetivo
X = df.drop('stroke', axis=1)
y = df['stroke']

# Codificación de variables categóricas
le = LabelEncoder()
X['gender'] = le.fit_transform(X['gender'])
X['smoking_status'] = le.fit_transform(X['smoking_status'])

# Ingeniería de características
X['age_squared'] = X['age'] ** 2
X['glucose_age_interaction'] = X['age'] * X['avg_glucose_level']

# División en conjuntos de entrenamiento, validación y prueba
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

# Función para encontrar el umbral óptimo
def find_optimal_threshold(y_true, y_pred_proba):
    thresholds = np.linspace(0, 1, 100)
    f1_scores = [f1_score(y_true, y_pred_proba >= threshold) for threshold in thresholds]
    optimal_threshold = thresholds[np.argmax(f1_scores)]
    return optimal_threshold

# Definir el pipeline
pipeline = ImbPipeline([
    ('scaler', StandardScaler()),
    ('sampler', ADASYN(random_state=42)),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

# Definir los parámetros para la búsqueda
param_grid = {
    'classifier__C': np.logspace(-3, -1, 20),
    'classifier__penalty': ['l1', 'l2', 'elasticnet'],
    'classifier__solver': ['saga'],
    'classifier__l1_ratio': np.linspace(0, 1, 5)
}

# Configurar la búsqueda de hiperparámetros con validación cruzada
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='roc_auc', n_jobs=-1)

# Iniciar una ejecución de MLflow
with mlflow.start_run():
    # Realizar la búsqueda de hiperparámetros
    grid_search.fit(X_train, y_train)

    # Obtener el mejor modelo de regresión logística
    best_lr = grid_search.best_estimator_

    # Crear el ensemble
    nb = GaussianNB()
    svm = SVC(kernel='linear', probability=True, random_state=42, C=0.1)
    rf = RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_leaf=5, random_state=42)

    ensemble = VotingClassifier(
        estimators=[
            ('lr', best_lr),
            ('nb', nb),
            ('svm', svm),
            ('rf', rf)
        ],
        voting='soft'
    )

    # Función para evaluar el modelo
    def evaluate_model(model, X, y, dataset_name):
        y_pred_proba = model.predict_proba(X)[:, 1]
        optimal_threshold = find_optimal_threshold(y, y_pred_proba)
        y_pred = (y_pred_proba >= optimal_threshold).astype(int)
        
        accuracy = accuracy_score(y, y_pred)
        auc_roc = roc_auc_score(y, y_pred_proba)
        brier = brier_score_loss(y, y_pred_proba)
        
        print(f"\n{dataset_name} Set Performance:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"ROC-AUC: {auc_roc:.4f}")
        print(f"Brier Score: {brier:.4f}")
        
        # Registrar métricas en MLflow
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("auc_roc", auc_roc)
        mlflow.log_metric("brier_score", brier)
        
        return accuracy, auc_roc, brier

    # Entrenar y evaluar el ensemble
    ensemble.fit(X_train, y_train)

    # Evaluar en los conjuntos de entrenamiento, validación y prueba
    train_accuracy, train_auc, train_brier = evaluate_model(ensemble, X_train, y_train, "Training")
    val_accuracy, val_auc, val_brier = evaluate_model(ensemble, X_val, y_val, "Validation")
    test_accuracy, test_auc, test_brier = evaluate_model(ensemble, X_test, y_test, "Test")

    # Calcular el overfitting (entre entrenamiento y prueba)
    print("\nOverfitting Metrics:")
    print(f"Accuracy Overfitting (Train - Test): {train_accuracy - test_accuracy:.4f}")
    print(f"AUC Overfitting (Train - Test): {train_auc - test_auc:.4f}")
    print(f"Brier Score Difference (Train - Test): {train_brier - test_brier:.4f}")

    # Validación cruzada del ensemble
    cv_scores = cross_val_score(ensemble, X_train, y_train, cv=5, scoring='roc_auc')
    print(f"\nCross-validation ROC-AUC scores: {cv_scores}")
    print(f"Mean CV ROC-AUC: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores) * 2:.4f})")



    # Registrar parámetros en MLflow
    mlflow.log_params(grid_search.best_params_)

    # Registrar el modelo en MLflow
    mlflow.sklearn.log_model(ensemble, "ensemble_model")
    
    # Visualizar la importancia de las características usando Random Forest
    rf_model = ensemble.named_estimators_['rf']
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': rf_model.feature_importances_
    })
    feature_importance = feature_importance.sort_values('importance', ascending=False)

    plt.figure(figsize=(10, 6))
    sns.barplot(x='importance', y='feature', data=feature_importance)
    plt.title('Feature Importance')
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.close()

    # Visualizar la curva ROC
    from sklearn.metrics import roc_curve

    y_pred_proba = ensemble.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)

    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {test_auc:.2f})')
    plt.plot([0, 1], [0, 1], linestyle='--', label='Random Classifier')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.savefig('roc_curve.png')
    plt.close()
    
    # Registrar artefactos en MLflow
    mlflow.log_artifact("feature_importance.png")
    mlflow.log_artifact("roc_curve.png")

    from sklearn.metrics import classification_report, roc_auc_score

    # Suponiendo que ya tienes las predicciones y los valores reales
    y_pred = ensemble.predict(X_test)
    y_pred_proba = ensemble.predict_proba(X_test)[:, 1]

    # Generar el informe de clasificación como diccionario
    report = classification_report(y_test, y_pred, output_dict=True)
    auc_roc = roc_auc_score(y_test, y_pred_proba)

    # Crear el informe en formato texto
    report_text = f"Informe del modelo:\n\n"
    report_text += f"Precisión: {report['accuracy']:.2f}\n"
    report_text += f"Recall (clase 1): {report['1']['recall']:.2f}\n"
    report_text += f"F1-Score (clase 1): {report['1']['f1-score']:.2f}\n"
    report_text += f"AUC-ROC: {auc_roc:.2f}\n"

    # Guardar el informe en un archivo
    with open('../models/model_report.txt', 'w') as f:
        f.write(report_text)


    # Guardar el modelo ensemble y el label encoder
    joblib.dump(ensemble, '../models/best_ensemble_model.joblib')
    joblib.dump(le, '../models/label_encoder.joblib')
    
print("Experimento completado y registrado en MLflow.")

2024/10/20 18:15:01 INFO mlflow.tracking.fluent: Experiment with name 'Stroke Prediction Experiment' does not exist. Creating a new experiment.



Training Set Performance:
Accuracy: 0.7904
ROC-AUC: 0.8095
Brier Score: 0.0817

Validation Set Performance:
Accuracy: 0.7920
ROC-AUC: 0.8078
Brier Score: 0.0824

Test Set Performance:
Accuracy: 0.8811
ROC-AUC: 0.8182
Brier Score: 0.0764

Overfitting Metrics:
Accuracy Overfitting (Train - Test): -0.0907
AUC Overfitting (Train - Test): -0.0087
Brier Score Difference (Train - Test): 0.0053





Cross-validation ROC-AUC scores: [0.76452763 0.81390374 0.83716578 0.79199705 0.76369168]
Mean CV ROC-AUC: 0.7943 (+/- 0.0569)


