In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import json

ImportError: cannot import name 'CatBoostClassifier' from 'catboost' (c:\G5-DScience\G5_D.Scientist\models\catboost.py)

In [5]:
# Cargar los datos
df = pd.read_csv('../data/stroke_dataset.csv')

In [6]:
# Definir características
numeric_features = ['age', 'avg_glucose_level', 'bmi']
categorical_features = ['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

# Crear preprocesador
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features)
    ])


In [7]:
# Definir escenarios y modelos
scenarios = [
    (" *** Dataset Original *** ", df),
    (" *** Sin menores de 14 años ***", df[df['age'] >= 14]),
    (" *** Sin valores desconocidos en Smoking Status", df[df['smoking_status'] != 'Unknown']),
    (" *** Nueva categoría par desconocidos en Smoking Status", df.replace({'smoking_status': {'Unknown': 'No Information'}}))
]

In [8]:
models = [
    ("Logistic Regression", LogisticRegression(random_state=42)),
    ("Random Forest", RandomForestClassifier(random_state=42)),
    ("XGBoost", XGBClassifier(random_state=42)),
    ("CatBoost", CatBoostClassifier(random_state=42, verbose=False))
]

In [9]:
# Función para evaluar el modelo y obtener métricas
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    y_pred_proba_test = model.predict_proba(X_test)[:, 1]
    
    train_auc = roc_auc_score(y_train, model.predict_proba(X_train)[:, 1])
    test_auc = roc_auc_score(y_test, y_pred_proba_test)
    
    return {
        "train_auc": train_auc,
        "test_auc": test_auc,
        "f1_score": f1_score(y_test, y_pred_test),
        "confusion_matrix": confusion_matrix(y_test, y_pred_test).tolist(),
        "classification_report": classification_report(y_test, y_pred_test, output_dict=True)
    }

results = {}

In [None]:
# Iterar sobre escenarios
for scenario_name, scenario_df in scenarios:
    print(f"\n--- Scenario: {scenario_name} ---")
    results[scenario_name] = {}
    
    # Dividir características y objetivo
    X = scenario_df.drop('stroke', axis=1)
    y = scenario_df['stroke']
    
    # Dividir en conjuntos de entrenamiento y prueba
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    # Preprocesar datos
    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_test_preprocessed = preprocessor.transform(X_test)
    
    # Iterar sobre modelos
    for model_name, model in models:
        print(f"\n{model_name}")
        results[scenario_name][model_name] = {}
        
        # Sin SMOTE
        pipeline = Pipeline([('classifier', model)])
        metrics_without_smote = evaluate_model(pipeline, X_train_preprocessed, X_test_preprocessed, y_train, y_test)
        results[scenario_name][model_name]["Without SMOTE"] = metrics_without_smote
        
        # Con SMOTE
        smote = SMOTE(random_state=42)
        X_train_smote, y_train_smote = smote.fit_resample(X_train_preprocessed, y_train)
        metrics_with_smote = evaluate_model(pipeline, X_train_smote, X_test_preprocessed, y_train_smote, y_test)
        results[scenario_name][model_name]["With SMOTE"] = metrics_with_smote
        
        # Imprimir resultados
        print("Without SMOTE:")
        print(f"Train AUC: {metrics_without_smote['train_auc']:.4f}, Test AUC: {metrics_without_smote['test_auc']:.4f}")
        print(f"F1-Score: {metrics_without_smote['f1_score']:.4f}")
        print("\nWith SMOTE:")
        print(f"Train AUC: {metrics_with_smote['train_auc']:.4f}, Test AUC: {metrics_with_smote['test_auc']:.4f}")
        print(f"F1-Score: {metrics_with_smote['f1_score']:.4f}")
        
        # Evaluar overfitting
        overfitting_without_smote = metrics_without_smote['train_auc'] - metrics_without_smote['test_auc']
        overfitting_with_smote = metrics_with_smote['train_auc'] - metrics_with_smote['test_auc']
        print("\nOverfitting evaluation:")
        print(f"Without SMOTE: {overfitting_without_smote:.4f}")
        print(f"With SMOTE: {overfitting_with_smote:.4f}")
        
        results[scenario_name][model_name]["Overfitting"] = {
            "Without SMOTE": overfitting_without_smote,
            "With SMOTE": overfitting_with_smote
        }

In [None]:
# Guardar resultados en un archivo JSON
with open('stroke_prediction_results.json', 'w') as f:
    json.dump(results, f, indent=4)

print("\nResults have been saved to 'stroke_prediction_results.json'")

# Visualización de importancia de características (para Random Forest)
X = df.drop('stroke', axis=1)
y = df['stroke']

pipeline_rf = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])
pipeline_rf.fit(X, y)

feature_names = (numeric_features + 
                pipeline_rf.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features).tolist())

importances = pipeline_rf.named_steps['classifier'].feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(12, 8))
plt.title("Feature Importances")
plt.bar(range(len(importances)), importances[indices])
plt.xticks(range(len(importances)), [feature_names[i] for i in indices], rotation=90)
plt.tight_layout()
plt.savefig('feature_importances.png')
plt.close()

print("Feature importance plot has been saved as 'feature_importances.png'")

In [None]:
# Iterar sobre escenarios
for scenario_name, scenario_df in scenarios:
    print(f"\n*** ESCENARIO: {scenario_name} *** ")
    
    # Dividir características y objetivo
    X = scenario_df.drop('stroke', axis=1)
    y = scenario_df['stroke']
    
    # Dividir en conjuntos de entrenamiento y prueba
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    # Iterar sobre modelos
    for model_name, model in models:
        print(f"\n{model_name} SIN técnicas de balanceo SMOTE:")
        
        # Crear y entrenar pipeline sin SMOTE
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', model)
        ])
        pipeline.fit(X_train, y_train)
        
        # Predecir y evaluar
        y_pred = pipeline.predict(X_test)
        y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
        
        print(f"AUC-ROC: {roc_auc_score(y_test, y_pred_proba):.4f}")
        print(f"F1-Score: {f1_score(y_test, y_pred):.4f}")
        print("\nConfusion Matrix:")
        print(confusion_matrix(y_test, y_pred))
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))
        
        print(f"\n{model_name} CON técnicas de balanceo SMOTE:")
        
        # Crear y entrenar pipeline con SMOTE
        pipeline_smote = ImbPipeline([
            ('preprocessor', preprocessor),
            ('smote', SMOTE(random_state=42)),
            ('classifier', model)
        ])
        pipeline_smote.fit(X_train, y_train)
        
        # Predecir y evaluar
        y_pred_smote = pipeline_smote.predict(X_test)
        y_pred_proba_smote = pipeline_smote.predict_proba(X_test)[:, 1]
        
        print(f"AUC-ROC: {roc_auc_score(y_test, y_pred_proba_smote):.4f}")
        print(f"F1-Score: {f1_score(y_test, y_pred_smote):.4f}")
        print("\nConfusion Matrix:")
        print(confusion_matrix(y_test, y_pred_smote))
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred_smote))