In [None]:
# ============================================================
# PROYECTO ML (NIVEL MÁSTER) — HeartDisease (clasificación)
# Código completo: carga → EDA clave → benchmarking de modelos
# → GridSearchCV (pipelines) → RandomizedSearchCV → evaluación
# → selección final → guardado con pickle
#
# Nota: Yo trabajo SIEMPRE con train para decisiones (evito leakage)
# y reservo test solo para evaluación final comparable.
# ============================================================

import os
import pickle
import warnings

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    GridSearchCV,
    RandomizedSearchCV
)

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    average_precision_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    RocCurveDisplay,
    PrecisionRecallDisplay
)

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier

warnings.filterwarnings("ignore")

# ============================================================
# 0) CONFIG
# ============================================================
DATA_PATH = "data/heart.csv"
RANDOM_STATE = 42
TEST_SIZE = 0.20

# Yo priorizo ROC-AUC como métrica principal en problemas de salud,
# porque mide capacidad de discriminación independientemente del umbral.
PRIMARY_SCORING = "roc_auc"

# ============================================================
# 1) ESTILO GLOBAL (común)
# ============================================================
sns.set_theme(style="whitegrid")
palette = sns.color_palette("Reds_r", 8)
sns.set_palette(palette)
plt.rcParams.update({
    "figure.dpi": 110,
    "axes.titlesize": 14,
    "axes.labelsize": 12,
    "xtick.labelsize": 10,
    "ytick.labelsize": 10
})

def label_from_dict(col: str, labels: dict) -> str:
    # Yo centralizo las etiquetas para que TODOS los gráficos usen el mismo idioma y estilo.
    return labels.get(col, col)

# ============================================================
# 2) CARGA + RENOMBRADO A ESPAÑOL (una sola vez)
# ============================================================
df_raw = pd.read_csv(DATA_PATH)

RENAME = {
    "Age": "edad",
    "Sex": "sexo",
    "ChestPainType": "tipo_dolor_pecho",
    "RestingBP": "presion_reposo",
    "Cholesterol": "colesterol",
    "FastingBS": "glucosa_ayunas",
    "RestingECG": "ecg_reposo",
    "MaxHR": "fc_max",
    "ExerciseAngina": "angina_ejercicio",
    "Oldpeak": "oldpeak",
    "ST_Slope": "pendiente_st",
    "HeartDisease": "enfermedad_cardiaca"
}
df = df_raw.rename(columns=RENAME).copy()

TARGET = "enfermedad_cardiaca"

LABELS = {
    "edad": "Edad del paciente (años)",
    "sexo": "Sexo",
    "tipo_dolor_pecho": "Tipo de dolor en el pecho",
    "presion_reposo": "Presión arterial en reposo (mm Hg)",
    "colesterol": "Colesterol sérico (mg/dl)",
    "glucosa_ayunas": "Glucosa en ayunas > 120 mg/dl",
    "ecg_reposo": "Electrocardiograma en reposo",
    "fc_max": "Frecuencia cardíaca máxima alcanzada",
    "angina_ejercicio": "Angina inducida por ejercicio",
    "oldpeak": "Depresión del segmento ST (oldpeak)",
    "pendiente_st": "Pendiente del segmento ST",
    "enfermedad_cardiaca": "Enfermedad cardíaca"
}

CATEGORY_VALUES = {
    "sexo": {"M": "Hombre", "F": "Mujer"},
    "angina_ejercicio": {"Y": "Sí", "N": "No"},
    "tipo_dolor_pecho": {
        "ASY": "Asintomático",
        "ATA": "Angina atípica",
        "NAP": "Dolor no anginoso",
        "TA": "Angina típica"
    },
    "ecg_reposo": {
        "Normal": "Normal",
        "ST": "Alteraciones ST-T",
        "LVH": "Hipertrofia ventricular izquierda"
    },
    "pendiente_st": {
        "Up": "Pendiente ascendente",
        "Flat": "Pendiente plana",
        "Down": "Pendiente descendente"
    }
}

def translate_levels_for_plot(df_plot: pd.DataFrame, col: str) -> pd.DataFrame:
    # Yo traduzco niveles SOLO para visualización (no altero los datos originales).
    out = df_plot.copy()
    if col in CATEGORY_VALUES:
        out[col] = out[col].map(CATEGORY_VALUES[col]).fillna(out[col])
    return out

# ============================================================
# 3) HOLD-OUT SPLIT (primero: evito leakage)
# ============================================================
X = df.drop(columns=[TARGET])
y = df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=TEST_SIZE,
    stratify=y,
    random_state=RANDOM_STATE
)

# ============================================================
# 4) TIPOS DE VARIABLES
# ============================================================
categorical_cols = ["sexo", "tipo_dolor_pecho", "ecg_reposo", "angina_ejercicio", "pendiente_st"]
binary_cols = ["glucosa_ayunas"]
numeric_cols = ["edad", "presion_reposo", "colesterol", "fc_max", "oldpeak"]

# ============================================================
# 5) EDA CLAVE (gráficos que aportan valor real)
# ============================================================

# 5.1) Distribución del target (train)
plt.figure(figsize=(6, 4))
sns.countplot(x=y_train, palette=palette)
plt.title("Distribución del diagnóstico (train)")
plt.xlabel(label_from_dict(TARGET, LABELS))
plt.ylabel("Número de pacientes")
plt.xticks([0, 1], ["No", "Sí"])
plt.tight_layout()
plt.show()

# 5.2) Matriz de correlación (numéricas + target) usando SOLO train
num_df = pd.concat([X_train[numeric_cols + binary_cols], y_train.rename(TARGET)], axis=1).copy()
for c in ["presion_reposo", "colesterol", "fc_max"]:
    if c in num_df.columns:
        num_df[c] = num_df[c].replace(0, np.nan)

corr_num = num_df.corr(numeric_only=True)

plt.figure(figsize=(12, 9))
sns.heatmap(corr_num, annot=True, fmt=".2f", linewidths=0.5, square=True)
plt.title("Matriz de correlación (train: numéricas + target)")
plt.tight_layout()
plt.show()

# 5.3) BOXPLOTS que aparecían en tus capturas (SOLO train)
train_df = X_train.copy()
train_df[TARGET] = y_train

plt.figure(figsize=(10, 4))
sns.boxplot(x=TARGET, y="fc_max", data=train_df)
plt.title("Frecuencia cardíaca máxima alcanzada según diagnóstico (train)")
plt.xlabel(label_from_dict(TARGET, LABELS))
plt.ylabel(label_from_dict("fc_max", LABELS))
plt.xticks([0, 1], ["No", "Sí"])
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 4))
sns.boxplot(x=TARGET, y="oldpeak", data=train_df)
plt.title("Depresión del segmento ST (oldpeak) según diagnóstico (train)")
plt.xlabel(label_from_dict(TARGET, LABELS))
plt.ylabel(label_from_dict("oldpeak", LABELS))
plt.xticks([0, 1], ["No", "Sí"])
plt.tight_layout()
plt.show()

# ============================================================
# 6) PREPROCESADO (pipelines)
# ============================================================
numeric_imputer = SimpleImputer(strategy="median")
cat_imputer = SimpleImputer(strategy="most_frequent")

preprocess_scaled = ColumnTransformer(
    transformers=[
        ("num", Pipeline(steps=[
            ("imputer", numeric_imputer),
            ("scaler", StandardScaler())
        ]), numeric_cols + binary_cols),
        ("cat", Pipeline(steps=[
            ("imputer", cat_imputer),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), categorical_cols),
    ],
    remainder="drop"
)

preprocess_noscale = ColumnTransformer(
    transformers=[
        ("num", Pipeline(steps=[
            ("imputer", numeric_imputer)
        ]), numeric_cols + binary_cols),
        ("cat", Pipeline(steps=[
            ("imputer", cat_imputer),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), categorical_cols),
    ],
    remainder="drop"
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

def evaluate_on_test(model, X_test, y_test) -> dict:
    y_pred = model.predict(X_test)

    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
    else:
        scores = model.decision_function(X_test)
        y_proba = (scores - scores.min()) / (scores.max() - scores.min() + 1e-12)

    return {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred, zero_division=0),
        "f1": f1_score(y_test, y_pred, zero_division=0),
        "roc_auc": roc_auc_score(y_test, y_proba),
        "pr_auc": average_precision_score(y_test, y_proba),
    }

# ============================================================
# 7) MODELOS (mínimo 5) + GRIDSEARCHCV (pipelines)
# ============================================================
try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except Exception:
    HAS_XGB = False

models = []

models.append({
    "name": "LogisticRegression",
    "pipeline": Pipeline(steps=[
        ("preprocess", preprocess_scaled),
        ("model", LogisticRegression(max_iter=2000, random_state=RANDOM_STATE))
    ]),
    "param_grid": {
        "model__C": [0.01, 0.1, 1, 10],
        "model__penalty": ["l2"],
        "model__solver": ["lbfgs"]
    },
    "why": "Yo lo uso como baseline interpretable y defendible en un contexto clínico."
})

models.append({
    "name": "SVC",
    "pipeline": Pipeline(steps=[
        ("preprocess", preprocess_scaled),
        ("model", SVC(probability=True, random_state=RANDOM_STATE))
    ]),
    "param_grid": {
        "model__C": [0.1, 1, 10],
        "model__kernel": ["rbf", "linear"],
        "model__gamma": ["scale", "auto"]
    },
    "why": "Yo lo pruebo porque puede capturar relaciones no lineales con buen rendimiento."
})

models.append({
    "name": "KNN",
    "pipeline": Pipeline(steps=[
        ("preprocess", preprocess_scaled),
        ("model", KNeighborsClassifier())
    ]),
    "param_grid": {
        "model__n_neighbors": [5, 11, 21, 31],
        "model__weights": ["uniform", "distance"],
        "model__p": [1, 2]
    },
    "why": "Yo lo incluyo como referencia simple y no paramétrica; me ayuda a comparar complejidad vs rendimiento."
})

models.append({
    "name": "RandomForest",
    "pipeline": Pipeline(steps=[
        ("preprocess", preprocess_noscale),
        ("model", RandomForestClassifier(random_state=RANDOM_STATE))
    ]),
    "param_grid": {
        "model__n_estimators": [200, 500],
        "model__max_depth": [None, 3, 5, 8],
        "model__min_samples_split": [2, 5, 10],
        "model__min_samples_leaf": [1, 2, 4]
    },
    "why": "Yo lo uso por robustez y porque suele rendir bien en tabular con poco tuning."
})

models.append({
    "name": "GradientBoosting",
    "pipeline": Pipeline(steps=[
        ("preprocess", preprocess_noscale),
        ("model", GradientBoostingClassifier(random_state=RANDOM_STATE))
    ]),
    "param_grid": {
        "model__n_estimators": [200, 400],
        "model__learning_rate": [0.03, 0.05, 0.1],
        "model__max_depth": [2, 3]
    },
    "why": "Yo lo pruebo porque el boosting suele capturar bien interacciones y no linealidad."
})

if HAS_XGB:
    models.append({
        "name": "XGBoost",
        "pipeline": Pipeline(steps=[
            ("preprocess", preprocess_noscale),
            ("model", XGBClassifier(
                random_state=RANDOM_STATE,
                eval_metric="logloss",
                n_jobs=-1
            ))
        ]),
        "param_grid": {
            "model__n_estimators": [300, 600],
            "model__learning_rate": [0.03, 0.05, 0.1],
            "model__max_depth": [2, 3, 4],
            "model__subsample": [0.8, 1.0],
            "model__colsample_bytree": [0.8, 1.0],
            "model__reg_lambda": [1.0, 5.0, 10.0]
        },
        "why": "Yo lo incluyo como candidato final productivo en tabular; lo regularizo para evitar overfitting."
    })
else:
    models.append({
        "name": "HistGradientBoosting",
        "pipeline": Pipeline(steps=[
            ("preprocess", preprocess_noscale),
            ("model", HistGradientBoostingClassifier(random_state=RANDOM_STATE))
        ]),
        "param_grid": {
            "model__learning_rate": [0.03, 0.05, 0.1],
            "model__max_depth": [3, 5, None],
            "model__max_iter": [300, 600]
        },
        "why": "Yo uso este modelo como alternativa moderna de boosting en sklearn si XGBoost no está disponible."
    })

# ============================================================
# 8) GRIDSEARCH POR MODELO + COMPARATIVA EN TEST
# ============================================================
results = []
best_estimators = {}

for spec in models:
    gs = GridSearchCV(
        estimator=spec["pipeline"],
        param_grid=spec["param_grid"],
        scoring=PRIMARY_SCORING,
        cv=cv,
        n_jobs=-1,
        refit=True
    )
    gs.fit(X_train, y_train)

    best_model = gs.best_estimator_
    best_estimators[spec["name"]] = best_model

    test_metrics = evaluate_on_test(best_model, X_test, y_test)

    results.append({
        "modelo": spec["name"],
        "cv_best_roc_auc": gs.best_score_,
        "test_accuracy": test_metrics["accuracy"],
        "test_precision": test_metrics["precision"],
        "test_recall": test_metrics["recall"],
        "test_f1": test_metrics["f1"],
        "test_roc_auc": test_metrics["roc_auc"],
        "test_pr_auc": test_metrics["pr_auc"],
        "me_justifico": spec["why"]
    })

results_df = pd.DataFrame(results).sort_values("test_roc_auc", ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=results_df, x="test_roc_auc", y="modelo", palette=palette)
plt.title("Comparativa de modelos (test) — ROC-AUC")
plt.xlabel("ROC-AUC (test)")
plt.ylabel("Modelo")
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 6))
sns.barplot(data=results_df, x="test_f1", y="modelo", palette=palette)
plt.title("Comparativa de modelos (test) — F1-score")
plt.xlabel("F1 (test)")
plt.ylabel("Modelo")
plt.tight_layout()
plt.show()

# ============================================================
# 9) RANDOMIZEDSEARCHCV (lo aplico al mejor candidato de boosting)
# ============================================================
best_name = results_df.iloc[0]["modelo"]
base_best = best_estimators[best_name]

if best_name == "XGBoost":
    param_dist = {
        "model__n_estimators": [300, 600, 900],
        "model__learning_rate": [0.01, 0.03, 0.05, 0.1],
        "model__max_depth": [2, 3, 4, 5],
        "model__subsample": [0.6, 0.8, 1.0],
        "model__colsample_bytree": [0.6, 0.8, 1.0],
        "model__min_child_weight": [1, 5, 10],
        "model__reg_alpha": [0.0, 0.1, 0.5],
        "model__reg_lambda": [1.0, 5.0, 10.0],
    }
elif best_name == "RandomForest":
    param_dist = {
        "model__n_estimators": [300, 600, 900],
        "model__max_depth": [None, 3, 5, 8, 12],
        "model__min_samples_split": [2, 5, 10, 20],
        "model__min_samples_leaf": [1, 2, 4, 8],
        "model__max_features": ["sqrt", "log2", None],
    }
elif best_name in ["GradientBoosting", "HistGradientBoosting"]:
    param_dist = {
        "model__learning_rate": [0.01, 0.03, 0.05, 0.1],
        "model__max_depth": [2, 3, 4, 5, None],
    }
    if best_name == "HistGradientBoosting":
        param_dist["model__max_iter"] = [200, 400, 800]
else:
    param_dist = {}

if param_dist:
    rs = RandomizedSearchCV(
        estimator=base_best,
        param_distributions=param_dist,
        n_iter=25,
        scoring=PRIMARY_SCORING,
        cv=cv,
        random_state=RANDOM_STATE,
        n_jobs=-1,
        refit=True
    )
    rs.fit(X_train, y_train)

    final_model = rs.best_estimator_
    final_metrics = evaluate_on_test(final_model, X_test, y_test)

    plt.figure(figsize=(6.5, 4.8))
    ConfusionMatrixDisplay.from_predictions(y_test, final_model.predict(X_test), cmap="Reds")
    plt.title(f"Matriz de confusión (test) — Modelo final: {best_name} (RandomizedSearch)")
    plt.tight_layout()
    plt.show()

    RocCurveDisplay.from_estimator(final_model, X_test, y_test)
    plt.title(f"Curva ROC (test) — Modelo final: {best_name}")
    plt.tight_layout()
    plt.show()

    PrecisionRecallDisplay.from_estimator(final_model, X_test, y_test)
    plt.title(f"Curva Precision-Recall (test) — Modelo final: {best_name}")
    plt.tight_layout()
    plt.show()

else:
    final_model = base_best
    final_metrics = evaluate_on_test(final_model, X_test, y_test)

    plt.figure(figsize=(6.5, 4.8))
    ConfusionMatrixDisplay.from_predictions(y_test, final_model.predict(X_test), cmap="Reds")
    plt.title(f"Matriz de confusión (test) — Modelo final: {best_name}")
    plt.tight_layout()
    plt.show()

    RocCurveDisplay.from_estimator(final_model, X_test, y_test)
    plt.title(f"Curva ROC (test) — Modelo final: {best_name}")
    plt.tight_layout()
    plt.show()

    PrecisionRecallDisplay.from_estimator(final_model, X_test, y_test)
    plt.title(f"Curva Precision-Recall (test) — Modelo final: {best_name}")
    plt.tight_layout()
    plt.show()

# ============================================================
# 10) GUARDADO DEL MODELO FINAL (pickle)
# ============================================================
os.makedirs("model", exist_ok=True)
MODEL_PATH = "model/modelo_final.pkl"

with open(MODEL_PATH, "wb") as f:
    pickle.dump(final_model, f)

# ============================================================
# 11) RESUMEN FINAL (tabla de resultados en test)
# ============================================================
results_df_display = results_df.copy()
results_df_display["me_justifico"] = results_df_display["me_justifico"].str.slice(0, 120) + "..."

results_df_display





