In [None]:
# Celda 1: instalación (solo se corre UNA vez, antes de reiniciar)

!pip install -q "numpy==1.26.4" mljar-supervised


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.3/127.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m49.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.8/91.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.1/99.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [None]:
# ============================
# 0. Imports base + silencio de warnings
# ============================

import logging
import warnings

# Silenciar warnings molestos de matplotlib (Arial, etc.)
logging.getLogger("matplotlib.font_manager").setLevel(logging.ERROR)
logging.getLogger("matplotlib.axes._base").setLevel(logging.ERROR)
warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib")

from supervised.automl import AutoML

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score
)

# Imports extra para PCA y preprocesamiento
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA


# ==================
# 1. Carga de datos
# ==================

DATA_PATH = "bank_subscription.csv"   # ajustar si el archivo tiene otro nombre
df = pd.read_csv(DATA_PATH, sep=";")

print("Shape:", df.shape)
print(df.head())
print("\nColumnas del dataset:")
print(df.columns.tolist())


# ==========================================
# 2. Definición de target, features y segmentos
# ==========================================

# Variable objetivo
TARGET_COL = "Subscription"

# Features (todas menos la target)
feature_cols = [c for c in df.columns if c != TARGET_COL]

# Columnas numéricas y categóricas (las mismas del EDA)
numeric_cols = [
    "Age",
    "Balance (euros)",
    "Last Contact Day",
    "Last Contact Duration",
    "Campaign",
    "Pdays",
    "Previous",
]

cat_cols = [
    "Job",
    "Marital Status",
    "Education",
    "Credit",
    "Housing Loan",
    "Personal Loan",
    "Contact",
    "Last Contact Month",
    "Poutcome",
]

X = df[feature_cols].copy()
y = df[TARGET_COL].copy()

# --- Segmentos ---

# Definimos "hubo contacto previo" como:
# Pdays distinto de -1  o Previous > 0
pdays_clean = df["Pdays"].fillna(-1)
previous_clean = df["Previous"].fillna(0)

has_previous_contact = (pdays_clean != -1) | (previous_clean > 0)

# Segmento A: sin historial
segment_A_mask = ~has_previous_contact

# Segmento C: historial exitoso (Poutcome == 'success')
segment_C_mask = (df["Poutcome"] == "success") & has_previous_contact

# Segmento B: historial no exitoso (hubo contacto previo PERO Poutcome != success)
segment_B_mask = has_previous_contact & ~segment_C_mask

print("\nTotal registros:", len(df))
print("Segmento A (sin historial):", segment_A_mask.sum())
print("Segmento B (historial no exitoso):", segment_B_mask.sum())
print("Segmento C (historial exitoso):", segment_C_mask.sum())

X_A, y_A = X[segment_A_mask], y[segment_A_mask]
X_B, y_B = X[segment_B_mask], y[segment_B_mask]
X_C, y_C = X[segment_C_mask], y[segment_C_mask]


# ===========================================
# 3. Función auxiliar para correr AutoML
# ===========================================

def run_automl_segment(X_seg, y_seg, segment_name,
                       total_time_limit=600,
                       random_state=42,
                       show_report=True):
    """
    Corre AutoML de mljar-supervised para un segmento dado,
    imprime matriz de confusión, classification_report y AUC ROC
    y opcionalmente genera el reporte gráfico (leaderboard, curvas, etc.).
    """
    print("\n" + "#" * 80)
    print(f"AUTOML - SEGMENTO: {segment_name}")
    print("#" * 80)

    # Split estratificado
    X_train, X_test, y_train, y_test = train_test_split(
        X_seg, y_seg,
        test_size=0.3,
        random_state=random_state,
        stratify=y_seg
    )

    # AutoML: usamos modo Explain y verbose=0 para que NO imprima logs largos
    automl = AutoML(
        mode="Explain",          # modo explicativo (rápido y con reportes)
        eval_metric="auc",       # métrica principal AUC
        total_time_limit=total_time_limit,
        random_state=random_state,
        results_path=f"Automl_{segment_name.replace(' ', '_')}",
        verbose=0                # <--- esto evita el texto de entrenamiento
    )

    # Entrenamiento
    automl.fit(X_train, y_train)

    # Predicciones con el mejor modelo encontrado
    y_pred = automl.predict(X_test)

    try:
        y_proba = automl.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_proba)
    except Exception as e:
        print("No se pudo calcular AUC ROC desde predict_proba:", e)
        y_proba = None
        auc = np.nan

    print("\nMatriz de confusión:")
    print(confusion_matrix(y_test, y_pred))

    print("\nClassification report:")
    print(classification_report(y_test, y_pred, zero_division=0))

    print("\nAUC ROC:", round(auc, 4) if not np.isnan(auc) else "NA")

    # ===> Aquí se genera el reporte bonito con tablas y gráficos
    if show_report:
        print(f"\n=== Reporte AutoML para {segment_name} ===")
        automl.report()

    return {
        "segmento": segment_name,
        "auc": auc,
        "automl": automl
    }


# =============================================
# 4. Experimentos SIN PCA (dataset completo y segmentos)
# =============================================

# Dataset completo
res_global_sin_pca = run_automl_segment(X, y, "Global_sin_PCA", total_time_limit=900)

# Segmento A - sin historial
res_A_sin_pca = run_automl_segment(X_A, y_A, "Segmento_A_sin_historial_sin_PCA", total_time_limit=600)

# Segmento B - historial no exitoso
res_B_sin_pca = run_automl_segment(X_B, y_B, "Segmento_B_historial_no_exitoso_sin_PCA", total_time_limit=600)

# Segmento C - historial exitoso
res_C_sin_pca = run_automl_segment(X_C, y_C, "Segmento_C_historial_exitoso_sin_PCA", total_time_limit=300)


# ==================================================
# 5. Construcción de dataset con PCA (sobre One-Hot)
# ==================================================

# Preprocesamiento: imputación + escalado + One-Hot
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, cat_cols),
    ]
)

# Ajustamos y transformamos TODO X (dataset completo)
X_pre_all = preprocess.fit_transform(X)

# Aseguramos densidad para PCA
if hasattr(X_pre_all, "toarray"):
    X_pre_all = X_pre_all.toarray()

# PCA manteniendo ~90% de la varianza
pca = PCA(n_components=0.9, random_state=42)
X_pca_all = pca.fit_transform(X_pre_all)

print("\nShape original (después del preprocesamiento):", X_pre_all.shape)
print("Shape después de PCA:", X_pca_all.shape)

# DataFrame con componentes principales
pca_cols = [f"PC_{i+1}" for i in range(X_pca_all.shape[1])]
X_pca_df = pd.DataFrame(X_pca_all, columns=pca_cols, index=df.index)

# Segmentos pero en el espacio PCA (usamos las mismas máscaras de filas)
X_pca_global = X_pca_df
X_pca_A = X_pca_df[segment_A_mask]
X_pca_B = X_pca_df[segment_B_mask]
X_pca_C = X_pca_df[segment_C_mask]


# =============================================
# 6. Experimentos CON PCA (dataset completo y segmentos)
# =============================================

# Dataset completo con PCA
res_global_con_pca = run_automl_segment(X_pca_global, y, "Global_con_PCA", total_time_limit=900)

# Segmento A - sin historial, con PCA
res_A_con_pca = run_automl_segment(X_pca_A, y_A, "Segmento_A_sin_historial_con_PCA", total_time_limit=600)

# Segmento B - historial no exitoso, con PCA
res_B_con_pca = run_automl_segment(X_pca_B, y_B, "Segmento_B_historial_no_exitoso_con_PCA", total_time_limit=600)

# Segmento C - historial exitoso, con PCA
res_C_con_pca = run_automl_segment(X_pca_C, y_C, "Segmento_C_historial_exitoso_con_PCA", total_time_limit=300)


# ==================================
# 7. Resumen rápido de AUC por caso
# ==================================

resumen_auc = pd.DataFrame([
    {"Caso": "Global sin PCA",      "AUC": res_global_sin_pca["auc"]},
    {"Caso": "Global con PCA",      "AUC": res_global_con_pca["auc"]},
    {"Caso": "Segmento A sin PCA",  "AUC": res_A_sin_pca["auc"]},
    {"Caso": "Segmento A con PCA",  "AUC": res_A_con_pca["auc"]},
    {"Caso": "Segmento B sin PCA",  "AUC": res_B_sin_pca["auc"]},
    {"Caso": "Segmento B con PCA",  "AUC": res_B_con_pca["auc"]},
    {"Caso": "Segmento C sin PCA",  "AUC": res_C_sin_pca["auc"]},
    {"Caso": "Segmento C con PCA",  "AUC": res_C_con_pca["auc"]},
])

print("\nResumen AUC por caso:")
print(resumen_auc.sort_values("AUC", ascending=False))


Shape: (45211, 17)
    Age           Job Marital Status  Education Credit  Balance (euros)  \
0  58.0           NaN            NaN        NaN     no           2143.0   
1  44.0    technician         single  secondary     no              NaN   
2  33.0  entrepreneur            NaN  secondary     no              2.0   
3  47.0           NaN        married    unknown     no           1506.0   
4  33.0       unknown         single    unknown    NaN              1.0   

  Housing Loan Personal Loan  Contact  Last Contact Day Last Contact Month  \
0          yes           NaN  unknown                 5                may   
1          yes            no  unknown                 5                may   
2          yes           yes  unknown                 5                may   
3          NaN            no  unknown                 5                may   
4           no            no  unknown                 5                may   

   Last Contact Duration  Campaign  Pdays  Previous Poutcome 