<a href="https://colab.research.google.com/github/EdwSanA/DPro_Tareas/blob/main/Aprendizaje_info_crediticia.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# Home Credit Default Risk — baseline robusto y rápido
# ----------------------------------------------------
# Qué se aprende/predice: probabilidad TARGET en application_test.csv
# Formato Kaggle: columnas -> SK_ID_CURR, TARGET ; Métrica: AUC-ROC

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# ---------- Parámetros de ejecución ----------
DO_CV = True            # pon False si quieres saltarte la CV y solo usar holdout
CV_FOLDS = 3            # CV liviana
CV_N_JOBS = 1           # evita paralelismo que a veces “cuelga” en Colab
RANDOM_STATE = 42
# --------------------------------------------

TARGET = "TARGET"
ID_COL = "SK_ID_CURR"

# 1) Cargar datos
train_df = pd.read_csv("application_train.csv")
test_df  = pd.read_csv("application_test.csv")

y = train_df[TARGET].astype(int)
X = train_df.drop(columns=[TARGET])

# Columnas por tipo
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

# 2) Preprocesamiento
num_pipe_std = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
num_pipe_nostd = Pipeline([
    ("imputer", SimpleImputer(strategy="median"))
])
cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocess_std = ColumnTransformer([
    ("num", num_pipe_std,   num_cols),
    ("cat", cat_pipe,       cat_cols),
], remainder="drop")

preprocess_nostd = ColumnTransformer([
    ("num", num_pipe_nostd, num_cols),
    ("cat", cat_pipe,       cat_cols),
], remainder="drop")

# 3) Holdout para evaluación rápida
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

# 4) Experimentos (sin SVM para evitar lentitud por calibración)
experiments = {
    "LogReg (std, C=1.0)": Pipeline([
        ("prep", preprocess_std),
        ("clf", LogisticRegression(solver="liblinear", C=1.0, random_state=RANDOM_STATE, max_iter=1000))
    ]),
    "LogReg (std, C=0.5)": Pipeline([
        ("prep", preprocess_std),
        ("clf", LogisticRegression(solver="liblinear", C=0.5, random_state=RANDOM_STATE, max_iter=1000))
    ]),
    "LogReg (std, C=2.0)": Pipeline([
        ("prep", preprocess_std),
        ("clf", LogisticRegression(solver="liblinear", C=2.0, random_state=RANDOM_STATE, max_iter=1000))
    ]),
    "DecisionTree (no-std, max_depth=8)": Pipeline([
        ("prep", preprocess_nostd),
        ("clf", DecisionTreeClassifier(max_depth=8, random_state=RANDOM_STATE))
    ]),
    "RandomForest (no-std, 100 trees)": Pipeline([
        ("prep", preprocess_nostd),
        ("clf", RandomForestClassifier(
            n_estimators=100, max_depth=None, min_samples_leaf=2,
            n_jobs=-1, random_state=RANDOM_STATE
        ))
    ]),
}

cv = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)

results = []
best_name  = None
best_auc   = -1.0
best_model = None

print("== Resultados (AUC-ROC) ==\n")
try:
    for name, pipe in experiments.items():
        # (Opcional) Validación cruzada liviana
        if DO_CV:
            cv_auc = cross_val_score(
                pipe, X_train, y_train,
                scoring="roc_auc", cv=cv, n_jobs=CV_N_JOBS, error_score="raise"
            )
            cv_mean, cv_std = float(cv_auc.mean()), float(cv_auc.std())
        else:
            cv_mean, cv_std = np.nan, np.nan

        # Holdout
        pipe.fit(X_train, y_train)
        val_pred = pipe.predict_proba(X_val)[:, 1]
        val_auc = roc_auc_score(y_val, val_pred)

        results.append({
            "Modelo": name,
            "CV AUC (mean)": cv_mean,
            "CV AUC (std)":  cv_std,
            "Holdout AUC":   val_auc
        })

        if DO_CV:
            print(f"{name:30s} | CV AUC={cv_mean:.4f} (+/-{cv_std:.4f}) | Holdout AUC={val_auc:.4f}")
        else:
            print(f"{name:30s} | Holdout AUC={val_auc:.4f}")

        if val_auc > best_auc:
            best_auc   = val_auc
            best_name  = name
            best_model = pipe

except KeyboardInterrupt:
    print("\n⚠️  Ejecución interrumpida por el usuario durante la validación. "
          "Se continúa con el mejor modelo visto hasta ahora…")

# Tabla resumen
results_df = pd.DataFrame(results).sort_values(by="Holdout AUC", ascending=False)
print("\n=== Tabla comparativa (ordenada por Holdout AUC) ===")
print(results_df.to_string(index=False))

print(f"\n➡️  Mejor modelo en holdout: {best_name} | AUC={best_auc:.4f}")

# 5) Entrenar el mejor con TODO el train y predecir test
if best_model is None:
    # Fallback seguro: usa LogReg baseline si algo interrumpió antes
    best_model = experiments["LogReg (std, C=1.0)"]
    print("\nℹ️  Usando fallback: LogReg (std, C=1.0)")

best_model.fit(X, y)
test_proba = best_model.predict_proba(test_df)[:, 1]

submission = pd.DataFrame({
    ID_COL: test_df[ID_COL],
    TARGET: test_proba
})
submission.to_csv("submission_baseline_plus.csv", index=False)
print("\n✅ Archivo de envío generado: submission_baseline_plus.csv")



== Resultados (AUC-ROC) ==

LogReg (std, C=1.0)            | CV AUC=0.7448 (+/-0.0018) | Holdout AUC=0.7489
LogReg (std, C=0.5)            | CV AUC=0.7446 (+/-0.0018) | Holdout AUC=0.7488
LogReg (std, C=2.0)            | CV AUC=0.7450 (+/-0.0017) | Holdout AUC=0.7489
DecisionTree (no-std, max_depth=8) | CV AUC=0.7114 (+/-0.0034) | Holdout AUC=0.7187
RandomForest (no-std, 100 trees) | CV AUC=0.7156 (+/-0.0025) | Holdout AUC=0.7164

=== Tabla comparativa (ordenada por Holdout AUC) ===
                            Modelo  CV AUC (mean)  CV AUC (std)  Holdout AUC
               LogReg (std, C=2.0)       0.744986      0.001727     0.748896
               LogReg (std, C=1.0)       0.744819      0.001755     0.748895
               LogReg (std, C=0.5)       0.744607      0.001778     0.748827
DecisionTree (no-std, max_depth=8)       0.711369      0.003410     0.718711
  RandomForest (no-std, 100 trees)       0.715614      0.002500     0.716354

➡️  Mejor modelo en holdout: LogReg (std, C=2.0) 