# Bibliotecas y configuración

In [7]:
# Imports (solo aquí)
import os
import re
import json
import pickle
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score,
    roc_curve,
    precision_recall_curve,
    confusion_matrix,
    average_precision_score
)
from sklearn.metrics import (
    roc_auc_score,
    precision_recall_curve,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    roc_curve
)


# Parámetros generales
TEST_SIZE     = 0.20
RANDOM_STATE  = 42
MIN_IV_SELECT = 0.02
MAX_VARS      = 120

# --------- localizar raíz del proyecto de forma robusta ----------
def find_repo_root(start: Path, max_hops: int = 6) -> Path:
    p = start.resolve()
    for _ in range(max_hops):
        if (p / ".git").exists() or (p / "pyproject.toml").exists() or (p / "data").exists():
            return p
        p = p.parent
    return start.resolve()  # fallback (mejor que repetir data/interim)


CWD = Path.cwd()
BASE_DIR = Path.cwd().resolve().parent  # raíz del repo
DATA_DIR = BASE_DIR / "data"
INTERIM_DIR = DATA_DIR / "interim"
REFERENCES_DIR = BASE_DIR / "references"
REPORTS_DIR = BASE_DIR / "reports"
MODELS_DIR = BASE_DIR / "models"
REPORTS_IV_DIR = REPORTS_DIR / "iv"

# --------- carga ABT y selección por IV ----------
df = pd.read_csv(INTERIM_DIR / "abt_PE2020_clean_min.csv")

TARGET = "default_12m"

iv_all_df = pd.read_csv(REPORTS_IV_DIR / "iv_ranking_all.csv")[["variable", "iv"]]
vars_model = (
    iv_all_df.query("iv >= @MIN_IV_SELECT")
             .sort_values("iv", ascending=False)
             .head(MAX_VARS)["variable"]
             .tolist()
)

# Guardar inventario de variables seleccionadas
REFERENCES_DIR.mkdir(parents=True, exist_ok=True)
out_vars = REFERENCES_DIR / f"vars_model_woe_{datetime.now():%Y%m%d}.csv"
pd.Series(vars_model, name="variable").to_csv(out_vars, index=False)

# --------- vista rápida para verificar rutas y tamaños ----------
print("=== Rutas detectadas ===")
print("CWD      :", CWD)
print("BASE_DIR :", BASE_DIR)
print("INTERIM  :", INTERIM_DIR)
print("REFERENCES:", REFERENCES_DIR)
print("REPORTS  :", REPORTS_DIR)
print()

print(f"TARGET: {TARGET}")
print(f"Variables seleccionadas: {len(vars_model)}")
print("Primeras 10:", vars_model[:10])
print("ABT shape:", df.shape)
print("Inventario guardado en:", out_vars)


=== Rutas detectadas ===
CWD      : C:\Users\PC RYU\Documents\Galileo\Maestria\Product Development\repo_proyecto_pe\notebooks
BASE_DIR : C:\Users\PC RYU\Documents\Galileo\Maestria\Product Development\repo_proyecto_pe
INTERIM  : C:\Users\PC RYU\Documents\Galileo\Maestria\Product Development\repo_proyecto_pe\data\interim
REFERENCES: C:\Users\PC RYU\Documents\Galileo\Maestria\Product Development\repo_proyecto_pe\references
REPORTS  : C:\Users\PC RYU\Documents\Galileo\Maestria\Product Development\repo_proyecto_pe\reports

TARGET: default_12m
Variables seleccionadas: 120
Primeras 10: ['mxdiasatramesinte', 'rat_ven_sact', 'saldo_vencido_12med', 'saldo_vencido_12max', 'rat_ven_sact_12med', 'saldo_vencido_6max', 'saldo_vencido_6med', 'salmxincicap_v', 'salmedincicap_v', 'limmedcont_v']
ABT shape: (17524, 438)
Inventario guardado en: C:\Users\PC RYU\Documents\Galileo\Maestria\Product Development\repo_proyecto_pe\references\vars_model_woe_20251110.csv


# Transformación a WOE

In [3]:
# =========================
# Celda 2 — Transformación WOE + Split 80/20 (selección robusta de exportgroup y reconstrucción de WOE)
# =========================

def read_csv_robust(path: Path) -> pd.DataFrame:
    seps = [None, ",", ";", "\t", "|"]
    encs = ["utf-8-sig", "utf-8", "latin-1"]
    engines = ["python", "c"]
    last_err = None
    for sep in seps:
        for enc in encs:
            for eng in engines:
                try:
                    df = pd.read_csv(path, sep=sep, encoding=enc, engine=eng)
                    if df.shape[1] == 1:
                        continue
                    return df
                except Exception as e:
                    last_err = e
                    continue
    try:
        return pd.read_csv(path)
    except Exception:
        raise last_err or RuntimeError(f"No pude leer {path}")

def _norm(s: str) -> str:
    s = s.lower()
    s = re.sub(r"[^a-z0-9]+", "_", s)
    return s.strip("_")

def pick_col(df: pd.DataFrame, candidates: list[str]) -> str:
    norm_map = {_norm(c): c for c in df.columns}
    for c in candidates:
        if _norm(c) in norm_map:
            return norm_map[_norm(c)]
    for nc, original in norm_map.items():
        if any(_norm(c) in nc for c in candidates):
            return original
    raise KeyError(f"No pude encontrar columnas tipo {candidates}. Columnas: {list(df.columns)}")

def normcols(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [c.strip() for c in df.columns]
    return df

# --- localizar un exportgroup "útil" (detalle por bin) ---
exp_candidates = sorted(REFERENCES_DIR.glob("exportgroup*.csv"), reverse=True)
exp_path = None
exp_df = None

for p in exp_candidates:
    df_try = read_csv_robust(p)
    df_try = normcols(df_try)

    has_var  = any(_norm(c) in {_norm("variable"), _norm("feature"), _norm("var")} for c in df_try.columns)
    has_bin  = any(_norm(c) in {_norm("bin"), _norm("grupo"), _norm("categoria"), _norm("category")} for c in df_try.columns)
    has_woe  = any(_norm(c) == _norm("woe") for c in df_try.columns)
    has_dist = any(_norm(c) == _norm("dist_bad") for c in df_try.columns) and any(_norm(c) == _norm("dist_good") for c in df_try.columns)
    has_bg   = any(_norm(c) == _norm("bads") for c in df_try.columns) and any(_norm(c) == _norm("goods") for c in df_try.columns)

    # requisito mínimo: variable + bin + (woe o dist_* o bads/goods)
    if has_var and has_bin and (has_woe or has_dist or has_bg):
        exp_path = p
        exp_df   = df_try
        break

if exp_path is None:
    raise FileNotFoundError(
        "No encontré un exportgroup con detalle por bin en references/ "
        "(debe tener columnas variable+bin y woe o dist_* o bads/goods)."
    )

# --- localizar catálogo numérico ---
num_candidates = sorted(REFERENCES_DIR.glob("numeric_bins_catalog_*.csv"), reverse=True)
if not num_candidates:
    raise FileNotFoundError("No encontré numeric_bins_catalog_*.csv en references/")
num_path = num_candidates[0]
num = normcols(read_csv_robust(num_path))

# --- mapear columnas del exportgroup ---
col_var_exp = pick_col(exp_df, ["variable", "feature", "var"])
col_bin_exp = pick_col(exp_df, ["bin", "grupo", "categoria", "category"])

# Determinar cómo obtener WOE
woe_col_present = any(_norm(c) == _norm("woe") for c in exp_df.columns)
if woe_col_present:
    col_woe_exp = pick_col(exp_df, ["woe"])
else:
    # intentar reconstruir
    dist_bad_present  = any(_norm(c) == _norm("dist_bad") for c in exp_df.columns)
    dist_good_present = any(_norm(c) == _norm("dist_good") for c in exp_df.columns)
    if dist_bad_present and dist_good_present:
        col_db = pick_col(exp_df, ["dist_bad"])
        col_dg = pick_col(exp_df, ["dist_good"])
        exp_df["__woe__"] = np.log(
            exp_df[col_db].replace(0, np.nan) / exp_df[col_dg].replace(0, np.nan)
        ).fillna(0.0)
        col_woe_exp = "__woe__"
    else:
        # último intento: usar bads/goods y totales reales de la ABT
        if not (any(_norm(c) == _norm("bads") for c in exp_df.columns) and
                any(_norm(c) == _norm("goods") for c in exp_df.columns)):
            raise KeyError(
                "El exportgroup seleccionado no tiene ni woe, ni dist_bad/dist_good, "
                "ni bads/goods. No puedo construir WOE."
            )
        col_bads  = pick_col(exp_df, ["bads"])
        col_goods = pick_col(exp_df, ["goods"])
        total_bads  = int(df[TARGET].sum())
        total_goods = int(len(df) - total_bads)
        exp_df["__dist_bad__"]  = exp_df[col_bads]  / max(total_bads, 1)
        exp_df["__dist_good__"] = exp_df[col_goods] / max(total_goods, 1)
        exp_df["__woe__"] = np.log(
            exp_df["__dist_bad__"].replace(0, np.nan) / exp_df["__dist_good__"].replace(0, np.nan)
        ).fillna(0.0)
        col_woe_exp = "__woe__"

# --- Diccionario (variable, bin) -> WOE
woe_map = (
    exp_df[[col_var_exp, col_bin_exp, col_woe_exp]]
    .rename(columns={col_var_exp: "variable", col_bin_exp: "bin", col_woe_exp: "woe"})
    .set_index(["variable", "bin"])["woe"]
    .to_dict()
)

# --- columnas en numeric catalog (cortes numéricos) ---
col_var_num = pick_col(num, ["variable", "feature", "var"])
col_bin_num = pick_col(num, ["bin", "grupo", "categoria"])
col_left    = pick_col(num, ["left", "min", "low", "desde", "l"])
col_right   = pick_col(num, ["right", "max", "high", "hasta", "r"])

num_std = (
    num[[col_var_num, col_bin_num, col_left, col_right]]
    .rename(columns={col_var_num: "variable", col_bin_num: "bin", col_left: "left", col_right: "right"})
)
num_std["left"]  = pd.to_numeric(num_std["left"], errors="coerce")
num_std["right"] = pd.to_numeric(num_std["right"], errors="coerce")

cuts_dict: dict[str, list[tuple[float, float, str]]] = {}
for v, g in num_std.groupby("variable"):
    g = g.dropna(subset=["left", "right"]).sort_values(["left", "right"])
    cuts_dict[v] = list(zip(g["left"].to_list(), g["right"].to_list(), g["bin"].to_list()))

def assign_bin_numeric(val, cuts):
    if pd.isna(val) or not cuts:
        return None
    for left, right, b in cuts:
        if val > left and val <= right:   # (left, right]
            return b
    return None

def to_woe_numeric(series: pd.Series, varname: str) -> pd.Series:
    cuts = cuts_dict.get(varname, [])
    if not cuts:
        return pd.Series(0.0, index=series.index)
    bins = series.apply(lambda x: assign_bin_numeric(x, cuts))
    return bins.apply(lambda b: woe_map.get((varname, b), woe_map.get((varname, "ALL"), 0.0)))

def to_woe_categorical(series: pd.Series, varname: str) -> pd.Series:
    return series.astype(str).apply(
        lambda cat: woe_map.get((varname, cat), woe_map.get((varname, "ALL"), 0.0))
    )

# --- construir matriz WOE con las vars seleccionadas que existan en df ---
X_woe = pd.DataFrame(index=df.index)
vars_in_df = [v for v in vars_model if v in df.columns]

for v in vars_in_df:
    if v in cuts_dict:
        X_woe[v] = to_woe_numeric(df[v], v)
    else:
        X_woe[v] = to_woe_categorical(df[v], v)

y = df[TARGET].astype(int)

# --- split 80/20 estratificado ---
X_train, X_test, y_train, y_test = train_test_split(
    X_woe, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)

# --- guardar ---
OUT_MODELS_INTERIM = INTERIM_DIR / "models"
OUT_MODELS_INTERIM.mkdir(parents=True, exist_ok=True)

X_train.to_csv(OUT_MODELS_INTERIM / "X_train_woe.csv", index=False)
X_test.to_csv(OUT_MODELS_INTERIM / "X_test_woe.csv", index=False)
y_train.to_csv(OUT_MODELS_INTERIM / "y_train.csv", index=False)
y_test.to_csv(OUT_MODELS_INTERIM / "y_test.csv", index=False)

print("Exportgroup usado:", exp_path.name, "| columnas:", list(exp_df.columns)[:10], "...")
print("Numeric catalog   :", num_path.name,    "| columnas:", list(num.columns)[:10], "...")
print("WOE matrix shape:", X_woe.shape)
print("Train:", X_train.shape, "| Test:", X_test.shape)
print("Positive rate (train/test):", y_train.mean().round(4), y_test.mean().round(4))


  X_woe[v] = to_woe_numeric(df[v], v)
  X_woe[v] = to_woe_numeric(df[v], v)
  X_woe[v] = to_woe_numeric(df[v], v)
  X_woe[v] = to_woe_numeric(df[v], v)
  X_woe[v] = to_woe_numeric(df[v], v)
  X_woe[v] = to_woe_numeric(df[v], v)
  X_woe[v] = to_woe_numeric(df[v], v)
  X_woe[v] = to_woe_numeric(df[v], v)
  X_woe[v] = to_woe_numeric(df[v], v)
  X_woe[v] = to_woe_numeric(df[v], v)
  X_woe[v] = to_woe_numeric(df[v], v)
  X_woe[v] = to_woe_numeric(df[v], v)
  X_woe[v] = to_woe_numeric(df[v], v)
  X_woe[v] = to_woe_numeric(df[v], v)
  X_woe[v] = to_woe_numeric(df[v], v)
  X_woe[v] = to_woe_numeric(df[v], v)
  X_woe[v] = to_woe_numeric(df[v], v)
  X_woe[v] = to_woe_numeric(df[v], v)
  X_woe[v] = to_woe_numeric(df[v], v)
  X_woe[v] = to_woe_numeric(df[v], v)


Exportgroup usado: exportgroup_20251103.csv | columnas: ['variable', 'var_type', 'bin', 'count', 'bads', 'goods', 'dist_bad', 'dist_good', 'woe', 'iv_component'] ...
Numeric catalog   : numeric_bins_catalog_20251103.csv | columnas: ['variable', 'bin', 'segment', 'left', 'right', 'closed'] ...
WOE matrix shape: (17524, 120)
Train: (14019, 120) | Test: (3505, 120)
Positive rate (train/test): 0.0527 0.0528


# Entrenamiento y evaluación

In [4]:
# --- rutas de E/S
IN_MODELS_INTERIM = INTERIM_DIR / "models"
OUT_MODELS_FINAL  = MODELS_DIR              # carpeta raíz /models
OUT_REPORTS       = REPORTS_DIR             # carpeta raíz /reports
OUT_FIGS          = REPORTS_DIR / "figures"
OUT_REPORTS.mkdir(parents=True, exist_ok=True)
OUT_FIGS.mkdir(parents=True, exist_ok=True)
OUT_MODELS_FINAL.mkdir(parents=True, exist_ok=True)

# --- cargar matrices WOE
X_train = pd.read_csv(IN_MODELS_INTERIM / "X_train_woe.csv")
X_test  = pd.read_csv(IN_MODELS_INTERIM / "X_test_woe.csv")
y_train = pd.read_csv(IN_MODELS_INTERIM / "y_train.csv").squeeze().astype(int)
y_test  = pd.read_csv(IN_MODELS_INTERIM / "y_test.csv").squeeze().astype(int)

print("Train shape:", X_train.shape, "| Test shape:", X_test.shape)
print("Pos rate (train/test):", y_train.mean().round(4), y_test.mean().round(4))

# --- modelo: Logistic Regression (WOE ya escalado; ponderación por desbalance)
clf = LogisticRegression(
    penalty="l2",
    solver="liblinear",
    class_weight="balanced",
    max_iter=2000,
    n_jobs=None
)

# entrenamiento
t0 = datetime.now()
clf.fit(X_train, y_train)
t1 = datetime.now()
print(f"Tiempo de entrenamiento: {(t1 - t0).total_seconds():.2f}s")

# --- scoring
train_score = clf.predict_proba(X_train)[:, 1]
test_score  = clf.predict_proba(X_test)[:, 1]

# --- métricas base
roc_auc   = roc_auc_score(y_test, test_score)
pr_auc    = average_precision_score(y_test, test_score)  # área bajo curva precisión-recall
gini      = 2 * roc_auc - 1

# KS
fpr, tpr, thr = roc_curve(y_test, test_score)
ks_values = tpr - fpr
ks = float(np.max(ks_values))
ks_thr = float(thr[np.argmax(ks_values)])

print(f"AUC ROC: {roc_auc:.4f} | PR AUC: {pr_auc:.4f} | Gini: {gini:.4f} | KS: {ks:.4f} @ thr≈{ks_thr:.4f}")

# --- umbral operativo (usamos el de máximo KS en validación)
best_thr = ks_thr

def confusion_at(th, y_true, score):
    y_hat = (score >= th).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_hat).ravel()
    return dict(threshold=float(th), TP=int(tp), FP=int(fp), TN=int(tn), FN=int(fn))

cm_test = confusion_at(best_thr, y_test, test_score)
print("Confusión (test) @KS:", cm_test)

# --- lift/top-decile y ganancias rápidas
def top_decile_lift(y_true, score, pct=0.1):
    n = len(score)
    k = max(1, int(np.floor(n * pct)))
    order = np.argsort(-score)
    top_k = y_true.iloc[order[:k]].sum()
    base  = y_true.mean() * k
    return float((top_k / max(base, 1e-12)))

lift10 = top_decile_lift(y_test, test_score, pct=0.10)
print(f"Lift Top 10%: {lift10:.2f}x")

# --- curvas y figuras
# ROC
plt.figure()
fpr_tr, tpr_tr, _ = roc_curve(y_train, train_score)
fpr_te, tpr_te, _ = roc_curve(y_test,  test_score)
plt.plot(fpr_tr, tpr_tr, label=f"Train ROC (AUC={roc_auc_score(y_train, train_score):.3f})")
plt.plot(fpr_te, tpr_te, label=f"Test ROC (AUC={roc_auc:.3f})")
plt.plot([0,1],[0,1],"--", lw=1, alpha=0.6)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve — Logistic (WOE)")
plt.legend()
roc_path = OUT_FIGS / "roc_woe_logreg.png"
plt.savefig(roc_path, dpi=150, bbox_inches="tight")
plt.close()

# PR
plt.figure()
prec, rec, _ = precision_recall_curve(y_test, test_score)
plt.plot(rec, prec, label=f"Test PR (AP={pr_auc:.3f})")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall — Logistic (WOE)")
plt.legend()
pr_path = OUT_FIGS / "pr_woe_logreg.png"
plt.savefig(pr_path, dpi=150, bbox_inches="tight")
plt.close()

# KS curve
plt.figure()
plt.plot(thr, ks_values, label="KS(th)")
plt.axvline(best_thr, color="r", ls="--", label=f"best thr={best_thr:.3f}")
plt.xlabel("Threshold")
plt.ylabel("KS")
plt.title("KS vs Threshold — Logistic (WOE)")
plt.legend()
ks_path = OUT_FIGS / "ks_woe_logreg.png"
plt.savefig(ks_path, dpi=150, bbox_inches="tight")
plt.close()

# --- guardar modelo y reportes
model_path = OUT_MODELS_FINAL / "woe_logistic_regression.pkl"
with open(model_path, "wb") as f:
    pickle.dump(dict(model=clf, features=X_train.columns.tolist(), threshold=best_thr), f)

metrics = dict(
    timestamp=datetime.now().isoformat(timespec="seconds"),
    n_features=X_train.shape[1],
    auc_roc=float(roc_auc),
    pr_auc=float(pr_auc),
    gini=float(gini),
    ks=float(ks),
    ks_threshold=float(best_thr),
    lift_top10=float(lift10),
    pos_rate_train=float(y_train.mean()),
    pos_rate_test=float(y_test.mean()),
    confusion_test=cm_test,
    paths=dict(roc=str(roc_path), pr=str(pr_path), ks=str(ks_path), model=str(model_path)),
)
with open(OUT_REPORTS / "woe_logreg_metrics.json", "w", encoding="utf-8") as f:
    json.dump(metrics, f, ensure_ascii=False, indent=2)

# --- archivo con scores de test (útil para auditoría)
scores_path = OUT_REPORTS / "scores_test_woe_logreg.csv"
pd.DataFrame({"y_true": y_test.values, "score": test_score}).to_csv(scores_path, index=False)

print("Artefactos guardados:")
print("  Modelo   :", model_path)
print("  Métricas :", OUT_REPORTS / 'woe_logreg_metrics.json')
print("  Figuras  :", roc_path.name, ",", pr_path.name, ",", ks_path.name)
print("  Scores   :", scores_path.name)


Train shape: (14019, 120) | Test shape: (3505, 120)
Pos rate (train/test): 0.0527 0.0528
Tiempo de entrenamiento: 0.40s
AUC ROC: 0.8321 | PR AUC: 0.3749 | Gini: 0.6643 | KS: 0.5233 @ thr≈0.5515
Confusión (test) @KS: {'threshold': 0.5514528368340634, 'TP': 123, 'FP': 470, 'TN': 2850, 'FN': 62}
Lift Top 10%: 5.58x
Artefactos guardados:
  Modelo   : C:\Users\PC RYU\Documents\Galileo\Maestria\Product Development\repo_proyecto_pe\models\woe_logistic_regression.pkl
  Métricas : C:\Users\PC RYU\Documents\Galileo\Maestria\Product Development\repo_proyecto_pe\reports\woe_logreg_metrics.json
  Figuras  : roc_woe_logreg.png , pr_woe_logreg.png , ks_woe_logreg.png
  Scores   : scores_test_woe_logreg.csv


# Deciles, KS y ganancias

In [5]:
# --- helper para deciles
def to_deciles(scores, n=10):
    # percentiles 0..100 en pasos de 10
    q = np.linspace(0, 1, n+1)
    cuts = np.quantile(scores, q)
    # para que el decil 1 sea el de mayor riesgo, invertimos los cortes
    cuts = np.unique(cuts)  # por si hay empates extremos
    return cuts

def decile_table(y_true, score, n=10):
    # Ordenar de mayor a menor score (más riesgoso primero)
    df_ = pd.DataFrame({"y": y_true.values, "score": score})
    df_ = df_.sort_values("score", ascending=False).reset_index(drop=True)
    df_["decile"] = pd.qcut(df_.index, q=n, labels=[f"D{i}" for i in range(1, n+1)])

    tab = (df_.groupby("decile", observed=False)
             .agg(
                 count=("y", "size"),
                 bads=("y", "sum"),
                 avg_score=("score", "mean")
             )
             .reset_index())
    tab["goods"] = tab["count"] - tab["bads"]
    tab["bad_rate"] = tab["bads"] / tab["count"]

    # acumulados (decil 1 = más riesgoso)
    tab["cum_bads"] = tab["bads"].cumsum()
    tab["cum_goods"] = tab["goods"].cumsum()
    total_bads = tab["bads"].sum()
    total_goods = tab["goods"].sum()
    tab["cum_capture_rate"] = tab["cum_bads"] / max(total_bads, 1)

    # lift por decil y lift acumulado
    overall_bad_rate = total_bads / (total_bads + total_goods)
    tab["lift"] = tab["bad_rate"] / max(overall_bad_rate, 1e-12)
    tab["cum_lift"] = (tab["cum_bads"] / tab["count"].cumsum()) / max(overall_bad_rate, 1e-12)

    # KS por decil (utilizando acumulados proporcionales)
    tab["cum_bad_pct"] = tab["cum_bads"] / max(total_bads, 1)
    tab["cum_good_pct"] = tab["cum_goods"] / max(total_goods, 1)
    tab["ks_by_decile"] = (tab["cum_bad_pct"] - tab["cum_good_pct"]).abs()

    return tab

deciles = decile_table(y_test, test_score, n=10)

# --- guardar CSV
out_csv = REPORTS_DIR / "deciles_woe_logreg.csv"
deciles.to_csv(out_csv, index=False)

# --- figuras simples (lift y KS por decil)
fig1_path = REPORTS_DIR / "figures" / "lift_by_decile_woe_logreg.png"
fig2_path = REPORTS_DIR / "figures" / "ks_by_decile_woe_logreg.png"

plt.figure(figsize=(7,4))
plt.plot(deciles["decile"], deciles["lift"], marker="o")
plt.title("Lift por decil (test)")
plt.xlabel("Decil (1=mayor riesgo)")
plt.ylabel("Lift")
plt.tight_layout()
plt.savefig(fig1_path, dpi=150)
plt.close()

plt.figure(figsize=(7,4))
plt.plot(deciles["decile"], deciles["ks_by_decile"], marker="o")
plt.title("KS por decil (test)")
plt.xlabel("Decil (1=mayor riesgo)")
plt.ylabel("KS")
plt.tight_layout()
plt.savefig(fig2_path, dpi=150)
plt.close()

print("Deciles guardados en:", out_csv.name)
print("Figuras:", fig1_path.name, ",", fig2_path.name)

# vistazo rápido
display(deciles)


Deciles guardados en: deciles_woe_logreg.csv
Figuras: lift_by_decile_woe_logreg.png , ks_by_decile_woe_logreg.png


Unnamed: 0,decile,count,bads,avg_score,goods,bad_rate,cum_bads,cum_goods,cum_capture_rate,lift,cum_lift,cum_bad_pct,cum_good_pct,ks_by_decile
0,D1,351,103,0.844888,248,0.293447,103,248,0.556757,5.559637,5.559637,0.556757,0.074699,0.482058
1,D2,350,24,0.590693,326,0.068571,127,574,0.686486,1.299151,3.432432,0.686486,0.172892,0.513595
2,D3,351,14,0.448408,337,0.039886,141,911,0.762162,0.755679,2.539333,0.762162,0.274398,0.487765
3,D4,350,12,0.372939,338,0.034286,153,1249,0.827027,0.649575,2.067568,0.827027,0.376205,0.450822
4,D5,351,9,0.325011,342,0.025641,162,1591,0.875676,0.485793,1.750852,0.875676,0.479217,0.396459
5,D6,350,3,0.28008,347,0.008571,165,1938,0.891892,0.162394,1.486486,0.891892,0.583735,0.308157
6,D7,350,9,0.225512,341,0.025714,174,2279,0.940541,0.487181,1.343903,0.940541,0.686446,0.254095
7,D8,351,9,0.161069,342,0.025641,183,2621,0.989189,0.485793,1.236486,0.989189,0.789458,0.199731
8,D9,350,1,0.094022,349,0.002857,184,2970,0.994595,0.054131,1.10528,0.994595,0.894578,0.100016
9,D10,351,1,0.035781,350,0.002849,185,3320,1.0,0.053977,1.0,1.0,1.0,0.0


# Selección de umbrales

In [8]:
# 1) Cargar scores de test
scores_path = REPORTS_DIR / "scores_test_woe_logreg.csv"
df_sc = pd.read_csv(scores_path)
y = df_sc["y_true"].astype(int).values
s = df_sc["score"].values

# 2) Curvas útiles
prec, rec, thr_pr = precision_recall_curve(y, s)
fpr, tpr, thr_roc  = roc_curve(y, s)

# 3) KS (conjunto de umbrales de ROC)
ks_vals = tpr - fpr
idx_ks  = np.argmax(ks_vals)
thr_ks  = thr_roc[idx_ks]
ks_max  = ks_vals[idx_ks]

# 4) F1 en una rejilla de umbrales
grid = np.unique(np.r_[np.linspace(0.01, 0.99, 99), thr_roc, thr_pr])
f1_list, p_list, r_list = [], [], []
for t in grid:
    yhat = (s >= t).astype(int)
    # evitemos divisiones raras si no hay positivos predichos
    if yhat.sum() == 0:
        f1_list.append(0.0); p_list.append(0.0); r_list.append(0.0)
        continue
    f1_list.append(f1_score(y, yhat))
    p_list.append(precision_score(y, yhat))
    r_list.append(recall_score(y, yhat))

idx_f1 = int(np.argmax(f1_list))
thr_f1 = float(grid[idx_f1])

# 5) Precision objetivo (ej: 30%) y Recall objetivo (ej: 60%) — ajusta a tu negocio
target_precision = 0.30
target_recall    = 0.60

# umbral mínimo que alcanza al menos la precisión objetivo
thr_p = None
for t, p in sorted(zip(grid, p_list), key=lambda z: z[0]):
    if p >= target_precision:
        thr_p = float(t); break

# umbral máximo que alcanza al menos el recall objetivo
thr_r = None
for t, r in sorted(zip(grid, r_list), key=lambda z: z[0]):
    if r >= target_recall:
        thr_r = float(t); break

# 6) Top-N% (ej: 10% más riesgosos)
top_rate = 0.10
thr_top = float(np.quantile(s, 1 - top_rate))  # 10% superior = score alto

def eval_at_threshold(t):
    yhat = (s >= t).astype(int)
    TP = int(((y==1)&(yhat==1)).sum())
    FP = int(((y==0)&(yhat==1)).sum())
    TN = int(((y==0)&(yhat==0)).sum())
    FN = int(((y==1)&(yhat==0)).sum())
    prec_ = precision_score(y, yhat) if (TP+FP)>0 else 0.0
    rec_  = recall_score(y, yhat)    if (TP+FN)>0 else 0.0
    f1_   = f1_score(y, yhat)        if (TP+FP)>0 else 0.0
    rate_ = yhat.mean()
    return dict(threshold=t, TP=TP, FP=FP, TN=TN, FN=FN,
                precision=prec_, recall=rec_, f1=f1_, approve_rate=rate_)

candidates = {
    "KS_max"            : thr_ks,
    "F1_max"            : thr_f1,
    "Prec>=30%"         : thr_p,
    "Recall>=60%"       : thr_r,
    "Top10%_score"      : thr_top,
}

rows = []
for name, t in candidates.items():
    if t is None:
        rows.append(dict(name=name, note="No alcanzado", threshold=np.nan))
        continue
    m = eval_at_threshold(t)
    m["name"] = name
    m["note"] = ""
    rows.append(m)

thr_df = pd.DataFrame(rows)

# 7) Guardar tabla de umbrales
out_thr = REPORTS_DIR / "thresholds_woe_logreg.csv"
thr_df.to_csv(out_thr, index=False)

print("Tabla de umbrales guardada en:", out_thr.name)
display(thr_df.sort_values("threshold"))


Tabla de umbrales guardada en: thresholds_woe_logreg.csv


Unnamed: 0,threshold,TP,FP,TN,FN,precision,recall,f1,approve_rate,name,note
3,0.005009,185,3320,0,0,0.052782,1.0,0.100271,1.0,Recall>=60%,
0,0.551453,123,470,2850,62,0.20742,0.664865,0.316195,0.169187,KS_max,
4,0.690493,103,248,3072,82,0.293447,0.556757,0.384328,0.100143,Top10%_score,
2,0.696021,103,240,3080,82,0.300292,0.556757,0.390152,0.09786,Prec>=30%,
1,0.790786,94,139,3181,91,0.403433,0.508108,0.449761,0.066476,F1_max,


# Evaluación de umbrales y recomendación

In [9]:
thr_path = REPORTS_DIR / "thresholds_woe_logreg.csv"
thr_df = pd.read_csv(thr_path)

# ordenar por umbral
thr_df = thr_df.sort_values("threshold").reset_index(drop=True)

print("=== Evaluación de umbrales ===")
display(thr_df)

# Selección según estrategia
sel = {}

# 1️⃣ Recall alto: prioriza capturar la mayor cantidad de defaults
sel["Estrategia_recall_alto"] = thr_df.loc[thr_df["name"].str.contains("KS_max", case=False, na=False)].iloc[0]

# 2️⃣ Balanceado: máximo F1
sel["Estrategia_balanceada"] = thr_df.loc[thr_df["name"].str.contains("F1_max", case=False, na=False)].iloc[0]

# 3️⃣ Precisión alta: objetivo >=30% (si existe)
prec_mask = thr_df["name"].str.contains("Prec", case=False, na=False)
if prec_mask.any():
    sel["Estrategia_precision_alta"] = thr_df.loc[prec_mask].iloc[0]
else:
    sel["Estrategia_precision_alta"] = thr_df.iloc[thr_df["precision"].idxmax()]

# Mostrar resumen
print("\n=== Recomendaciones de política ===")
for k, row in sel.items():
    print(f"\n{k}")
    print(f"  → Umbral sugerido: {row['threshold']:.4f}")
    print(f"  → Precisión: {row['precision']:.3f}")
    print(f"  → Recall: {row['recall']:.3f}")
    print(f"  → F1: {row['f1']:.3f}")
    print(f"  → Aprobación: {row['approve_rate']*100:.1f}%")

# Guardar resumen
out_summary = REPORTS_DIR / "thresholds_summary_woe_logreg.csv"
pd.DataFrame(sel).T.to_csv(out_summary, index=True)
print(f"\nResumen guardado en: {out_summary.name}")


=== Evaluación de umbrales ===


Unnamed: 0,threshold,TP,FP,TN,FN,precision,recall,f1,approve_rate,name,note
0,0.005009,185,3320,0,0,0.052782,1.0,0.100271,1.0,Recall>=60%,
1,0.551453,123,470,2850,62,0.20742,0.664865,0.316195,0.169187,KS_max,
2,0.690493,103,248,3072,82,0.293447,0.556757,0.384328,0.100143,Top10%_score,
3,0.696021,103,240,3080,82,0.300292,0.556757,0.390152,0.09786,Prec>=30%,
4,0.790786,94,139,3181,91,0.403433,0.508108,0.449761,0.066476,F1_max,



=== Recomendaciones de política ===

Estrategia_recall_alto
  → Umbral sugerido: 0.5515
  → Precisión: 0.207
  → Recall: 0.665
  → F1: 0.316
  → Aprobación: 16.9%

Estrategia_balanceada
  → Umbral sugerido: 0.7908
  → Precisión: 0.403
  → Recall: 0.508
  → F1: 0.450
  → Aprobación: 6.6%

Estrategia_precision_alta
  → Umbral sugerido: 0.6960
  → Precisión: 0.300
  → Recall: 0.557
  → F1: 0.390
  → Aprobación: 9.8%

Resumen guardado en: thresholds_summary_woe_logreg.csv
