# Scoring crediticio

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    confusion_matrix,
    classification_report
)

# carga de datos

df = pd.read_csv("credit_risk_scoring_420.csv")

features = ["Edad", "Ingreso_Mensual", "Antiguedad_Laboral", "Nivel_Endeudamiento", "Historial_Morosidad"]
target = "Default"

X = df[features]
y = df[target]

# split (estratificado por desbalance)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

# modelo (LogReg + class_weight para priorizar detectar defaults)

pipe = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(max_iter=2000, class_weight="balanced"))
])

pipe.fit(X_train, y_train)

# probabilidad de default (PD)

pd_test = pipe.predict_proba(X_test)[:, 1]

# score 300–850 (más score = menos riesgo)

# score = 850 - PD * 550  => PD=0 -> 850 ; PD=1 -> 300

score_test = 850 - (pd_test * 550)
score_test = np.clip(score_test, 300, 850)

scoring = X_test.copy()
scoring["Default_real"] = y_test.values
scoring["PD"] = pd_test
scoring["Score"] = score_test.round(0).astype(int)

# bandas de riesgo

def banda(score):
    if score >= 750:
        return "Bajo"
    elif score >= 650:
        return "Medio"
    else:
        return "Alto"

scoring["Riesgo"] = scoring["Score"].apply(banda)

# umbral de decisión (para “sí/no”)

threshold = 0.35
y_pred = (pd_test >= threshold).astype(int)

print("ROC-AUC:", round(roc_auc_score(y_test, pd_test), 3))
print("PR-AUC (Average Precision):", round(average_precision_score(y_test, pd_test), 3))
print("\nMatriz de confusión (umbral =", threshold, "):")
print(confusion_matrix(y_test, y_pred))
print("\nReporte:")
print(classification_report(y_test, y_pred))

# tabla final (top 15 más riesgosos)

print("\nTop 15 más riesgosos (PD alta / score bajo):")
print(scoring.sort_values("PD", ascending=False).head(15)[
    ["PD", "Score", "Riesgo", "Edad", "Ingreso_Mensual", "Antiguedad_Laboral", "Nivel_Endeudamiento", "Historial_Morosidad", "Default_real"]
])

# default rate por banda
print("\nTasa de default por banda (en test):")
print(scoring.groupby("Riesgo")["Default_real"].mean().sort_values(ascending=False))


ROC-AUC: 0.788
PR-AUC (Average Precision): 0.443

Matriz de confusión (umbral = 0.35 ):
[[60 53]
 [ 2 11]]

Reporte:
              precision    recall  f1-score   support

           0       0.97      0.53      0.69       113
           1       0.17      0.85      0.29        13

    accuracy                           0.56       126
   macro avg       0.57      0.69      0.49       126
weighted avg       0.89      0.56      0.64       126


Top 15 más riesgosos (PD alta / score bajo):
           PD  Score Riesgo  Edad  Ingreso_Mensual  Antiguedad_Laboral  \
338  0.977569    312   Alto    56           634790                 1.3   
68   0.940840    333   Alto    53           549306                 2.0   
358  0.931175    338   Alto    32           136604                 2.1   
347  0.861857    376   Alto    50           469105                 0.9   
247  0.859155    377   Alto    58           286613                 8.0   
209  0.848592    383   Alto    43           211527                