In [1]:
# A1 — chemins + chargement artefacts + données
from pathlib import Path
import joblib, json, pandas as pd, numpy as np

ROOT = Path.cwd() if (Path.cwd()/ "data" / "processed").exists() else Path.cwd().parent
PROC, MODELS = ROOT/"data/processed", ROOT/"models"

rf   = joblib.load(MODELS/"rf_model.joblib")
imp  = joblib.load(MODELS/"imputer.joblib")
meta = json.load(open(MODELS/"model_meta.json"))
feat, thr = meta["features"], meta["threshold"]

df = pd.read_csv(PROC/"train_features_v1.csv")
X = df[feat].astype(float).values
y = df["TARGET"].astype(int).values
proba = rf.predict_proba(imp.transform(X))[:,1]

print("OK:", df.shape, "| seuil =", thr)


OK: (307511, 17) | seuil = 0.6


In [2]:
# A2 — on prend 3 cas autour du seuil : 1 TP, 1 FP, 1 FN
df_ = df.copy()
df_["proba"] = proba

near_thr_idx = (df_["proba"] - thr).abs().idxmin()
# faux négatif : y=1 mais proba<thr (le plus proche du seuil)
fn_idx = df_.query("TARGET==1 and proba < @thr")["proba"].sort_values(ascending=False).index[:1]
# faux positif : y=0 mais proba>=thr (le plus proche du seuil)
fp_idx = df_.query("TARGET==0 and proba >= @thr")["proba"].sort_values().index[:1]

ids = {
    "near_thr": int(df_.loc[near_thr_idx,"SK_ID_CURR"]),
    "FN":       int(df_.loc[fn_idx].iloc[0]["SK_ID_CURR"]),
    "FP":       int(df_.loc[fp_idx].iloc[0]["SK_ID_CURR"]),
}
ids


{'near_thr': 131387, 'FN': 329397, 'FP': 131387}

In [3]:
# A3 — mini "what-if": on change ±10% 5 features top et on voit l'impact sur la proba
top5 = [c for c in ["EXT_SOURCE_2","EXT_SOURCE_3","EXT_SOURCE_1",
                    "credit_term","employed_years"] if c in feat][:5]

def what_if_one(row):
    rows=[]
    for c in top5:
        base = row.copy()
        p0 = rf.predict_proba(imp.transform(base[feat].astype(float).to_frame().T.values))[:,1][0]
        for mult in (0.9, 1.1):
            test = base.copy()
            if pd.notna(test[c]):
                test[c] = test[c] * mult
            p = rf.predict_proba(imp.transform(test[feat].astype(float).to_frame().T.values))[:,1][0]
            rows.append({"feature": c, "changement": f"{int((mult-1)*100)}%",
                         "proba_base": round(p0,3), "proba_nouvelle": round(p,3),
                         "delta": round(p-p0,3)})
    return pd.DataFrame(rows).sort_values(["feature","changement"])

for tag, sk in ids.items():
    row = df_.loc[df_["SK_ID_CURR"]==sk].iloc[0]
    tbl = what_if_one(row)
    print(f"\n=== Cas {tag} — SK_ID_CURR={sk} — TARGET={int(row.TARGET)} — proba={row.proba:.3f}")
    display(tbl)



=== Cas near_thr — SK_ID_CURR=131387 — TARGET=0 — proba=0.600


Unnamed: 0,feature,changement,proba_base,proba_nouvelle,delta
4,EXT_SOURCE_1,-9%,0.6,0.599,-0.001
5,EXT_SOURCE_1,10%,0.6,0.595,-0.005
0,EXT_SOURCE_2,-9%,0.6,0.633,0.033
1,EXT_SOURCE_2,10%,0.6,0.544,-0.056
2,EXT_SOURCE_3,-9%,0.6,0.6,0.0
3,EXT_SOURCE_3,10%,0.6,0.599,-0.001
6,credit_term,-9%,0.6,0.637,0.037
7,credit_term,10%,0.6,0.587,-0.013
8,employed_years,-9%,0.6,0.602,0.002
9,employed_years,10%,0.6,0.603,0.003



=== Cas FN — SK_ID_CURR=329397 — TARGET=1 — proba=0.600


Unnamed: 0,feature,changement,proba_base,proba_nouvelle,delta
4,EXT_SOURCE_1,-9%,0.6,0.6,0.0
5,EXT_SOURCE_1,10%,0.6,0.6,0.0
0,EXT_SOURCE_2,-9%,0.6,0.606,0.006
1,EXT_SOURCE_2,10%,0.6,0.594,-0.006
2,EXT_SOURCE_3,-9%,0.6,0.608,0.008
3,EXT_SOURCE_3,10%,0.6,0.601,0.001
6,credit_term,-9%,0.6,0.609,0.009
7,credit_term,10%,0.6,0.515,-0.085
8,employed_years,-9%,0.6,0.596,-0.004
9,employed_years,10%,0.6,0.601,0.001



=== Cas FP — SK_ID_CURR=131387 — TARGET=0 — proba=0.600


Unnamed: 0,feature,changement,proba_base,proba_nouvelle,delta
4,EXT_SOURCE_1,-9%,0.6,0.599,-0.001
5,EXT_SOURCE_1,10%,0.6,0.595,-0.005
0,EXT_SOURCE_2,-9%,0.6,0.633,0.033
1,EXT_SOURCE_2,10%,0.6,0.544,-0.056
2,EXT_SOURCE_3,-9%,0.6,0.6,0.0
3,EXT_SOURCE_3,10%,0.6,0.599,-0.001
6,credit_term,-9%,0.6,0.637,0.037
7,credit_term,10%,0.6,0.587,-0.013
8,employed_years,-9%,0.6,0.602,0.002
9,employed_years,10%,0.6,0.603,0.003
