In [7]:
!pip install -q \
    lightgbm \
    shap \
    category_encoders \
    scikit-learn \
    pandas \
    numpy \
    matplotlib \
    tqdm \
    joblib


In [8]:
from pathlib import Path
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

BASE = Path("/content/Interpretable Machine Learning for Credit Risk Modeling using SHAP Values")
DATA_DIR = BASE / "data"
OUTPUT_DIR = BASE / "outputs"
DATA_DIR.mkdir(exist_ok=True)
OUTPUT_DIR.mkdir(exist_ok=True)

dataset_path = BASE / "default of credit card clients.csv"
if not dataset_path.exists():
    raise FileNotFoundError("Dataset not found at: " + str(dataset_path))

df = pd.read_csv(dataset_path)
df.columns = [c.strip() for c in df.columns]

possible_targets = [
    "default.payment.next.month",
    "default payment next month",
    "default_payment",
    "default",
    "DEFAULT",
    "target",
    "TARGET",
    "Y",
    "y",
    "Class"
]

target_col = None
clean_cols = {c: c.replace(".", "").replace("_", "").lower() for c in df.columns}
clean_expected = [t.replace(".", "").replace("_", "").lower() for t in possible_targets]

for c in df.columns:
    if clean_cols[c] in clean_expected:
        target_col = c
        break

if target_col is None:
    bin_candidates = [c for c in df.columns if set(df[c].dropna().unique()).issubset({0,1})]
    if len(bin_candidates) == 1:
        target_col = bin_candidates[0]
    else:
        raise ValueError("No binary target found.")

df = df.rename(columns={target_col: "target_default"})
df["target_default"] = pd.to_numeric(df["target_default"], errors="coerce")
df = df.dropna(subset=["target_default"])
df = df[df["target_default"].isin([0,1])]
df["target_default"] = df["target_default"].astype(int)

df.to_csv(DATA_DIR / "clean_dataset.csv", index=False)

print("Target column detected:", target_col)
print("Dataset cleaned and saved to:", DATA_DIR / "clean_dataset.csv")
df.head(3)


Target column detected: Y
Dataset cleaned and saved to: /content/Interpretable Machine Learning for Credit Risk Modeling using SHAP Values/data/clean_dataset.csv


Unnamed: 0.1,Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X15,X16,X17,X18,X19,X20,X21,X22,X23,target_default
1,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
2,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
3,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0


In [9]:
import os, json, joblib
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import lightgbm as lgb
import shap
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score

BASE = Path("/content/Interpretable Machine Learning for Credit Risk Modeling using SHAP Values")
DATA_PATH = BASE / "data" / "clean_dataset.csv"
OUT = BASE / "outputs"
OUT.mkdir(parents=True, exist_ok=True)

df = pd.read_csv(DATA_PATH)

numeric_cols = [c for c in df.columns if c != "target_default" and pd.api.types.is_numeric_dtype(df[c])]
categorical_cols = [c for c in df.columns if c not in numeric_cols + ["target_default"]]

for c in numeric_cols:
    lo, hi = df[c].quantile(0.01), df[c].quantile(0.99)
    df[c] = df[c].clip(lo, hi)

df["interaction_limit_age"] = (df["LIMIT_BAL"] / df["AGE"].replace(0, df["AGE"].median())) if ("LIMIT_BAL" in df.columns and "AGE" in df.columns) else 0
bill_cols = [c for c in df.columns if "BILL" in c.upper()]
pay_cols = [c for c in df.columns if "PAY_AMT" in c.upper()]

df["avg_bill"] = df[bill_cols].mean(axis=1) if bill_cols else 0
df["avg_pay"] = df[pay_cols].mean(axis=1) if pay_cols else 0
df["interaction_pay_bill"] = df["avg_pay"] / df["avg_bill"].replace(0, df["avg_bill"].median())

engineered = ["interaction_limit_age","avg_bill","avg_pay","interaction_pay_bill"]
all_numeric = list(dict.fromkeys(numeric_cols + engineered))
all_numeric = [c for c in all_numeric if df[c].nunique() > 1]

imputer = IterativeImputer(random_state=42)
numeric_imputed = pd.DataFrame(imputer.fit_transform(df[all_numeric]), columns=all_numeric)
df[all_numeric] = numeric_imputed

X = df.drop(columns=["target_default"])
y = df["target_default"]

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=42)

preprocessor = ColumnTransformer(
    [
        ("num", StandardScaler(), all_numeric),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols)
    ],
    remainder="drop"
)

clf = lgb.LGBMClassifier(objective="binary", random_state=42)
pipe = Pipeline([("preprocessor", preprocessor), ("clf", clf)])

param_dist = {
    "clf__n_estimators": [200,400,600],
    "clf__learning_rate": [0.01,0.03,0.05],
    "clf__num_leaves": [31,63,127]
}

rs = RandomizedSearchCV(pipe, param_dist, n_iter=10, scoring="roc_auc", cv=3, random_state=42, n_jobs=-1)
rs.fit(X_train, y_train)

best = rs.best_estimator_
joblib.dump(best, OUT / "best_model.joblib")

y_proba = best.predict_proba(X_test)[:,1]
y_pred = (y_proba>=0.5).astype(int)

metrics = {
    "AUC": float(roc_auc_score(y_test,y_proba)),
    "F1": float(f1_score(y_test,y_pred)),
    "Precision": float(precision_score(y_test,y_pred)),
    "Recall": float(recall_score(y_test,y_pred))
}
json.dump(metrics, open(OUT/"metrics.json","w"), indent=2)

preproc = best.named_steps["preprocessor"]
clf = best.named_steps["clf"]

X_test_proc = preproc.transform(X_test)
X_test_proc_df = pd.DataFrame(X_test_proc)

explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(X_test_proc_df)
shap_pos = shap_values[1] if isinstance(shap_values, list) else shap_values

plt.figure(figsize=(10,6))
shap.summary_plot(shap_pos, X_test_proc_df, show=False)
plt.savefig(OUT/"shap_summary.png", dpi=200)
plt.close()

print("Training done. Outputs saved to:", OUT)


[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003693 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3458
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.221208 -> initscore=-1.258639
[LightGBM] [Info] Start training from score -1.258639
Training done. Outputs saved to: /content/Interpretable Machine Learning for Credit Risk Modeling using SHAP Values/outputs


In [10]:
import os, zipfile
from pathlib import Path
import pandas as pd
import numpy as np
import shap
import matplotlib.pyplot as plt
import joblib

BASE = Path("/content/Interpretable Machine Learning for Credit Risk Modeling using SHAP Values")
OUT = BASE / "outputs"
OUT.mkdir(exist_ok=True)

best = joblib.load(OUT/"best_model.joblib")

preproc = best.named_steps["preprocessor"]
clf = best.named_steps["clf"]

df = pd.read_csv(BASE/"data/clean_dataset.csv")
X = df.drop(columns=["target_default"])
y = df["target_default"]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=42)

X_test_proc = preproc.transform(X_test)
X_test_proc_df = pd.DataFrame(X_test_proc)

explainer = shap.TreeExplainer(clf)
shap_vals = explainer.shap_values(X_test_proc_df)
shap_pos = shap_vals[1] if isinstance(shap_vals, list) else shap_vals

y_proba = clf.predict_proba(X_test_proc)[:,1]

pred_df = pd.DataFrame({"proba": y_proba, "true": y_test.reset_index(drop=True)})

idx_high = pred_df['proba'].idxmax()
idx_low = pred_df['proba'].idxmin()
idx_border = (pred_df['proba'] - 0.5).abs().idxmin()

selected = [idx_high, idx_low, idx_border]

for i, sel in enumerate(selected):
    fp = shap.force_plot(
        explainer.expected_value[1] if isinstance(explainer.expected_value, list) else explainer.expected_value,
        shap_pos[sel],
        X_test_proc_df.iloc[[sel]],
        matplotlib=False
    )
    shap.save_html(str(OUT / f"force_plot_case_{i+1}.html"), fp)

    shap_bar = pd.Series(shap_pos[sel]).sort_values()
    plt.figure(figsize=(8,6))
    shap_bar.tail(10).plot(kind='barh')
    plt.title(f"SHAP Contributions Case {i+1}")
    plt.tight_layout()
    plt.savefig(OUT / f"force_plot_case_{i+1}.png")
    plt.close()

with zipfile.ZipFile(BASE/"final_project_outputs.zip","w") as z:
    for root, dirs, files in os.walk(OUT):
        for file in files:
            fp = os.path.join(root,file)
            z.write(fp, arcname=os.path.relpath(fp, BASE))

print("ZIP created at:", BASE/"final_project_outputs.zip")


ZIP created at: /content/Interpretable Machine Learning for Credit Risk Modeling using SHAP Values/final_project_outputs.zip
