# Credit Default Prediction - Advanced Evaluation & Robustness

## 1. Introduction
**What:** This notebook presents a complete machine learning pipeline to predict credit default risk (Good vs. Bad credit).
**Why:** Credit default prediction is crucial for financial institutions to minimize risk and make informed lending decisions.
**Goal:** To train and evaluate multiple machine learning models, including advanced Gradient Boosting techniques (XGBoost, LightGBM, CatBoost) and Interpretability-focused models (GAMs).
**Advanced Features:**
*   **K-Fold Cross-Validation:** For robust performance estimation.
*   **Probability Calibration:** To ensure predicted probabilities reflect true risk.
*   **Cost-Sensitive Analysis:** Optimizing the decision threshold based on financial costs (False Negatives vs False Positives).
*   **Robustness Checks:** Comparing Class Weighting vs. SMOTE Resampling.


## 2. Setup and Imports
**What:** Importing necessary Python libraries.
**Why:** We need `pandas` for data, `sklearn` for modeling, `imblearn` for resampling, and `interpret` for GAMs.



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, cross_validate
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from interpret.glassbox import ExplainableBoostingClassifier
from sklearn.metrics import (accuracy_score, roc_auc_score, confusion_matrix, classification_report, 
                             f1_score, average_precision_score, brier_score_loss, precision_recall_curve)
from sklearn.calibration import CalibratedClassifierCV, CalibrationDisplay
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# Set visualization style
sns.set_style("whitegrid")


## 3. Data Loading
**What:** Loading the German Credit dataset.
**Details:** Defining column names manually as the dataset lacks a header.


In [None]:
# Define column names
columns = [
    "status", "duration", "credit_history", "purpose", "credit_amount",
    "savings", "employment_duration", "installment_rate", "personal_status_sex",
    "other_debtors", "residence_since", "property", "age", "other_installment_plans",
    "housing", "existing_credits", "job", "people_liable", "telephone", "foreign_worker",
    "credit_risk"
]

def load_data():
    filepath = "dataset.data"
    if not os.path.exists(filepath):
        print(f"Error: {filepath} not found.")
        return None
    df = pd.read_csv(filepath, sep=" ", names=columns, header=None)
    return df

df = load_data()
if df is not None:
    print("Dataset loaded successfully.")
    display(df.head())


## 4. Data Cleaning and Mapping
**What:** Mapping cryptic codes to meaningful labels.
**Why:** To improve interpretability.


In [None]:
# Mapping dictionary based on dataset documentation
mappings = {
    "status": {"A11": "< 0 DM", "A12": "0 <= ... < 200 DM", "A13": ">= 200 DM", "A14": "no checking"},
    "credit_history": {"A30": "no credits/paid", "A31": "all paid at this bank", "A32": "existing paid", "A33": "delay", "A34": "critical/other"},
    "purpose": {"A40": "car (new)", "A41": "car (used)", "A42": "furniture/equipment", "A43": "radio/tv", "A44": "domestic appliances", "A45": "repairs", "A46": "education", "A47": "vacation", "A48": "retraining", "A49": "business", "A410": "others"},
    "savings": {"A61": "< 100 DM", "A62": "100 <= ... < 500 DM", "A63": "500 <= ... < 1000 DM", "A64": ">= 1000 DM", "A65": "unknown/none"},
    "employment_duration": {"A71": "unemployed", "A72": "< 1 year", "A73": "1 <= ... < 4 years", "A74": "4 <= ... < 7 years", "A75": ">= 7 years"},
    "personal_status_sex": {"A91": "male: divorced/separated", "A92": "female: div/dep/mar", "A93": "male: single", "A94": "male: mar/wid", "A95": "female: single"},
    "other_debtors": {"A101": "none", "A102": "co-applicant", "A103": "guarantor"},
    "property": {"A121": "real estate", "A122": "building society/life ins", "A123": "car/other", "A124": "unknown/none"},
    "other_installment_plans": {"A141": "bank", "A142": "stores", "A143": "none"},
    "housing": {"A151": "rent", "A152": "own", "A153": "for free"},
    "job": {"A171": "unemployed/unskilled non-res", "A172": "unskilled res", "A173": "skilled", "A174": "management/self-employed"},
    "telephone": {"A191": "none", "A192": "yes"},
    "foreign_worker": {"A201": "yes", "A202": "no"}
}

for col, mapping in mappings.items():
    if col in df.columns:
        df[col] = df[col].map(mapping).fillna(df[col])

print("Data cleaning complete.")


## 5. Data Preprocessing
**What:** Preparing data for modeling.
**Steps:**
1.  **Target Encoding:** 1 (Good) -> 0, 2 (Bad) -> 1.
2.  **Train/Test Split:** 80/20 split.
3.  **Preprocessing:** Scaling numericals, One-Hot Encoding categoricals.


In [None]:
# Encode target: 1 (Good) -> 0, 2 (Bad) -> 1
df['target'] = df['credit_risk'].apply(lambda x: 0 if x == 1 else 1)

X = df.drop(['credit_risk', 'target'], axis=1)
y = df['target']

categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
    ])

X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)
print(f"Training Set: {X_train.shape[0]}, Test Set: {X_test.shape[0]}")


## 6. Advanced Evaluation Setup
**What:** Defining the models and the evaluation framework.
**Key Components:**
*   **K-Fold Cross-Validation:** We use 5-fold stratified CV to get a stable estimate of model performance.
*   **Probability Calibration:** We apply Isotonic Regression to calibrate probabilities.
*   **Metrics:** We track Accuracy, ROC AUC, PR AUC, and Brier Score.
*   **Cost Analysis:** We define a custom cost function.


In [None]:
# Prepare constraints for XGBoost
duration_idx = list(numerical_cols).index('duration')
amount_idx = list(numerical_cols).index('credit_amount')
n_features = X_train_preprocessed.shape[1]
monotone_constraints = [0] * n_features
monotone_constraints[duration_idx] = 1
monotone_constraints[amount_idx] = 1
monotone_constraints = tuple(monotone_constraints)

# Define Base Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "LightGBM": LGBMClassifier(random_state=42, verbose=-1),
    "CatBoost": CatBoostClassifier(verbose=0, random_state=42),
    "GAM (EBM)": ExplainableBoostingClassifier(random_state=42),
    "XGBoost (Constrained)": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, monotone_constraints=monotone_constraints)
}

# Cost Matrix
# Cost(FN) = 5 (Classifying Bad as Good)
# Cost(FP) = 1 (Classifying Good as Bad)
COST_FN = 5
COST_FP = 1

def calculate_cost(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    # cm structure: [[TN, FP], [FN, TP]]
    TN, FP, FN, TP = cm.ravel()
    return (FN * COST_FN) + (FP * COST_FP)



## 7. Model Training, Calibration, and Evaluation
**What:** Training models, calibrating them, and evaluating on the test set.
**Why Calibration?** Many models (like Random Forest or Naive Bayes) do not output true probabilities. Calibration (Isotonic/Platt) adjusts the outputs so that a predicted probability of 0.8 actually means 80% of such cases are positive.
**Why PR-AUC?** For imbalanced datasets, Precision-Recall AUC is often more informative than ROC AUC.
**Why Brier Score?** It measures the accuracy of probabilistic predictions (lower is better).


In [None]:
results = {}

print(f"{'Model':<25} | {'Acc':<6} | {'ROC':<6} | {'PR-AUC':<6} | {'Brier':<6} | {'Cost':<6}")
print("-" * 75)

for name, model in models.items():
    # Handle EBM (raw data) vs others (preprocessed)
    if name == "GAM (EBM)":
        X_curr = X_train
        X_test_curr = X_test
    else:
        X_curr = X_train_preprocessed
        X_test_curr = X_test_preprocessed

    # 1. Train Base Model
    model.fit(X_curr, y_train)
    
    # 2. Calibrate Model (using CalibratedClassifierCV on pre-fitted model)
    # Note: For EBM, it's already well-calibrated usually, but we apply for consistency or skip.
    # Here we apply Isotonic calibration.
    if name != "GAM (EBM)": # EBM has its own calibration
        calibrated_model = CalibratedClassifierCV(model, method='isotonic', cv='prefit')
        calibrated_model.fit(X_curr, y_train)
        final_model = calibrated_model
    else:
        final_model = model

    # 3. Predict Probabilities
    y_prob = final_model.predict_proba(X_test_curr)[:, 1]
    y_pred = final_model.predict(X_test_curr)
    
    # 4. Calculate Metrics
    acc = accuracy_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_prob)
    pr_auc = average_precision_score(y_test, y_prob)
    brier = brier_score_loss(y_test, y_prob)
    cost = calculate_cost(y_test, y_pred)
    
    results[name] = {
        "model": final_model,
        "y_prob": y_prob,
        "metrics": {"acc": acc, "roc": roc, "pr_auc": pr_auc, "brier": brier, "cost": cost}
    }
    
    print(f"{name:<25} | {acc:.4f} | {roc:.4f} | {pr_auc:.4f} | {brier:.4f} | {cost:<6}")

    # 5. Calibration Curve
    CalibrationDisplay.from_predictions(y_test, y_prob, n_bins=10, name=name)
    plt.title(f"Calibration Curve - {name}")
    plt.show()



## 8. Cost-Sensitive Threshold Optimization
**What:** Finding the optimal probability threshold that minimizes the expected financial cost.
**Why:** The default threshold of 0.5 is rarely optimal when costs are asymmetric (Cost(FN) = 5 vs Cost(FP) = 1).
**Method:** We iterate through thresholds from 0 to 1, calculate the total cost for each, and find the minimum.


In [None]:
def plot_cost_curve(y_true, y_prob, model_name):
    thresholds = np.linspace(0, 1, 100)
    costs = []
    
    for t in thresholds:
        y_pred_t = (y_prob >= t).astype(int)
        costs.append(calculate_cost(y_true, y_pred_t))
        
    min_cost = min(costs)
    best_thresh = thresholds[np.argmin(costs)]
    
    plt.figure(figsize=(8, 5))
    plt.plot(thresholds, costs, label=f'{model_name} (Min Cost: {min_cost})')
    plt.axvline(best_thresh, color='r', linestyle='--', label=f'Optimal Thresh: {best_thresh:.2f}')
    plt.xlabel('Threshold')
    plt.ylabel('Total Cost')
    plt.title(f'Expected Cost by Threshold - {model_name}')
    plt.legend()
    plt.show()
    return min_cost, best_thresh

# Analyze Cost for the best performing model (e.g., CatBoost or EBM)
best_model_name = "CatBoost" # Example choice
y_prob_best = results[best_model_name]["y_prob"]
min_cost, best_thresh = plot_cost_curve(y_test, y_prob_best, best_model_name)
print(f"Optimal Threshold for {best_model_name}: {best_thresh:.2f} with Min Cost: {min_cost}")


## 9. Robustness: Class Weighting vs. SMOTE
**What:** Comparing two techniques to handle class imbalance.
**1. Class Weighting:** Assigns higher penalties to misclassifying the minority class directly in the loss function.
**2. SMOTE (Synthetic Minority Over-sampling Technique):** Generates synthetic examples for the minority class to balance the dataset.
**Why:** To see which method yields better robustness and performance for this specific dataset.


In [None]:
print("Comparing Class Weighting vs. SMOTE for Random Forest...")

# 1. Class Weighting (Already done in base models)
rf_weighted = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf_weighted.fit(X_train_preprocessed, y_train)
y_pred_w = rf_weighted.predict(X_test_preprocessed)
cost_w = calculate_cost(y_test, y_pred_w)
print(f"Class Weighting Cost: {cost_w}")

# 2. SMOTE
smote_pipeline = ImbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)) # No class_weight here
])
smote_pipeline.fit(X_train_preprocessed, y_train)
y_pred_s = smote_pipeline.predict(X_test_preprocessed)
cost_s = calculate_cost(y_test, y_pred_s)
print(f"SMOTE Cost:           {cost_s}")

if cost_s < cost_w:
    print("Conclusion: SMOTE performed better.")
else:
    print("Conclusion: Class Weighting performed better.")


## 10. Parallel Cross-Validation
**What:** Running 5-Fold Stratified Cross-Validation in parallel (`n_jobs=-1`).
**Why:** To utilize all CPU cores for faster evaluation and get a statistically robust performance estimate.


In [None]:
print("Running Parallel Cross-Validation (5-Fold)...")

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
model_cv = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)

# Use cross_validate to get multiple metrics
scoring = {'accuracy': 'accuracy', 'roc_auc': 'roc_auc', 'f1': 'f1'}
scores = cross_validate(model_cv, X_train_preprocessed, y_train, cv=cv, scoring=scoring, n_jobs=-1)

print(f"Mean Accuracy: {scores['test_accuracy'].mean():.4f} (+/- {scores['test_accuracy'].std():.4f})")
print(f"Mean ROC AUC:  {scores['test_roc_auc'].mean():.4f} (+/- {scores['test_roc_auc'].std():.4f})")
print(f"Mean F1 Score: {scores['test_f1'].mean():.4f} (+/- {scores['test_f1'].std():.4f})")
