# Notebook 03 – Model Validation

Objective:  
- Run cross-validation for candidate models (LogReg, RF, XGBoost).  
- Evaluate using credit scoring metrics (ROC AUC, recall, precision, F1, accuracy, precision@k).  
- Select top hyperparameter configurations.  
- Export candidate definitions to `configs/training_candidates.yaml` for pipeline training.


In [2]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


In [3]:
# Load dataset (curated version with decoded categories)
df = pd.read_csv("../german_credit.csv")

X = df.drop("CreditRisk", axis=1)
y = df["CreditRisk"]

# Load feature groups (saved from Notebook 01)
import json
with open("../configs/feature_groups.json", "r") as f:
    feature_groups = json.load(f)

num_cols = feature_groups["num_cols"]
simple_cat_cols = feature_groups["simple_cat_cols"]
complex_cat_cols = feature_groups["complex_cat_cols"]

num_cols, simple_cat_cols, complex_cat_cols

(['Duration',
  'CreditAmount',
  'InstallmentRate',
  'ResidenceSince',
  'Age',
  'ExistingCredits',
  'PeopleLiable'],
 ['OtherDetors',
  'OtherInstallmentPlans',
  'Housing',
  'Telephone',
  'ForeignWorker'],
 ['Status',
  'CreditHistory',
  'Purpose',
  'Savings',
  'Employment',
  'SexAndStatus',
  'Property',
  'Job'])

In [5]:
# Train-test split (stratified, consistent with preprocessing component)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape

((800, 20), (200, 20))

In [6]:
# Numeric pipeline: impute median + scale
num_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Simple categorical pipeline: impute most frequent + one-hot encode
simple_cat_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

# Complex categorical pipeline: impute most frequent + target encode
complex_cat_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("targetenc", TargetEncoder())
])

# Combine everything into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipeline, num_cols),
        ("simple_cat", simple_cat_pipeline, simple_cat_cols),
        ("complex_cat", complex_cat_pipeline, complex_cat_cols),
    ]
)


In [9]:
# Compute scale_pos_weight for XGB
n_neg = (y_train == 0).sum()
n_pos = (y_train == 1).sum()
scale_pos_weight = n_neg / n_pos

models_and_grids = {
    "logreg": (
        LogisticRegression(max_iter=1000, solver="liblinear", class_weight="balanced"),
        {
            "model__C": [0.01, 0.1, 1, 10],
            "model__penalty": ["l1", "l2"]
        }
    ),
    "rf": (
        RandomForestClassifier(random_state=42, class_weight="balanced"),
        {
            "model__n_estimators": [100, 200],
            "model__max_depth": [None, 5, 10],
            "model__min_samples_split": [2, 5]
        }
    ),
    "xgb": (
        XGBClassifier(
            eval_metric="auc",  # no more 'use_label_encoder'
            random_state=42,
            scale_pos_weight=scale_pos_weight
        ),
        {
            "model__n_estimators": [100, 200],
            "model__max_depth": [3, 5],
            "model__learning_rate": [0.1, 0.01],
            "model__subsample": [0.8, 1.0]
        }
    )
}


In [10]:
cv_results = []

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for model_name, (estimator, param_grid) in models_and_grids.items():
    print(f"Running CV for {model_name}...")
    
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("model", estimator)
    ])
    
    grid = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        cv=cv,
        scoring="roc_auc",
        n_jobs=-1,
        verbose=1
    )
    
    grid.fit(X_train, y_train)
    
    # Log results
    best_params = grid.best_params_
    best_score = grid.best_score_
    print(f"Best CV ROC AUC for {model_name}: {best_score:.3f}")
    
    cv_results.append({
        "model": model_name,
        "best_params": best_params,
        "cv_score": best_score
    })


Running CV for logreg...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best CV ROC AUC for logreg: 0.779
Running CV for rf...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best CV ROC AUC for rf: 0.790
Running CV for xgb...
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best CV ROC AUC for xgb: 0.792


In [11]:
cv_results_df = pd.DataFrame(cv_results).sort_values(by="cv_score", ascending=False)
display(cv_results_df)

# Select top 2 candidates
top_candidates = cv_results_df.head(2).to_dict(orient="records")
top_candidates


Unnamed: 0,model,best_params,cv_score
2,xgb,"{'model__learning_rate': 0.1, 'model__max_dept...",0.791778
1,rf,"{'model__max_depth': 5, 'model__min_samples_sp...",0.789918
0,logreg,"{'model__C': 10, 'model__penalty': 'l2'}",0.779464


[{'model': 'xgb',
  'best_params': {'model__learning_rate': 0.1,
   'model__max_depth': 3,
   'model__n_estimators': 100,
   'model__subsample': 1.0},
  'cv_score': 0.7917782738095238},
 {'model': 'rf',
  'best_params': {'model__max_depth': 5,
   'model__min_samples_split': 2,
   'model__n_estimators': 200},
  'cv_score': 0.7899181547619049}]

In [None]:
import json

with open("../configs/training_candidates.json", "w") as f:
    json.dump({
        "selection_metric": "roc_auc",
        "acceptance_criteria": {"min_roc_auc": 0.70},
        "candidates": top_candidates
    }, f, indent=2)

print("Top candidates exported to configs/training_candidates.json")
