# Core Model Zoo — Tuning Template
This notebook demonstrates leak‑free preprocessing with ColumnTransformer + Pipeline, and uses RandomizedSearchCV / grids to tune a few strong baselines.
**Steps**: synth data → split → preprocess → tune → calibrate → evaluate → save.


In [None]:

import numpy as np, pandas as pd, joblib
from pathlib import Path
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score, brier_score_loss
from importlib import import_module

# Synthetic tabular toy dataset + a couple of categoricals
X_num, y = make_classification(n_samples=6000, n_features=20, n_informative=8, weights=[0.85,0.15], random_state=42)
X = pd.DataFrame(X_num, columns=[f"f{i}" for i in range(20)])
rng = np.random.default_rng(42)
X["country"] = rng.choice(["US","DE","IN","BR"], size=len(X), p=[.4,.2,.25,.15])
X["device"] = rng.choice(["mobile","desktop","tablet"], size=len(X), p=[.6,.35,.05])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    stratify=y, random_state=42)

num_cols = X_train.select_dtypes(np.number).columns.tolist()
cat_cols = X_train.select_dtypes(exclude=np.number).columns.tolist()

pre = ColumnTransformer([("num", StandardScaler(), num_cols),
                         ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)])

grids = import_module("grids")
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


## 1) Logistic Regression baseline

In [None]:

pipe_lr = Pipeline([("pre", pre), ("model", LogisticRegression(max_iter=1000))])
search_lr = RandomizedSearchCV(pipe_lr, param_distributions=grids.logreg_dist,
                               n_iter=30, cv=cv, n_jobs=-1, scoring="f1", random_state=42)
search_lr.fit(X_train, y_train)
best_lr = search_lr.best_estimator_
cal_lr = CalibratedClassifierCV(best_lr, cv=3).fit(X_train, y_train)
p_lr = cal_lr.predict_proba(X_test)[:,1]
print("LR ROC AUC:", roc_auc_score(y_test, p_lr), "AP:", average_precision_score(y_test, p_lr), "Brier:", brier_score_loss(y_test, p_lr))
print(classification_report(y_test, (p_lr>=0.5).astype(int)))


## 2) Random Forest

In [None]:

pipe_rf = Pipeline([("pre", pre), ("clf", RandomForestClassifier(random_state=42, n_jobs=-1))])
search_rf = RandomizedSearchCV(pipe_rf, param_distributions=grids.rf_grid,
                               n_iter=20, cv=cv, n_jobs=-1, scoring="roc_auc", random_state=42)
search_rf.fit(X_train, y_train)
best_rf = search_rf.best_estimator_
p_rf = best_rf.predict_proba(X_test)[:,1]
print("RF ROC AUC:", roc_auc_score(y_test, p_rf), "AP:", average_precision_score(y_test, p_rf))


## 3) Histogram-based Gradient Boosting

In [None]:

pipe_hgb = Pipeline([("pre", pre), ("model", HistGradientBoostingClassifier(early_stopping=True, random_state=42))])
search_hgb = RandomizedSearchCV(pipe_hgb, param_distributions=grids.hgb_dist,
                                n_iter=20, cv=cv, n_jobs=-1, scoring="roc_auc", random_state=42)
search_hgb.fit(X_train, y_train)
best_hgb = search_hgb.best_estimator_
p_hgb = best_hgb.predict_proba(X_test)[:,1]
print("HGB ROC AUC:", roc_auc_score(y_test, p_hgb), "AP:", average_precision_score(y_test, p_hgb))


## 4) SVC (RBF) — scaled

In [None]:

pipe_svc = Pipeline([("pre", pre), ("model", SVC(kernel="rbf", probability=True))])
search_svc = RandomizedSearchCV(pipe_svc, param_distributions=grids.svc_dist,
                                n_iter=30, cv=cv, n_jobs=-1, scoring="roc_auc", random_state=42)
search_svc.fit(X_train, y_train)
best_svc = search_svc.best_estimator_
p_svc = best_svc.predict_proba(X_test)[:,1]
print("SVC ROC AUC:", roc_auc_score(y_test, p_svc), "AP:", average_precision_score(y_test, p_svc))


## 5) kNN (remember: scaling matters)

In [None]:

pipe_knn = Pipeline([("pre", pre), ("model", KNeighborsClassifier())])
from scipy.stats import randint
search_knn = RandomizedSearchCV(pipe_knn, param_distributions=grids.knn_grid,
                                n_iter=20, cv=cv, n_jobs=-1, scoring="roc_auc", random_state=42)
search_knn.fit(X_train, y_train)
best_knn = search_knn.best_estimator_
p_knn = best_knn.predict_proba(X_test)[:,1]
print("kNN ROC AUC:", roc_auc_score(y_test, p_knn), "AP:", average_precision_score(y_test, p_knn))


## Save the best (by AUC)

In [None]:

cands = {"lr": p_lr, "rf": p_rf, "hgb": p_hgb, "svc": p_svc, "knn": p_knn}
best_name = max(cands, key=lambda k: roc_auc_score(y_test, cands[k]))
best_model = {"lr": cal_lr, "rf": best_rf, "hgb": best_hgb, "svc": best_svc, "knn": best_knn}[best_name]
print("Best by ROC AUC:", best_name)
joblib.dump(best_model, str(Path("../models/best_pipeline.joblib")))
