In [None]:
# %%
# 🎯 04_hyperparam_tuning.ipynb – Systematic Model Tuning & Test Prediction Export
# ------------------------------------------------------------------------------

import sys
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import log_loss, brier_score_loss
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
import joblib
import lightgbm as lgb

BASE = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
DATA_INTERIM = BASE / "data" / "interim"
OUTPUTS = BASE / "outputs"
OUTPUTS.mkdir(exist_ok=True)

np.random.seed(42)

# %%
# === Load Data ===
train = pd.read_csv(DATA_INTERIM / "train_features.csv")
target = train["Winner"]
groups = train["Race_ID"]
X = train.drop(columns=["Race_ID", "Horse", "Winner"], errors="ignore")

# Drop non-numeric columns (e.g., datetime, string/object)
non_numeric = X.select_dtypes(exclude=["number"]).columns
if len(non_numeric) > 0:
    print(f"Dropping non-numeric columns: {list(non_numeric)}")
    X = X.drop(columns=non_numeric)

# Split data for tuning (uses stratify for class balance)
X_train, X_val, y_train, y_val, groups_train, groups_val = train_test_split(
    X, target, groups, test_size=0.2, random_state=42, stratify=target
)

# %%
# === LightGBM Hyperparameter Tuning ===
lgb_params = {
    "n_estimators": [100, 200, 500],
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [3, 5, 7, 10, -1],
    "num_leaves": [15, 31, 63, 127],
    "min_child_samples": [5, 10, 20, 50],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0]
}

lgb_clf = lgb.LGBMClassifier(objective="binary", random_state=42, n_jobs=-1)
lgb_search = RandomizedSearchCV(
    lgb_clf, lgb_params, n_iter=10, cv=3, scoring="neg_log_loss", verbose=2, random_state=42, n_jobs=-1
)
lgb_search.fit(X_train, y_train)
print("Best LGBM params:", lgb_search.best_params_)

lgb_best = lgb_search.best_estimator_
lgb_probs = lgb_best.predict_proba(X_val)[:, 1]
print("LGBM Log Loss:", log_loss(y_val, lgb_probs))
print("LGBM Brier Score:", brier_score_loss(y_val, lgb_probs))

# %%
# === Random Forest Hyperparameter Tuning ===
rf_params = {
    "n_estimators": [100, 300, 500],
    "max_depth": [6, 10, 15, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", None],
}
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
rf_search = RandomizedSearchCV(
    rf, rf_params, n_iter=10, cv=3, scoring="neg_log_loss", verbose=2, random_state=42, n_jobs=-1
)
rf_search.fit(X_train, y_train)
print("Best RF params:", rf_search.best_params_)

rf_best = rf_search.best_estimator_
rf_probs = rf_best.predict_proba(X_val)[:, 1]
print("Random Forest Log Loss:", log_loss(y_val, rf_probs))
print("Random Forest Brier Score:", brier_score_loss(y_val, rf_probs))

# %%
# === XGBoost Hyperparameter Tuning ===
xgb_params = {
    "n_estimators": [100, 300, 500],
    "max_depth": [3, 6, 10],
    "learning_rate": [0.01, 0.05, 0.1],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
}
xgb_clf = xgb.XGBClassifier(
    objective="binary:logistic", eval_metric="logloss", use_label_encoder=False, random_state=42, n_jobs=-1
)
xgb_search = RandomizedSearchCV(
    xgb_clf, xgb_params, n_iter=10, cv=3, scoring="neg_log_loss", verbose=2, random_state=42, n_jobs=-1
)
xgb_search.fit(X_train, y_train)
print("Best XGB params:", xgb_search.best_params_)

xgb_best = xgb_search.best_estimator_
xgb_probs = xgb_best.predict_proba(X_val)[:, 1]
print("XGBoost Log Loss:", log_loss(y_val, xgb_probs))
print("XGBoost Brier Score:", brier_score_loss(y_val, xgb_probs))

# %%
# === CatBoost Hyperparameter Tuning ===
cat_params = {
    "iterations": [200, 300, 500],
    "learning_rate": [0.01, 0.05, 0.1],
    "depth": [4, 6, 8],
    "l2_leaf_reg": [1, 3, 5, 7, 9]
}
cat = CatBoostClassifier(
    loss_function='Logloss',
    eval_metric='Logloss',
    random_seed=42,
    verbose=100
)
cat_search = RandomizedSearchCV(
    cat, cat_params, n_iter=10, cv=3, scoring="neg_log_loss", random_state=42, verbose=2, n_jobs=-1
)
cat_search.fit(X_train, y_train)
print("Best CatBoost params:", cat_search.best_params_)

cat_best = cat_search.best_estimator_
cat_probs = cat_best.predict_proba(X_val)[:, 1]
print("CatBoost Log Loss:", log_loss(y_val, cat_probs))
print("CatBoost Brier Score:", brier_score_loss(y_val, cat_probs))

# %%
# === MLP Hyperparameter Tuning ===
mlp_params = {
    "hidden_layer_sizes": [(64,), (128,), (64, 32)],
    "activation": ["relu", "tanh"],
    "solver": ["adam", "sgd"],
    "alpha": [0.0001, 0.001, 0.01],
    "learning_rate": ["constant", "adaptive"],
}
mlp = MLPClassifier(max_iter=200, random_state=42)
mlp_search = RandomizedSearchCV(
    mlp, mlp_params, n_iter=10, cv=3, scoring="neg_log_loss", random_state=42, verbose=2, n_jobs=-1
)
mlp_search.fit(X_train, y_train)
print("Best MLP params:", mlp_search.best_params_)

mlp_best = mlp_search.best_estimator_
mlp_probs = mlp_best.predict_proba(X_val)[:, 1]
print("MLP Log Loss:", log_loss(y_val, mlp_probs))
print("MLP Brier Score:", brier_score_loss(y_val, mlp_probs))

# %%
# === Save Best Models ===
joblib.dump(lgb_best, OUTPUTS / "best_lgbm_model.joblib")
joblib.dump(rf_best, OUTPUTS / "best_rf_model.joblib")
joblib.dump(xgb_best, OUTPUTS / "best_xgb_model.joblib")
joblib.dump(cat_best, OUTPUTS / "best_cat_model.joblib")
joblib.dump(mlp_best, OUTPUTS / "best_mlp_model.joblib")
print("✅ All best models saved to outputs/")

# %%
# === Optional: Show All Results Side by Side ===
results = pd.DataFrame({
    "Model": ["LGBM", "RF", "XGB", "CatBoost", "MLP"],
    "LogLoss": [log_loss(y_val, lgb_probs), log_loss(y_val, rf_probs), log_loss(y_val, xgb_probs), log_loss(y_val, cat_probs), log_loss(y_val, mlp_probs)],
    "BrierScore": [brier_score_loss(y_val, lgb_probs), brier_score_loss(y_val, rf_probs), brier_score_loss(y_val, xgb_probs), brier_score_loss(y_val, cat_probs), brier_score_loss(y_val, mlp_probs)]
})
print(results)

# %%
# === Generate and Save Test Predictions for All Tuned Models ===

# Reload test features to avoid column mismatches
X_test = pd.read_csv(DATA_INTERIM / "test_features.csv")
meta_cols = ["Race_ID", "Horse"]

def predict_and_save(model_path, pred_name, is_catboost=False):
    path = OUTPUTS / model_path
    if not path.exists():
        print(f"❌ Model not found: {path}")
        return
    model = joblib.load(path)
    # Align features
    if hasattr(model, "feature_names_in_"):
        X_test_model = X_test[model.feature_names_in_]
    else:
        X_test_model = X_test.drop(columns=meta_cols, errors="ignore")
    # CatBoost: use Pool and categorical indices
    if is_catboost:
        from catboost import Pool
        cat_features = [
            i for i, col in enumerate(X_test_model.columns)
            if X_test_model[col].dtype == 'object' or X_test_model[col].dtype.name == 'category'
        ]
        pool = Pool(X_test_model, cat_features=cat_features)
        probs = model.predict_proba(pool)[:, 1]
    else:
        probs = model.predict_proba(X_test_model)[:, 1]
    df_pred = X_test[meta_cols].copy()
    df_pred["Predicted_Probability"] = probs
    out_file = f"test_preds_{pred_name}.csv"
    df_pred.to_csv(OUTPUTS / out_file, index=False)
    print(f"✅ {out_file} saved.")

predict_and_save("best_lgbm_model.joblib", "lgbm_tuned")
predict_and_save("best_rf_model.joblib", "rf_tuned")
predict_and_save("best_xgb_model.joblib", "xgb_tuned")
predict_and_save("best_cat_model.joblib", "cat_tuned", is_catboost=True)
predict_and_save("best_mlp_model.joblib", "mlp_tuned")

# %%
print("✅ Hyperparameter tuning AND test prediction export complete. Ready for stacking/ensemble in 05.")



Dropping non-numeric columns: ['Race_Time', 'Course', 'Going', 'Trainer', 'Jockey', 'RecentRun', 'Distance_Bucket']
Fitting 3 folds for each of 10 candidates, totalling 30 fits
[LightGBM] [Info] Number of positive: 4278, number of negative: 37401
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004904 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4672
[LightGBM] [Info] Number of data points in the train set: 41679, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.102642 -> initscore=-2.168212
[LightGBM] [Info] Start training from score -2.168212
Best LGBM params: {'subsample': 0.8, 'num_leaves': 63, 'n_estimators': 100, 'min_child_samples': 5, 'max_depth': 5, 'learning_rate': 0.05, 'colsample_bytree': 1.0}
LGBM Log Loss: 0.3101526272057301
LGBM Brier Score: 0.08823321288721273
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best RF params: {'n_estimators': 

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best XGB params: {'subsample': 0.6, 'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.01, 'colsample_bytree': 0.6}
XGBoost Log Loss: 0.30982421210153377
XGBoost Brier Score: 0.088065704486053
Fitting 3 folds for each of 10 candidates, totalling 30 fits
0:	learn: 0.6502730	total: 147ms	remaining: 43.9s
100:	learn: 0.3097936	total: 1.2s	remaining: 2.36s
200:	learn: 0.3056112	total: 2.15s	remaining: 1.06s
299:	learn: 0.3018855	total: 3.09s	remaining: 0us
Best CatBoost params: {'learning_rate': 0.05, 'l2_leaf_reg': 3, 'iterations': 300, 'depth': 4}
CatBoost Log Loss: 0.3098186956651067
CatBoost Brier Score: 0.0881245287108765
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best MLP params: {'solver': 'adam', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (64,), 'alpha': 0.01, 'activation': 'tanh'}
MLP Log Loss: 0.32946646494045595
MLP Brier Score: 0.09187602043015665
✅ All best models saved to outputs/
      Model   LogLoss  BrierScore
0      LGBM  0.310153    0.0882