In [1]:
# %% 
# 📟 03_models.ipynb – Baseline & Alternative ML Models
# -----------------------------------------------------
# Trains LightGBM LambdaRank model + alternative models to estimate win probabilities for each horse.
# Outputs OOF/test predictions for each, ready for stacking/blending.

# %% 
# === Imports & path setup ===
import sys
from pathlib import Path
import pandas as pd
import numpy as np

NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = NOTEBOOK_DIR if (NOTEBOOK_DIR / "src").exists() else NOTEBOOK_DIR.parent
sys.path.append(str(PROJECT_ROOT / "src"))

from model import train_and_evaluate  # Your LambdaRank function

DATA_INTERIM = PROJECT_ROOT / "data" / "interim"
OUTPUTS = PROJECT_ROOT / "outputs"
OUTPUTS.mkdir(exist_ok=True)

# %% 
# === Load features ===
train = pd.read_csv(DATA_INTERIM / "train_features.csv")
test = pd.read_csv(DATA_INTERIM / "test_features.csv")

# === Prepare labels and groupings ===
target = train["Winner"]
groups = train["Race_ID"]  # Each group = a race

X = train.drop(columns=["Race_ID", "Horse", "Winner"])
X_test = test.drop(columns=["Race_ID", "Horse"], errors="ignore")

# === Encode categorical variables ===
cat_cols = X.select_dtypes(include=["object", "category"]).columns
for col in cat_cols:
    train_cats = X[col].astype("category").cat.categories
    X[col] = X[col].astype(pd.CategoricalDtype(categories=train_cats)).cat.codes
    X_test[col] = X_test[col].astype(pd.CategoricalDtype(categories=train_cats)).cat.codes

# --- Utility: Softmax per race
def softmax_group(df, group_col, score_col):
    return df.groupby(group_col)[score_col].transform(lambda x: np.exp(x - np.max(x)) / np.sum(np.exp(x - np.max(x))))

def save_predictions(filename, df, prob_col="Predicted_Probability"):
    df.to_csv(OUTPUTS / filename, index=False)
    print(f"✅ Saved: {OUTPUTS / filename}")

# ==========================================================
# 1. LightGBM LambdaRank Baseline
# ==========================================================
models, oof_preds, feature_importance = train_and_evaluate(X, target, groups)

# OOF for LGBM
oof_preds_df = pd.DataFrame({
    "Race_ID": train["Race_ID"],
    "Horse": train["Horse"],
    "True_Label": target,
    "Predicted_Probability": oof_preds
})
oof_preds_df.to_csv(OUTPUTS / "oof_preds.csv", index=False)
oof_preds_df.to_csv(OUTPUTS / "oof_preds_lgbm.csv", index=False)
print("✅ oof_preds.csv and oof_preds_lgbm.csv saved.")

# Test preds for LGBM
best_model = models[-1]
test_raw_scores = best_model.predict(X_test)
test_out = test.copy()
test_out["Predicted_Probability"] = softmax_group(test_out.assign(score=test_raw_scores), "Race_ID", "score")
submission = test_out[["Race_ID", "Horse", "Predicted_Probability"]]
submission.to_csv(OUTPUTS / "test_predictions.csv", index=False)
submission.to_csv(OUTPUTS / "test_preds_lgbm.csv", index=False)
print("✅ test_predictions.csv and test_preds_lgbm.csv saved!")

# --- Sanity Checks
check = submission.groupby("Race_ID")["Predicted_Probability"].sum()
assert check.between(0.999, 1.001).all(), "Probabilities do not sum to 1 per race!"

# ==========================================================
# 2. Random Forest (sklearn)
# ==========================================================
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GroupKFold
from sklearn.metrics import log_loss, brier_score_loss

NUM_FOLDS = 5
gkf = GroupKFold(n_splits=NUM_FOLDS)
oof_scores_rf = np.zeros(len(X))
models_rf = []

for fold, (train_idx, val_idx) in enumerate(gkf.split(X, target, groups=groups)):
    print(f"\n🌲 RF Training Fold {fold + 1}/{NUM_FOLDS}...")
    rf = RandomForestClassifier(
        n_estimators=200, max_depth=8, min_samples_leaf=10,
        random_state=42, n_jobs=-1
    )
    rf.fit(X.iloc[train_idx], target.iloc[train_idx])
    val_scores = rf.predict_proba(X.iloc[val_idx])[:, 1]
    oof_scores_rf[val_idx] = val_scores
    models_rf.append(rf)

oof_rf = train[["Race_ID", "Horse", "Winner"]].copy()
oof_rf["Predicted_Probability"] = softmax_group(oof_rf.assign(score=oof_scores_rf), "Race_ID", "score")
save_predictions("oof_preds_rf.csv", oof_rf.rename(columns={"Winner": "True_Label"}))

test_rf = test[["Race_ID", "Horse"]].copy()
rf_test_scores = np.mean([m.predict_proba(X_test)[:, 1] for m in models_rf], axis=0)
test_rf["Predicted_Probability"] = softmax_group(test_rf.assign(score=rf_test_scores), "Race_ID", "score")
save_predictions("test_preds_rf.csv", test_rf)

# ==========================================================
# 3. XGBoost
# ==========================================================
import xgboost as xgb

oof_scores_xgb = np.zeros(len(X))
models_xgb = []

for fold, (train_idx, val_idx) in enumerate(gkf.split(X, target, groups=groups)):
    print(f"\n🦾 XGB Training Fold {fold + 1}/{NUM_FOLDS}...")
    xgb_clf = xgb.XGBClassifier(
        n_estimators=300, max_depth=7, learning_rate=0.08,
        subsample=0.8, colsample_bytree=0.8, use_label_encoder=False,
        eval_metric="logloss", random_state=42, tree_method='hist', verbosity=0, n_jobs=-1
    )
    xgb_clf.fit(X.iloc[train_idx], target.iloc[train_idx])
    val_scores = xgb_clf.predict_proba(X.iloc[val_idx])[:, 1]
    oof_scores_xgb[val_idx] = val_scores
    models_xgb.append(xgb_clf)

oof_xgb = train[["Race_ID", "Horse", "Winner"]].copy()
oof_xgb["Predicted_Probability"] = softmax_group(oof_xgb.assign(score=oof_scores_xgb), "Race_ID", "score")
save_predictions("oof_preds_xgb.csv", oof_xgb.rename(columns={"Winner": "True_Label"}))

test_xgb = test[["Race_ID", "Horse"]].copy()
xgb_test_scores = np.mean([m.predict_proba(X_test)[:, 1] for m in models_xgb], axis=0)
test_xgb["Predicted_Probability"] = softmax_group(test_xgb.assign(score=xgb_test_scores), "Race_ID", "score")
save_predictions("test_preds_xgb.csv", test_xgb)

# ==========================================================
# 4. CatBoostClassifier
# ==========================================================
from catboost import CatBoostClassifier, Pool

oof_scores_cat = np.zeros(len(X))
models_cat = []
cat_features_idx = [X.columns.get_loc(col) for col in cat_cols if col in X.columns]

for fold, (train_idx, val_idx) in enumerate(gkf.split(X, target, groups=groups)):
    print(f"\n🐈 CatBoost Training Fold {fold + 1}/{NUM_FOLDS}...")
    train_pool = Pool(X.iloc[train_idx], label=target.iloc[train_idx], cat_features=cat_features_idx)
    val_pool = Pool(X.iloc[val_idx], label=target.iloc[val_idx], cat_features=cat_features_idx)
    cat = CatBoostClassifier(
        iterations=500, learning_rate=0.08, depth=7, loss_function='Logloss',
        eval_metric='Logloss', random_seed=42, verbose=100, early_stopping_rounds=50
    )
    cat.fit(train_pool, eval_set=val_pool, use_best_model=True, verbose=100)
    val_scores = cat.predict_proba(X.iloc[val_idx])[:, 1]
    oof_scores_cat[val_idx] = val_scores
    models_cat.append(cat)

oof_cat = train[["Race_ID", "Horse", "Winner"]].copy()
oof_cat["Predicted_Probability"] = softmax_group(oof_cat.assign(score=oof_scores_cat), "Race_ID", "score")
save_predictions("oof_preds_cat.csv", oof_cat.rename(columns={"Winner": "True_Label"}))

test_cat = test[["Race_ID", "Horse"]].copy()
cat_test_scores = np.mean([m.predict_proba(X_test)[:, 1] for m in models_cat], axis=0)
test_cat["Predicted_Probability"] = softmax_group(test_cat.assign(score=cat_test_scores), "Race_ID", "score")
save_predictions("test_preds_cat.csv", test_cat)

# ==========================================================
# 5. MLP Neural Network (sklearn)
# ==========================================================
from sklearn.neural_network import MLPClassifier

oof_scores_mlp = np.zeros(len(X))
models_mlp = []

for fold, (train_idx, val_idx) in enumerate(gkf.split(X, target, groups=groups)):
    print(f"\n🧠 MLP Training Fold {fold + 1}/{NUM_FOLDS}...")
    mlp = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=200, alpha=1e-4, random_state=42)
    mlp.fit(X.iloc[train_idx], target.iloc[train_idx])
    val_scores = mlp.predict_proba(X.iloc[val_idx])[:, 1]
    oof_scores_mlp[val_idx] = val_scores
    models_mlp.append(mlp)

oof_mlp = train[["Race_ID", "Horse", "Winner"]].copy()
oof_mlp["Predicted_Probability"] = softmax_group(oof_mlp.assign(score=oof_scores_mlp), "Race_ID", "score")
save_predictions("oof_preds_mlp.csv", oof_mlp.rename(columns={"Winner": "True_Label"}))

test_mlp = test[["Race_ID", "Horse"]].copy()
mlp_test_scores = np.mean([m.predict_proba(X_test)[:, 1] for m in models_mlp], axis=0)
test_mlp["Predicted_Probability"] = softmax_group(test_mlp.assign(score=mlp_test_scores), "Race_ID", "score")
save_predictions("test_preds_mlp.csv", test_mlp)

print("\n🎯 All model OOF/test predictions saved for stacking or analysis.")

# %%




🚀 Training Fold 1/5...


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001234 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5531
[LightGBM] [Info] Number of data points in the train set: 41678, number of used features: 32
Training until validation scores don't improve for 100 rounds


[100]	valid_0's ndcg@1: 0.404494	valid_0's ndcg@2: 0.464407	valid_0's ndcg@3: 0.522771	valid_0's ndcg@4: 0.563287	valid_0's ndcg@5: 0.598259
Early stopping, best iteration is:
[31]	valid_0's ndcg@1: 0.424157	valid_0's ndcg@2: 0.470995	valid_0's ndcg@3: 0.521938	valid_0's ndcg@4: 0.567361	valid_0's ndcg@5: 0.60086


📊 Fold 1 — Log Loss: 0.31529 | Brier: 0.08914

🚀 Training Fold 2/5...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002609 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5471
[LightGBM] [Info] Number of data points in the train set: 41680, number of used features: 32
Training until validation scores don't improve for 100 rounds


[100]	valid_0's ndcg@1: 0.420806	valid_0's ndcg@2: 0.482626	valid_0's ndcg@3: 0.530518	valid_0's ndcg@4: 0.573032	valid_0's ndcg@5: 0.602722
Early stopping, best iteration is:
[33]	valid_0's ndcg@1: 0.417994	valid_0's ndcg@2: 0.476267	valid_0's ndcg@3: 0.524956	valid_0's ndcg@4: 0.567228	valid_0's ndcg@5: 0.605977


📊 Fold 2 — Log Loss: 0.31399 | Brier: 0.08874

🚀 Training Fold 3/5...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002602 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5539
[LightGBM] [Info] Number of data points in the train set: 41680, number of used features: 32
Training until validation scores don't improve for 100 rounds


[100]	valid_0's ndcg@1: 0.424555	valid_0's ndcg@2: 0.47634	valid_0's ndcg@3: 0.531757	valid_0's ndcg@4: 0.571788	valid_0's ndcg@5: 0.603109
Early stopping, best iteration is:
[11]	valid_0's ndcg@1: 0.430178	valid_0's ndcg@2: 0.476831	valid_0's ndcg@3: 0.53516	valid_0's ndcg@4: 0.578537	valid_0's ndcg@5: 0.605266


📊 Fold 3 — Log Loss: 0.31828 | Brier: 0.08958

🚀 Training Fold 4/5...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002610 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5467
[LightGBM] [Info] Number of data points in the train set: 41680, number of used features: 32
Training until validation scores don't improve for 100 rounds


[100]	valid_0's ndcg@1: 0.428304	valid_0's ndcg@2: 0.474365	valid_0's ndcg@3: 0.524029	valid_0's ndcg@4: 0.56499	valid_0's ndcg@5: 0.596539
Early stopping, best iteration is:
[13]	valid_0's ndcg@1: 0.432052	valid_0's ndcg@2: 0.480195	valid_0's ndcg@3: 0.531125	valid_0's ndcg@4: 0.565505	valid_0's ndcg@5: 0.601354


📊 Fold 4 — Log Loss: 0.31772 | Brier: 0.08951

🚀 Training Fold 5/5...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005182 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5540
[LightGBM] [Info] Number of data points in the train set: 41678, number of used features: 32
Training until validation scores don't improve for 100 rounds


[100]	valid_0's ndcg@1: 0.42603	valid_0's ndcg@2: 0.48398	valid_0's ndcg@3: 0.540293	valid_0's ndcg@4: 0.581146	valid_0's ndcg@5: 0.606618
Early stopping, best iteration is:
[54]	valid_0's ndcg@1: 0.431648	valid_0's ndcg@2: 0.488093	valid_0's ndcg@3: 0.545575	valid_0's ndcg@4: 0.583865	valid_0's ndcg@5: 0.613681


📊 Fold 5 — Log Loss: 0.31209 | Brier: 0.08850



📈 Final Out-of-Fold Evaluation
-------------------------------
Mean Log Loss:    0.31547
Mean Brier Score: 0.08909


✅ oof_preds.csv and oof_preds_lgbm.csv saved.


✅ test_predictions.csv and test_preds_lgbm.csv saved!

🌲 RF Training Fold 1/5...



🌲 RF Training Fold 2/5...



🌲 RF Training Fold 3/5...



🌲 RF Training Fold 4/5...



🌲 RF Training Fold 5/5...


✅ Saved: C:\Users\dylan\Documents\Projects\horse_model_project\outputs\oof_preds_rf.csv


✅ Saved: C:\Users\dylan\Documents\Projects\horse_model_project\outputs\test_preds_rf.csv

🦾 XGB Training Fold 1/5...



🦾 XGB Training Fold 2/5...



🦾 XGB Training Fold 3/5...



🦾 XGB Training Fold 4/5...



🦾 XGB Training Fold 5/5...


✅ Saved: C:\Users\dylan\Documents\Projects\horse_model_project\outputs\oof_preds_xgb.csv


✅ Saved: C:\Users\dylan\Documents\Projects\horse_model_project\outputs\test_preds_xgb.csv

🐈 CatBoost Training Fold 1/5...


0:	learn: 0.6254182	test: 0.6257008	best: 0.6257008 (0)	total: 233ms	remaining: 1m 56s


100:	learn: 0.2169690	test: 0.3138206	best: 0.3128708 (78)	total: 8.9s	remaining: 35.1s


Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.3128708412
bestIteration = 78

Shrink model to first 79 iterations.

🐈 CatBoost Training Fold 2/5...


0:	learn: 0.6254590	test: 0.6256469	best: 0.6256469 (0)	total: 77ms	remaining: 38.4s


100:	learn: 0.2179266	test: 0.3163662	best: 0.3144512 (65)	total: 8.35s	remaining: 33s


Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.3144512144
bestIteration = 65

Shrink model to first 66 iterations.



🐈 CatBoost Training Fold 3/5...


0:	learn: 0.6255274	test: 0.6253623	best: 0.6253623 (0)	total: 76.6ms	remaining: 38.2s


100:	learn: 0.2160727	test: 0.3150985	best: 0.3150567 (98)	total: 8.78s	remaining: 34.7s


Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.3150566777
bestIteration = 98

Shrink model to first 99 iterations.



🐈 CatBoost Training Fold 4/5...


0:	learn: 0.6253991	test: 0.6257205	best: 0.6257205 (0)	total: 83.5ms	remaining: 41.7s


100:	learn: 0.2162310	test: 0.3147368	best: 0.3121565 (76)	total: 9.63s	remaining: 38.1s


Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.3121564518
bestIteration = 76

Shrink model to first 77 iterations.

🐈 CatBoost Training Fold 5/5...


0:	learn: 0.6256269	test: 0.6253493	best: 0.6253493 (0)	total: 81.6ms	remaining: 40.7s


100:	learn: 0.2169536	test: 0.3098649	best: 0.3098649 (100)	total: 9.8s	remaining: 38.7s


Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.309212181
bestIteration = 128

Shrink model to first 129 iterations.


✅ Saved: C:\Users\dylan\Documents\Projects\horse_model_project\outputs\oof_preds_cat.csv


✅ Saved: C:\Users\dylan\Documents\Projects\horse_model_project\outputs\test_preds_cat.csv

🧠 MLP Training Fold 1/5...



🧠 MLP Training Fold 2/5...



🧠 MLP Training Fold 3/5...



🧠 MLP Training Fold 4/5...



🧠 MLP Training Fold 5/5...


✅ Saved: C:\Users\dylan\Documents\Projects\horse_model_project\outputs\oof_preds_mlp.csv


✅ Saved: C:\Users\dylan\Documents\Projects\horse_model_project\outputs\test_preds_mlp.csv

🎯 All model OOF/test predictions saved for stacking or analysis.
