In [1]:
# %%
# 📟 02_features.ipynb – Feature Engineering Pipeline
# ---------------------------------------------------
# Transforms cleaned data into feature-rich form for model training and testing.

# %%
# === Imports & Setup ===
import sys
from pathlib import Path
import pandas as pd

# Set up project paths
BASE = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
sys.path.append(str(BASE))

# Import feature logic
from src.features import engineer_features, feature_summary

# Define I/O paths
DATA_PROCESSED = BASE / "data" / "processed"
DATA_INTERIM = BASE / "data" / "interim"
DATA_MARKET = BASE / "data" / "market"
DATA_INTERIM.mkdir(parents=True, exist_ok=True)

# %%
# === Load Cleaned Data ===
train = pd.read_csv(DATA_PROCESSED / "train_cleaned.csv")
test = pd.read_csv(DATA_PROCESSED / "test_cleaned.csv")

print(f"📂 Loaded: {len(train)} training rows, {len(test)} test rows")

# %%
# === Merge SAFE Market Odds (Pre-race only) ===
# Only merge betfair_sp_prerace.csv which has no leakage!
market = pd.read_csv(DATA_MARKET / "betfair_sp_prerace.csv")
train = train.merge(market, on=["Race_ID", "Horse"], how="left")
test = test.merge(market, on=["Race_ID", "Horse"], how="left")

# === Assert NO leakage columns ===
LEAK_COLS = ["betfairSP", "Position", "timeSecs", "pdsBeaten", "NMFP"]
for col in LEAK_COLS:
    assert col not in train.columns, f"Leakage column found in train: {col}"
    assert col not in test.columns, f"Leakage column found in test: {col}"

# %%
# === Apply Feature Engineering ===
train_feat = engineer_features(train.copy())
test_feat = engineer_features(test.copy())

print(f"✅ Engineered train shape: {train_feat.shape}")
print(f"✅ Engineered test shape:  {test_feat.shape}")

# %%
# === Optional: Preview Most Predictive Features ===
if "Winner" in train_feat.columns:
    summary = feature_summary(train_feat, label_col="Winner")
    display(summary.head(20))
else:
    print("⚠️ Skipping feature summary: 'Winner' column not found.")

# %%
# === Drop market odds and any derived columns post-feature engineering ===
drop_cols = [
    "Market_Odds", "Market_Prob", "betfairSP", "Position", "timeSecs", "pdsBeaten", "NMFP",
    "log_MarketOdds", "Market_Prob_Z", "MarketRank",
    "RaceMeanOdds", "RaceStdOdds", "OddsToMeanRatio"
]
train_feat = train_feat.drop(columns=[c for c in drop_cols if c in train_feat.columns], errors="ignore")
test_feat = test_feat.drop(columns=[c for c in drop_cols if c in test_feat.columns], errors="ignore")

# --- Diagnostics: Show any "market" features still remaining
leak_suspects = [c for c in train_feat.columns if "market" in c.lower() or "prob" in c.lower() or "odds" in c.lower()]
print("⚠️ Features with 'market', 'prob', or 'odds' still present:", leak_suspects)

# Double-check NO leakage columns remain after feature engineering
for col in LEAK_COLS + drop_cols:
    assert col not in train_feat.columns, f"Leakage column found after engineering: {col}"
    assert col not in test_feat.columns, f"Leakage column found after engineering: {col}"

print("Remaining columns after leakage drop:", list(train_feat.columns))

# %%
# === Save Output ===
train_feat.to_csv(DATA_INTERIM / "train_features.csv", index=False)
test_feat.to_csv(DATA_INTERIM / "test_features.csv", index=False)
print("💾 Feature sets saved to data/interim/")



📂 Loaded: 52099 training rows, 11275 test rows


✅ Engineered train shape: (52099, 44)
✅ Engineered test shape:  (11275, 43)


Unnamed: 0,Feature,Null_%,Corr_to_Label,Type
0,Market_Prob,0.0,0.30491,float64
1,Market_Prob_Z,0.0,0.263429,float64
2,NMFPLTO,0.0,0.118656,float64
3,z_Speed_PreviousRun,0.0,0.101387,float64
4,rel_Speed_PreviousRun,0.0,0.093472,float64
5,SpeedAvg2,0.0,0.078984,float64
6,Speed_PreviousRun,0.0,0.078334,float64
7,Speed_2ndPreviousRun,0.0,0.049201,float64
8,TrainerRecencyCombo,0.0,0.036116,float64
9,DamsireRating,0.0,0.01196,float64


⚠️ Features with 'market', 'prob', or 'odds' still present: ['MarketOdds_PreviousRun', 'MarketOdds_2ndPreviousRun', 'OddsAvg2']
Remaining columns after leakage drop: ['Race_Time', 'Race_ID', 'Course', 'Distance', 'distanceYards', 'Prize', 'Going', 'Horse', 'Trainer', 'Jockey', 'Runners', 'Age', 'Speed_PreviousRun', 'Speed_2ndPreviousRun', 'NMFPLTO', 'MarketOdds_PreviousRun', 'MarketOdds_2ndPreviousRun', 'TrainerRating', 'JockeyRating', 'daysSinceLastRun', 'SireRating', 'DamsireRating', 'meanRunners', 'Winner', 'rel_Speed_PreviousRun', 'rank_Speed_PreviousRun', 'z_Speed_PreviousRun', 'Field_Size', 'JockeyTrainerCombo', 'SpeedTrainerInt', 'RecentRun', 'SpeedAvg2', 'OddsAvg2', 'TrainerFormCombo', 'TrainerRecencyCombo', 'Distance_Bucket']


💾 Feature sets saved to data/interim/
