# 02 Models

We train multiple models with different bias-variance behaviours:
- seasonal naive baseline
- ridge regression
- histogram gradient boosting
- random forest

We store predictions for consistent downstream evaluation.


In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor

DATA = Path("../data/processed/series.parquet")
OUT = Path("../data/processed/preds.parquet")

df = pd.read_parquet(DATA).sort_values("date").reset_index(drop=True)

# Features
df["dow"] = df["date"].dt.dayofweek
df["month"] = df["date"].dt.month
df["lag_1"] = df["y"].shift(1)
df["lag_7"] = df["y"].shift(7)

df["yhat_seasonal_7"] = df["y"].shift(7)

df = df.dropna().reset_index(drop=True)

split = int(len(df) * 0.8)
train = df.iloc[:split].copy()
test  = df.iloc[split:].copy()

features = ["dow","month","lag_1","lag_7"]
target = "y"

# Ridge
ridge = Pipeline([("scaler", StandardScaler()), ("model", Ridge(alpha=1.0))])
ridge.fit(train[features], train[target])
test["yhat_ridge"] = ridge.predict(test[features])

# HistGBM
hgb = HistGradientBoostingRegressor(max_depth=5, learning_rate=0.05, max_iter=300, random_state=42)
hgb.fit(train[features], train[target])
test["yhat_hgb"] = hgb.predict(test[features])

# RF
rf = RandomForestRegressor(n_estimators=300, min_samples_leaf=2, random_state=42, n_jobs=-1)
rf.fit(train[features], train[target])
test["yhat_rf"] = rf.predict(test[features])

preds = test[["date","y","is_shock_window","yhat_seasonal_7","yhat_ridge","yhat_hgb","yhat_rf"]].copy()
preds.to_parquet(OUT, index=False)
preds.head()


Unnamed: 0,date,y,is_shock_window,yhat_seasonal_7,yhat_ridge,yhat_hgb,yhat_rf
954,2022-08-19,113.853254,True,112.274894,111.869047,112.807209,112.814957
955,2022-08-20,106.412795,True,115.631201,112.856524,112.74915,111.095935
956,2022-08-21,114.057384,False,110.825789,107.436693,108.952533,111.099069
957,2022-08-22,117.26419,False,116.44918,113.51285,112.276181,111.16931
958,2022-08-23,113.586671,False,112.906129,113.314943,115.048717,111.055994
