# ðŸ›’ Capstone Phase 2: Univariate Modeling

**Goal**: Predict future sales using **ONLY** the history of sales. 
No prices, no promotions, no holiday info. Just: *"Sales were 10, 12, 10... what comes next?"*

**Why start here?** 
This tells us the "signal strength" of the time-series pattern itself. If a Univariate model is 90% accurate, you don't need complex data pipelines.

---

## 1. Setup & Load Data

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from statsforecast import StatsForecast
from statsforecast.models import AutoARIMA, SeasonalNaive
from prophet import Prophet

# Load Data
df = pd.read_parquet('../data/raw/m5_lite_synthetic.parquet')

# 1. Prepare for StatsForecast (Unique ID, ds, y)
df_sf = df.copy()
df_sf['unique_id'] = df_sf['store_id'].astype(str) + '_' + df_sf['item_id'].astype(str)
df_sf = df_sf.rename(columns={'date': 'ds', 'sales': 'y'})

# Subset for speed (Top 5 items)
top_items = df_sf['unique_id'].unique()[:5]
df_subset = df_sf[df_sf['unique_id'].isin(top_items)]

# 2. Prepare for LightGBM (Lags Only)
# We strictly use target lags. No 'is_promo', no 'sell_price'.
def create_univariate_features(df):
    df = df.sort_values(['store_id', 'item_id', 'date']).copy()
    for lag in [7, 14, 28]:
        df[f'lag_{lag}'] = df.groupby(['store_id', 'item_id'])['sales'].shift(lag)
    return df.dropna()

df_lgb = create_univariate_features(df)
df_lgb = df_lgb[df_lgb['store_id'].astype(str) + '_' + df_lgb['item_id'].astype(str).isin(top_items)]

## 2. Statistical (AutoARIMA)
ARIMA = AutoRegressive (Lags) + Integrated (Differencing) + Moving Average.
It is the definition of Univariate modeling.

In [None]:
sf = StatsForecast(
    models=[AutoARIMA(season_length=7)],
    freq='D',
    n_jobs=-1
)

horizon = 28
cv_stats = sf.cross_validation(
    df=df_subset[['unique_id', 'ds', 'y']],
    h=horizon,
    step_size=horizon,
    n_windows=1
)

scores = cv_stats.drop(columns=['ds', 'cutoff']).groupby('unique_id').apply(
    lambda x: np.abs(x['y'] - x['AutoARIMA']).mean()  # MAE
)
print(f"Average MAE (AutoARIMA): {scores.mean():.2f}")

## 3. Prophet (Univariate)
Here we use Prophet **without** `add_regressor`. It only sees the trend and weekly seasonality curve.

In [None]:
mae_prophet = []

for uid in top_items:
    item_df = df_subset[df_subset['unique_id'] == uid]
    train = item_df.iloc[:-horizon]
    test = item_df.iloc[-horizon:]
    
    m = Prophet(daily_seasonality=False, weekly_seasonality=True)
    m.fit(train[['ds', 'y']])
    
    future = m.make_future_dataframe(periods=horizon)
    forecast = m.predict(future)
    
    preds = forecast.tail(horizon)['yhat'].values
    mae = np.mean(np.abs(test['y'].values - preds))
    mae_prophet.append(mae)

print(f"Average MAE (Prophet Univariate): {np.mean(mae_prophet):.2f}")

## 4. LightGBM (Univariate / Lags Only)
We train a Gradient Boosting model, but we starve it of info. It only gets `lag_7`, `lag_14`, `lag_28`.
It has no idea if it's Christmas or if there's a 50% off sale.

In [None]:
features = ['lag_7', 'lag_14', 'lag_28']
target = 'sales'

# Split
test_start = df_lgb['date'].max() - pd.Timedelta(days=28)
train_lgb = df_lgb[df_lgb['date'] < test_start]
test_lgb = df_lgb[df_lgb['date'] >= test_start]

model = lgb.LGBMRegressor(verbose=-1)
model.fit(train_lgb[features], train_lgb[target])

preds = model.predict(test_lgb[features])
mae_lgb = np.mean(np.abs(test_lgb[target] - preds))

print(f"Average MAE (LightGBM Univariate): {mae_lgb:.2f}")