In [None]:
import os
import pandas as pd
import numpy as np
import warnings
from statsmodels.tsa.statespace.sarimax import SARIMAX, SARIMAXResults
from pmdarima import auto_arima
from sklearn.metrics import mean_squared_error

# Suppress warnings (optional)
warnings.filterwarnings("ignore")

In [None]:
# ——— Config ———
DATA_PATH   = '../data/dm/train.csv'
MODEL_DIR   = 'models'
METRICS_CSV  = 'sarimax_family_rmse.csv'
os.makedirs(MODEL_DIR, exist_ok=True)

In [None]:
# ——— Load & Prep ———
df = pd.read_csv(DATA_PATH, parse_dates=['date'])
df.drop(columns=['id'], inplace=True)  # drop id column once and for all

families = df['family'].unique()
total    = len(families)

def make_safe(name: str) -> str:
    # replace filesystem-unsafe chars so we get one flat file per family
    return (
        name
        .replace('/', '_')
        .replace(' ', '_')
        .replace(',', '')
        .upper()
    )

metrics = []

for idx, fam in enumerate(families, start=1):
    print(f"[{idx}/{total}] Processing family: {fam}")
    safe_fam   = make_safe(fam)
    model_path = os.path.join(MODEL_DIR, f"sarimax_{safe_fam}.pkl")

    # 1) Aggregate across all stores
    sub = df[df['family'] == fam]
    ts  = sub.groupby('date')[['sales','onpromotion']].sum()
    ts.index = pd.DatetimeIndex(ts.index).to_period('D')
    ts['y']  = np.log1p(ts['sales'])

    # 2) Train/test split (last 14 days)
    n_test     = 14
    train, test = ts.iloc[:-n_test], ts.iloc[-n_test:]
    exog_train = train[['onpromotion']]
    exog_test  = test [['onpromotion']]

    # 3) Load existing or fit & save new
    if os.path.exists(model_path):
        fit = SARIMAXResults.load(model_path)
    else:
        # pick (p,d,q)(P,D,Q,7) via auto_arima
        step = auto_arima(
            train['y'],
            seasonal=True, m=7,
            start_p=0, start_q=0, max_p=3, max_q=3,
            start_P=0, start_Q=0, max_P=1, max_Q=1,
            d=None, D=None,
            trace=False, error_action='ignore',
            suppress_warnings=True
        )
        order          = step.order
        seasonal_order = step.seasonal_order

        model = SARIMAX(
            train['y'],
            exog=exog_train,
            order=order,
            seasonal_order=seasonal_order,
            enforce_stationarity=False,
            enforce_invertibility=False
        )
        fit = model.fit(disp=False)
        fit.save(model_path)

    # 4) Forecast & evaluate
    pred_log = fit.get_forecast(steps=n_test, exog=exog_test).predicted_mean
    pred     = np.expm1(pred_log)

    mse  = mean_squared_error(test['sales'], pred)
    rmse = np.sqrt(mse)

    metrics.append({
        'family': fam,
        'rmse':    rmse,
        'model':   model_path
    })

In [None]:
# ——— save a summary ———
metrics_df = pd.DataFrame(metrics).sort_values('rmse')
metrics_df.to_csv(METRICS_CSV, index=False)

print("\nAll done! RMSE summary:")
print(metrics_df)