# SARIMAX Modeling for All Families

This notebook performs the following steps:

1. **Import necessary libraries.**
2. **Configure file paths and parameters.**
3. **Read `train.csv`** and identify unique product families.
4. **Loop through each family** to:
   - Aggregate daily sales and promotions across all stores.
   - Split data into training and test sets (last 28 days as test).
   - Use `pmdarima.auto_arima` to find best `(p,d,q)(P,D,Q,m)` parameters.
   - Fit a SARIMAX model (log‐transformed sales as endog, promotions as exog).
   - Save each fitted model to the `Models/` folder.
   - Forecast the last 14 days and compute performance metrics: RMSE, MAE, R².
5. **Save performance metrics** for all families into `performance_results.csv`.

> **Note:**
> - Adjust `AUTO_ARIMA_ARGS` if your data’s frequency or seasonal assumptions differ.
> - This notebook assumes data is daily (`freq = "D"`) with a weekly seasonality (`m=7`).
> - Models will be saved in `Models/` and metrics saved in `performance_results.csv`.


In [1]:
import os
import warnings

import numpy as np
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX, SARIMAXResults
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from pmdarima import auto_arima

# Suppress warnings (optional)
warnings.filterwarnings("ignore")

In [None]:
# ────────────────────────────────────────────────────────────────────────────────
# 1. CONFIGURABLE PARAMETERS
# ────────────────────────────────────────────────────────────────────────────────

# File path for train.csv (relative to notebook location)
# (Adjust this path if your train.csv is in a different folder.)
TRAIN_CSV = "../Data/DM/train.csv"

# Folder to store all SARIMAX models
ALL_MODELS_DIR = "Models"
os.makedirs(ALL_MODELS_DIR, exist_ok=True)

# Number of last days to reserve for test set (forecast horizon)
N_TEST = 14

# Frequency of the time series: 'D' = daily, 'W' = weekly, 'M' = monthly, etc.
FREQ = "D"

# Arguments for auto_arima to find best (p,d,q)(P,D,Q,m)
AUTO_ARIMA_ARGS = {
    "seasonal": True,
    "m": 7,                 # Assumes a weekly seasonal pattern (7 days)
    "start_p": 0, "start_q": 0, "max_p": 5, "max_q": 5,
    "start_P": 0, "start_Q": 0, "max_P": 2, "max_Q": 2,
    "d": None, "D": None,
    "trace": False,         # Set True to see details of auto_arima
    "error_action": "ignore",
    "suppress_warnings": True,
    "stepwise": True,
    "information_criterion": "aic",
}

In [3]:
# ────────────────────────────────────────────────────────────────────────────────
# 2. READ TRAIN.CSV & IDENTIFY UNIQUE FAMILIES
# ────────────────────────────────────────────────────────────────────────────────

print(f"Reading file: {TRAIN_CSV}")
df_all = pd.read_csv(TRAIN_CSV, parse_dates=["date"])

# Extract unique product families
unique_families = df_all["family"].unique()
print(f"Found {len(unique_families)} unique families.")

Reading file: ../Data/train.csv
Found 33 unique families.
Found 33 unique families.


In [4]:
# ────────────────────────────────────────────────────────────────────────────────
# 3. LOOP THROUGH EACH FAMILY
# ────────────────────────────────────────────────────────────────────────────────

performance_list = []

for family in unique_families:
    print("===================================================")
    print(f"Processing family: {family}")

    # Filter data for this family and aggregate daily across all stores
    df_family = df_all[df_all["family"] == family].copy()
    if df_family.empty:
        print(f"  No data for family {family}, skipping.")
        continue

    # Aggregate: sum 'sales' and 'onpromotion' by date
    df_agg = (
        df_family
        .groupby("date")[["sales", "onpromotion"]]
        .sum()
        .rename(columns={"onpromotion": "onpromo"})
        .sort_index()
    )

    # Convert index to PeriodIndex (daily) and enforce a continuous date index
    df_agg.index = pd.DatetimeIndex(df_agg.index).to_period(FREQ)
    df_agg = df_agg.asfreq(FREQ)
    df_agg["sales"]   = df_agg["sales"].fillna(0).astype(float)
    df_agg["onpromo"] = df_agg["onpromo"].fillna(0).astype(float)

    # Skip if not enough data points to hold out N_TEST days
    if len(df_agg) < N_TEST + 1:
        print(f"  Only {len(df_agg)} data points for {family}, skipping.")
        continue

    # Log-transform sales (log1p) to stabilize zeros
    df_agg["y_log"] = np.log1p(df_agg["sales"])

    # Split into train/test (last N_TEST as test)
    train_df = df_agg.iloc[:-N_TEST]
    test_df  = df_agg.iloc[-N_TEST:]
    print(f"  Train length: {len(train_df)}, Test length: {len(test_df)}")

    ts_log_train = train_df["y_log"]
    exog_train   = train_df["onpromo"]
    exog_test    = test_df["onpromo"]

    # Auto-ARIMA to find optimal SARIMAX orders
    try:
        print("  Running auto_arima for parameter tuning...")
        model_auto = auto_arima(
            ts_log_train,
            exogenous=exog_train.values.reshape(-1, 1),
            **AUTO_ARIMA_ARGS
        )
        order_opt          = model_auto.order
        seasonal_order_opt = model_auto.seasonal_order
        print(f"  Found order: {order_opt}, seasonal_order: {seasonal_order_opt}")
    except Exception as e:
        print(f"  auto_arima error for {family}: {e}")
        continue

    # Fit SARIMAX on log1p(sales) with exogenous onpromo
    try:
        print("  Fitting SARIMAX model...")
        model = SARIMAX(
            ts_log_train,
            exog=exog_train,
            order=order_opt,
            seasonal_order=seasonal_order_opt,
            enforce_stationarity=False,
            enforce_invertibility=False
        )
        sarimax_fit = model.fit(disp=False)
    except Exception as e:
        print(f"  SARIMAX fit error for {family}: {e}")
        continue

    # Save the fitted model to ALL_MODELS_DIR
    safe_name = family.replace(" ", "_").replace("/", "_").replace("\\", "_")
    model_filename = f"sarimax_family_{safe_name}.pkl"
    model_path = os.path.join(ALL_MODELS_DIR, model_filename)
    try:
        sarimax_fit.save(model_path)
        print(f"  Model saved to {model_path}")
    except Exception as e:
        print(f"  Error saving model for {family}: {e}")

    # Forecast last N_TEST days (log scale), then convert back to original scale
    try:
        pred_log = sarimax_fit.get_forecast(steps=N_TEST, exog=exog_test).predicted_mean
        pred_sales = np.expm1(pred_log)
    except Exception as e:
        print(f"  Forecast error for {family}: {e}")
        continue

    # Compute performance metrics on original scale
    actual_sales = test_df["sales"].values
    rmse = np.sqrt(mean_squared_error(actual_sales, pred_sales))
    mae  = mean_absolute_error(actual_sales, pred_sales)
    r2   = r2_score(actual_sales, pred_sales)
    print(f"  Metrics for {family}: RMSE={rmse:.2f}, MAE={mae:.2f}, R²={r2:.3f}")

    performance_list.append({
        "family": family,
        "RMSE": rmse,
        "MAE": mae,
        "R2": r2
    })

# After the loop, create a DataFrame of all performance metrics
perf_df = pd.DataFrame(performance_list)
perf_df

Processing family: AUTOMOTIVE
  Train length: 1656, Test length: 28
  Running auto_arima for parameter tuning...
  Found order: (0, 1, 4), seasonal_order: (1, 0, 1, 7)
  Fitting SARIMAX model...
  Found order: (0, 1, 4), seasonal_order: (1, 0, 1, 7)
  Fitting SARIMAX model...
  Model saved to Models\sarimax_family_AUTOMOTIVE.pkl
  Metrics for AUTOMOTIVE: RMSE=54.01, MAE=38.84, R²=0.579
Processing family: BABY CARE
  Train length: 1656, Test length: 28
  Running auto_arima for parameter tuning...
  Model saved to Models\sarimax_family_AUTOMOTIVE.pkl
  Metrics for AUTOMOTIVE: RMSE=54.01, MAE=38.84, R²=0.579
Processing family: BABY CARE
  Train length: 1656, Test length: 28
  Running auto_arima for parameter tuning...
  Found order: (2, 1, 1), seasonal_order: (1, 0, 1, 7)
  Fitting SARIMAX model...
  Found order: (2, 1, 1), seasonal_order: (1, 0, 1, 7)
  Fitting SARIMAX model...
  Model saved to Models\sarimax_family_BABY_CARE.pkl
  Metrics for BABY CARE: RMSE=4.07, MAE=3.46, R²=-0.091
Pr

Unnamed: 0,family,RMSE,MAE,R2
0,AUTOMOTIVE,54.01362,38.83867,0.579162
1,BABY CARE,4.066891,3.455096,-0.091135
2,BEAUTY,86.148931,69.973589,0.054929
3,BEVERAGES,62899.859675,55026.00805,-2.441777
4,BOOKS,2.120236,1.994549,-4.093057
5,BREAD/BAKERY,2760.089011,2283.606417,0.520756
6,CELEBRATION,189.524901,142.511266,-0.180126
7,CLEANING,19002.601688,13131.21332,-0.178348
8,DAIRY,5781.207215,4510.309199,0.285011
9,DELI,2315.506038,1924.258297,0.294238


In [5]:
# ────────────────────────────────────────────────────────────────────────────────
# 4. SAVE PERFORMANCE METRICS TO CSV
# ────────────────────────────────────────────────────────────────────────────────

perf_csv_path = "performance_results.csv"
perf_df.to_csv(perf_csv_path, index=False)
print(f"Performance results saved to {perf_csv_path}") 
perf_df

Performance results saved to performance_results.csv


Unnamed: 0,family,RMSE,MAE,R2
0,AUTOMOTIVE,54.01362,38.83867,0.579162
1,BABY CARE,4.066891,3.455096,-0.091135
2,BEAUTY,86.148931,69.973589,0.054929
3,BEVERAGES,62899.859675,55026.00805,-2.441777
4,BOOKS,2.120236,1.994549,-4.093057
5,BREAD/BAKERY,2760.089011,2283.606417,0.520756
6,CELEBRATION,189.524901,142.511266,-0.180126
7,CLEANING,19002.601688,13131.21332,-0.178348
8,DAIRY,5781.207215,4510.309199,0.285011
9,DELI,2315.506038,1924.258297,0.294238
