In [2]:
import os
import warnings
import pandas as pd
import numpy as np
import statsmodels.api as sm
from tqdm import tqdm

warnings.filterwarnings("ignore")

FILE = "consolidated_file_cleaned_v2.csv"
TARGET = "sold/m"
DATE_COL = "time"
CATEGORY_COL = "second-level_category"

# ========== Load & preprocess ==========
df = pd.read_csv(FILE)
df[DATE_COL] = pd.to_datetime(df[DATE_COL])
df = df[[DATE_COL, CATEGORY_COL, TARGET]].dropna()

# Convert numeric field safely
df[TARGET] = pd.to_numeric(df[TARGET], errors="coerce").fillna(0)

# Aggregate monthly
monthly = df.groupby([CATEGORY_COL, pd.Grouper(key=DATE_COL, freq="MS")])[TARGET].sum().reset_index()

# Pivot per category series
categories = monthly[CATEGORY_COL].unique()

os.makedirs("sarima_results", exist_ok=True)

all_resids = []

print("\n=== Training SARIMA per Category ===\n")
for cat in tqdm(categories):
    series = monthly[monthly[CATEGORY_COL] == cat].set_index(DATE_COL)[TARGET].asfreq("MS")

    # Skip extremely short series
    if len(series) < 18:
        continue

    # Auto seasonal order for monthly (12-month season)
    # You can adjust orders if needed for accuracy
    try:
        model = sm.tsa.statespace.SARIMAX(
            series,
            order=(1,1,1),
            seasonal_order=(1,1,1,12),
            enforce_stationarity=False,
            enforce_invertibility=False
        ).fit(disp=False)

        model.save(f"sarima_results/{cat}.pkl")

        resid = model.resid.dropna()
        tmp = pd.DataFrame({
            CATEGORY_COL: cat,
            "date": resid.index,
            "residual": resid.values
        })
        all_resids.append(tmp)
    except:
        pass

res_df = pd.concat(all_resids, ignore_index=True)
res_df.to_csv("sarima_residuals.csv", index=False)

print("\n✨ SARIMA Training Complete!")
print("Residuals file saved as: sarima_residuals.csv")


=== Training SARIMA per Category ===



100%|██████████| 215/215 [00:11<00:00, 19.32it/s]


✨ SARIMA Training Complete!
Residuals file saved as: sarima_residuals.csv





In [None]:
# --- Evaluation Metrics: RMSE, MAE, MAPE ---
from sklearn.metrics import mean_squared_error, mean_absolute_error

def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    nonzero = y_true != 0
    return np.mean(np.abs((y_true[nonzero] - y_pred[nonzero]) / y_true[nonzero])) * 100 if np.any(nonzero) else np.nan

# Store metrics for each category
sarima_metrics = []

for cat in tqdm(categories):
    series = monthly[monthly[CATEGORY_COL] == cat].set_index(DATE_COL)[TARGET].asfreq("MS")
    if len(series) < 18:
        continue
    try:
        model = sm.tsa.statespace.SARIMAX(
            series,
            order=(1,1,1),
            seasonal_order=(1,1,1,12),
            enforce_stationarity=False,
            enforce_invertibility=False
        ).fit(disp=False)
        model.save(f"sarima_results/{cat}.pkl")
        resid = model.resid.dropna()
        tmp = pd.DataFrame({
            CATEGORY_COL: cat,
            "date": resid.index,
            "residual": resid.values
        })
        all_resids.append(tmp)
        # --- Evaluation ---
        y_true = series[-len(resid):]
        y_pred = series[-len(resid):] - resid
        rmse = mean_squared_error(y_true, y_pred, squared=False)
        mae = mean_absolute_error(y_true, y_pred)
        mape = mean_absolute_percentage_error(y_true, y_pred)
        sarima_metrics.append({'category': cat, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape})
        print(f"Category: {cat}")
        print(f"  RMSE: {rmse:.4f}")
        print(f"  MAE: {mae:.4f}")
        print(f"  MAPE: {mape:.2f}%\n")
    except:
        pass

metrics_df = pd.DataFrame(sarima_metrics)
metrics_df.head()