In [None]:
# ============================================================
# HYBRID SARIMA-LSTM: Seasonal Demand Forecasting (Per Category)
# Dataset: consolidated_file_cleaned_v2.csv
# ============================================================

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input
from tensorflow.keras.callbacks import EarlyStopping

# -----------------------------
# 1. Load dataset
# -----------------------------
FILE = "consolidated_file_cleaned_v2.csv"
df = pd.read_csv(FILE)

# Clean and prepare
df['time'] = pd.to_datetime(df['time'])
df = df.sort_values(by='time')
df = df[df['suitable_for_seasonal_analysis'] == True]

# Focus on monthly demand column
df['month'] = df['time'].dt.to_period('M').dt.to_timestamp()
category_data = df.groupby(['month','second-level_category'])['sold/m'].sum().reset_index()

# -----------------------------
# 2. SARIMA fitting helper
# -----------------------------
def safe_sarima_fit(ts):
    """Attempt SARIMA fitting with fallback parameters."""
    sarima_orders = [
        ((1,1,1), (1,1,1,12)),
        ((0,1,1), (1,1,1,12)),
        ((1,1,0), (0,1,1,12)),
        ((0,1,1), (0,1,1,12)),
        ((1,0,0), (1,0,0,12))
    ]
    for order, sorder in sarima_orders:
        try:
            model = SARIMAX(
                ts,
                order=order,
                seasonal_order=sorder,
                enforce_stationarity=False,
                enforce_invertibility=False
            )
            fitted = model.fit(disp=False)
            return fitted
        except Exception:
            continue
    return None

# -----------------------------
# 3. Hybrid SARIMA + LSTM
# -----------------------------
hybrid_results = []
FORECAST_STEPS = 12  # forecast 12 months ahead

for cat in category_data['second-level_category'].unique():
    print(f"\nProcessing Category: {cat}")

    data_cat = category_data[category_data['second-level_category'] == cat]
    ts = data_cat.set_index('month')['sold/m']

    # Skip categories with insufficient variation
    if len(ts) < 18 or ts.sum() == 0 or ts.nunique() == 1:
        print(f"Skipping {cat}: not enough data or variation")
        continue

    # -----------------------------
    # SARIMA Model
    # -----------------------------
    sarima_fit = safe_sarima_fit(ts)
    if sarima_fit is None:
        print(f"⚠️ Skipping {cat}: SARIMA could not fit")
        continue

    sarima_forecast = sarima_fit.forecast(steps=FORECAST_STEPS)
    residuals = sarima_fit.resid

    # -----------------------------
    # LSTM on residuals
    # -----------------------------
    scaler = MinMaxScaler()
    scaled_res = scaler.fit_transform(residuals.values.reshape(-1,1))

    X, y = [], []
    for i in range(12, len(scaled_res)):
        X.append(scaled_res[i-12:i])
        y.append(scaled_res[i])
    X, y = np.array(X), np.array(y)

    lstm_model = Sequential([
        Input(shape=(X.shape[1], 1)),
        LSTM(32, activation='tanh', return_sequences=True),
        LSTM(16, activation='tanh'),
        Dense(1)
    ])
    lstm_model.compile(optimizer='adam', loss='mse')

    es = EarlyStopping(monitor='loss', patience=10, restore_best_weights=True)
    lstm_model.fit(X, y, epochs=50, batch_size=8, verbose=0, callbacks=[es])

    # Predict future residuals
    last_sequence = scaled_res[-12:]
    residual_preds = []
    seq = last_sequence.copy()
    for _ in range(FORECAST_STEPS):
        inp = seq.reshape(1, 12, 1)
        pred = lstm_model.predict(inp, verbose=0)
        residual_preds.append(pred[0][0])
        seq = np.roll(seq, -1)
        seq[-1] = pred

    residual_forecast = scaler.inverse_transform(np.array(residual_preds).reshape(-1,1)).flatten()

    # -----------------------------
    # Hybrid forecast = SARIMA + LSTM residuals
    # -----------------------------
    hybrid_forecast = sarima_forecast.values + residual_forecast

    forecast_dates = pd.date_range(start=ts.index.max() + pd.offsets.MonthBegin(), 
                                   periods=FORECAST_STEPS, freq='MS')

    for d, f in zip(forecast_dates, hybrid_forecast):
        hybrid_results.append({
            'category': cat,
            'forecast_date': d,
            'predicted_sold_per_month': round(max(f,0),2)
        })

# -----------------------------
# 4. Save to CSV
# -----------------------------
hybrid_df = pd.DataFrame(hybrid_results)
hybrid_df.to_csv("hybrid_sarima_lstm_forecasts.csv", index=False)
print("\n✅ Hybrid forecasts saved to: hybrid_sarima_lstm_forecasts.csv")
hybrid_df.head()


Processing Category: Accessories

Processing Category: Accessories Sets & Packages

Processing Category: Additional Accessories

Processing Category: Alcoholic Beverages

Processing Category: Amplifiers & Mixers

Processing Category: Anklets

Processing Category: Art Supplies

Processing Category: Audio & Video Cables & Converters

Processing Category: Automobile Exterior Accessories

Processing Category: Automobile Interior Accessories

Processing Category: Automobile Spare Parts

Processing Category: Automotive Care

Processing Category: Automotive Keychains & Key Covers

Processing Category: Automotive Oils & Lubes

Processing Category: Automotive Tools

Processing Category: Baby & Kids Accessories

Processing Category: Baby Clothes

Processing Category: Baby Healthcare

Processing Category: Baby Mittens & Footwear

Processing Category: Baby Safety

Processing Category: Baby Travel Essentials

Processing Category: Backpacks

Processing Category: Bag Accessories

Processing Category

Unnamed: 0,category,forecast_date,predicted_sold_per_month
0,Accessories,2025-12-01,39714003.36
1,Accessories,2026-01-01,38966048.66
2,Accessories,2026-02-01,39160454.72
3,Accessories,2026-03-01,40002218.83
4,Accessories,2026-04-01,40203148.68


In [4]:
# --- Evaluation Metrics: RMSE, MAE, MAPE ---
from sklearn.metrics import mean_squared_error, mean_absolute_error

def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    nonzero = y_true != 0
    return np.mean(np.abs((y_true[nonzero] - y_pred[nonzero]) / y_true[nonzero])) * 100 if np.any(nonzero) else np.nan

# Evaluate only for categories with enough data and actuals available
metrics = []
for cat in category_data['second-level_category'].unique():
    data_cat = category_data[category_data['second-level_category'] == cat]
    ts = data_cat.set_index('month')['sold/m']
    if len(ts) < 18 or ts.sum() == 0 or ts.nunique() == 1:
        continue
    # Get actuals for the last FORECAST_STEPS months
    actuals = ts[-FORECAST_STEPS:]
    preds = hybrid_df[hybrid_df['category'] == cat]['predicted_sold_per_month'].values[:len(actuals)]
    if len(actuals) == 0 or len(preds) == 0:
        continue
    rmse = np.sqrt(mean_squared_error(actuals, preds))
    mae = mean_absolute_error(actuals, preds)
    mape = mean_absolute_percentage_error(actuals, preds)
    metrics.append({'category': cat, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape})
    print(f"Category: {cat}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  MAE: {mae:.4f}")
    print(f"  MAPE: {mape:.2f}%\n")

metrics_df = pd.DataFrame(metrics)
metrics_df.head()

Category: Accessories
  RMSE: 35551344.6916
  MAE: 34775912.0008
  MAPE: 1194.57%

Category: Accessories Sets & Packages
  RMSE: 83879.2104
  MAE: 69651.4633
  MAPE: 552.45%

Category: Additional Accessories
  RMSE: 249499.8460
  MAE: 128214.0142
  MAPE: 12.26%

Category: Alcoholic Beverages
  RMSE: 88240.6803
  MAE: 69148.8217
  MAPE: 550.37%

Category: Amplifiers & Mixers
  RMSE: 31429.5962
  MAE: 15010.5833
  MAPE: 100.00%

Category: Anklets
  RMSE: 1975.1441
  MAE: 1822.8517
  MAPE: 37.72%

Category: Art Supplies
  RMSE: 7007077.6798
  MAE: 6170425.4125
  MAPE: 802.41%

Category: Audio & Video Cables & Converters
  RMSE: 335463.6950
  MAE: 171788.8792
  MAPE: 1125.70%

Category: Automobile Exterior Accessories
  RMSE: 174531.9704
  MAE: 138294.2700
  MAPE: 111.59%

Category: Automobile Interior Accessories
  RMSE: 247171.4278
  MAE: 228448.1567
  MAPE: 71.84%

Category: Automobile Spare Parts
  RMSE: 2043100.2791
  MAE: 1885847.0550
  MAPE: 537.59%

Category: Automotive Care
  RMSE

Unnamed: 0,category,RMSE,MAE,MAPE
0,Accessories,35551340.0,34775910.0,1194.573836
1,Accessories Sets & Packages,83879.21,69651.46,552.453204
2,Additional Accessories,249499.8,128214.0,12.260387
3,Alcoholic Beverages,88240.68,69148.82,550.373733
4,Amplifiers & Mixers,31429.6,15010.58,100.0


In [5]:
# Display metrics table for first 10 categories
metrics_df = pd.DataFrame(metrics)
print(f"\n✅ Evaluation complete for {len(metrics_df)} categories")
metrics_df.head(10)


✅ Evaluation complete for 214 categories


Unnamed: 0,category,RMSE,MAE,MAPE
0,Accessories,35551340.0,34775910.0,1194.573836
1,Accessories Sets & Packages,83879.21,69651.46,552.453204
2,Additional Accessories,249499.8,128214.0,12.260387
3,Alcoholic Beverages,88240.68,69148.82,550.373733
4,Amplifiers & Mixers,31429.6,15010.58,100.0
5,Anklets,1975.144,1822.852,37.722886
6,Art Supplies,7007078.0,6170425.0,802.406606
7,Audio & Video Cables & Converters,335463.7,171788.9,1125.698528
8,Automobile Exterior Accessories,174532.0,138294.3,111.587263
9,Automobile Interior Accessories,247171.4,228448.2,71.838544
