# Lab 06: Forecast Evaluation
**BSAD 8310: Business Forecasting — University of Nebraska at Omaha**

## Objectives
1. Compute RMSE, MAE, MAPE, and MASE for a set of models
2. Implement walk-forward (rolling-origin) validation
3. Plot horizon accuracy profiles for five model families
4. Run the Diebold-Mariano test for equal predictive accuracy
5. Combine forecasts (equal weights, RMSE weights, OLS weights)

## Dataset
- **RSXFS**: Advance Retail Sales — Retail and Food Services (monthly, SA, millions USD)
- Source: FRED (Federal Reserve Bank of St. Louis)
- Fallback: statsmodels macrodata (quarterly GDP)
- Walk-forward: 36 expanding-window origins, H = 12 horizons

In [None]:
# ── 1. Setup ──────────────────────────────────────────────────────────────────
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
from pathlib import Path
from scipy import stats

np.random.seed(42)

# UNO color palette
UNO_BLUE  = '#005CA9'
UNO_RED   = '#E41C38'
UNO_GRAY  = '#525252'
UNO_GREEN = '#15803d'
UNO_ORANGE= '#d97706'

plt.rcParams.update({
    'figure.dpi': 150,
    'axes.spines.top': False,
    'axes.spines.right': False,
    'axes.titlesize': 11,
    'axes.labelsize': 10,
    'legend.fontsize': 9,
})

FIGURES = Path('../Figures')
FIGURES.mkdir(exist_ok=True)
print('Setup complete.')

In [None]:
# ── 2. Load Data ──────────────────────────────────────────────────────────────
try:
    import pandas_datareader.data as web
    start, end = '2000-01-01', '2023-12-31'
    rsxfs = web.DataReader('RSXFS', 'fred', start, end)
    series = rsxfs['RSXFS'].dropna()
    series.index = pd.PeriodIndex(series.index, freq='M')
    FREQ = 'M'
    m    = 12  # seasonal period
    print(f'FRED data: {len(series)} months, {series.index[0]} to {series.index[-1]}')
except Exception as e:
    print(f'FRED unavailable ({e}); using statsmodels macrodata fallback.')
    import statsmodels.api as sm
    macro = sm.datasets.macrodata.load_pandas().data
    series = macro['realgdp'].copy()
    series.index = pd.period_range('1959Q1', periods=len(series), freq='Q')
    FREQ = 'Q'
    m    = 4
    print(f'Fallback: {len(series)} quarters, {series.index[0]} to {series.index[-1]}')

print(series.describe().round(1))

In [None]:
# ── 3. Error Metric Functions ─────────────────────────────────────────────────
def rmse(actual, forecast):
    return np.sqrt(np.mean((np.asarray(actual) - np.asarray(forecast))**2))

def mae(actual, forecast):
    return np.mean(np.abs(np.asarray(actual) - np.asarray(forecast)))

def mape(actual, forecast):
    a, f = np.asarray(actual, float), np.asarray(forecast, float)
    mask = a != 0
    return 100 * np.mean(np.abs((a[mask] - f[mask]) / a[mask]))

def mase(actual, forecast, train_series, m=12):
    """MASE: MAE relative to in-sample seasonal naive MAE."""
    train = np.asarray(train_series)
    naive_errors = np.abs(train[m:] - train[:-m])
    q_bar = naive_errors.mean()
    if q_bar == 0:
        return np.nan
    return mae(actual, forecast) / q_bar

def compute_metrics(actual, forecast, train_series=None, m=12):
    """Return dict of RMSE, MAE, MAPE, MASE."""
    result = {
        'RMSE': rmse(actual, forecast),
        'MAE':  mae(actual, forecast),
        'MAPE': mape(actual, forecast),
    }
    if train_series is not None:
        result['MASE'] = mase(actual, forecast, train_series, m)
    return result

# Quick demo: naive vs. drift on last 12 months
hold = series.iloc[-12:]
train_demo = series.iloc[:-12]
naive_fc  = np.full(12, float(train_demo.iloc[-m]))  # seasonal naive
drift_fc  = np.linspace(float(train_demo.iloc[-1]),
                         float(train_demo.iloc[-1]) +
                         (float(train_demo.iloc[-1]) - float(train_demo.iloc[0])) / len(train_demo),
                         12)

print('Demo metrics (last 12 periods):')
for name, fc in [('Seasonal Naive', naive_fc), ('Drift', drift_fc)]:
    m_dict = compute_metrics(hold.values, fc, train_demo.values, m)
    print(f"  {name}: RMSE={m_dict['RMSE']:.0f}  MAE={m_dict['MAE']:.0f}"
          f"  MAPE={m_dict['MAPE']:.1f}%  MASE={m_dict['MASE']:.2f}")

In [None]:
# ── 4. Walk-Forward Validation Framework ──────────────────────────────────────
def walk_forward_eval(series, model_fn, T0, H, window='expanding', window_size=None):
    """
    Walk-forward evaluation.

    Parameters
    ----------
    series      : pd.Series with PeriodIndex
    model_fn    : callable(train_series) -> array of length H
    T0          : int, index of first forecast origin (0-based)
    H           : int, forecast horizon
    window      : 'expanding' or 'rolling'
    window_size : int, required for rolling window

    Returns
    -------
    pd.DataFrame with columns: origin, horizon, actual, forecast, error
    """
    records = []
    n = len(series)
    for t in range(T0, n - H):
        if window == 'expanding':
            train = series.iloc[:t]
        elif window == 'rolling':
            if window_size is None:
                raise ValueError('window_size required for rolling window')
            start = max(0, t - window_size)
            train = series.iloc[start:t]
        else:
            raise ValueError(f'Unknown window type: {window}')

        try:
            fc = model_fn(train)
        except Exception:
            fc = np.full(H, np.nan)

        for h in range(1, H + 1):
            actual = float(series.iloc[t + h - 1])
            fcast  = float(fc[h - 1]) if h <= len(fc) else np.nan
            records.append({
                'origin':   t,
                'horizon':  h,
                'actual':   actual,
                'forecast': fcast,
                'error':    actual - fcast,
            })
    return pd.DataFrame(records)


# --- Define model callables ---------------------------------------------------
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.holtwinters import ExponentialSmoothing

def seasonal_naive_fn(train):
    last_season = train.values[-m:]
    reps = (H // m) + 1
    return np.tile(last_season, reps)[:H]

def sarima_fn(train):
    mod = SARIMAX(train.values,
                  order=(1, 1, 1),
                  seasonal_order=(0, 1, 1, m),
                  enforce_stationarity=False,
                  enforce_invertibility=False)
    res = mod.fit(disp=False)
    return res.forecast(H)

def ets_fn(train):
    mod = ExponentialSmoothing(
        train.values,
        trend='add', seasonal='add',
        seasonal_periods=m,
        initialization_method='estimated'
    )
    res = mod.fit(optimized=True)
    return res.forecast(H)

print('Model callables defined.')
print(f'Walk-forward config: T0={len(series)-36-H}, H={H}, window=expanding')

In [None]:
# ── 5. Run Walk-Forward for Each Model ────────────────────────────────────────
# Use last 36 origins so T0 leaves enough training data
H = 12
T0 = len(series) - 36 - H

print(f'Running walk-forward: T0={T0}, {len(series)-T0-H} origins, H={H}')

results = {}

print('Seasonal Naive...', end=' ')
results['Seasonal Naive'] = walk_forward_eval(
    series, seasonal_naive_fn, T0, H)
print('done.')

print('SARIMA(1,1,1)(0,1,1)...', end=' ')
results['SARIMA'] = walk_forward_eval(
    series, sarima_fn, T0, H)
print('done.')

print('ETS (add/add)...', end=' ')
results['ETS'] = walk_forward_eval(
    series, ets_fn, T0, H)
print('done.')

print('All walk-forward evaluations complete.')

In [None]:
# ── 6. Horizon Profiles and Summary Table ─────────────────────────────────────
train_for_mase = series.iloc[:T0].values

def horizon_rmse(df):
    return df.dropna().groupby('horizon').apply(
        lambda g: rmse(g['actual'], g['forecast'])
    )

def scalar_mase(df):
    d = df.dropna()
    return mase(d['actual'].values, d['forecast'].values, train_for_mase, m)

# Build horizon RMSE profiles
profiles = {name: horizon_rmse(df) for name, df in results.items()}

# Summary table
rows = []
for name, df in results.items():
    d = df.dropna()
    rows.append({
        'Model':    name,
        'RMSE(h=1)': profiles[name].get(1, np.nan),
        'RMSE(h=3)': profiles[name].get(3, np.nan),
        'RMSE(h=12)':profiles[name].get(12, np.nan),
        'MASE(avg)': scalar_mase(df),
    })
summary = pd.DataFrame(rows).set_index('Model').round(0)
print('Horizon accuracy summary:')
print(summary.to_string())

# ── Horizon profile plot ──────────────────────────────────────────────────────
fig, ax = plt.subplots(figsize=(8, 4))
colors = [UNO_GRAY, UNO_BLUE, UNO_GREEN]
for (name, profile), color in zip(profiles.items(), colors):
    ax.plot(profile.index, profile.values,
            color=color, lw=1.8, marker='o', markersize=4,
            label=name)
ax.set_xlabel('Forecast horizon $h$')
ax.set_ylabel('RMSE')
ax.set_title('Walk-Forward RMSE by Horizon', fontweight='bold')
ax.legend()
ax.yaxis.set_major_formatter(mticker.FuncFormatter(
    lambda x, _: f'{x:,.0f}'))
plt.tight_layout()
plt.savefig(FIGURES / 'lecture06_horizon_profile.png', bbox_inches='tight')
plt.show()
print('Horizon profile saved.')

In [None]:
# ── 7. Diebold-Mariano Test ───────────────────────────────────────────────────
import statsmodels.stats.sandwich_covariance as sw

def dm_test(errors1, errors2, h=1, loss='MSE'):
    """
    Diebold-Mariano test for equal predictive accuracy.
    H0: E[g(e1)] = E[g(e2)]  (equal loss).

    Parameters
    ----------
    errors1, errors2 : array-like, walk-forward errors
    h    : forecast horizon (for HAC bandwidth = h-1)
    loss : 'MSE' or 'MAE'

    Returns
    -------
    DM statistic, two-sided p-value, mean loss differential
    """
    e1 = np.asarray(errors1, float)
    e2 = np.asarray(errors2, float)
    mask = ~(np.isnan(e1) | np.isnan(e2))
    e1, e2 = e1[mask], e2[mask]

    if loss == 'MSE':
        d = e1**2 - e2**2
    elif loss == 'MAE':
        d = np.abs(e1) - np.abs(e2)
    else:
        raise ValueError('loss must be MSE or MAE')

    n = len(d)
    d_bar = d.mean()

    # HAC variance with Newey-West bandwidth max(1, h-1)
    bw = max(1, h - 1)
    d_reshaped = (d - d_bar).reshape(-1, 1)
    # Newey-West long-run variance estimate
    gamma0 = np.var(d, ddof=0)
    gamma_sum = sum(
        (1 - k / (bw + 1)) * np.mean((d[k:] - d_bar) * (d[:-k] - d_bar))
        for k in range(1, bw + 1)
    )
    lrv = gamma0 + 2 * gamma_sum
    se  = np.sqrt(max(lrv, 1e-10) / n)

    dm_stat = d_bar / se
    p_val   = 2 * (1 - stats.norm.cdf(abs(dm_stat)))
    return dm_stat, p_val, d_bar


# --- Run pairwise DM tests at h=1 -------------------------------------------
print('Diebold-Mariano test results (MSE loss, h=1):')
print('-' * 60)
pairs = [
    ('SARIMA', 'ETS'),
    ('SARIMA', 'Seasonal Naive'),
    ('ETS',    'Seasonal Naive'),
]
for m1, m2 in pairs:
    e1 = results[m1].query('horizon == 1')['error'].values
    e2 = results[m2].query('horizon == 1')['error'].values
    dm, pv, dbar = dm_test(e1, e2, h=1)
    sig = '***' if pv < 0.01 else '**' if pv < 0.05 else '*' if pv < 0.10 else ''
    print(f'  {m1} vs {m2}: DM={dm:.2f}, p={pv:.4f} {sig}, d_bar={dbar:.0f}')
print('Significance: *** p<0.01, ** p<0.05, * p<0.10')

# --- Loss differential plot for SARIMA vs ETS --------------------------------
e_sar = results['SARIMA'].query('horizon == 1')['error'].values
e_ets = results['ETS'].query('horizon == 1')['error'].values
min_len = min(len(e_sar), len(e_ets))
d_vals  = e_sar[:min_len]**2 - e_ets[:min_len]**2
d_bar_  = d_vals.mean()

fig, ax = plt.subplots(figsize=(9, 3))
colors_bar = [UNO_RED if v > 0 else UNO_GREEN for v in d_vals]
ax.bar(np.arange(1, len(d_vals) + 1), d_vals,
       color=colors_bar, alpha=0.75, width=0.8)
ax.axhline(d_bar_, color=UNO_BLUE, lw=2, ls='--',
           label=f'$\\bar{{d}}$ = {d_bar_:,.0f}')
ax.axhline(0, color='black', lw=0.6)
ax.set_xlabel('Walk-forward origin')
ax.set_ylabel('$d_t = e^2_{SARIMA} - e^2_{ETS}$')
ax.set_title('Loss Differential: SARIMA vs. ETS (h=1)', fontweight='bold')
ax.legend()
ax.yaxis.set_major_formatter(mticker.FuncFormatter(
    lambda x, _: f'{x/1e6:.1f}M' if abs(x) >= 1e6 else f'{x:,.0f}'))
plt.tight_layout()
plt.savefig(FIGURES / 'lecture06_dm_plot.png', bbox_inches='tight')
plt.show()
print('Loss differential plot saved.')

In [None]:
# ── 8. Forecast Combination ───────────────────────────────────────────────────
# Pivot to (origin, horizon) → forecast matrix
def get_forecast_matrix(results_dict, horizon=1):
    frames = {}
    for name, df in results_dict.items():
        sub = df.query('horizon == @horizon')[['origin', 'forecast', 'actual']].copy()
        sub = sub.set_index('origin')
        frames[name] = sub['forecast']
        actuals = sub['actual']  # same across all models
    return pd.DataFrame(frames), actuals


def equal_weight_combo(fc_df):
    return fc_df.mean(axis=1).values


def rmse_weight_combo(fc_df, errors_dict, horizon=1):
    rmse_vals = {}
    for name, df in errors_dict.items():
        sub = df.query('horizon == @horizon')['error'].values
        rmse_vals[name] = rmse(sub, np.zeros_like(sub))
    weights = pd.Series({k: 1/v for k, v in rmse_vals.items() if v > 0})
    weights = weights / weights.sum()
    combo = sum(fc_df[k] * w for k, w in weights.items())
    return combo.values, weights


def ols_weight_combo(fc_df, actuals):
    from numpy.linalg import lstsq
    X = fc_df.values
    y = actuals.values
    w, _, _, _ = lstsq(X, y, rcond=None)
    combo = X @ w
    return combo, w


# --- Evaluate combinations at h=1 --------------------------------------------
fc_df, actuals = get_forecast_matrix(results, horizon=1)
fc_df = fc_df.dropna()
actuals = actuals.loc[fc_df.index]

equal_fc = equal_weight_combo(fc_df)
rmse_fc, rmse_w = rmse_weight_combo(fc_df, results, horizon=1)
ols_fc,  ols_w  = ols_weight_combo(fc_df, actuals)

print('Combination results (h=1):')
print(f'RMSE weights: {dict(rmse_w.round(3))}')
print(f'OLS weights:  {dict(zip(fc_df.columns, ols_w.round(3)))}')
print()

combo_metrics = {}
for name, fc in [
    ('Best individual (SARIMA)', results['SARIMA'].query('horizon==1')['forecast'].dropna().values[:len(actuals)]),
    ('Equal weight',             equal_fc),
    ('RMSE weight',              rmse_fc),
    ('OLS weight',               ols_fc),
]:
    combo_metrics[name] = compute_metrics(
        actuals.values[:len(fc)], fc,
        series.iloc[:T0].values, m)

combo_df = pd.DataFrame(combo_metrics).T.round(1)
print('Combination accuracy comparison:')
print(combo_df.to_string())

# Run DM: equal-weight combo vs. best individual
print('\nDM test: Equal weight vs. SARIMA (h=1):')
e_sarima = results['SARIMA'].query('horizon==1')['error'].dropna().values[:len(actuals)]
e_combo  = actuals.values - equal_fc
min_l = min(len(e_sarima), len(e_combo))
dm, pv, dbar = dm_test(e_sarima[:min_l], e_combo[:min_l], h=1)
print(f'  DM={dm:.2f}, p={pv:.4f}, d_bar={dbar:.0f}')
conclusion = 'Combination significantly better' if pv < 0.10 and dbar > 0 else 'No significant difference'
print(f'  Conclusion: {conclusion}')

In [None]:
# ── 9. Final Forecast Visualization ───────────────────────────────────────────
fig, ax = plt.subplots(figsize=(11, 5))

# Historical (last 48 periods before the walk-forward window)
hist = series.iloc[T0-48:T0]
ax.plot(hist.index.to_timestamp(), hist.values,
        color=UNO_GRAY, lw=1.2, label='History')

# Final-origin forecasts for all models (last origin in walk-forward)
last_origin = max(r['origin'].max() for r in results.values())
test_idx    = series.index[last_origin: last_origin + H]
actuals_final = series.iloc[last_origin: last_origin + H].values

ax.plot(test_idx.to_timestamp(), actuals_final,
        color='black', lw=2, ls='-', label='Actual')

model_colors = [UNO_GRAY, UNO_BLUE, UNO_GREEN]
for (name, color) in zip(results.keys(), model_colors):
    fc_row = results[name].query('origin == @last_origin').sort_values('horizon')
    ax.plot(test_idx.to_timestamp(), fc_row['forecast'].values,
            color=color, lw=1.4, ls='-', label=name)

# Equal-weight combination for final origin
fc_final_df = pd.DataFrame({
    name: results[name].query('origin == @last_origin').sort_values('horizon')['forecast'].values
    for name in results
})
combo_final = fc_final_df.mean(axis=1).values
ax.plot(test_idx.to_timestamp(), combo_final,
        color=UNO_RED, lw=2, ls='--', label='Equal-weight combo')

ax.axvline(test_idx[0].to_timestamp(), color='black', lw=0.8, ls=':')
ax.set_title('Retail Sales: Walk-Forward Final-Origin Forecasts',
             fontsize=13, fontweight='bold')
ax.set_ylabel('USD millions')
ax.legend(ncol=3, fontsize=8)
ax.yaxis.set_major_formatter(mticker.FuncFormatter(
    lambda x, _: f'{x:,.0f}'))
plt.tight_layout()
plt.savefig(FIGURES / 'lecture06_forecasts.png', bbox_inches='tight')
plt.show()
print('Final forecast plot saved.')

print('\n── Discussion questions ──')
questions = [
    '1. Which metric should a retailer use if stockout costs 5x overstock?',
    '2. Should you use rolling or expanding window for post-COVID retail data?',
    '3. SARIMA DM p=0.12 vs ETS: do you switch models? What would change your mind?',
    '4. OLS weights perform worse than equal weights — what causes this?',
    '5. The equal-weight combination is not significantly better than SARIMA.',
       '   Is combination still useful? Why or why not?',
]
for q in questions:
    print(q)