# Lab 02: Regression-Based Forecasting
**BSAD 8310: Business Forecasting — University of Nebraska at Omaha**

## Objectives
1. Fit three regression-based forecasting models to US retail sales:
   - Model 1: Linear trend regression
   - Model 2: Trend + seasonal dummy variables
   - Model 3: AR($p$) with BIC lag selection
2. Compute 95% prediction intervals for the trend model
3. Compare OOS accuracy against Lab 01 benchmarks (naïve, seasonal naïve, mean, drift)
4. Diagnose residuals using the ACF

## Dataset
US Advance Retail Sales (RSXFS) from FRED — same series as Lab 01 for continuity.
Monthly, seasonally unadjusted, billions of dollars.

## Key notation (matches lecture slides)
- $y_t$: retail sales at time $t$
- $\hat{y}_{T+h|T}$: forecast of $y_{T+h}$ made at time $T$
- $e_t = y_t - \hat{y}_{t|t-1}$: forecast error
- $T$: last training observation; $H=24$: test horizon (2 years)

In [None]:
# =============================================================================
# Section 1: Setup
# =============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
from pathlib import Path
from statsmodels.tsa.stattools import acf
from statsmodels.graphics.tsaplots import plot_acf
import warnings
warnings.filterwarnings('ignore')

# Reproducibility
np.random.seed(42)

# Paths
ROOT = Path('..').resolve()
FIG_DIR = ROOT / 'Figures'
FIG_DIR.mkdir(exist_ok=True)

# UNO color palette
UNO_BLUE   = '#005CA9'
UNO_RED    = '#E41C38'
UNO_GRAY   = '#525252'
UNO_GREEN  = '#15803d'
UNO_LBLUE  = '#E8F0FA'
UNO_LRED   = '#FDECEA'
UNO_LGREEN = '#F0FAF4'

# Matplotlib defaults
plt.rcParams.update({
    'figure.dpi': 150,
    'figure.figsize': (10, 4),
    'font.family': 'sans-serif',
    'axes.spines.top': False,
    'axes.spines.right': False,
    'axes.prop_cycle': plt.cycler('color', [UNO_BLUE, UNO_RED, UNO_GREEN, UNO_GRAY]),
    'axes.labelsize': 11,
    'xtick.labelsize': 9,
    'ytick.labelsize': 9,
})
print('Setup complete.')

In [None]:
# =============================================================================
# Section 2: Load Data
# =============================================================================
# Primary: FRED RSXFS via pandas_datareader
# Fallback: AirPassengers from statsmodels (same structure as Lab 01)

try:
    import pandas_datareader.data as web
    from datetime import datetime
    raw = web.DataReader('RSXFS', 'fred', datetime(2000, 1, 1), datetime(2023, 12, 31))
    y = raw['RSXFS'].dropna()
    series_name = 'US Advance Retail Sales (RSXFS, bn USD)'
    print(f'Loaded FRED RSXFS: {len(y)} monthly observations ({y.index[0].date()} – {y.index[-1].date()})')
except Exception as e:
    print(f'FRED unavailable ({e}). Falling back to AirPassengers.')
    from statsmodels.datasets import get_rdataset
    ap = get_rdataset('AirPassengers').data
    y = pd.Series(
        ap['value'].values,
        index=pd.date_range('1949-01', periods=len(ap), freq='MS'),
        name='AirPassengers'
    )
    series_name = 'International Air Passengers (thousands)'
    print(f'Loaded AirPassengers: {len(y)} monthly observations')

print(f'Series: {series_name}')
print(f'Mean: {y.mean():.1f},  Std: {y.std():.1f},  Min: {y.min():.1f},  Max: {y.max():.1f}')

In [None]:
# =============================================================================
# Section 3: Train / Test Split
# =============================================================================
H = 24  # test horizon: last 24 months
T = len(y) - H

y_train = y.iloc[:T]
y_test  = y.iloc[T:]

# Time index for OLS (1-based integer, matching lecture notation)
t_train = np.arange(1, T + 1)           # t = 1, ..., T
t_test  = np.arange(T + 1, T + H + 1)   # t = T+1, ..., T+H

# Month index for seasonal dummies (1=Jan, ..., 12=Dec)
month_train = y_train.index.month
month_test  = y_test.index.month

print(f'Train: {T} obs  ({y_train.index[0].date()} – {y_train.index[-1].date()})')
print(f'Test:  {H} obs  ({y_test.index[0].date()} – {y_test.index[-1].date()})')

# Quick plot to confirm split
fig, ax = plt.subplots(figsize=(11, 3.5))
ax.plot(y_train.index, y_train, color=UNO_BLUE, lw=1.5, label='Training')
ax.plot(y_test.index,  y_test,  color=UNO_RED,  lw=1.5, label='Test (H=24)')
ax.axvline(y_train.index[-1], color=UNO_GRAY, ls='--', lw=1)
ax.set_title('Train / Test Split', fontsize=12, fontweight='bold')
ax.set_ylabel('Sales (bn USD)' if 'RSXFS' in series_name else 'Passengers')
ax.legend(frameon=False)
plt.tight_layout()
plt.savefig(FIG_DIR / 'lecture02_split.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# =============================================================================
# Section 4: Model 1 — Linear Trend Regression
# =============================================================================
# y_t = beta_0 + beta_1 * t + eps_t

from numpy.linalg import lstsq
from scipy import stats as scipy_stats

def build_trend_matrix(t):
    """Design matrix for linear trend: [1, t]."""
    return np.column_stack([np.ones(len(t)), t])

X_trend_train = build_trend_matrix(t_train)
X_trend_test  = build_trend_matrix(t_test)

# OLS: beta_hat = (X'X)^{-1} X'y  (via lstsq for numerical stability)
beta_trend, _, _, _ = lstsq(X_trend_train, y_train.values, rcond=None)

# Residuals and residual std
resid_trend = y_train.values - X_trend_train @ beta_trend
s_e = np.sqrt(np.sum(resid_trend**2) / (T - 2))  # k=1 predictor + intercept

# Point forecasts
y_hat_trend = X_trend_test @ beta_trend

# 95% Prediction Intervals
# PI: y_hat +/- t_{alpha/2, T-2} * s_e * sqrt(1 + x'(X'X)^{-1}x)
XtX_inv = np.linalg.inv(X_trend_train.T @ X_trend_train)
t_crit  = scipy_stats.t.ppf(0.975, df=T - 2)

pi_half = np.array([
    t_crit * s_e * np.sqrt(1 + x @ XtX_inv @ x)
    for x in X_trend_test
])
pi_lower = y_hat_trend - pi_half
pi_upper = y_hat_trend + pi_half

print(f'beta_0 (intercept): {beta_trend[0]:.2f}')
print(f'beta_1 (trend):     {beta_trend[1]:.4f}  (avg monthly change)')
print(f'Residual std:       {s_e:.2f}')

In [None]:
# =============================================================================
# Section 5: Model 2 — Trend + Seasonal Dummies
# =============================================================================
# y_t = beta_0 + beta_1*t + gamma_2*D2 + ... + gamma_12*D12 + eps_t
# Base category: January (month 1); include dummies for months 2-12.

def build_trend_seasonal_matrix(t, months):
    """
    Design matrix: [1, t, D2, D3, ..., D12]
    months: integer array of month numbers (1=Jan, ..., 12=Dec)
    """
    n = len(t)
    X = np.zeros((n, 13))          # intercept + trend + 11 seasonal dummies
    X[:, 0] = 1.0                  # intercept
    X[:, 1] = t                    # trend
    for j in range(2, 13):         # months 2-12
        X[:, j] = (months == j).astype(float)
    return X

X_seas_train = build_trend_seasonal_matrix(t_train, month_train)
X_seas_test  = build_trend_seasonal_matrix(t_test,  month_test)

beta_seas, _, _, _ = lstsq(X_seas_train, y_train.values, rcond=None)
resid_seas = y_train.values - X_seas_train @ beta_seas

y_hat_seas = X_seas_test @ beta_seas

# Print seasonal effects (gamma_j relative to January)
months_label = ['Jan','Feb','Mar','Apr','May','Jun',
                'Jul','Aug','Sep','Oct','Nov','Dec']
print('Seasonal effects (deviation from January baseline):')
print(f'  Jan (base): 0.00')
for j in range(2, 13):
    print(f'  {months_label[j-1]}: {beta_seas[j]:+.2f}')

In [None]:
# =============================================================================
# Section 6: Model 3 — AR(p) with BIC Lag Selection
# =============================================================================
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.stattools import arma_order_select_ic

# Search over p = 1, ..., 24 using BIC
p_max = 24
bic_values = {}
for p in range(1, p_max + 1):
    try:
        ar_fit = AutoReg(y_train, lags=p, old_names=False).fit()
        bic_values[p] = ar_fit.bic
    except Exception:
        pass

p_star = min(bic_values, key=bic_values.get)
print(f'BIC-selected lag order: p* = {p_star}')

# Refit with optimal p
ar_model = AutoReg(y_train, lags=p_star, old_names=False).fit()
print(ar_model.summary().tables[0])  # brief summary

# Out-of-sample forecast (recursive, H steps ahead from T)
# AutoReg.predict with dynamic=True implements recursive substitution
ar_forecasts = ar_model.predict(
    start=len(y_train),
    end=len(y_train) + H - 1,
    dynamic=True
)
y_hat_ar = ar_forecasts.values

# Plot BIC profile
fig, ax = plt.subplots(figsize=(7, 3))
ax.plot(list(bic_values.keys()), list(bic_values.values()),
        color=UNO_BLUE, lw=1.5, marker='o', ms=3)
ax.axvline(p_star, color=UNO_RED, ls='--', lw=1.5,
           label=f'BIC-optimal: p={p_star}')
ax.set_xlabel('AR lag order $p$')
ax.set_ylabel('BIC')
ax.set_title('BIC Profile for AR Lag Selection', fontweight='bold')
ax.legend(frameon=False)
plt.tight_layout()
plt.savefig(FIG_DIR / 'lecture02_bic_profile.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# =============================================================================
# Section 7: Metrics — All Models vs. Lab 01 Benchmarks
# =============================================================================

def compute_metrics(y_true, y_pred, label):
    """RMSE, MAE, MAPE for a forecast vs. actuals."""
    e = y_true - y_pred
    rmse = np.sqrt(np.mean(e**2))
    mae  = np.mean(np.abs(e))
    # MAPE: guard against zero actuals
    with np.errstate(divide='ignore', invalid='ignore'):
        mape = np.mean(np.abs(e / y_true)) * 100
    return {'Model': label, 'RMSE': rmse, 'MAE': mae, 'MAPE (%)': mape}

y_true = y_test.values

# ---- Lab 01 benchmarks (re-implemented for completeness) -------------------
# Naive: last observed value repeated
y_hat_naive    = np.full(H, y_train.iloc[-1])

# Seasonal naive: value from same month one year ago (m=12)
m = 12
y_hat_snv = np.array([y_train.iloc[-(m - h % m)] for h in range(H)])

# Historical mean
y_hat_mean = np.full(H, y_train.mean())

# Random walk + drift
drift = (y_train.iloc[-1] - y_train.iloc[0]) / (T - 1)
y_hat_drift = y_train.iloc[-1] + drift * np.arange(1, H + 1)

# ---- Collect all results ---------------------------------------------------
results = [
    compute_metrics(y_true, y_hat_naive,   'Naïve'),
    compute_metrics(y_true, y_hat_snv,     'Seasonal Naïve'),
    compute_metrics(y_true, y_hat_mean,    'Historical Mean'),
    compute_metrics(y_true, y_hat_drift,   'RW + Drift'),
    compute_metrics(y_true, y_hat_trend,   'Trend (L2)'),
    compute_metrics(y_true, y_hat_seas,    'Trend + Seasonal (L2)'),
    compute_metrics(y_true, y_hat_ar,      f'AR({p_star}) BIC (L2)'),
]

results_df = pd.DataFrame(results).set_index('Model')
results_df = results_df.round(2)

# Highlight best in each column
print('\n=== Out-of-Sample Forecast Accuracy (H=24) ===')
print(results_df.to_string())
print(f'\nBest RMSE: {results_df["RMSE"].idxmin()} ({results_df["RMSE"].min():.2f})')
print(f'Best MAE:  {results_df["MAE"].idxmin()} ({results_df["MAE"].min():.2f})')

In [None]:
# =============================================================================
# Section 8: Prediction Intervals + Forecast Plot
# =============================================================================

fig, ax = plt.subplots(figsize=(12, 4.5))

# Training and test actuals
ax.plot(y_train.index[-48:], y_train.iloc[-48:],
        color=UNO_BLUE, lw=1.8, label='Actual (train)')
ax.plot(y_test.index, y_true,
        color=UNO_BLUE, lw=1.8, ls='--', label='Actual (test)')

# Model forecasts
ax.plot(y_test.index, y_hat_snv,   color=UNO_GRAY,  lw=1.2, ls=':',  label='Seasonal Naïve')
ax.plot(y_test.index, y_hat_trend, color=UNO_RED,   lw=1.5, label='Trend')
ax.plot(y_test.index, y_hat_seas,  color=UNO_GREEN, lw=1.5, label='Trend + Seasonal')
ax.plot(y_test.index, y_hat_ar,    color='#7C3AED', lw=1.5, label=f'AR({p_star})')

# 95% PI for trend model
ax.fill_between(y_test.index, pi_lower, pi_upper,
                color=UNO_RED, alpha=0.12, label='95% PI (Trend)')

# Forecast origin
ax.axvline(y_train.index[-1], color=UNO_GRAY, ls='--', lw=1, alpha=0.6)
ax.text(y_train.index[-1], ax.get_ylim()[0],
        '  Forecast origin', color=UNO_GRAY, fontsize=8, va='bottom')

ax.set_title('Regression Forecasts vs. Actuals — US Retail Sales',
             fontsize=12, fontweight='bold')
ax.set_ylabel('Sales (bn USD)' if 'RSXFS' in series_name else 'Passengers')
ax.legend(frameon=False, ncol=3, fontsize=8)
plt.tight_layout()
plt.savefig(FIG_DIR / 'lecture02_forecasts.png', dpi=150, bbox_inches='tight')
plt.show()
print('Figure saved to Figures/lecture02_forecasts.png')

In [None]:
# =============================================================================
# Section 9: Residual Diagnostics — ACF of Regression Residuals
# =============================================================================
# Check whether residuals are white noise (required for valid PIs).
# Significant ACF spikes at short lags indicate unexploited autocorrelation.

fig, axes = plt.subplots(1, 3, figsize=(13, 3.5))

for ax, resid, title in zip(
    axes,
    [resid_trend, resid_seas, ar_model.resid.values],
    ['Trend model residuals', 'Trend + Seasonal residuals',
     f'AR({p_star}) residuals']
):
    n = len(resid)
    nlags = min(40, n // 4)
    acf_vals = acf(resid, nlags=nlags, fft=True)
    lags = np.arange(nlags + 1)
    ci_bound = 1.96 / np.sqrt(n)

    ax.bar(lags, acf_vals, color=UNO_BLUE, alpha=0.7, width=0.6)
    ax.axhline( ci_bound, color=UNO_RED, ls='--', lw=1)
    ax.axhline(-ci_bound, color=UNO_RED, ls='--', lw=1)
    ax.axhline(0, color=UNO_GRAY, lw=0.8)
    ax.set_title(title, fontsize=10, fontweight='bold')
    ax.set_xlabel('Lag')
    ax.set_ylabel('ACF')
    ax.set_ylim(-0.6, 1.05)

plt.suptitle('Residual ACF: Are Errors White Noise?',
             fontsize=12, fontweight='bold', y=1.01)
plt.tight_layout()
plt.savefig(FIG_DIR / 'lecture02_residual_acf.png', dpi=150, bbox_inches='tight')
plt.show()

# Summary judgment
for resid, label in zip(
    [resid_trend, resid_seas, ar_model.resid.values],
    ['Trend', 'Trend+Seasonal', f'AR({p_star})']
):
    n = len(resid)
    ci = 1.96 / np.sqrt(n)
    acf_v = acf(resid, nlags=12, fft=True)[1:]
    n_sig = np.sum(np.abs(acf_v) > ci)
    verdict = 'AUTOCORRELATED (PIs invalid)' if n_sig > 0 else 'White noise OK'
    print(f'{label:20s}: {n_sig} significant spikes (lags 1-12) → {verdict}')