# Lab 09: Tree-Based Methods — Random Forests and XGBoost

**BSAD 8310: Business Forecasting | University of Nebraska at Omaha**

## Objectives

1. Build the lag-feature matrix for RSXFS retail sales (same as Lab 08)
2. Fit a Random Forest with TimeSeriesSplit hyperparameter tuning
3. Fit XGBoost with early stopping on the validation set
4. Visualize feature importance (impurity, permutation, XGBoost gain)
5. Compare RF and XGBoost against SARIMA and Elastic Net (L08 baseline)

## Packages Required
```
numpy, pandas, matplotlib, scikit-learn, xgboost, statsmodels
pandas_datareader (optional — FRED data)
```

In [None]:
# =============================================================================
# Section 1: Setup
# =============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.model_selection import (
    TimeSeriesSplit, RandomizedSearchCV, cross_val_score
)
from sklearn.metrics import mean_squared_error

try:
    import xgboost as xgb
    XGB_AVAILABLE = True
except ImportError:
    print('xgboost not installed — XGBoost section will be skipped.')
    XGB_AVAILABLE = False

np.random.seed(42)

# UNO color palette
UNO = {
    'blue':       '#005CA9',
    'red':        '#E41C38',
    'gray':       '#525252',
    'green':      '#15803d',
    'lightblue':  '#cce0f5',
    'lightgray':  '#e5e5e5',
}

plt.rcParams.update({
    'figure.dpi':        150,
    'axes.spines.top':   False,
    'axes.spines.right': False,
    'font.size':         11,
    'axes.titlesize':    13,
})

FIGURE_DIR = '../Figures'
import os; os.makedirs(FIGURE_DIR, exist_ok=True)
print('Setup complete.')

In [None]:
# =============================================================================
# Section 2: Load Data and Build Feature Matrix
# =============================================================================
# Identical feature engineering to Lab 08 for fair comparison.

# --- Load data ---
try:
    import pandas_datareader.data as web
    raw = web.DataReader('RSXFS', 'fred',
                         start='1992-01-01', end='2024-12-01')
    y_all = raw['RSXFS'].dropna()
    y_all.index = pd.to_datetime(y_all.index).to_period('M')
    print(f'Loaded FRED RSXFS: {len(y_all)} monthly observations')
except Exception:
    import statsmodels.api as sm
    macro = sm.datasets.macrodata.load_pandas().data
    macro.index = pd.period_range('1959Q1', periods=len(macro), freq='Q')
    y_all = macro['realgdp']
    print('Loaded statsmodels macrodata fallback.')

# --- Feature engineering (leakage-free) ---
def make_features(y, n_lags=12, roll_windows=(3, 6, 12), add_calendar=True):
    df = pd.DataFrame({'y': y})
    for k in range(1, n_lags + 1):
        df[f'lag_{k}'] = y.shift(k)
    y_lag1 = y.shift(1)
    for w in roll_windows:
        df[f'roll_mean_{w}'] = y_lag1.rolling(w).mean()
        df[f'roll_std_{w}']  = y_lag1.rolling(w).std()
    if add_calendar:
        if hasattr(y.index, 'to_timestamp'):
            month = y.index.to_timestamp().month
        elif hasattr(y.index, 'month'):
            month = y.index.month
        else:
            month = None
        if month is not None:
            for m in range(2, 13):
                df[f'month_{m}'] = (month == m).astype(int)
    df.dropna(inplace=True)
    return df.drop(columns=['y']), df['y']

X, y = make_features(y_all, n_lags=12, roll_windows=(3, 6, 12))

# --- Three-way split ---
n = len(y)
n_test  = int(0.15 * n)
n_val   = int(0.15 * n)
n_train = n - n_val - n_test

X_train, y_train = X.iloc[:n_train],            y.iloc[:n_train]
X_val,   y_val   = X.iloc[n_train:n_train+n_val], y.iloc[n_train:n_train+n_val]
X_test,  y_test  = X.iloc[n_train+n_val:],      y.iloc[n_train+n_val:]
X_trainval = X.iloc[:n_train+n_val]
y_trainval = y.iloc[:n_train+n_val]

feat_names = X.columns.tolist()
print(f'Features: {len(feat_names)} | Train: {n_train} | Val: {n_val} | Test: {n_test}')

In [None]:
# =============================================================================
# Section 3: Random Forest — Hyperparameter Tuning
# =============================================================================

tscv = TimeSeriesSplit(n_splits=5, gap=0)

rf_param_grid = {
    'n_estimators':     [200, 500],
    'max_features':     ['sqrt', 0.33, 0.5],
    'min_samples_leaf': [1, 3, 5],
    'max_depth':        [None, 10, 20],
}

print('Fitting Random Forest via RandomizedSearchCV (TimeSeriesSplit)...')
rf_search = RandomizedSearchCV(
    RandomForestRegressor(random_state=42, n_jobs=-1),
    rf_param_grid,
    n_iter=20,
    cv=tscv,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    random_state=42
)
rf_search.fit(X_trainval, y_trainval)

best_rf_params = rf_search.best_params_
print(f'Best RF params: {best_rf_params}')
print(f'CV RMSE:        {-rf_search.best_score_:.1f}')

# Final RF fitted on train+val with best params
rf_model = rf_search.best_estimator_
y_pred_rf = pd.Series(rf_model.predict(X_test), index=y_test.index)

rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
print(f'Test RMSE (RF): {rmse_rf:.1f}')

In [None]:
# =============================================================================
# Section 4: XGBoost — Early Stopping on Validation Set
# =============================================================================

if XGB_AVAILABLE:
    dtrain = xgb.DMatrix(X_train,    label=y_train)
    dval   = xgb.DMatrix(X_val,      label=y_val)
    dtest  = xgb.DMatrix(X_test,     label=y_test)
    dtrainval = xgb.DMatrix(X_trainval, label=y_trainval)

    xgb_params = {
        'learning_rate':    0.05,
        'max_depth':        4,
        'subsample':        0.8,
        'colsample_bytree': 0.8,
        'reg_lambda':       1.0,
        'objective':        'reg:squarederror',
        'eval_metric':      'rmse',
        'seed':             42,
    }

    print('Training XGBoost with early stopping...')
    xgb_model = xgb.train(
        xgb_params,
        dtrain,
        num_boost_round=2000,
        evals=[(dval, 'val')],
        early_stopping_rounds=50,
        verbose_eval=100
    )
    best_rounds = xgb_model.best_iteration
    print(f'Best round: {best_rounds}')

    # Refit on full trainval with best_rounds
    xgb_final = xgb.train(
        xgb_params, dtrainval,
        num_boost_round=best_rounds
    )
    y_pred_xgb = pd.Series(
        xgb_final.predict(dtest), index=y_test.index
    )
    rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
    print(f'Test RMSE (XGBoost): {rmse_xgb:.1f}')
else:
    print('Skipping XGBoost (not installed).')
    y_pred_xgb = None
    rmse_xgb   = float('nan')

In [None]:
# =============================================================================
# Section 5: Feature Importance — Three Methods
# =============================================================================

n_top = 12  # show top N features

# --- 1. Impurity-based importance (RF default) ---
imp_df = pd.DataFrame({
    'feature': feat_names,
    'impurity': rf_model.feature_importances_
}).sort_values('impurity', ascending=False).head(n_top)

# --- 2. Permutation importance (unbiased, on val set) ---
perm_result = permutation_importance(
    rf_model, X_val, y_val,
    n_repeats=10, random_state=42, scoring='neg_root_mean_squared_error'
)
perm_df = pd.DataFrame({
    'feature': feat_names,
    'perm_mean': -perm_result.importances_mean,
    'perm_std':  perm_result.importances_std,
}).sort_values('perm_mean', ascending=False).head(n_top)

# --- 3. XGBoost gain importance ---
if XGB_AVAILABLE:
    xgb_imp = xgb_final.get_score(importance_type='gain')
    xgb_imp_df = pd.DataFrame([
        {'feature': k, 'gain': v} for k, v in xgb_imp.items()
    ]).sort_values('gain', ascending=False).head(n_top)

# --- Plot ---
n_panels = 3 if XGB_AVAILABLE else 2
fig, axes = plt.subplots(1, n_panels, figsize=(5 * n_panels, 5))

# Impurity
ax = axes[0]
ax.barh(imp_df['feature'][::-1], imp_df['impurity'][::-1],
        color=UNO['blue'], edgecolor='white')
ax.set_title('RF: Impurity Importance')
ax.set_xlabel('Mean decrease in impurity')

# Permutation
ax = axes[1]
ax.barh(perm_df['feature'][::-1], perm_df['perm_mean'][::-1],
        xerr=perm_df['perm_std'][::-1],
        color=UNO['green'], edgecolor='white', ecolor=UNO['gray'], capsize=3)
ax.set_title('RF: Permutation Importance (val)')
ax.set_xlabel('Mean RMSE increase when shuffled')

# XGBoost gain
if XGB_AVAILABLE:
    ax = axes[2]
    ax.barh(xgb_imp_df['feature'][::-1], xgb_imp_df['gain'][::-1],
            color=UNO['red'], edgecolor='white')
    ax.set_title('XGBoost: Gain Importance')
    ax.set_xlabel('Mean gain per split')

plt.tight_layout()
plt.savefig(f'{FIGURE_DIR}/lecture09_feature_importance.png',
            dpi=150, bbox_inches='tight')
plt.show()
print('Saved lecture09_feature_importance.png')

In [None]:
# =============================================================================
# Section 6: OOB Error vs. n_estimators
# =============================================================================
# Show how OOB error stabilizes as number of trees grows.

n_tree_range = [10, 25, 50, 100, 200, 350, 500]
oob_errors = []

for n_trees in n_tree_range:
    rf_tmp = RandomForestRegressor(
        n_estimators=n_trees,
        max_features=best_rf_params.get('max_features', 0.33),
        min_samples_leaf=best_rf_params.get('min_samples_leaf', 1),
        max_depth=best_rf_params.get('max_depth', None),
        oob_score=True,
        random_state=42,
        n_jobs=-1
    )
    rf_tmp.fit(X_trainval, y_trainval)
    oob_pred = rf_tmp.oob_prediction_
    oob_rmse = np.sqrt(mean_squared_error(y_trainval, oob_pred))
    oob_errors.append(oob_rmse)

fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(n_tree_range, oob_errors, 'o-',
        color=UNO['blue'], lw=2, ms=6)
ax.set_xlabel('Number of trees (n_estimators)')
ax.set_ylabel('OOB RMSE')
ax.set_title('OOB Error Stabilizes with More Trees')
ax.axvline(500, color=UNO['gray'], ls='--', lw=1.2, label='n=500 (recommended)')
ax.legend()
plt.tight_layout()
plt.savefig(f'{FIGURE_DIR}/lecture09_oob_curve.png',
            dpi=150, bbox_inches='tight')
plt.show()
print('Saved lecture09_oob_curve.png')

In [None]:
# =============================================================================
# Section 7: SARIMA Baseline
# =============================================================================
from statsmodels.tsa.statespace.sarimax import SARIMAX

try:
    sarima_mod = SARIMAX(
        y_trainval,
        order=(1, 1, 1),
        seasonal_order=(1, 1, 1, 12),
        enforce_stationarity=False,
        enforce_invertibility=False
    )
    sarima_res = sarima_mod.fit(disp=False)
    y_pred_sarima = sarima_res.forecast(len(y_test))
    sarima_ok = True
    print('SARIMA baseline fit complete.')
except Exception as e:
    print(f'SARIMA failed: {e}')
    sarima_ok = False
    y_pred_sarima = pd.Series(
        [y_trainval.mean()] * len(y_test), index=y_test.index
    )

In [None]:
# =============================================================================
# Section 8: Model Comparison
# =============================================================================

def rmse(actual, predicted):
    a = np.asarray(actual)
    p = np.asarray(predicted)[:len(a)]
    return np.sqrt(mean_squared_error(a[:len(p)], p))

def mae(actual, predicted):
    a = np.asarray(actual)
    p = np.asarray(predicted)[:len(a)]
    return np.mean(np.abs(a[:len(p)] - p))

rows = [
    ('SARIMA(1,1,1)(1,1,1)_12',
     rmse(y_test, y_pred_sarima), mae(y_test, y_pred_sarima)),
    ('Random Forest',
     rmse(y_test, y_pred_rf), mae(y_test, y_pred_rf)),
]
if XGB_AVAILABLE and y_pred_xgb is not None:
    rows.append((
        'XGBoost (early stop)',
        rmse(y_test, y_pred_xgb), mae(y_test, y_pred_xgb)
    ))

results = pd.DataFrame(rows, columns=['Model', 'RMSE', 'MAE'])
results['RMSE'] = results['RMSE'].round(1)
results['MAE']  = results['MAE'].round(1)
print('Test-set results:')
print(results.to_string(index=False))

In [None]:
# =============================================================================
# Section 9: Forecast Comparison Plot
# =============================================================================

fig, ax = plt.subplots(figsize=(11, 4))

# History context
context = y_trainval.iloc[-24:]
ax.plot(context.index.astype(str), context.values,
        color=UNO['lightgray'], lw=1.5, label='History')

# Actuals
ax.plot(y_test.index.astype(str), y_test.values,
        color='black', lw=2, label='Actual', zorder=5)

# Baseline
if sarima_ok:
    sarima_vals = np.asarray(y_pred_sarima)[:len(y_test)]
    ax.plot(y_test.index.astype(str)[:len(sarima_vals)],
            sarima_vals, color=UNO['gray'], lw=1.5, ls='--', label='SARIMA')

# RF
ax.plot(y_test.index.astype(str), y_pred_rf.values,
        color=UNO['blue'], lw=2, ls='-.', label='Random Forest')

# XGBoost
if XGB_AVAILABLE and y_pred_xgb is not None:
    ax.plot(y_test.index.astype(str), y_pred_xgb.values,
            color=UNO['red'], lw=2, label='XGBoost')

ax.set_title('Forecast Comparison: SARIMA vs. Tree-Based Models (Test Set)')
ax.set_xlabel('Period')
ax.set_ylabel('Retail Sales (Millions USD)')
ax.legend(loc='upper left', fontsize=9)
ax.xaxis.set_major_locator(mticker.MaxNLocator(8))
plt.xticks(rotation=30)

# RMSE annotations
annot_models = [
    ('RF',  UNO['blue'],  y_pred_rf),
]
if XGB_AVAILABLE and y_pred_xgb is not None:
    annot_models.append(('XGB', UNO['red'], y_pred_xgb))

for i, (label, color, pred) in enumerate(annot_models):
    r = rmse(y_test, pred)
    ax.annotate(f'{label}: RMSE={r:,.0f}',
                xy=(0.99, 0.97 - i * 0.09),
                xycoords='axes fraction',
                ha='right', va='top', fontsize=9, color=color)

plt.tight_layout()
plt.savefig(f'{FIGURE_DIR}/lecture09_forecast_comparison.png',
            dpi=150, bbox_inches='tight')
plt.show()
print('Saved lecture09_forecast_comparison.png')

print('\n=== Final Results ===')
print(results.to_string(index=False))