# Model Training and Evaluation: SARIMA & LightGBM Ensemble

This notebook trains SARIMA models to forecast economic indicators, then uses LightGBM to ensemble these forecasts and predict recession probability for 1, 3, and 6 months ahead. Includes evaluation and visualization.

In [3]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
    --------------------------------------- 0.0/1.5 MB 1.3 MB/s eta 0:00:02
    --------------------------------------- 0.0/1.5 MB 1.3 MB/s eta 0:00:02
    --------------------------------------- 0.0/1.5 MB 1.3 MB/s eta 0:00:02
    --------------------------------------- 0.0/1.5 MB 1.3 MB/s eta 0:00:02
   -- ------------------------------------- 0.1/1.5 MB 403.5 kB/s eta 0:00:04
   -- ------------------------------------- 0.1/1.5 MB 403.5 kB/s eta 0:00:04
   --- ------------------------------------ 0.1/1.5 MB 344.8 kB/s eta 0:00:04
   ---- ----------------------------------- 0.2/1.5 MB 416.7 kB/s eta 0:00:04
   ---- ----------------------------------- 0.2/1.5 MB 416.7 kB/s eta 0:00:04
   ---- ----------------------------------- 0.2/1.5 MB 416.7 kB/s eta 0:00:04
   ---- ---


[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: C:\Users\dulak\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [None]:
# 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.statespace.sarimax import SARIMAX
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report

# 2. Load Feature-Engineered Data
df = pd.read_csv('../data/processed/feature_engineered_economic_indicators.csv', index_col=0, parse_dates=True)
print(f"Loaded feature-engineered data: {df.shape}")
print("\nAvailable columns:")
print(df.columns.tolist())

# Set target columns explicitly for this dataset
target_1m = '1_month_recession_probability'
target_3m = '3_month_recession_probability'
target_6m = '6_month_recession_probability'

targets = [target_1m, target_3m, target_6m]
indicator_cols = [col for col in df.select_dtypes(include=[np.number]).columns if col not in targets]

# 3. SARIMA Forecasting for Each Indicator
sarima_forecasts = pd.DataFrame(index=df.index)
for col in indicator_cols:
    print(f"Fitting SARIMA for {col}...")
    train = df[col].iloc[:-6]
    model = SARIMAX(train, order=(1,1,1), seasonal_order=(0,1,1,12), enforce_stationarity=False, enforce_invertibility=False)
    results = model.fit(disp=False)
    forecast = results.get_forecast(steps=6)
    sarima_forecasts[col+'_sarima'] = np.nan
    # Assign forecasted values to the last 6 rows using iloc
    sarima_forecasts.iloc[-6:, sarima_forecasts.columns.get_loc(col+'_sarima')] = forecast.predicted_mean.values
sarima_forecasts = sarima_forecasts.fillna(method='ffill')
print("SARIMA forecasts complete.")

# 4. Prepare LightGBM Data (using SARIMA forecasts as features)
X = sarima_forecasts.copy()
y_1m = df[target_1m].loc[X.index]
y_3m = df[target_3m].loc[X.index]
y_6m = df[target_6m].loc[X.index]

# 5. Train/Test Split
split = int(len(X)*0.8)
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train_1m, y_test_1m = y_1m.iloc[:split], y_1m.iloc[split:]
y_train_3m, y_test_3m = y_3m.iloc[:split], y_3m.iloc[split:]
y_train_6m, y_test_6m = y_6m.iloc[:split], y_6m.iloc[split:]

# 6. LightGBM Model Training and Evaluation
def train_eval_lgb(X_train, y_train, X_test, y_test, horizon):
    print(f"\nTraining LightGBM for {horizon} ahead...")
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
    params = {'objective': 'binary', 'metric': 'auc', 'verbosity': -1}
    gbm = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_eval], num_boost_round=100, early_stopping_rounds=10, verbose_eval=False)
    y_pred = gbm.predict(X_test)
    auc = roc_auc_score(y_test, y_pred)
    y_pred_label = (y_pred > 0.5).astype(int)
    acc = accuracy_score(y_test, y_pred_label)
    print(f"AUC: {auc:.3f}, Accuracy: {acc:.3f}")
    print(classification_report(y_test, y_pred_label))
    plt.figure(figsize=(6,4))
    plt.hist(y_pred, bins=30, alpha=0.7, label='Predicted Prob')
    plt.title(f'Predicted Recession Probability ({horizon})')
    plt.xlabel('Probability')
    plt.ylabel('Frequency')
    plt.legend()
    plt.show()
    return gbm

model_1m = train_eval_lgb(X_train, y_train_1m, X_test, y_test_1m, '1 month')
model_3m = train_eval_lgb(X_train, y_train_3m, X_test, y_test_3m, '3 months')
model_6m = train_eval_lgb(X_train, y_train_6m, X_test, y_test_6m, '6 months')

print("\nAll models trained and evaluated.")

Loaded feature-engineered data: (658, 13)

Available columns:
['value', 'realtime_end', 'date', 'value_lag1', 'value_lag3', 'value_lag6', 'value_lag12', 'value_rollmean3', 'value_rollstd3', 'value_rollmean6', 'value_rollstd6', 'value_rollmean12', 'value_rollstd12']


ValueError: Could not find all target columns. Found: 1m=None, 3m=None, 6m=None

In [None]:
# If the automatic search fails, inspect the DataFrame and set target columns manually below.
print("\nFirst few rows of the DataFrame:")
display(df.head())
# Example: Uncomment and set these if needed:
# target_1m = 'your_column_name_for_1m'
# target_3m = 'your_column_name_for_3m'
# target_6m = 'your_column_name_for_6m'