In [1]:
import os
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error
from statsmodels.tsa.statespace.varmax import VARMAX
from statsmodels.tsa.stattools import adfuller
import optuna
import warnings
warnings.filterwarnings('ignore')

# ---------- Configuration ----------
financial_indicators = [
    '1_year_rate', '3_months_rate', '6_months_rate', 'CPI', 'INDPRO',
    '10_year_rate', 'share_price', 'unemployment_rate', 'PPI',
    'OECD_CLI_index', 'CSI_index', 'gdp_per_capita'
]

recession_targets = [
    'recession_probability', '1_month_recession_probability',
    '3_month_recession_probability', '6_month_recession_probability'
]

# ---------- Helper functions ----------

def check_stationarity(series):
    result = adfuller(series.dropna())
    return result[1] < 0.05

def make_stationary(series, max_diff=2):
    original_series = series.copy()
    diff_order = 0
    for d in range(max_diff + 1):
        if d == 0:
            current_series = series
        else:
            current_series = current_series.diff().dropna()
            diff_order = d
        if len(current_series) < 20:
            break
        if adfuller(current_series.dropna())[1] < 0.05:
            break
    return current_series, diff_order

def prepare_varmax_data(train_df, test_df, target_indicators, max_exog_features=10):
    features_to_exclude = ['date'] + recession_targets + target_indicators
    available_exog = [c for c in train_df.columns if c not in features_to_exclude]
    
    if len(available_exog) > max_exog_features:
        correlations = {}
        for feature in available_exog:
            corr_sum = sum(
                abs(train_df[feature].corr(train_df[t])) if not np.isnan(train_df[feature].corr(train_df[t])) else 0
                for t in target_indicators
            )
            correlations[feature] = corr_sum
        available_exog = [f[0] for f in sorted(correlations.items(), key=lambda x: x[1], reverse=True)[:max_exog_features]]
    
    # Prepare targets and exog
    train_targets = train_df[['date'] + target_indicators].set_index('date')
    test_targets = test_df[['date'] + target_indicators].set_index('date')
    train_exog = train_df[['date'] + available_exog].set_index('date')
    test_exog = test_df[['date'] + available_exog].set_index('date')
    
    # Fill missing values
    for df in [train_targets, test_targets, train_exog, test_exog]:
        df.replace([np.inf, -np.inf], np.nan, inplace=True)
        df.fillna(method='ffill', inplace=True)
        df.fillna(method='bfill', inplace=True)
        for col in df.columns:
            if df[col].isnull().any():
                df[col].fillna(df[col].median(), inplace=True)
    
    return train_targets, test_targets, train_exog, test_exog, available_exog

def check_and_make_stationary(data):
    stationary_data = data.copy()
    diff_orders = {}
    for col in data.columns:
        series, diff_order = make_stationary(data[col])
        stationary_data[col] = np.nan
        stationary_data.loc[series.index, col] = series
        diff_orders[col] = diff_order
    stationary_data = stationary_data.dropna()
    return stationary_data, diff_orders

def make_varmax_objective(train_targets, train_exog, val_targets, val_exog):
    def objective(trial):
        order = (trial.suggest_int('p', 1, 4), trial.suggest_int('q', 0, 2))
        trend = trial.suggest_categorical('trend', ['n', 'c', 't', 'ct'])
        try:
            model = VARMAX(
                endog=train_targets,
                exog=train_exog,
                order=order,
                trend=trend,
                enforce_stationarity=False,
                enforce_invertibility=False
            )
            fitted = model.fit(disp=False, maxiter=100, method='lbfgs')
            forecast = fitted.forecast(steps=len(val_targets), exog=val_exog.values)
            rmse_sum = sum(np.sqrt(mean_squared_error(val_targets[col], forecast[:, i]))
                           for i, col in enumerate(val_targets.columns))
            return rmse_sum / len(val_targets.columns)
        except Exception:
            return float('inf')
    return objective

def fit_varmax_model(train_targets, test_targets, train_exog, test_exog, n_trials=20):
    val_size = min(12, len(test_targets), len(train_targets)//4)
    train_fit, train_exog_fit = train_targets.iloc[:-val_size], train_exog.iloc[:-val_size]
    val_targets, val_exog = train_targets.iloc[-val_size:], train_exog.iloc[-val_size:]
    
    study = optuna.create_study(direction='minimize')
    study.optimize(make_varmax_objective(train_fit, train_exog_fit, val_targets, val_exog), n_trials=n_trials)
    best_order = (study.best_params['p'], study.best_params['q'])
    best_trend = study.best_params['trend']
    
    final_model = VARMAX(endog=train_targets, exog=train_exog, order=best_order, trend=best_trend,
                         enforce_stationarity=False, enforce_invertibility=False)
    fitted_model = final_model.fit(disp=False, maxiter=200, method='lbfgs')
    
    forecast = fitted_model.forecast(steps=len(test_targets), exog=test_exog.values)
    
    results = {}
    for i, col in enumerate(train_targets.columns):
        y_true = test_targets[col].values
        y_pred = forecast[:, i]
        mae = mean_absolute_error(y_true, y_pred)
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        mask = y_true != 0
        mape = np.mean(np.abs((y_true[mask]-y_pred[mask])/y_true[mask]))*100 if mask.any() else np.inf
        results[col] = {'forecast': y_pred, 'actual': y_true, 'mae': mae, 'rmse': rmse, 'mape': mape}
    return fitted_model, results

def invert_differencing(forecast_df, last_train_values, diff_orders):
    inv_df = forecast_df.copy()
    for col in forecast_df.columns:
        d = diff_orders.get(col, 0)
        if d > 0:
            prev_value = last_train_values[col]
            for _ in range(d):
                inv_series = inv_df[col].cumsum() + prev_value
                inv_df[col] = inv_series
                prev_value = inv_series.iloc[-1]
    return inv_df

# ---------- Main Pipeline ----------
def run_varmax_pipeline(train_df, test_df, target_indicators=None, max_exog_features=10,
                        n_trials=20, save_model=True, save_plots=True):
    
    if target_indicators is None:
        target_indicators = ['CPI', 'unemployment_rate', '1_year_rate', 'share_price', 'INDPRO']
    
    train_targets, test_targets, train_exog, test_exog, exog_features = prepare_varmax_data(
        train_df, test_df, target_indicators, max_exog_features
    )
    
    train_targets_stat, target_diff_orders = check_and_make_stationary(train_targets)
    train_exog_stat, exog_diff_orders = check_and_make_stationary(train_exog)
    
    # Align test data by applying same differencing
    test_targets_stat = test_targets.copy()
    test_exog_stat = test_exog.copy()
    for col, d in target_diff_orders.items():
        for _ in range(d):
            test_targets_stat[col] = test_targets_stat[col].diff()
    for col, d in exog_diff_orders.items():
        for _ in range(d):
            test_exog_stat[col] = test_exog_stat[col].diff()
    test_targets_stat.dropna(inplace=True)
    test_exog_stat.dropna(inplace=True)
    
    model, results = fit_varmax_model(train_targets_stat, test_targets_stat, train_exog_stat, test_exog_stat, n_trials)
    
    # Invert differencing for original scale
    last_train_values = train_targets.iloc[-1]
    forecast_df = pd.DataFrame({col: results[col]['forecast'] for col in target_indicators})
    forecast_orig = invert_differencing(forecast_df, last_train_values, target_diff_orders)
    
    # Save model
    if save_model:
        os.makedirs('varmax_models', exist_ok=True)
        with open('varmax_models/varmax_model.pkl', 'wb') as f:
            pickle.dump(model, f)
        print("Saved model: varmax_models/varmax_model.pkl")
    
    # Plot results
    os.makedirs('varmax_plots', exist_ok=True)
    for col in target_indicators:
        plt.figure(figsize=(8,4))
        plt.plot(test_targets.index, results[col]['actual'], label='Actual', color='green')
        plt.plot(test_targets.index, forecast_orig[col], label='Forecast', color='red')
        plt.title(f"{col} Forecast")
        plt.legend()
        plt.tight_layout()
        if save_plots:
            plt.savefig(f'varmax_plots/{col}_forecast.png')
            print(f"Saved plot: varmax_plots/{col}_forecast.png")
        plt.show()
    
    return model, results, forecast_orig


In [2]:
train_df = pd.read_csv('../data/fix/feature_selected_recession_train.csv')
test_df = pd.read_csv('../data/fix/feature_selected_recession_test.csv')

In [3]:
# 2️⃣ List of all financial indicators you want to forecast
financial_indicators = [
    "1_year_rate", "3_months_rate", "6_months_rate", "CPI", "INDPRO",
    "10_year_rate", "share_price", "unemployment_rate", "PPI",
    "OECD_CLI_index", "CSI_index", "gdp_per_capita"
]

# Run the full VARMAX pipeline
model, results, forecast_orig = run_varmax_pipeline(
    train_df=train_df,
    test_df=test_df,
    target_indicators=financial_indicators,
    max_exog_features=10,   # Maximum number of exogenous features
    n_trials=20,            # Number of Optuna trials for hyperparameter tuning
    save_model=True,        # Save model as .pkl
    save_plots=True         # Save plots to varmax_plots/
)

# Check results for each indicator
for ind in financial_indicators:
    print(f"{ind}: MAE={results[ind]['mae']:.4f}, RMSE={results[ind]['rmse']:.4f}, MAPE={results[ind]['mape']:.2f}%")

# Quick view of forecast
print(forecast_orig.head())

[I 2025-10-12 10:42:05,022] A new study created in memory with name: no-name-ca95a2cd-2fb7-4ba0-aac6-66c5b192f6ee
[W 2025-10-12 17:04:46,027] Trial 0 failed with parameters: {'p': 2, 'q': 0, 'trend': 't'} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/Users/donaldaadithiyan/Personal learn dev/RecessionRadar/venv/lib/python3.12/site-packages/optuna/study/_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/var/folders/vx/lz_s92194473v0lcw_034qq40000gn/T/ipykernel_67672/1264124305.py", line 101, in objective
    fitted = model.fit(disp=False, maxiter=100, method='lbfgs')
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/donaldaadithiyan/Personal learn dev/RecessionRadar/venv/lib/python3.12/site-packages/statsmodels/tsa/statespace/mlemodel.py", line 704, in fit
    mlefit = super(MLEModel, self).fit(start_params, method=method,
             ^^^^^^^^

KeyboardInterrupt: 