# Lag Selection for Bayesian VAR Models Using BIC

This notebook implements a systematic approach to select the optimal number of lags for Bayesian Vector Autoregression (BVAR) models. The lag selection process is performed on expanding splits of the data for three regions: Switzerland (CH), the Euro Area (EU), and the United States (US).

The Bayesian Information Criterion (BIC) is used to evaluate up to 12 lags for each split, and the lag with the lowest BIC value is chosen as the optimal lag.

In [None]:
import sys
import pandas as pd
import numpy as np
from pathlib import Path

# --- Path and Import Setup ---
# Assumes the notebook is in `notebooks/` and the project root is one level up.
ROOT_DIR = Path("../").resolve()
sys.path.append(str(ROOT_DIR))

# Import the BVAR analysis and helper functions
from src.bvar_utils.bvar_analyt import bvar_analyt
from src.bvar_utils.helpers import preprocess_multivar, calculate_bic

# --- Configuration ---
# Paths are relative to the project root
DATA_DIR = ROOT_DIR / "data/processed"

# Model Parameters
REGIONS = ["ch", "eu", "us"]
MAX_LAGS = 12

# Define variable sets for each region
VARIABLE_SETS = {
    "ch": [
        "cpi_total_yoy_t", "cpi_goods_cat_goods_ind_t", "cpi_goods_cat_services_ind_t", "cpi_housing_energy_ind_t",
        "cpi_food_nonalcoholic_beverages_ind_t", "cpi_transport_ind_t", "cpi_health_ind_t", "cpi_clothing_footwear_ind_t",
        "cpi_alcoholic_beverages_tobacco_ind_t", "cpi_household_furniture_furnishings_routine_maintenance_ind_t",
        "cpi_restaurants_hotels_ind_t", "cpi_recreation_culture_ind_t", "cpi_communications_ind_t", "cpi_education_ind_t",
        "mon_stat_mon_agg_m0_total_chf_t", "ppi_total_base_month_december_2020_ind_t", "ipi_total_base_month_december_2020_ind_t",
        "oilpricex_t"
    ],
    "eu": [
        "hcpi_yoy_t", "irt3m_eacc_t", "irt6m_eacc_t", "ltirt_eacc_t", "ppicag_ea_t", "ppicog_ea_t", "ppindcog_ea_t", "ppidcog_ea_t",
        "ppiing_ea_t", "ppinrg_ea_t", "hicpnef_ea_t", "hicpg_ea_t", "hicpin_ea_t", "hicpsv_ea_t", "hicpng_ea_t", "curr_eacc_t",
        "m2_eacc_t", "m1_eacc_t", "oilpricex_t"
    ],
    "us": [
        "cpi_all_yoy_t", "m1sl_t", "m2sl_t", "m2real_t", "busloans_t", "fedfunds_t", "tb3ms_t", "tb6ms_t", "gs1_t", "gs5_t", "gs10_t",
        "ppicmm_t", "oilpricex_t", "cpiappsl_t", "cpitrnsl_t", "cpimedsl_t", "cusr0000sac_t", "cusr0000sad_t", "cusr0000sas_t", "pcepi_t"
    ],
}

## 2. Data Loading and Preprocessing

In [None]:
multivar_data_win = {}
for region in REGIONS:
    filepath = DATA_DIR / f"{region}_data_transformed_win.csv"
    dataset = pd.read_csv(filepath)
    multivar_data_win[region] = preprocess_multivar(dataset, VARIABLE_SETS[region])
    print(f"Loaded {region.upper()} winsorized data: {multivar_data_win[region].shape}")
    print(f"  Time range: {multivar_data_win[region].index.min().date()} to {multivar_data_win[region].index.max().date()}\n")

## 3. Lag Selection

The following cells run the lag selection process for each prior type separately to ensure the logic is identical to the original, working implementation.

### 3.1 Minnesota Prior

In [None]:
# Perform lag selection for each region using transformed and winsorized data
minnesota_results = {}
max_lags = 12
prior_type = 2  # Minnesota
a_bar_1 = 0.3
a_bar_2 = 0.15
a_bar_3 = 10

for region in ["ch", "eu", "us"]:
    print(f"\nPerforming lag selection with Minnesota prior for {region.upper()} (winsorized data)...")
    data_matrix = multivar_data_win[region].values  # Use winsorized data
    T_total = data_matrix.shape[0]
    
    # Define splits from 40% to 90% of the data
    splits = [int(T_total * p) for p in [0.4, 0.5, 0.6, 0.7, 0.8, 0.9]]
    
    optimal_lags = []
    final_bic_values = []  # Store final BIC values for selected lags
    for t_idx, t in enumerate(splits):
        # Use an expanding window: start from the beginning of the dataset
        Y_train = data_matrix[0:t, :]

        bic_values = []
        for p_lags in range(1, max_lags + 1):
            try:
                bic = calculate_bic(Y_train, p_lags, prior_type=prior_type, a_bar_1=a_bar_1, a_bar_2=a_bar_2, a_bar_3=a_bar_3)
                if bic == np.inf:
                    raise ValueError(f"BIC could not be calculated for lag {p_lags} at split {t_idx + 1}/{len(splits)}.")
                bic_values.append(bic)
            except ValueError as e:
                print(f"Error calculating BIC for lag {p_lags} with prior_type 2: {e}")
                break
            except Exception as e:
                print(f"Unexpected error at lag {p_lags}: {e}")
                break

        if all(b == np.inf for b in bic_values):
            print(f"Warning: Unable to calculate BIC for any lag in range 1 to {max_lags}.")
            optimal_lag = None
            final_bic = None
        else:
            # Filter out np.inf values and find the index of the smallest valid BIC
            valid_bic_values = [(idx, b) for idx, b in enumerate(bic_values) if b != np.inf]
            if valid_bic_values:
                optimal_lag, final_bic = min(valid_bic_values, key=lambda x: x[1])
                optimal_lag += 1  # Add 1 because lag index starts at 0
            else:
                print(f"Warning: No valid BIC values found for region {region}, split {t}.")
                optimal_lag = None
                final_bic = None
        optimal_lags.append(optimal_lag)
        final_bic_values.append(final_bic)

    minnesota_results[region] = {
        "optimal_lags": optimal_lags,
        "final_bic_values": final_bic_values,
    }
    print(f"[{region.upper()}] Optimal lags with max_lag = {max_lags} for splits {splits}: {optimal_lags}")
    print(f"[{region.upper()}] Final BIC values for selected lags: {final_bic_values}")

### 3.2 Diffuse Prior

In [None]:
# Perform lag selection for each region with diffuse prior
diffuse_results = {}
max_lags = 12
prior_type = 1

for region in ["ch", "eu", "us"]:
    print(f"\nPerforming lag selection with Diffuse prior for {region.upper()}...")
    data_matrix = multivar_data_win[region].values
    T_total = data_matrix.shape[0]
    
    # Define splits from 40% to 90% of the data
    splits = [int(T_total * p) for p in [0.4, 0.5, 0.6, 0.7, 0.8, 0.9]]
    
    optimal_lags = []
    final_bic_values = []  # Store final BIC values for selected lags
    for t_idx, t in enumerate(splits):
        # Use an expanding window: start from the beginning of the dataset
        Y_train = data_matrix[0:t, :]

        bic_values = []
        for p_lags in range(1, max_lags + 1):
            try:
                bic = calculate_bic(Y_train, p_lags, prior_type=prior_type)
                if bic == np.inf:
                    raise ValueError(f"BIC could not be calculated for lag {p_lags} at split {t_idx + 1}/{len(splits)}.")
                bic_values.append(bic)
            except ValueError as e:
                print(f"Error calculating BIC for lag {p_lags} with prior_type {prior_type}: {e}")
                break
            except Exception as e:
                print(f"Unexpected error at lag {p_lags}: {e}")
                break

        if all(b == np.inf for b in bic_values):
            print(f"Warning: Unable to calculate BIC for any lag in range 1 to {max_lags}.")
            optimal_lag = None
            final_bic = None
        else:
            # Filter out np.inf values and find the index of the smallest valid BIC
            valid_bic_values = [(idx, b) for idx, b in enumerate(bic_values) if b != np.inf]
            if valid_bic_values:
                optimal_lag, final_bic = min(valid_bic_values, key=lambda x: x[1])
                optimal_lag += 1  # Add 1 because lag index starts at 0
            else:
                print(f"Warning: No valid BIC values found for region {region}, split {t}.")
                optimal_lag = None
                final_bic = None
        optimal_lags.append(optimal_lag)
        final_bic_values.append(final_bic)

    diffuse_results[region] = {
        "optimal_lags": optimal_lags,
        "final_bic_values": final_bic_values,
    }
    print(f"[{region.upper()}] Optimal lags with max_lag = {max_lags} for splits {splits}: {optimal_lags}")
    print(f"[{region.upper()}] Final BIC values for selected lags: {final_bic_values}")

### 3.3 Normal-Wishart Prior

In [None]:
# Perform lag selection for each region with Wishart prior
wishart_results = {}
max_lags = 12
prior_type = 3

for region in ["ch", "eu", "us"]:
    print(f"\nPerforming lag selection with Wishart prior for {region.upper()}...")
    data_matrix = multivar_data_win[region].values
    T_total = data_matrix.shape[0]
    
    # Define splits from 40% to 90% of the data
    splits = [int(T_total * p) for p in [0.4, 0.5, 0.6, 0.7, 0.8, 0.9]]
    
    optimal_lags = []
    final_bic_values = []  # Store final BIC values for selected lags
    for t_idx, t in enumerate(splits):
        # Use an expanding window: start from the beginning of the dataset
        Y_train = data_matrix[0:t, :]

        bic_values = []
        for p_lags in range(1, max_lags + 1):
            try:
                bic = calculate_bic(Y_train, p_lags, prior_type=prior_type)
                if bic == np.inf:
                    raise ValueError(f"BIC could not be calculated for lag {p_lags} at split {t_idx + 1}/{len(splits)}.")
                bic_values.append(bic)
            except ValueError as e:
                print(f"Error calculating BIC for lag {p_lags} with prior_type {prior_type}: {e}")
                break
            except Exception as e:
                print(f"Unexpected error at lag {p_lags}: {e}")
                break

        if all(b == np.inf for b in bic_values):
            print(f"Warning: Unable to calculate BIC for any lag in range 1 to {max_lags}.")
            optimal_lag = None
            final_bic = None
        else:
            # Filter out np.inf values and find the index of the smallest valid BIC
            valid_bic_values = [(idx, b) for idx, b in enumerate(bic_values) if b != np.inf]
            if valid_bic_values:
                optimal_lag, final_bic = min(valid_bic_values, key=lambda x: x[1])
                optimal_lag += 1  # Add 1 because lag index starts at 0
            else:
                print(f"Warning: No valid BIC values found for region {region}, split {t}.")
                optimal_lag = None
                final_bic = None
        optimal_lags.append(optimal_lag)
        final_bic_values.append(final_bic)

    wishart_results[region] = {
        "optimal_lags": optimal_lags,
        "final_bic_values": final_bic_values,
    }
    print(f"[{region.upper()}] Optimal lags with max_lag = {max_lags} for splits {splits}: {optimal_lags}")
    print(f"[{region.upper()}] Final BIC values for selected lags: {final_bic_values}")

## 4. Results

The table below shows the optimal lag length for each region, prior, and expanding data window (split).

In [None]:
all_results_data = []
results_map = {
    "Minnesota": minnesota_results,
    "Diffuse": diffuse_results,
    "Normal-Wishart": wishart_results
}

# Define split percentages for column headers
split_percentages = ["40%", "50%", "60%", "70%", "80%", "90%"]

for prior_name, results_dict in results_map.items():
    for region, region_data in results_dict.items():
        # Create a dictionary for the row, starting with region and prior
        row = {"Region": region.upper(), "Prior": prior_name}
        # Add the optimal lags for each split percentage
        for i, lag in enumerate(region_data['optimal_lags']):
            row[split_percentages[i]] = lag
        all_results_data.append(row)

# Create a DataFrame from the collected data
summary_df = pd.DataFrame(all_results_data)

# Set a multi-level index for better organization
summary_df = summary_df.set_index(["Region", "Prior"])

# Display the final summary table
print("--- Summary of Optimal Lags Across All Priors ---")
display(summary_df)