In [1]:
import os
import pandas as pd
import pickle
from statsmodels.tsa.api import VAR
import numpy as np


In [2]:
model_save_folder = 'trained_var_models'
os.makedirs(model_save_folder, exist_ok=True)

In [3]:
def train_and_save_var(location_train_csv):
    location_name = os.path.basename(location_train_csv).replace('_train.csv', '')
    print(f"\nTraining VAR model for location: {location_name}")

    df = pd.read_csv(location_train_csv, parse_dates=['date'])
    # Drop 'date' and 'location_id' columns
    df_numeric = df.drop(columns=['date', 'location_id'], errors='ignore')

    # Drop one of the perfectly correlated variables to reduce multicollinearity
    if 'precipitation' in df_numeric.columns:
        df_numeric = df_numeric.drop(columns=['precipitation'])
        print("Dropped 'precipitation' due to perfect correlation with 'rainfall'.")

    # Drop columns with all NaNs and rows with any NaN
    df_numeric = df_numeric.dropna(axis=1, how='all').dropna()

    print("Variance per variable:")
    print(df_numeric.var())
    print("Correlation matrix:")
    print(df_numeric.corr())

    n_obs = len(df_numeric)
    n_vars = df_numeric.shape[1]

    if n_obs < (n_vars + 1):
        print(f"Not enough observations ({n_obs}) for number of variables ({n_vars}), skipping.")
        return

    fixed_lag = 1  # fixed lag order

    model = VAR(df_numeric)

    try:
        var_model = model.fit(fixed_lag, trend='c')
        print(f"Fitted VAR model with lag {fixed_lag}")
    except np.linalg.LinAlgError as e:
        print(f"LinAlgError during fit: {e}, trying lag=1 again without constant")
        var_model = model.fit(fixed_lag, trend='nc')

    try:
        print(var_model.summary())
    except np.linalg.LinAlgError as e:
        print(f"Cannot print summary due to LinAlgError: {e}")

    model_path = os.path.join(model_save_folder, f'{location_name}_var_model.pkl')
    with open(model_path, 'wb') as f:
        pickle.dump(var_model, f)
    print(f"Saved VAR model to {model_path}")

In [4]:
# Example usage:
train_folder = 'train_val_test_splits'
for file in os.listdir(train_folder):
    if file.endswith('_train.csv'):
        train_csv_path = os.path.join(train_folder, file)
        train_and_save_var(train_csv_path)


Training VAR model for location: 0
Dropped 'precipitation' due to perfect correlation with 'rainfall'.
Variance per variable:
temperature      0.915459
rainfall       115.268661
wind_speed      20.538635
dtype: float64
Correlation matrix:
             temperature  rainfall  wind_speed
temperature     1.000000 -0.355362    0.003182
rainfall       -0.355362  1.000000    0.039134
wind_speed      0.003182  0.039134    1.000000
Fitted VAR model with lag 1
  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Thu, 10, Jul, 2025
Time:                     10:49:14
--------------------------------------------------------------------
No. of Equations:         3.00000    BIC:                    6.38183
Nobs:                     3690.00    HQIC:                   6.36882
Log likelihood:          -27432.9    FPE:                    579.188
AIC:                      6.36163    Det(Omega_mle):         577.308
------------------------