In [12]:
import pandas as pd

# Read in the CSV file
df = pd.read_csv('data/fred_data_scaled.csv')

# Print the first 5 rows of the DataFrame
print(df.head())


         DATE  Real_GDP       CPI  Unemployment_Rate  Federal_Funds_Rate  \
0  2010-01-31 -1.661221 -1.432244           1.648484           -0.615660   
1  2010-02-28 -1.624934 -1.442768           1.648484           -0.592263   
2  2010-03-31 -1.588648 -1.439107           1.693053           -0.557169   
3  2010-04-30 -1.552362 -1.436565           1.693053           -0.510375   
4  2010-05-31 -1.523090 -1.442310           1.559346           -0.510375   

   Labor_Force_Participation_Rate  Industrial_Production  Vehicle_Sales  \
0                        2.095690              -2.580118      -2.269470   
1                        2.210160              -2.498748      -2.536812   
2                        2.210160              -2.334615      -1.862906   
3                        2.553572              -2.249684      -2.009991   
4                        2.210160              -1.929908      -1.743573   

   Disposable_Income  Personal_Consumption_Expenditures  10Y_Treasury_Rate  \
0          -1.

In [13]:
# Set "DATE" column as index
df = df.set_index('DATE')

# Print the DataFrame with "DATE" as the index
print(df.head())


            Real_GDP       CPI  Unemployment_Rate  Federal_Funds_Rate  \
DATE                                                                    
2010-01-31 -1.661221 -1.432244           1.648484           -0.615660   
2010-02-28 -1.624934 -1.442768           1.648484           -0.592263   
2010-03-31 -1.588648 -1.439107           1.693053           -0.557169   
2010-04-30 -1.552362 -1.436565           1.693053           -0.510375   
2010-05-31 -1.523090 -1.442310           1.559346           -0.510375   

            Labor_Force_Participation_Rate  Industrial_Production  \
DATE                                                                
2010-01-31                        2.095690              -2.580118   
2010-02-28                        2.210160              -2.498748   
2010-03-31                        2.210160              -2.334615   
2010-04-30                        2.553572              -2.249684   
2010-05-31                        2.210160              -1.929908   

    

In [15]:
from statsmodels.tsa.stattools import adfuller

# Loop through each column of the DataFrame
for col in df.columns:
    # Run Dickey-Fuller test on the column
    result = adfuller(df[col])
    pvalue = result[1]
    print(f'Column {col}: p-value = {pvalue}')
    
    # Differentiate the column until it is stationary
    while pvalue > 0.05:
        diff = df[col].diff().dropna()
        result = adfuller(diff)
        pvalue = result[1]
        print(f'  differentiated: p-value = {pvalue}')
        df[col] = diff

# Print the DataFrame with all columns differentiated
print(df.head())


Column Real_GDP: p-value = 0.9442271925129032
  differentiated: p-value = 0.0046071994191671396
Column CPI: p-value = 0.9990876548203566
  differentiated: p-value = 0.05022097547955895
  differentiated: p-value = 3.023055171548809e-08
Column Unemployment_Rate: p-value = 0.0821348256144766
  differentiated: p-value = 6.249880118633057e-17
Column Federal_Funds_Rate: p-value = 0.41139057390151396
  differentiated: p-value = 0.7085531679432417
  differentiated: p-value = 1.251410628017575e-17
Column Labor_Force_Participation_Rate: p-value = 0.1733369505776165
  differentiated: p-value = 1.0282438110611127e-19
Column Industrial_Production: p-value = 0.0638252028101994
  differentiated: p-value = 2.1159114028850787e-18
Column Vehicle_Sales: p-value = 0.1485164533216542
  differentiated: p-value = 4.005320249334033e-12
Column Disposable_Income: p-value = 0.7363676263107957
  differentiated: p-value = 0.049301580970367916
Column Personal_Consumption_Expenditures: p-value = 0.9945711456550443
 

In [17]:
# Define the date string to filter by
date_string = '2011-01-01'

# Filter the DataFrame for rows not older than the given date
df_filtered = df[df.index >= date_string]

# Print the filtered DataFrame
print(df_filtered.head())

            Real_GDP       CPI  Unemployment_Rate  Federal_Funds_Rate  \
DATE                                                                    
2011-01-31 -0.009239 -0.008490          -0.089138        0.000000e+00   
2011-02-28  0.025941 -0.000203          -0.044569       -1.110223e-16   
2011-03-31  0.025941  0.022218           0.000000       -1.169829e-02   
2011-04-30  0.025941 -0.005135           0.044569       -2.339658e-02   
2011-05-31 -0.001493 -0.016981          -0.044569        3.509487e-02   

            Labor_Force_Participation_Rate  Industrial_Production  \
DATE                                                                
2011-01-31                        -0.11447              -0.050272   
2011-02-28                        -0.11447              -0.096570   
2011-03-31                         0.11447               0.251077   
2011-04-30                         0.00000              -0.084002   
2011-05-31                        -0.11447               0.034917   

    

In [31]:
from statsmodels.tsa.api import VAR
from statsmodels.tsa.vector_ar.var_model import VARResults
from sklearn.model_selection import TimeSeriesSplit
import numpy as np

# Split data into training and testing sets
train_size = int(len(df_filtered) * 0.8)
train_data, test_data = df_filtered.iloc[:train_size], df_filtered.iloc[train_size:]

# Tune model hyperparameters using TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=3)
best_aic = np.inf
best_order = None
for p in range(1, 5):
    for q in range(1, 5):
        model = VAR(train_data)
        for train_index, test_index in tscv.split(train_data):
            train = train_data.iloc[train_index]
            test = train_data.iloc[test_index]
            fitted_model = model.fit(maxlags=p, ic='aic', trend='c', method='ols')
            y_hat = fitted_model.forecast(train.to_numpy(), steps=len(test))
            residuals = test.to_numpy() - y_hat
            aic = fitted_model.aic
            if aic < best_aic:
                best_aic = aic
                best_order = (p, q)

# Train model with best hyperparameters on entire training set
model = VAR(train_data)
fitted_model = model.fit(maxlags=best_order[0], ic='aic', trend='c', method='ols')

# Evaluate model on testing set
y_hat = fitted_model.forecast(train_data.to_numpy(), steps=len(test_data))
residuals = test_data.to_numpy() - y_hat
mse = np.mean(residuals**2)

# Save model to models folder
fitted_model.save('models/var_model.pkl')


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


In [32]:
import pickle
import statsmodels.api as sm

# Load the pre-trained VAR model from the models folder
with open('models/var_model.pkl', 'rb') as f:
    var_model = pickle.load(f)

# Print the summary of the VAR model
print(var_model.summary())


  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Wed, 08, Mar, 2023
Time:                     18:05:24
--------------------------------------------------------------------
No. of Equations:         11.0000    BIC:                   -38.5380
Nobs:                     111.000    HQIC:                  -45.7193
Log likelihood:           1571.94    FPE:                1.78453e-22
AIC:                     -50.6210    Det(Omega_mle):     4.22383e-24
--------------------------------------------------------------------
Results for equation Real_GDP
                                          coefficient       std. error           t-stat            prob
-------------------------------------------------------------------------------------------------------
const                                        0.005166         0.008795            0.587           0.557
L1.Real_GDP                                  0.899600         0.12247

In [34]:
lag_order = 10

In [35]:
# Forecast
forecast_input = df_filtered.values[-lag_order:]
fc = var_model.forecast(y=forecast_input, steps=24)

# Convert forecast results to dataframe
fc_df = pd.DataFrame(fc, index=range(df_filtered.shape[0], df_filtered.shape[0]+24), columns=df_filtered.columns + '_forecast')

# Print forecast results
print(fc_df)


     Real_GDP_forecast  CPI_forecast  Unemployment_Rate_forecast  \
144          -0.066883     -0.012357                    0.487571   
145           0.085776      0.046911                   -0.054340   
146           0.049861      0.012944                    0.310632   
147           0.054535     -0.026454                   -1.189531   
148           0.017090      0.001860                    0.483715   
149           0.041358     -0.028407                    0.751700   
150           0.187342      0.024543                   -1.674040   
151          -0.272686     -0.013099                    0.585855   
152          -0.095532     -0.001533                    0.282991   
153           0.130970      0.022642                    1.892103   
154          -0.065013      0.009057                   -1.963713   
155          -0.140719     -0.019711                   -1.611820   
156          -0.185216     -0.070843                    6.022399   
157           0.888813      0.123825            

In [46]:
import pandas as pd

start_date = '2023-01-01'
end_date = '2024-12-31'

idx = pd.date_range(start=start_date, end=end_date, freq='M')


In [47]:
idx

DatetimeIndex(['2023-01-31', '2023-02-28', '2023-03-31', '2023-04-30',
               '2023-05-31', '2023-06-30', '2023-07-31', '2023-08-31',
               '2023-09-30', '2023-10-31', '2023-11-30', '2023-12-31',
               '2024-01-31', '2024-02-29', '2024-03-31', '2024-04-30',
               '2024-05-31', '2024-06-30', '2024-07-31', '2024-08-31',
               '2024-09-30', '2024-10-31', '2024-11-30', '2024-12-31'],
              dtype='datetime64[ns]', freq='M')

In [48]:
fc_df.index = idx


In [50]:
fc_df.columns = df_filtered.columns


In [51]:
concatenated_df = pd.concat([fc_df, df_filtered], axis=0)

In [53]:
concatenated_df.to_csv("data/concatenated_df.csv", index=True)