# VAR USA

Vector auto-regressive models

1. [Imports](#imports)
2. [Ingestion](#ingestion)
3. [Plotting](#plotting)
4. [Statistical tests](#stattests1)
5. [Differencing](#diff)


### VAR with First-Order Differencing
1. [Train test split - first order differencing](#traintest1)
2. [Find order p of VAR](#var_p_1)
3. [VAR(8) Model](#var8_1)
4. [Plots of first differenced predictions](#diff1_plot)
5. [Undifferencing and predicting](#undiff_1)
6. [MAPE](#mape1)
7. [Rolling forecasts](#roll1)


### VARMA
1. [VARMA](#varma)
2. [Rolling forecasts](#roll2)

<a name=imports></a>
## Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

<a name=ingestion></a>
## Ingestion

In [None]:
daily_cases_usa = pd.read_csv('../../cleaned_datasets/usa/daily_cases_usa.csv', parse_dates=['Date'])
daily_vacc_usa = pd.read_csv('../../cleaned_datasets/usa/daily_vacc_usa.csv', parse_dates=['date'])

In [None]:
daily_cases_usa.dtypes

In [None]:
daily_vacc_usa.dtypes

In [None]:
daily_cases_usa

In [None]:
daily_vacc_usa

In [None]:
cases_vacc = daily_cases_usa.merge(daily_vacc_usa, how='outer', left_on='Date', right_on='date')
cases_vacc = cases_vacc[["Date", "Confirmed", "Total_Doses"]]
cases_vacc

In [None]:
cases_vacc.fillna(0, inplace=True)
indexed = cases_vacc.set_index('Date')
indexed

<a name=plotting></a>
## Plot initial data

In [None]:
def plot_subplots(indexed):
    
    if type(indexed) == pd.DataFrame:
        nrows = int(len(indexed.columns)/2)
    else:
        nrows = 1
        
    fig, axes = plt.subplots(nrows=nrows, ncols=2, dpi=120, figsize=(8,4))
    for i, ax in enumerate(axes.flatten()):
        
        if type(indexed) == list:
            # fig, ax = plt.subplots()
            # fig.set_size_inches(8, 8)

            ax.plot(indexed[0][indexed[0].columns[i]], color='blue', label = 'Train')
            ax.plot(indexed[1][indexed[1].columns[i]], color='red', label = 'Test')
            ax.legend(loc = 'best')
            # Decorations
            ax.set_title(indexed[0].columns[i])
        else:    
            data = indexed[indexed.columns[i]]
            ax.plot(data, color='blue', linewidth=1)
            # Decorations
            ax.set_title(indexed.columns[i])
        ax.xaxis.set_ticks_position('none')
        ax.yaxis.set_ticks_position('none')
        ax.spines["top"].set_alpha(0)
        ax.tick_params(labelsize=6)
    
    plt.tight_layout()
    
plot_subplots(indexed)

<a name=stattests1></a>
## Statistical tests

### Johansen co-integration test

In [None]:
from statsmodels.tsa.vector_ar.vecm import coint_johansen

def cointegration_test(df, alpha=0.05): 
    """Perform Johanson's Cointegration Test and Report Summary"""
    out = coint_johansen(df,-1,5)
    d = {'0.90':0, '0.95':1, '0.99':2}
    traces = out.lr1
    cvts = out.cvt[:, d[str(1-alpha)]]
    def adjust(val, length= 6): return str(val).ljust(length)

    # Summary
    print('Name   ::  Test Stat > C(95%)    =>   Signif  \n', '--'*20)
    for col, trace, cvt in zip(df.columns, traces, cvts):
        print(adjust(col), ':: ', adjust(round(trace,2), 9), ">", adjust(cvt, 8), ' =>  ' , trace > cvt)

cointegration_test(indexed)

### Augmented DF Test

In [None]:
from statsmodels.tsa.stattools import adfuller

def run_dicky_fuller(ts):
  '''Function to run Augmented Dicky Fuller test on the passed time series and report the statistics from the test'''
  print("Observations of Dickey-fuller test")
  dftest = adfuller(ts,autolag='AIC')
  dfoutput=pd.Series(dftest[0:4],index=['Test Statistic','p-value','#lags used','number of observations used'])

  for key,value in dftest[4].items():
      dfoutput['critical value (%s)'%key]= value
  print(dfoutput)


# ADF Test on each column
for name, column in indexed.iteritems():
    run_dicky_fuller(column)
    print('\n')

The TS is not stationary

<a name=diff></a>
## Differencing

## First order differencing

In [None]:
df_diff_1 = indexed.diff().dropna()
df_diff_1

In [None]:
# ADF Test on each column
for name, column in df_diff_1.iteritems():
    run_dicky_fuller(column)
    print('\n')

In [None]:
plot_subplots(df_diff_1)

Even though the data seems to be stationary, there are 2 small sinusoids corresponding with the two waves of cases. Therefore, 2nd order differencing might be required.

<a name=traintest1></a>
## Train-test split

### Train-test split - first order differenced

In [None]:
percent_90 = int(len(df_diff_1)*0.9)

train = df_diff_1.iloc[:percent_90].dropna()
test = df_diff_1.iloc[percent_90:]

plot_subplots([train, test])

<a name=var_p_1></a>
### Finding the best value of p for VAR(p)

Using AIC, BIC, FPE, HQIC

In [None]:
from statsmodels.tsa.vector_ar.var_model import VAR

model = VAR(train)

x = model.select_order(maxlags=25)
x.summary()

At p = 8 the value of BIC and HQIC are at their local minima

<a name=var8_1></a>
### VAR(8) Model

In [None]:
model_fitted = model.fit(8)
model_fitted.summary()

In [None]:
forecast_input = test.values


forecast_input
# Forecast
fc = model_fitted.forecast(y=forecast_input, steps=len(forecast_input))


df_forecast = pd.DataFrame(fc, index=test.index, columns=test.columns + '_forecast')

forecasted_conf = df_forecast.join(test)[['Confirmed', 'Confirmed_forecast']]
forecasted_vacc = df_forecast.join(test)[['Total_Doses', 'Total_Doses_forecast']]

<a name=diff1_plot></a>
### Plot Predictions on Diffed

In [None]:
def plot_fore_test(test, fore, title):
    
    fig, ax = plt.subplots()
    fig.set_size_inches(7, 7)

    ax.plot(test, color='blue', label='Test')
    ax.plot(fore, color='red', label='Forecast')
    ax.legend(loc='best')
    plt.title(title)
    plt.show()


In [None]:
plot_fore_test(forecasted_conf[['Confirmed']], forecasted_conf[['Confirmed_forecast']], title='Daily cases')

In [None]:
plot_fore_test(forecasted_vacc[['Total_Doses']], forecasted_vacc[['Total_Doses_forecast']], title='Daily vaccinations')

<a name=undiff_1></a>

## Un-differencing and Plotting

In [None]:
def invert_transformation(diffed, original):
    """Revert back the differencing to get the forecast to original scale."""
    df_copy = original.copy()
    
    columns = diffed.columns
    
    for col in columns:
        # Add corresponding diff column
        df_copy[f'undiff_{col}'] = original[col]
        df_copy[f'undiff_{col}'][1:] = diffed[col]
        df_copy[f'undiff_{col}'] = df_copy[f'undiff_{col}'].cumsum()
    
        df_copy.drop(col, axis=1, inplace=True)
        
    return df_copy

In [None]:
# Check if un-diffed df_diff_1 is the same as original

#inverted = invert_transformation(df_diff_1, indexed)
#(inverted[['undiff_Confirmed']] - indexed[['Confirmed']]).isna().sum()

In [None]:
# Un-diff the test dataset

start_index = indexed.index.get_loc(test.index[0])-1
test_original = invert_transformation(test, indexed.iloc[start_index:])

#test_original

In [None]:
renamed_df = df_forecast.rename(columns={'Confirmed_forecast': 'Confirmed', 'Total_Doses_forecast': 'Total_Doses'}, inplace=False)


start_index = indexed.index.get_loc(renamed_df.index[0])-1
fore_original = invert_transformation(renamed_df, indexed.iloc[start_index:])

#fore_original

In [None]:
end_index = indexed.index.get_loc(fore_original.index[0])
train_original = indexed[:end_index]
plot_subplots([train_original, fore_original])

### Plot the forecasts

In [None]:
plot_fore_test(test_original[['undiff_Confirmed']], fore_original[['undiff_Confirmed']], title='Daily cases')

In [None]:
plot_fore_test(test_original[['undiff_Total_Doses']], fore_original[['undiff_Total_Doses']], title='Daily vaccinations')

In [None]:
def plot_train_test_fore(train, test, fore, title='Forecast vs Actuals', ylabel='', xlabel='Date', figpath=None):
    plt.figure(figsize=(10,5), dpi=100)
    plt.plot(train, label='training')
    plt.plot(test, label='actual')
    plt.plot(fore, label='forecast')
    fig = plt.gcf()
    
    plt.xlabel=xlabel
    plt.ylabel=ylabel

    plt.title(title)
    plt.legend(loc='upper left', fontsize=8)
    plt.show()
    
    
    if figpath is not None:
        fig.savefig(figpath, format='eps', bbox_inches='tight')

In [None]:
# Plot of daily cases
plot_train_test_fore(train_original.Confirmed, test_original[['undiff_Confirmed']], fore_original[['undiff_Confirmed']], title='Daily cases', figpath='../../figures/v_ar/usa_cases.eps')



In [None]:
# Plot of daily cases
plot_train_test_fore(train_original.Total_Doses, test_original[['undiff_Total_Doses']], fore_original[['undiff_Total_Doses']], title='Daily doses', figpath='../../figures/v_ar/usa_vacc.eps')




<a name=mape1></a>
### MAPE

In [None]:
def MAPE(Y_actual,Y_Predicted, title):
    mask = Y_actual != 0
    
    mape = np.mean(np.abs((Y_actual - Y_Predicted)/Y_actual)[mask])*100
    print(f"MAPE of {title} is {mape[mape.index[0]]}%")
    return mape[mape.index[0]]


mape_vacc = MAPE(test_original[['undiff_Total_Doses']], fore_original[['undiff_Total_Doses']], title="Daily vaccinations")
mape_cases = MAPE(test_original[['undiff_Confirmed']], fore_original[['undiff_Confirmed']], title="Daily cases")

In [None]:
from sklearn.metrics import mean_absolute_error


mean_absolute_error(test_original[['undiff_Confirmed']], fore_original[['undiff_Confirmed']])

Clearly, a VAR model is not good enough to make predictions

<a name=varma></a>

## VARMA

In [None]:
from statsmodels.tsa.statespace.varmax import VARMAX
from sklearn.metrics import mean_squared_error

# evaluate an VARMA model for a given order (p,d,q)
def evaluate_varma_model(train, test, varma_order):
    
    model = VARMAX(train, order=varma_order)
    model_fit = model.fit()
    yhat = model_fit.forecast(len(test))
    error = mean_squared_error(test, yhat)
    return error

In [None]:
# Grid search


# evaluate combinations of p and q values for an VARMA model
def evaluate_models(train, test, p_values, q_values):
    #dataset = dataset.astype('float32')
    best_score, best_cfg = float("inf"), None
    
    for p in p_values:
        for q in q_values:
            order = (p,q)
            try:
                mse = evaluate_varma_model(train, test, order)
                if mse < best_score:
                    best_score, best_cfg = mse, order
                print('VARMA%s MSE=%.3f' % (order,mse))
            except:
                continue
    print()
    print('Best VARMA%s MSE=%.3f' % (best_cfg, best_score))

In [None]:
import warnings
warnings.filterwarnings("ignore")

p_values = range(1, 3)
q_values = range(1, 3)

evaluate_models(train, test, p_values, q_values)

In [None]:
from statsmodels.tsa.statespace.varmax import VARMAX

model = VARMAX(train, order=(1,1))
model_fit = model.fit()
yhat = model_fit.forecast(len(test))
#yhat

In [None]:
plot_fore_test(test[['Confirmed']], yhat[['Confirmed']], title='Diffed Daily cases')

In [None]:
plot_fore_test(test[['Total_Doses']], yhat[['Total_Doses']], title='Diffed Daily Doses')

In [None]:

# Un-diff the test dataset

fore_original_3 = invert_transformation(yhat, indexed.iloc[start_index:])
fore_original_3

In [None]:
plot_fore_test(test_original[['undiff_Confirmed']], fore_original_3[['undiff_Confirmed']], title='Daily cases')

In [None]:
plot_fore_test(test_original[['undiff_Total_Doses']], fore_original_3[['undiff_Total_Doses']], title='Daily Doses')

In [None]:
plot_subplots([train_original, fore_original_3])

<a name=roll2></a>

## Rolling Forecasts

In [None]:
history = train.copy()
predicted = test.copy()

for t in range(len(test)):
    model = VARMAX(history, order=(1,2))
    model_fit = model.fit()
    yhat = model_fit.forecast()
    
    
    newindex = history.index[-1] + pd.to_timedelta(1, 'D')
    
    predicted.loc[newindex]['Confirmed'] = yhat['Confirmed']
    predicted.loc[newindex]['Total_Doses'] = yhat['Total_Doses']
    
    history = history.append(test.iloc[t])
#     history[newindex]['Total_Doses'] = test.iloc[t]['Total_Doses']
    
    print('predicted =', yhat.values, ' ; actual =', test.iloc[t].values)

In [None]:
fore_original_4 = invert_transformation(predicted, indexed.iloc[start_index:])

In [None]:
plot_train_test_fore(train_original.Confirmed, test_original[['undiff_Confirmed']], fore_original_4[['undiff_Confirmed']], title='Daily cases', figpath='../../figures/varma/usa_cases.eps')



In [None]:
mape_cases = MAPE(test_original[['undiff_Confirmed']], fore_original_4[['undiff_Confirmed']], title="Daily cases")


In [None]:
mean_absolute_error(test_original[['undiff_Confirmed']], fore_original_4[['undiff_Confirmed']])