In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
daily_cases_india = pd.read_csv('../../cleaned_datasets/india/daily_cases_india.csv', parse_dates=['Date'], index_col=0)
daily_vacc_india = pd.read_csv('../../cleaned_datasets/india/daily_vacc_india.csv', parse_dates=['Updated On'])

In [None]:
daily_cases_india.dtypes

In [None]:
daily_cases_india

In [None]:
daily_vacc_india.dtypes

In [None]:
daily_vacc_india

In [None]:
cases_vacc = daily_cases_india.merge(daily_vacc_india, how='outer', left_on='Date', right_on='Updated On')
cases_vacc = cases_vacc[["Date", "Confirmed", "Total_Doses"]]
cases_vacc

In [None]:
cases_vacc.fillna(0, inplace=True)
indexed = cases_vacc.set_index('Date')
indexed

In [None]:
def plot_subplots(indexed):
    
    if type(indexed) == pd.DataFrame:
        nrows = int(len(indexed.columns)/2)
    else:
        nrows = 1
        
    fig, axes = plt.subplots(nrows=nrows, ncols=2, dpi=120, figsize=(8,4))
    for i, ax in enumerate(axes.flatten()):
        
        if type(indexed) == list:
            # fig, ax = plt.subplots()
            # fig.set_size_inches(8, 8)

            ax.plot(indexed[0][indexed[0].columns[i]], color='blue', label = 'Train')
            ax.plot(indexed[1][indexed[1].columns[i]], color='red', label = 'Test')
            ax.legend(loc = 'best')
            # Decorations
            ax.set_title(indexed[0].columns[i])
        else:    
            data = indexed[indexed.columns[i]]
            ax.plot(data, color='blue', linewidth=1)
            # Decorations
            ax.set_title(indexed.columns[i])
        ax.xaxis.set_ticks_position('none')
        ax.yaxis.set_ticks_position('none')
        ax.spines["top"].set_alpha(0)
        ax.tick_params(labelsize=6)
    
    plt.tight_layout()
    
plot_subplots(indexed)

In [None]:
from statsmodels.tsa.vector_ar.vecm import coint_johansen

def cointegration_test(df, alpha=0.05): 
    """Perform Johanson's Cointegration Test and Report Summary"""
    out = coint_johansen(df,-1,5)
    d = {'0.90':0, '0.95':1, '0.99':2}
    traces = out.lr1
    cvts = out.cvt[:, d[str(1-alpha)]]
    def adjust(val, length= 6): return str(val).ljust(length)

    # Summary
    print('Name   ::  Test Stat > C(95%)    =>   Signif  \n', '--'*20)
    for col, trace, cvt in zip(df.columns, traces, cvts):
        print(adjust(col), ':: ', adjust(round(trace,2), 9), ">", adjust(cvt, 8), ' =>  ' , trace > cvt)

cointegration_test(indexed)

### Augmented DF Test

In [None]:
from statsmodels.tsa.stattools import adfuller

def run_dicky_fuller(ts):
  '''Function to run Augmented Dicky Fuller test on the passed time series and report the statistics from the test'''
  print("Observations of Dickey-fuller test")
  dftest = adfuller(ts,autolag='AIC')
  dfoutput=pd.Series(dftest[0:4],index=['Test Statistic','p-value','#lags used','number of observations used'])

  for key,value in dftest[4].items():
      dfoutput['critical value (%s)'%key]= value
  print(dfoutput)


# ADF Test on each column
for name, column in indexed.iteritems():
    run_dicky_fuller(column)
    print('\n')

The TS is not stationary

### First order differencing

In [None]:
df_diff_1 = indexed.diff().dropna()
df_diff_1

In [None]:
# ADF Test on each column
for name, column in df_diff_1.iteritems():
    run_dicky_fuller(column)
    print('\n')

In [None]:
plot_subplots(df_diff_1)

Even though the data seems to be stationary, there are 2 small sinusoids corresponding with the two waves of cases. Therefore, 2nd order differencing might be required.

## Second-order differencing

In [None]:
df_diff_2 = df_diff_1.diff().dropna()
df_diff_2

## Train-test split - first order differenced

In [None]:
percent_90 = int(len(df_diff_1)*0.9)

train = df_diff_1.iloc[:percent_90].dropna()
test = df_diff_1.iloc[percent_90:]

plot_subplots([train, test])

### Finding the best value of p for VAR(p)

Using AIC, BIC, FPE, HQIC

In [None]:
from statsmodels.tsa.vector_ar.var_model import VAR

model = VAR(train)

x = model.select_order(maxlags=25)
x.summary()

At p = 13 the value of BIC and HQIC are at their local minima

In [None]:
model_fitted = model.fit(13)
model_fitted.summary()

In [None]:
forecast_input = test.values


forecast_input
# Forecast
fc = model_fitted.forecast(y=forecast_input, steps=len(forecast_input))


df_forecast = pd.DataFrame(fc, index=test.index, columns=test.columns + '_forecast')

forecasted_conf = df_forecast.join(test)[['Confirmed', 'Confirmed_forecast']]
forecasted_vacc = df_forecast.join(test)[['Total_Doses', 'Total_Doses_forecast']]

In [None]:
def plot_fore_test(test, fore, title):
    
    fig, ax = plt.subplots()
    fig.set_size_inches(7, 7)

    ax.plot(test, color='blue', label='Test')
    ax.plot(fore, color='red', label='Forecast')
    ax.legend(loc='best')
    plt.title(title)
    plt.show()


In [None]:
plot_fore_test(forecasted_conf[['Confirmed']], forecasted_conf[['Confirmed_forecast']], title='Daily cases')

In [None]:
plot_fore_test(forecasted_vacc[['Total_Doses']], forecasted_vacc[['Total_Doses_forecast']], title='Daily vaccinations')

## Un-differencing

In [None]:
def invert_transformation(diffed, original):
    """Revert back the differencing to get the forecast to original scale."""
    df_copy = original.copy()
    
    columns = diffed.columns
    
    for col in columns:
        # Add corresponding diff column
        df_copy[f'undiff_{col}'] = original[col]
        df_copy[f'undiff_{col}'][1:] = diffed[col]
        df_copy[f'undiff_{col}'] = df_copy[f'undiff_{col}'].cumsum()
    
        df_copy.drop(col, axis=1, inplace=True)
        
    return df_copy

In [None]:
# Check if un-diffed df_diff_1 is the same as original

inverted = invert_transformation(df_diff_1, indexed)
(inverted[['undiff_Confirmed']] - indexed[['Confirmed']]).isna().sum()

In [None]:
# Un-diff the test dataset

start_index = indexed.index.get_loc(test.index[0])-1
test_original = invert_transformation(test, indexed.iloc[start_index:])

test_original

In [None]:
renamed_df = df_forecast.rename(columns={'Confirmed_forecast': 'Confirmed', 'Total_Doses_forecast': 'Total_Doses'}, inplace=False)


start_index = indexed.index.get_loc(renamed_df.index[0])-1
fore_original = invert_transformation(renamed_df, indexed.iloc[start_index:])

fore_original

## Plot the forecasts

In [None]:
plot_fore_test(test_original[['undiff_Confirmed']], fore_original[['undiff_Confirmed']], title='Daily cases')

In [None]:
plot_fore_test(test_original[['undiff_Total_Doses']], fore_original[['undiff_Total_Doses']], title='Daily vaccinations')

### MAPE

In [None]:
def MAPE(Y_actual,Y_Predicted, title):
    mask = Y_actual != 0
    
    mape = np.mean(np.abs((Y_actual - Y_Predicted)/Y_actual)[mask])*100
    print(f"MAPE of {title} is {mape[mape.index[0]]}%")
    return mape[mape.index[0]]


mape_vacc = MAPE(test_original[['undiff_Total_Doses']], fore_original[['undiff_Total_Doses']], title="Daily vaccinations")
mape_cases = MAPE(test_original[['undiff_Confirmed']], fore_original[['undiff_Confirmed']], title="Daily cases")

Clearly, a VAR model is not good enough to make predictions

## Train-test split - second order differenced


In [None]:
percent_90 = int(len(df_diff_2)*0.9)

train_2 = df_diff_2.iloc[:percent_90].dropna()
test_2 = df_diff_2.iloc[percent_90:]

plot_subplots([train_2, test_2])

### Finding the best value of p for VAR(p)

Using AIC, BIC, FPE, HQIC

In [None]:
model_2 = VAR(train_2)

x_2 = model_2.select_order(maxlags=25)
x_2.summary()

At p = 13 the value of AIC, BIC and HQIC are at their local minima

In [None]:
model_fitted_2 = model_2.fit(13)
model_fitted_2.summary()

In [None]:
forecast_input_2 = test_2.values


forecast_input_2
# # Forecast
fc_2 = model_fitted_2.forecast(y=forecast_input_2, steps=len(forecast_input_2))


df_forecast_2 = pd.DataFrame(fc_2, index=test_2.index, columns=test_2.columns + '_forecast')

forecasted_conf_2 = df_forecast_2.join(test_2)[['Confirmed', 'Confirmed_forecast']]
forecasted_vacc_2 = df_forecast_2.join(test_2)[['Total_Doses', 'Total_Doses_forecast']]

In [None]:
forecasted_conf_2

In [None]:
# Daily cases - diffed twice

plot_fore_test(forecasted_conf_2[['Confirmed']], forecasted_conf_2[['Confirmed_forecast']], title='Daily cases')


In [None]:
# Daily vaccinations - diffed twice

plot_fore_test(forecasted_vacc_2[['Total_Doses']], forecasted_vacc_2[['Total_Doses_forecast']], title='Daily vaccinations')


In [None]:
# Un-diff the test_2 dataset once

start_index_2 = test.index.get_loc(test_2.index[0])-1
test_2_original = invert_transformation(test_2, test.iloc[start_index_2:])


test_2_original


In [None]:
# Un-diff the fore_2 dataset once

renamed_df_2 = df_forecast_2.rename(columns={'Confirmed_forecast': 'Confirmed', 'Total_Doses_forecast': 'Total_Doses'}, inplace=False)


start_index_2 = test.index.get_loc(renamed_df_2.index[0])-1
fore_original_2 = invert_transformation(renamed_df_2, test.iloc[start_index_2:])


fore_original_2


In [None]:
# Un diffed once

plot_fore_test(test_2_original[['undiff_Confirmed']], fore_original_2[['undiff_Confirmed']], title='Daily cases')


In [None]:
# Un-diffed once

plot_fore_test(test_2_original[['undiff_Total_Doses']], fore_original_2[['undiff_Total_Doses']], title='Daily cases')


In [None]:
# Un-diff the test_2 again

test_2_original.rename(columns={'undiff_Confirmed': 'Confirmed', 'undiff_Total_Doses': 'Total_Doses'}, inplace=True)


start_index_1 = indexed.index.get_loc(test_2_original.index[0])-1
test_original_1 = invert_transformation(test_2_original, indexed.iloc[start_index_1:])

test_original_1


In [None]:
fore_original_2.rename(columns={'undiff_Confirmed': 'Confirmed', 'undiff_Total_Doses': 'Total_Doses'}, inplace=True)


start_index_1 = indexed.index.get_loc(fore_original_2.index[0])-1
fore_original_1 = invert_transformation(fore_original_2, indexed.iloc[start_index_1:])

fore_original_1

In [None]:
fore_original_2-df_forecast_2.rename(columns={'Confirmed_forecast': 'undiff_Confirmed', 'Total_Doses_forecast': 'undiff_Total_Doses'}, inplace=False)


In [None]:
df_forecast_2

## Plot the forecasts

In [None]:
plot_fore_test(test_original_1[['undiff_Confirmed']], fore_original_1[['undiff_Confirmed']], title='Daily cases')



In [None]:
plot_fore_test(test_original_1[['undiff_Total_Doses']], fore_original_1[['undiff_Total_Doses']], title='Daily cases')

# wavelet, svm, ann

In [None]:
end_index = indexed.index.get_loc(fore_original_1.index[0])

train_2_original = indexed[:end_index]

plot_subplots([train_2_original, fore_original_1])

Predicting with first order differenced data shows a better forecast than second order differenced.