# Using SARIMAX for Time Forecasting
SARIMA is used for non-stationary series, that is, where the data do not fluctuate around the same mean, variance and co-variance. This model can identify trend and seasonality, which makes it so important.

AR: Auto regressive model (can be a simple, multiple or non-linear regression)
MA: Moving averages model. The moving average models can use weighting factors.

--The composition of AR and MA together carry the ARMA model, but this model is used only for stationary series (mean, variance constant over time).

--If the series has a tendency, it will be necessary to use the ARIMA model.
ARIMA is used for non-stationary series. In this model, a differentiation step I (d) is used to eliminate non-stationarity.

--The integrated element “I” for differentiation allows the method to support time series with trend. But still this model does not identify seasonality.

--Finally, we arrive at the SARIMA model, which has a seasonal correlation and can identify the seasonality of the time series.

In [None]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.linear_model import LinearRegression 
from sklearn.preprocessing import PolynomialFeatures 
import time
import seaborn as sn
import datetime
import statsmodels.api as sm
import plotly.graph_objects as go

  import pandas.util.testing as tm


In [None]:
url = "https://pomber.github.io/covid19/timeseries.json"
r = requests.get(url)
json_response = r.json()
indian_data = json_response['India']
# print(indian_data)
date,confirmed,deaths,recovered=[],[],[],[]

for record in indian_data:
    date.append(record['date'])
    confirmed.append(record['confirmed'])
    deaths.append(record['deaths'])
    recovered.append(record['recovered'])
# print(date,confirmed,deaths,recovered)
# data = {'date':date,'confirmed':confirmed,'deaths':deaths,'recovered':recovered}
df = pd.DataFrame(np.column_stack([date,confirmed,deaths,recovered]),columns=['date','confirmed','deaths','recovered'])
df.to_csv("Covid_19_deaths.csv",index=False)

# df = pd.read_csv("case_time_series.csv")
# date = df.date.values
# confirmed = df.total_confirmed.values
# recovered = df.total_recovered.values
# deaths = df.total_deceased.values
# print(confirmed,recovered,deaths)

In [None]:
import plotly.express as px
fig = go.Figure()

fig.add_trace(go.Scatter(x=date, y=confirmed,name="Confirmed"))
fig.add_trace(go.Scatter(x=date, y=recovered,name="Recovered"))
fig.add_trace(go.Scatter(x=date, y=deaths,name="Dead"))

fig.update_layout(yaxis=dict(title='Values'),width=1000,height=500,
                  title='Analysis of Death/Cured/Confirmed in India with time',
                  xaxis=dict(title='Time in Date'))

In [None]:
df

Unnamed: 0,date,confirmed,deaths,recovered
0,2020-1-22,0,0,0
1,2020-1-23,0,0,0
2,2020-1-24,0,0,0
3,2020-1-25,0,0,0
4,2020-1-26,0,0,0
...,...,...,...,...
119,2020-5-20,112028,3434,45422
120,2020-5-21,118226,3584,48553
121,2020-5-22,124794,3726,51824
122,2020-5-23,131423,3868,54385


In [None]:
X = df.iloc[:, 0:1].values
dates = np.reshape(X,(1,-1))
dates = dates[0]
y = df.iloc[:, 3].values
# y = [int(i) for i in y]
y = y.astype(int)
y

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     3,     3,
           3,     3,     3,     3,     3,     3,     3,     3,     3,
           3,     3,     3,     3,     3,     3,     3,     3,     3,
           3,     3,     3,     4,     4,     4,     4,     4,    13,
          13,    14,    14,    15,    20,    23,    27,    27,    40,
          43,    45,    73,    84,    95,   102,   123,   148,   191,
         192,   229,   229,   375,   421,   506,   620,   774,   969,
        1080,  1181,  1359,  1432,  1768,  2041,  2463,  2854,  3273,
        3975,  4370,  5012,  5498,  5939,  6523,  7137,  7747,  8437,
        9068, 10007, 10819, 11775, 12847, 14142, 15331, 16776, 17887,
       19301, 20969, 22549, 24420, 26400, 27969, 30258, 34224, 36795,
       39233, 42309, 45422, 48553, 51824, 54385, 57692])

In [None]:
X=np.array(range(0,len(X)))


In [None]:
model = sm.tsa.statespace.SARIMAX(y,
                                  order = [14,1,0],
                                  enforce_stationarity=False,
                                  enforce_invertibility=False)
results = model.fit()
print(results.aic)


1448.8808980946374


In [None]:
yhat = results.get_prediction()
predict_ci = yhat.conf_int()
preds=predict_ci[:,1]
forecast= results.forecast(steps=20)
X_preds = X
y_preds = preds
for i in range(len(forecast)):
    y_preds = np.append(y_preds,forecast[i])
    X_preds = np.append(X_preds,len(X)+i)

y_preds = [int(i) for i in y_preds]
fig = go.Figure()
fig.add_trace(go.Scatter(x=X_preds[40:], y=y_preds[40:],mode='lines+markers',name="Predictions"))
fig.add_trace(go.Scatter(x=X[40:], y=y[40:],mode='lines+markers',name="Actual"))

fig.update_layout(yaxis=dict(title='Values'),width=1200,height=800,
                  title='SARIMAX Confirmed Cases Prediction Vs Actual Values',
                  xaxis=dict(title='Time in Date'))


In [None]:
forecast_1= results.forecast(steps=5)
days = range(len(X)+1,len(X)+6)
pred_df = pd.DataFrame()
pred_df["Predictions"] = forecast_1
pred_df["Days"] = days
print(pred_df)

    Predictions  Days
0  61784.260278   125
1  66063.083571   126
2  72633.929040   127
3  79488.851522   128
4  84245.471959   129


In [None]:
final_df = pd.DataFrame()
final_df["dates"] = dates
final_df["Prediction"] = preds
final_df["Actual Data"] = y
final_df["Error"] = np.absolute(preds-y)
print(final_df)
final_df.to_csv("Infection_Predictions.csv",index=False)
error = int((np.absolute(preds[54:])-y[54:]).mean())
print("\nMean Error cases: ",error)
print("Error Percentage: ",error/y[54:].mean()*100)

         dates    Prediction  Actual Data        Error
0    2020-1-22   2771.807649            0  2771.807649
1    2020-1-23   2014.657934            0  2014.657934
2    2020-1-24   1990.977505            0  1990.977505
3    2020-1-25   2098.517045            0  2098.517045
4    2020-1-26   2015.629979            0  2015.629979
..         ...           ...          ...          ...
119  2020-5-20  45880.016450        45422   458.016450
120  2020-5-21  49127.514716        48553   574.514716
121  2020-5-22  52362.533540        51824   538.533540
122  2020-5-23  54794.373230        54385   409.373230
123  2020-5-24  57970.826093        57692   278.826093

[124 rows x 4 columns]

Mean Error cases:  289
Error Percentage:  2.598733904035681


Therefore we are getting a mean error of 228 cases in our predictions distributed in the last 46 days