In [130]:
import os
import urllib.request
import pandas as pd

import chart_studio.plotly as py
import plotly.offline as pyoff
import plotly.graph_objs as go
import plotly.express as px

import statsmodels.formula.api as smf
from sklearn.preprocessing import MinMaxScaler

import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import Adam 
from keras.callbacks import EarlyStopping
from keras.utils import np_utils
from keras.layers import LSTM
from sklearn.model_selection import KFold, cross_val_score, train_test_split
import numpy as np

from datetime import date
from datetime import timedelta
from datetime import datetime

In [131]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [158]:
df = pd.read_csv('full_data.csv')
df = df[df['location'].isin(people['Country (or dependency)'])]
df.head()

Unnamed: 0,date,location,new_cases,new_deaths,total_cases,total_deaths
0,2019-12-31,Afghanistan,0,0,0,0
1,2020-01-01,Afghanistan,0,0,0,0
2,2020-01-02,Afghanistan,0,0,0,0
3,2020-01-03,Afghanistan,0,0,0,0
4,2020-01-04,Afghanistan,0,0,0,0


In [159]:
df_country = df[df['location'] == 'United States']

In [160]:
df_country = df_country[['date','total_cases']].reset_index(drop = True)

In [161]:
df_country.tail(10)

Unnamed: 0,date,total_cases
104,2020-04-13,557571
105,2020-04-14,582594
106,2020-04-15,609516
107,2020-04-16,639664
108,2020-04-17,671331
109,2020-04-18,702164
110,2020-04-19,735086
111,2020-04-20,759687
112,2020-04-21,787752
113,2020-04-22,825041


In [162]:
df_diff = df_country
#add previous total_cases to the next row
df_diff['prev_total'] = df_diff['total_cases'].shift(1)
#drop the na values and calculate the difference
df_diff = df_diff.dropna()
df_diff['diff'] = (df_diff['total_cases'] - df_diff['prev_total'])
df_diff.tail(10)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,date,total_cases,prev_total,diff
104,2020-04-13,557571,529951.0,27620.0
105,2020-04-14,582594,557571.0,25023.0
106,2020-04-15,609516,582594.0,26922.0
107,2020-04-16,639664,609516.0,30148.0
108,2020-04-17,671331,639664.0,31667.0
109,2020-04-18,702164,671331.0,30833.0
110,2020-04-19,735086,702164.0,32922.0
111,2020-04-20,759687,735086.0,24601.0
112,2020-04-21,787752,759687.0,28065.0
113,2020-04-22,825041,787752.0,37289.0


In [163]:
#create dataframe for transformation from time series to supervised
df_supervised = df_diff.drop(['prev_total'],axis=1)
#adding lags of 14 days because of the information about COVID1-9
for inc in range(1,15):
    field_name = 'lag_' + str(inc)
    df_supervised[field_name] = df_supervised['diff'].shift(inc)
#drop na values
df_supervised = df_supervised.dropna().reset_index(drop=True)


In [164]:
df_supervised.tail()

Unnamed: 0,date,total_cases,diff,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7,lag_8,lag_9,lag_10,lag_11,lag_12,lag_13,lag_14
94,2020-04-18,702164,30833.0,31667.0,30148.0,26922.0,25023.0,27620.0,28391.0,35527.0,33901.0,33323.0,30613.0,30561.0,25398.0,34272.0,32425.0
95,2020-04-19,735086,32922.0,30833.0,31667.0,30148.0,26922.0,25023.0,27620.0,28391.0,35527.0,33901.0,33323.0,30613.0,30561.0,25398.0,34272.0
96,2020-04-20,759687,24601.0,32922.0,30833.0,31667.0,30148.0,26922.0,25023.0,27620.0,28391.0,35527.0,33901.0,33323.0,30613.0,30561.0,25398.0
97,2020-04-21,787752,28065.0,24601.0,32922.0,30833.0,31667.0,30148.0,26922.0,25023.0,27620.0,28391.0,35527.0,33901.0,33323.0,30613.0,30561.0
98,2020-04-22,825041,37289.0,28065.0,24601.0,32922.0,30833.0,31667.0,30148.0,26922.0,25023.0,27620.0,28391.0,35527.0,33901.0,33323.0,30613.0


In [165]:
#checking if variables mean anything for the diff variation
model = smf.ols(formula = 'diff ~ lag_1 + lag_2 + lag_3 + lag_4 + lag_5 + lag_6 + lag_7 + lag_8 + lag_9 + lag_10 + lag_11 + lag_12 + lag_13 + lag_14', data = df_supervised)
model_fit = model.fit()
regression_adj_rsq = model_fit.rsquared_adj
print(regression_adj_rsq)

0.9915655373950867


In [166]:
df_model = df_supervised.drop(['total_cases','date'],axis=1)
#split train and test set
train_set, test_set = df_model[0:-6].values, df_model[-6:].values

In [167]:
#apply Min Max Scaler
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler = scaler.fit(train_set)
# reshape training set
train_set = train_set.reshape(train_set.shape[0], train_set.shape[1])
train_set_scaled = scaler.transform(train_set)
# reshape test set
test_set = test_set.reshape(test_set.shape[0], test_set.shape[1])
test_set_scaled = scaler.transform(test_set)

In [168]:
X_train, y_train = train_set_scaled[:, 1:], train_set_scaled[:, 0:1]
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test, y_test = test_set_scaled[:, 1:], test_set_scaled[:, 0:1]
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])

In [169]:
model = Sequential()
model.add(LSTM(4, batch_input_shape=(1, X_train.shape[1], X_train.shape[2]), stateful=True))
model.add(Dense(1))
#model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X_train, y_train, epochs=100, batch_size=1, verbose=1, shuffle=False)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 100/100


<keras.callbacks.callbacks.History at 0x1d32887b688>

In [170]:
y_pred = model.predict(X_test,batch_size=1)

In [171]:
#reshape y_pred
y_pred = y_pred.reshape(y_pred.shape[0], 1, y_pred.shape[1])
#rebuild test set for inverse transform
pred_test_set = []
for index in range(0,len(y_pred)):
    pred_test_set.append(np.concatenate([y_pred[index],X_test[index]],axis=1))
#reshape pred_test_set
pred_test_set = np.array(pred_test_set)
pred_test_set = pred_test_set.reshape(pred_test_set.shape[0], pred_test_set.shape[2])
#inverse transform
pred_test_set_inverted = scaler.inverse_transform(pred_test_set)

In [173]:
#create dataframe that shows the predicted total_cases
result_list = []
sales_dates = list(df_country[-7:].date)
act = list(df_country[-7:]['total_cases'])
for index in range(0,len(pred_test_set_inverted)):
    result_dict = {}
    result_dict['pred_value'] = int(pred_test_set_inverted[index][0] + act[index])
    result_dict['date'] = sales_dates[index+1]
    result_list.append(result_dict)
df_result = pd.DataFrame(result_list)

In [174]:
df_result

Unnamed: 0,pred_value,date
0,667663,2020-04-17
1,697660,2020-04-18
2,726692,2020-04-19
3,756015,2020-04-20
4,782090,2020-04-21
5,810818,2020-04-22


In [175]:
#plot actual and predicted
df_sales_pred = pd.merge(df_country,df_result,on='date',how='left')
plot_data = [
    go.Scatter(
        x=df_sales_pred['date'],
        y=df_sales_pred['total_cases'],
        name='actual'
    ),
        go.Scatter(
        x=df_sales_pred['date'],
        y=df_sales_pred['pred_value'],
        name='predicted'
    )
    
]
plot_layout = go.Layout(
        title='COVID-19 cases'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)

# That's a pretty good model!!!