In [None]:
# Load Data
* Fetch data from csv file and put it into a Pandas DataFrame.

In [1]:
import pandas as pd

covid_data = pd.read_csv('../input/novel-corona-virus-2019-dataset/time_series_covid_19_confirmed.csv')

# Data Preparation
* Since we want to predict cases for the US, we will extract the row containing US confirmed covid cases.

In [None]:
us_covid_data = covid_data.loc[covid_data['Country/Region'] == 'US']

us_covid_data

### Preprocess Data
* We can drop the columns for 'Province/State', 'Country/Region', 'Lat', and 'Long' since we know the data is only for the US and these columns are not needed for prediction.

In [None]:
us_covid_data = us_covid_data.drop(columns=['Province/State', 'Country/Region', 'Lat', 'Long'])

* Since the number of Corona cases gets rather large over time our model's calculations during training may be very slow. We can fix this by using sklearn's MinMaxScaler to rescale our data.

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(us_covid_data.values.T)
us_covid_data = scaler.transform(us_covid_data.values.T)

### Split Into X and y
* We will set up our X and y in such a way that X[n] will contain the cases for a certain amount of previous days (time_steps), and y[n] will then contain the reading for the next day.
* This way our model will be trained to predict the number of cases on a certain day based on the trend in the number of cases within the previous time_steps number of days.
* After some testing, I have found that using the data from the previous 30 days allowed our model to make fairly accurate predictions on the 31st day.

In [None]:
import numpy as np

X, y = [], []
time_steps = 30

for i in range(len(us_covid_data) - time_steps):
    x = us_covid_data[i:(i+time_steps), 0]
    X.append(x)
    y.append(us_covid_data[i+time_steps, 0])

X = np.array(X)
y = np.array(y)

# Data Partitioning
* Must keep the data set in order since we are looking at a chronological timeline of Corona cases, so we can just take the first 80% of the data as our training, and our testing will be the remaining 20%.
* Also need to reshape the X[n] partitions so our model can process them properly.

In [None]:
split = int(len(X) * 0.8)

X_train = X[:split]
X_test = X[split:]
y_train = y[:split]
y_test = y[split:]

X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

# Model Architecture
* We create our model using a reccurent neural network architecture.
* Model consists of an input layer, followed by three LSTM layers which utilize dropout to prevent our model from overfitting.
* Output is a Dense layer with a single neuron using ReLU activation function since we are predicting the number of Corona cases, so our output will be a positive number (0, $\infty$).

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, Input, LSTM
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import RMSprop

model = Sequential()
model.add(Input(shape=(1, time_steps)))
model.add(LSTM(48, return_sequences=True))
model.add(Dropout(0.4))
model.add(LSTM(48, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(48))
model.add(Dropout(0.2))
model.add(Dense(1, activation='relu'))


model.compile(loss = 'mean_squared_error',
              optimizer = RMSprop(),
              metrics = ['mean_squared_error'])

model.summary()

# Train the Model
* Now we can train our model using 20% of the training data as our validation set.
* Model will utilize the ReduceLROnPlateau to lower our learning rate any time our validation MSE plateaus for three epochs for best accuracy.

In [None]:
from keras.callbacks import ReduceLROnPlateau

batchsize = 100
epochs =  100

learning_rate_reduction = ReduceLROnPlateau(monitor='val_mean_squared_error', 
                                            patience=3, 
                                            verbose=1, 
                                            factor=0.5, 
                                            min_lr=1e-10)

history = model.fit(X_train,
                    y_train,
                    batch_size=batchsize,
                    epochs=epochs,
                    validation_split=0.2,
                    shuffle=False,
                    callbacks=[learning_rate_reduction])

* Plot the model's loss and MSE values throughout training.

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.legend(['train', 'val'])
plt.show()

plt.plot(history.history['mean_squared_error'])
plt.plot(history.history['val_mean_squared_error'])
plt.title('Model Error')
plt.ylabel('Mean Squared Error')
plt.xlabel('Epochs')
plt.legend(['train', 'val'])
plt.show()

# Plot Model Predictions
* In order to see the accuracy of our model, we first use it to predict the output of our X_test data.
* We then rescale our prediction and y_test data back to the original bounds of the data set in order to accurately plot their values.
* Finally, we can plot the actual Covid cases compared to our predicted Covid cases to see the overall accuracy of our model.

In [None]:
y_pred = model.predict(X_test)
y_pred = scaler.inverse_transform(y_pred)
y_test = scaler.inverse_transform(y_test.reshape(-1,1))

plt.plot(y_pred, color='red')
plt.plot(y_test, color='blue')
plt.title('Actual vs. Predicted Covid Cases (Test Data)')
plt.ylabel('Number of Cases')
plt.xlabel('Day')
plt.legend(['predicted', 'actual'])