# RNN Models

### Imports and set seed

In [56]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error 

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, SimpleRNN, LSTM

In [2]:
np.random.seed(10)

### Load data

In [3]:
covid = pd.read_csv("../datasets/WHO-COVID-19-global-data.csv")

### Create mini datasets of each country with the first day having >= 100 cases as the start date

In [4]:
# a dataframe of an individual country
# starting at the first day with over 100 cases
# with just the date and cumulative cases
# with the index reset
covid_shifted = covid[covid['Cumulative_cases'] >= 100]

lst = []

for country in covid_shifted['Country'].unique():
    temp_df = covid_shifted[(covid_shifted['Cumulative_cases'] >= 100) 
                            & (covid_shifted['Country'] == country)][['Date_reported', 'Cumulative_cases']].reset_index().drop(columns='index')
    temp_df['Days_since_100'] = [i for i in range(len(temp_df))]
# add it to the list as a tuple to keep country name while reducing df size
    lst.append((country, temp_df))

# view the first three rows of the first four country reports
# for tup in lst[0:4]:
#     print(tup[0])
#     display(tup[1].head(3))

## Simple RNN
The following code works but acted as a test / recreation of the datatechnotes.com [guide](https://www.datatechnotes.com/2018/12/rnn-example-with-keras-simplernn-in.html). \
For the code actually used, see below under Automation as that is organized in a more modular reusable manner.

In [5]:
# # test country
# country = lst[1]

# step = 4

# X = [val for val in country[1]['Cumulative_cases']]
# # add "step" extra to be used to make the final pred
# X = np.append(X, X[-1::]*step)

# # convert into dataset matrix
# # credit to https://www.datatechnotes.com/2018/12/rnn-example-with-keras-simplernn-in.html
# def convertToMatrix(data, step):
#  X, Y =[], []
#  for i in range(len(data)-step):
#   d=i+step  
#   X.append(data[i:d,])
#   Y.append(data[d,])
#  return np.array(X), np.array(Y)

# X_train, y_train =convertToMatrix(X,step)

# X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))

# # SimpleRNN model
# model = Sequential()
# model.add(SimpleRNN(units=32, input_shape=(1,step), activation='relu'))
# model.add(Dense(16, activation='relu'))
# model.add(Dense(1))
# model.compile(loss='mean_squared_error', optimizer='rmsprop')
# model.summary()

# model.fit(X_train, y_train, epochs=244, batch_size=16)

# preds = model.predict(X_train)

# df = pd.DataFrame(preds)

# df.size

# df['orig'] = X[0:468]

# df[0] = [int(i) for i in df[0]]

# df

---
### Automation of SimpleRNN

In [6]:
# function to convert into dataset matrix
# credit to https://www.datatechnotes.com/2018/12/rnn-example-with-keras-simplernn-in.html
def convertToMatrix(data, step):
 X, Y =[], []
 for i in range(len(data)-step):
  d=i+step  
  X.append(data[i:d,])
  Y.append(data[d,])
 return np.array(X), np.array(Y)

In [7]:
# function to 
# accepts a tuple ((country name, dataset), step size)
def make_predictable(country_tuple, step):
#     get the dataframe
    country = country_tuple[1]
#     samples to take at a time
    step = 4
    X = [val for val in country['Cumulative_cases']]
#     add "step" extra to be used to make the final pred
    X = np.append(X, X[-1::]*step)
    
    X_a, y_a = convertToMatrix(X,step)
    X_a = np.reshape(X_a, (X_a.shape[0], 1, X_a.shape[1]))
#     X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))
    return X_a, y_a

In [8]:
# accepts a tuple ((country name, dataset))
def make_model(country_tuple):
    step = 4
    
#     use make_predictable to set up the data
    X_train, y_train = make_predictable(country_tuple, step)
    
#     SimpleRNN model
    model = Sequential()
    model.add(SimpleRNN(units=32, input_shape=(1,step), activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='rmsprop')
#     model.summary()
    
    model.fit(X_train, y_train, epochs=210, batch_size=16, verbose=0)
    print(model.history)

    #TODO: return a tuple with the country and the model
    return country_tuple, model

In [9]:
# # a small sample (2) to test
# # the next cell will run for all countries
# model_temp = []

# for country in lst[0:2]:
#     print(country[0])
#     model_temp.append(make_model(country))

In [None]:
# loop to create the full list of models
# HIGHLY RECOMMENDED TO LEAVE THIS ONE COMMENTED OUT UNLESS YOU WANT TO LET IT RUN FOR A LONG TIME (over an hour on Strix's machine)
# run but cleared output to clean up
model_ls = []

for country in lst:
    model_ls.append(make_model(country))
#     print(f'{country[0]} completed')

### Making Predictions

In [11]:
# make predictions for a target country using a model country
def make_preds(country_model, target_country):
#     make preds on the target country
    preds = country_model.predict(make_predictable(target_country, step=4)[0])
#     return it as a df
    df = pd.DataFrame(preds)
    return df

In [12]:
# # small test
# for ctry in lst[0:2]:
# #     print(f'--------------------------------------------------------')
# #     print(f'Country: {ctry[0]}, Samples: {len(ctry[1])}')
#     for model in model_temp:
#         tempreds = make_preds(model[1], ctry)
# #         display(tempreds)
#         ctry[1][f'{model[0][0]}_mp'] = tempreds

In [None]:
# whole set
# cell was run but output was cleared so that it didn't take up space
ccount = 1
mcount = 1
for ctry in lst:
    for model in model_ls:
#         print(f'Country: {ccount}, Model: {mcount}')
        tempreds = make_preds(model[1], ctry)
        ctry[1][f'{model[0][0]}_mp'] = tempreds
        mcount += 1
    ccount += 1
    

In [None]:
# clean up the dataframes, save as a final list
fin_lst = []

for con in lst:
#     tuples are immutable, how inconvenient
#     have to reassign the entire tuple, not just the element
    con = (con[0], con[1].astype('int32', errors='ignore'))
    con[1].columns = [header.lower().replace(' ', '_') for header in con[1].columns]
    fin_lst.append((con[0], con[1]))

In [66]:
fin_lst[6][1]

Unnamed: 0,date_reported,cumulative_cases,days_since_100,afghanistan_mp,albania_mp,algeria_mp,andorra_mp,angola_mp,anguilla_mp,antigua_and_barbuda_mp,...,united_states_of_america_mp,united_states_virgin_islands_mp,uruguay_mp,uzbekistan_mp,venezuela_(bolivarian_republic_of)_mp,viet_nam_mp,wallis_and_futuna_mp,yemen_mp,zambia_mp,zimbabwe_mp
0,2020-09-28,101,0,100,108,107,109,107,99,104,...,103,106,110,108,106,94,105,101,100,102
1,2020-09-29,101,1,100,108,107,109,107,99,104,...,103,106,110,108,106,94,105,101,100,102
2,2020-09-30,101,2,104,112,113,112,111,99,108,...,107,111,114,111,110,97,109,103,105,107
3,2020-10-01,101,3,106,114,116,115,111,102,109,...,110,112,116,113,113,98,109,105,106,110
4,2020-10-02,101,4,108,116,115,117,115,103,112,...,112,112,117,116,115,101,113,107,107,111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,2021-07-02,1264,277,1259,1277,1288,1279,1282,1241,1272,...,1236,1290,1292,1277,1292,1298,1299,1232,1250,1254
278,2021-07-03,1264,278,1259,1277,1288,1279,1282,1241,1272,...,1236,1290,1292,1277,1292,1298,1299,1232,1250,1254
279,2021-07-04,1264,279,1259,1277,1288,1279,1282,1241,1272,...,1236,1290,1292,1277,1292,1298,1299,1232,1250,1254
280,2021-07-05,1264,280,1259,1277,1288,1279,1282,1241,1272,...,1236,1290,1292,1277,1292,1298,1299,1232,1250,1254


In [148]:
# returns the best predictor given the actual column and the df of predictions (by column = model)
def best_predictor(actual, preds_frame):
    best_name = ''
    pred_acc = 0
    best_mse = 999999999999
    
    for col in preds_frame:
        mse = mean_squared_error(actual, preds_frame[col])
        if mse < best_mse:
            best_name = col
            best_mse = np.abs(mse)
            pred_acc = 1 - (np.abs(preds_frame[col].iloc[-1] - actual.iloc[-1])/actual.iloc[-1])
        
    return (best_name, (best_mse ** (.5)).round(decimals=4), pred_acc.round(decimals=4))

In [149]:
for country in fin_lst:
    name, rmse, acc = best_predictor(actual=country[1]['cumulative_cases'], preds_frame=country[1].drop(columns=['date_reported', 'cumulative_cases', 'days_since_100']))
    print(f"Best predictor of {country[0]}'s cov rate: {name}s with MSE of {rmse} and {acc} % accuracy on 2021-07-06''")

Best predictor of Afghanistan's cov rate: anguilla_mps with MSE of 673.5013 and 0.9823 % accuracy on 2021-07-06''
Best predictor of Albania's cov rate: nicaragua_mps with MSE of 607.8185 and 0.9949 % accuracy on 2021-07-06''
Best predictor of Algeria's cov rate: nicaragua_mps with MSE of 518.1064 and 0.9949 % accuracy on 2021-07-06''
Best predictor of Andorra's cov rate: brunei_darussalam_mps with MSE of 64.0492 and 0.9943 % accuracy on 2021-07-06''
Best predictor of Angola's cov rate: nicaragua_mps with MSE of 169.1523 and 0.9949 % accuracy on 2021-07-06''
Best predictor of Anguilla's cov rate: new_caledonia_mps with MSE of 0.3273 and 1.0 % accuracy on 2021-07-06''
Best predictor of Antigua and Barbuda's cov rate: brunei_darussalam_mps with MSE of 14.9351 and 0.9937 % accuracy on 2021-07-06''
Best predictor of Argentina's cov rate: uruguay_mps with MSE of 5256.2645 and 0.983 % accuracy on 2021-07-06''
Best predictor of Armenia's cov rate: brunei_darussalam_mps with MSE of 1087.7318 an

---
TODO:

## LSTM
TODO: make an LSTM model with backpropagation or more layers

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
model = keras.Sequential()
# Add an Embedding layer expecting input vocab of size 1000, and
# output embedding dimension of size 64.
model.add(layers.Embedding(input_dim=1000, output_dim=64))

# Add a LSTM layer with 128 internal units.
model.add(layers.LSTM(32))

# Add a Dense layer with 10 units.
model.add(layers.Dense(10))

model.summary()