In [5]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
import numpy as np
from google.colab import drive


data_frame = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv")

data_frame = data_frame[5:] #ignore puerto rico and stuff
dates = pd.to_datetime(data_frame.columns[11:])#Dates

#Group by state
grouped = data_frame.groupby("Province_State", as_index=False)

#Stores all of the data by state instead
state_data = pd.DataFrame()


for group_name, group in grouped:
    #Get cases for the state
    cases = group[11:]
    #Sum the cases for each day (making it from city count per day to state amount per day)
    
    summed_cases = cases.sum(axis=0)[11:] 
    
    state_data[group_name] = summed_cases

state_data["Date"] = dates

display(state_data)

state_data.to_csv("state_data.csv", index=True)

Unnamed: 0,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,Diamond Princess,District of Columbia,Florida,Georgia,Grand Princess,Hawaii,Idaho,Illinois,Indiana,Iowa,Kansas,Kentucky,Louisiana,Maine,Maryland,Massachusetts,Michigan,Minnesota,Mississippi,Missouri,Montana,Nebraska,Nevada,New Hampshire,New Jersey,New Mexico,New York,North Carolina,North Dakota,Ohio,Oklahoma,Oregon,Pennsylvania,Rhode Island,South Carolina,South Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming,Date
1/22/20,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,1,0,0,0,2020-01-22
1/23/20,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,1,0,0,0,2020-01-23
1/24/20,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,1,0,0,0,2020-01-24
1/25/20,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,1,0,0,0,2020-01-25
1/26/20,0,0,0,0,2,0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,1,0,0,0,2020-01-26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7/10/20,45252,416,16517,22584,290236,23745,0.0,0.0,0.0,0.0,202297,106080,0.0,0.0,5091,150806,43644,29376,17964,17244,61465,695,50910,41989,73563,36903,32250,25116,1361,20224,3859,2,85491,9489,395945,77895,1260,59246,17747,9938,81196,0.0,39218,6140,56370,240957,26500,208,64263,32260,3081,31021,842,2020-07-10
7/11/20,46515,423,16900,23553,297434,24020,0.0,0.0,0.0,0.0,210470,109109,0.0,0.0,5459,151954,44361,29811,18347,17628,63280,706,51271,42084,74209,37616,33005,25614,1431,20440,3922,2,85615,9573,396607,79703,1305,60501,18387,10276,81623,0.0,40849,6184,57703,250440,27061,209,65069,32260,3228,31873,862,2020-07-11
7/12/20,47992,458,17219,23977,302778,24257,0.0,0.0,0.0,0.0,223266,111573,0.0,0.0,5661,152895,44851,30522,18641,17854,64335,711,51695,42140,74599,38240,33829,25882,1501,20605,3994,6,85792,9681,397258,81467,1352,61785,18797,10541,81926,0.0,42226,6221,58629,258691,27638,209,65898,33324,3317,32586,874,2020-07-12
7/13/20,49781,469,17353,24502,311143,24493,0.0,0.0,0.0,0.0,233708,115071,0.0,0.0,5832,153751,45240,30720,19271,18096,65698,710,51939,42222,74991,38666,34196,26255,1580,20823,4065,5,85894,9767,397797,83214,1406,62955,19284,10781,82341,0.0,43218,6245,61846,265607,28121,209,66837,34216,3377,33056,904,2020-07-13


In [6]:

# General algorithmic functions I wrote for data preprocessing
def normalize(data, maxnum):
  oned = np.squeeze(data)
  min_data = min(oned)
  max_data = max(oned)
  normalized_data = (oned - min_data) / (maxnum - min_data)
  return normalized_data, min_data, max_data

def denormalize(norm_data, max_d, min_d):
  denormalized_d = np.squeeze(norm_data) * (max_d - min_d) + min_d
  return denormalized_d

#Plot cases for a given state
def get_cases(state_name:str):
  return pd.to_datetime(state_data["Date"]), state_data[state_name]

batch_size = 5

#The net is trained on individual states
curr_state = "New Mexico"
epoch_num = 3500
time, cases = get_cases(curr_state)

cases = cases.values

phase_time = np.arange(len(time)).reshape(len(time), 1)
cases = cases.reshape(len(cases), 1)
print(max(cases))
maxn = 11000000
maxp = 575
#cases = tf.keras.utils.normalize(np.squeeze(cases))[0]
#phase_time = tf.keras.utils.normalize(np.squeeze(phase_time))[0]
cases, minc, maxc = normalize(cases, maxn)
phase_time, minphase, maxphase = normalize(phase_time, maxp)
print(max(denormalize(cases, maxn, 0)))



print(len(cases), len(phase_time))
#If you want to run the network yourself, use this code if you want to use manual train/test divisions
'''X_train, X_valid, y_train, y_valid = [], [], [], []

for i in range(len(cases)-batch_size):
  X_train.append(np.array(phase_time[i:i+5], dtype=np.float32))
  y_train.append(np.array(cases[i:i+5],dtype=np.float32))

for i in range(len(cases)-25, len(cases)-batch_size):
  X_valid.append(np.array(phase_time[i:i+5],dtype=np.float32))
  y_valid.append(np.array(cases[i:i+5],dtype=np.float32))
print(np.array(X_train).shape, np.array(y_train).shape, np.array(X_valid).shape, np.array(y_valid).shape)



X_train = np.reshape(X_train,(161, 5, 1))
#X_train= tf.keras.utils.normalize(X_train, axis=1, order=2)


#X_valid = np.array(phase_time[151:],dtype=np.int32)
X_valid = np.reshape(X_valid, (20, 5, 1))
#X_valid= tf.keras.utils.normalize(X_valid, axis=1, order=2)


#y_train = np.array(cases[:150],dtype=np.int32)
y_train = np.reshape(y_train,(161, 5, 1))
#y_train = tf.keras.utils.normalize(y_train, axis=1, order=2)


#y_valid = np.array(cases[151:],dtype=np.int32)
y_valid = np.reshape(y_valid,(20, 5, 1))
#y_valid = tf().keras.utils.normalize(y_valid, axis=1, order=2)
plt.plot(np.squeeze(X_train), np.squeeze(y_train))
plt.show()'''




#print(time)
#print(np.array(cases).reshape((len(cases), )))

from tensorflow import keras
from keras.models import Sequential
from keras.layers import LSTM, Dropout, Dense
from keras.preprocessing.sequence import TimeseriesGenerator
batch_size = 5
n_input = 5
np.random.seed(42)
tf.random.set_seed(42)
#print(cases)
cases = cases.reshape((len(cases), 1))
phase_time = phase_time.reshape(len(phase_time), 1)

#print(cases)
data_gen = TimeseriesGenerator(phase_time, cases,
                               length=n_input,
                               batch_size=batch_size)

model = Sequential()
model.add(LSTM(250, activation='relu', input_shape=(n_input,1)))
model.add(Dropout(0.5))
model.add(Dense(1))
model.compile(loss="mse", optimizer="adam")
print(model.summary())
model.fit_generator(data_gen, steps_per_epoch=1, epochs=epoch_num, verbose=0)

import datetime
filename = f"{curr_state}_{datetime.datetime.now()}"
model.save(f"/content/drive/My Drive/{filename}")
print(f"Model saved as {filename}")
day = 0
phase_time = np.squeeze(phase_time)

day = day*phase_time[1]
print(phase_time[1])
print(day)


predictions = []
x_axis = np.arange((day),(day+200*phase_time[1]),phase_time[1])
x_axis2 = np.array([x_axis[i:i+n_input] for i in range(0, len(x_axis) - n_input)])
x_axis3 = [x_axis[i:i+n_input].reshape((1, n_input, 1)) for i in range(0, len(x_axis)-n_input)]

predictions = [model.predict(i) for i in x_axis3]
#print(np.squeeze(predictions))
fig, ax = plt.subplots()
xes = np.squeeze([i[-1]+phase_time[1] for i in x_axis2])
yaxes = np.squeeze(predictions)
plt.figure()
plt.plot(xes, yaxes)
plt.plot(phase_time, cases)


#Runs a single prediction on a given time to determine what the model's
#Estimated cases during that time are
def single_predict(value, given_model):
  value = (value-1)*phase_time[1]
  inputs = np.arange(value, value-phase_time[1]*(n_input-0.5),  -phase_time[1])
  inputs = inputs.reshape((1, n_input, 1))
  prediction = given_model.predict(inputs)
  #print(f"Value is {value}, prediction is {prediction}, {phase_time[50]}")
  return float(denormalize(np.squeeze(prediction), maxn, 0))
                         
n = int(input("What day past January 22nd do you want to predict coronavirus cases for the state of {?"))
print(f"Prediction for day {n} of coronavirus outbreak returned approximately {int(single_predict(n, model))} cases")
ax.plot(denormalize(xes, maxp, 0), denormalize(yaxes, maxn, 0))
ax.plot(denormalize(phase_time, maxp, 0), denormalize(cases, maxn, 0))

[9851]
9851.0
175 175
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 250)               252000    
_________________________________________________________________
dropout_2 (Dropout)          (None, 250)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 251       
Total params: 252,251
Trainable params: 252,251
Non-trainable params: 0
_________________________________________________________________
None


OSError: ignored

In [None]:
wfrom google.colab import drive
drive.mount('/content/drive')

In [None]:
state_data

In [None]:
predictslist = [single_predict(i, model) for i in range(0, 201)]
print(len(predictslist))
new_data[f'{curr_state} Predict'] = predictslist 



In [None]:
new_data

In [None]:
new_data.to_csv('/content/drive/My Drive/predicted_states.csv')