In [1]:
# import packages
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
from datetime import datetime
from github import Github
import os

In [2]:
state_names = ["Alaska", "Alabama", "Arkansas", "American Samoa", "Arizona", "California", "Colorado", "Connecticut", "District ", "of Columbia", "Delaware", "Florida", "Georgia", "Guam", "Hawaii", "Iowa", "Idaho", "Illinois", "Indiana", "Kansas", "Kentucky", "Louisiana", "Massachusetts", "Maryland", "Maine", "Michigan", "Minnesota", "Missouri", "Mississippi", "Montana", "North Carolina", "North Dakota", "Nebraska", "New Hampshire", "New Jersey", "New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Puerto Rico", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Virginia", "Virgin Islands", "Vermont", "Washington", "Wisconsin", "West Virginia", "Wyoming"]

In [3]:
len(state_names)

56

In [4]:
# Create dictionary for FIPS code
url = 'https://raw.githubusercontent.com/reichlab/covid19-forecast-hub/master/data-locations/locations.csv'
df_fips = pd.read_csv(url, error_bad_lines=False)
us_state_fips = {}
for index, row in df_fips.iterrows():
    if index < 58 and index >= 0:
        us_state_fips[row["location"]] = row["location_name"]
us_state_fips

{'US': 'US',
 '01': 'Alabama',
 '02': 'Alaska',
 '04': 'Arizona',
 '05': 'Arkansas',
 '06': 'California',
 '08': 'Colorado',
 '09': 'Connecticut',
 '10': 'Delaware',
 '11': 'District of Columbia',
 '12': 'Florida',
 '13': 'Georgia',
 '15': 'Hawaii',
 '16': 'Idaho',
 '17': 'Illinois',
 '18': 'Indiana',
 '19': 'Iowa',
 '20': 'Kansas',
 '21': 'Kentucky',
 '22': 'Louisiana',
 '23': 'Maine',
 '24': 'Maryland',
 '25': 'Massachusetts',
 '26': 'Michigan',
 '27': 'Minnesota',
 '28': 'Mississippi',
 '29': 'Missouri',
 '30': 'Montana',
 '31': 'Nebraska',
 '32': 'Nevada',
 '33': 'New Hampshire',
 '34': 'New Jersey',
 '35': 'New Mexico',
 '36': 'New York',
 '37': 'North Carolina',
 '38': 'North Dakota',
 '39': 'Ohio',
 '40': 'Oklahoma',
 '41': 'Oregon',
 '42': 'Pennsylvania',
 '44': 'Rhode Island',
 '45': 'South Carolina',
 '46': 'South Dakota',
 '47': 'Tennessee',
 '48': 'Texas',
 '49': 'Utah',
 '50': 'Vermont',
 '51': 'Virginia',
 '53': 'Washington',
 '54': 'West Virginia',
 '55': 'Wisconsin',
 '

<h1> Getting predictions from CDC COVID forecast Github <h1>
<h6> This will clean data as the format we want later for visualizations and measurements purposes. This will generate 3 csv files for each model: model_cum_death.csv, model_inc_death.csv, model_inc_case.csv. They will be stored under each model's own folder under "./Predictions/".<h6>


In [5]:
#get all filenames under a specific model from CDC COVID forecast Github
def get_filenames(model):
    #plese put your Github account username and password here to use Github API
    g = Github("","")
    repo = g.get_repo("reichlab/covid19-forecast-hub")
    contents = repo.get_contents("/data-processed/" + model)
    filenames = []
    for content_file in contents:
        filenames.append(content_file.name)
    
    #preserve only .csv files
    to_remove = []
    for filename in filenames:
        if not filename[-3:] == 'csv':
            to_remove.append(filename)
    for item in to_remove:
        filenames.remove(item)
    
    return filenames

In [6]:
#get all models names from CDC COVID forecast Github
def get_modelnames():
    #plese put your Github account username and password here to use Github API
    g = Github("","")
    repo = g.get_repo("reichlab/covid19-forecast-hub")
    contents = repo.get_contents("/data-processed/")
    filenames = []
    for content_file in contents:
        filenames.append(content_file.name)
    
    #preserve only .csv files
    to_remove = []
    for filename in filenames:
        if '.' in filename:
            to_remove.append(filename)
    
    for item in to_remove:
        filenames.remove(item)
    
    return filenames

In [7]:
def get_prediction(model):
    filenames = get_filenames(model)
    df_cum_death = pd.DataFrame(columns = ["state", "# weeks ahead", "forecast_date", "target_end_date", "mean"])
    df_inc_death = pd.DataFrame(columns = ["state", "# weeks ahead", "forecast_date", "target_end_date", "mean"])
    df_inc_case = pd.DataFrame(columns = ["state", "# weeks ahead", "forecast_date", "target_end_date", "mean"])
    for filename in filenames:
        try:
            url = 'https://raw.githubusercontent.com/reichlab/covid19-forecast-hub/master/data-processed/' + model + '/' + filename
            df = pd.read_csv(url, error_bad_lines=False, dtype={"location":str})
            df.set_index(["location", "type"],inplace=True)

            new_cum_death = {}
            new_inc_death = {}
            new_inc_case = {}

            state_list_temp = list(dict.fromkeys(df.index.get_level_values(0)))
            state_list = []
            #remove county level "fips" code (zip code)
            for state in state_list_temp:
                if state in list(us_state_fips.keys()):
                    state_list.append(state)

            for state in state_list:
                for index, row in df.loc[(state,"point")].iterrows():
                    if row["target"][-9:] == "cum death":
                        week_ahead = row["target"][:2]
                        new_cum_death[week_ahead] = {}
                        new_cum_death[week_ahead]["state"] = us_state_fips[index[0]]
                        new_cum_death[week_ahead]["target_end_date"] = row["target_end_date"]
                        new_cum_death[week_ahead]["forecast_date"] = row["forecast_date"]
                        new_cum_death[week_ahead]["mean"] = row["value"]
                        new_cum_death[week_ahead]["# weeks ahead"] = week_ahead
                    if row["target"][-9:] == "inc death":
                        week_ahead = row["target"][:2]
                        new_inc_death[week_ahead] = {}
                        new_inc_death[week_ahead]["state"] = us_state_fips[index[0]]
                        new_inc_death[week_ahead]["target_end_date"] = row["target_end_date"]
                        new_inc_death[week_ahead]["forecast_date"] = row["forecast_date"]
                        new_inc_death[week_ahead]["mean"] = row["value"]
                        new_inc_death[week_ahead]["# weeks ahead"] = week_ahead
                    if row["target"][-9:] == " inc case":
                        week_ahead = row["target"][:2]
                        new_inc_case[week_ahead] = {}
                        new_inc_case[week_ahead]["state"] = us_state_fips[index[0]]
                        new_inc_case[week_ahead]["target_end_date"] = row["target_end_date"]
                        new_inc_case[week_ahead]["forecast_date"] = row["forecast_date"]
                        new_inc_case[week_ahead]["mean"] = row["value"]
                        new_inc_case[week_ahead]["# weeks ahead"] = week_ahead

                for key in new_cum_death:
                    df_cum_death = df_cum_death.append(new_cum_death[key],ignore_index=True)
                for key in new_inc_death:
                    df_inc_death = df_inc_death.append(new_inc_death[key],ignore_index=True)
                for key in new_inc_case:
                    df_inc_case = df_inc_case.append(new_inc_case[key],ignore_index=True)
        except:
                print(filename + " failed.")
                    
    df_cum_death.set_index(['state',"forecast_date", "# weeks ahead"], inplace=True)
    df_cum_death.sort_index(inplace=True)
    df_inc_death.set_index(['state',"forecast_date", "# weeks ahead"], inplace=True)
    df_inc_death.sort_index(inplace=True)
    df_inc_case.set_index(['state',"forecast_date", "# weeks ahead"], inplace=True)
    df_inc_case.sort_index(inplace=True)
    
    outdir = './Predictions/' + model
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    
    if not df_cum_death.empty:
        df_cum_death.to_csv(os.path.join(outdir, model + "_cum_death.csv") )
    if not df_inc_death.empty:
        df_inc_death.to_csv(os.path.join(outdir, model + "_inc_death.csv") )
    if not df_inc_case.empty:
        df_inc_case.to_csv(os.path.join(outdir, model + "_inc_case.csv") )
        
                    

In [None]:
#run this cell to get all data cleaned and store locally.
models = get_modelnames()
for model in models:
    get_prediction(model)

  for index, row in df.loc[(state,"point")].iterrows():


2020-05-24-Auquan-SEIR.csv failed.
2020-06-07-Auquan-SEIR.csv failed.


<h1> Getting ground truth data from JHU COVID Github <h1>

In [8]:
def jhu():
    #cum confirmed
    url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv'
    df_c = pd.read_csv(url, error_bad_lines=False)
    df_cGrouped = df_c.groupby('Province_State')
    df_c = df_cGrouped.sum()
    column_names = list(df_c.columns)
    to_delete = column_names[0:44]
    df_c.drop(to_delete,axis=1,inplace=True)
    
    url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
    df_us = pd.read_csv(url, error_bad_lines=False)
    new_row = pd.Series(df_us.iloc[-24,:])
    to_remove = list(new_row.index[:43])
    new_row.drop(labels=to_remove,inplace=True)
    new_row.name="US"
    df_c = df_c.append(new_row,ignore_index=False)
    
    column_names = [datetime.strptime(d[:-3]+"/2020", '%m/%d/%Y').date().strftime("%Y-%m-%d") for d in column_names[44:]]
    prev_column_names = list(df_c.columns)
    dic = {}
    for i in range(len(column_names)):
        dic[prev_column_names[i]] = column_names[i]
    df_c.rename(columns=dic,inplace=True)

    
    #cum death
    url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv'
    df_d = pd.read_csv(url, error_bad_lines=False)
    df_dGrouped = df_d.groupby('Province_State')
    df_d = df_dGrouped.sum()
    column_names = list(df_d.columns)
    to_delete = column_names[0:45]
    df_d.drop(to_delete,axis=1,inplace=True)
    
    url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
    df_us = pd.read_csv(url, error_bad_lines=False)
    new_row = pd.Series(df_us.iloc[-24,:])
    to_remove = list(new_row.index[:43])
    new_row.drop(labels=to_remove,inplace=True)
    new_row.name="US"
    df_d = df_d.append(new_row,ignore_index=False)
    
    column_names = [datetime.strptime(d[:-3]+"/2020", '%m/%d/%Y').date().strftime("%Y-%m-%d") for d in column_names[45:]]
    prev_column_names = list(df_d.columns)
    dic = {}
    for i in range(len(column_names)):
        dic[prev_column_names[i]] = column_names[i]
    df_d.rename(columns=dic,inplace=True)
    
    #calculation inc death and inc case
    df_inc_d = df_d.copy()
    df_inc_c = df_c.copy()
    df_inc_d.drop(["2020-03-01"],axis=1,inplace=True)
    df_inc_c.drop(["2020-03-01"],axis=1,inplace=True)
    
    for state in df_c.index:
        for i in range(1,len(df_c.columns)):
            df_inc_c.loc[state][df_c.columns[i]] = df_c.loc[state][df_c.columns[i]] - df_c.loc[state][df_c.columns[i-1]]
            df_inc_d.loc[state][df_d.columns[i]] = df_d.loc[state][df_d.columns[i]] - df_d.loc[state][df_d.columns[i-1]]
    
    df_inc_d[df_inc_d<0]=0
    df_inc_c[df_inc_c<0]=0
    
    df_d.to_csv("./Ground Truth/cum_death.csv")
    df_inc_d.to_csv("./Ground Truth/inc_death.csv")
    df_inc_c.to_csv("./Ground Truth/inc_case.csv")
    df_c.to_csv("./Ground Truth/cum_case.csv")




In [9]:
#run this cell to get ground truth data cleaned and store locally
jhu()

<h2> This part is just our efforts to reformat our model prediction so that it can match cdc's format. Please ignore.<h2>

In [38]:
filename = os.listdir("./Predictions_OurModels/LSTM")
for file in filename:
    if file[-9:] == 'daily.csv':
        os.remove("./Predictions_OurModels/LSTM/" + file)


In [39]:
import shutil
filename = os.listdir("./Predictions_OurModels/SIRD")
for file in filename:
    if file[-11:] == 'rt-live.csv':
        shutil.move("./Predictions_OurModels/SIRD/" + file,'./Predictions_OurModels/SIRD-rt-live')


In [76]:
l = ["LSTM", "SIR", "SIRD", "RandomForest", "SIRD-rt-live", "ARIMA", "ARGOnet"]
for name in l:
    filename = os.listdir("./Predictions_OurModels/" + name)
    to_remove = []
    for i in filename:
        if not i[-3:] == 'csv':
            to_remove.append(i)
    for i in to_remove:
        filename.remove(i)
    
    for file in filename:
        df = pd.read_csv("./Predictions_OurModels/" + name + "/" + file)
        if name in ["LSTM", "SIR", "SIRD", "RandomForest", "SIRD-rt-live"]:
            df["type"] = "point"
            df.rename(columns={"point":"value"},inplace=True)
            df["quantile"] = "NA"
        for index, row in df.iterrows():
            num = (datetime.strptime(row["target_end_date"],"%Y-%m-%d").date() - datetime.strptime(row["forecast_date"],"%Y-%m-%d").date()).days
            if row["target"][-9:] == " cum case" or row["target"] == "Cumulative Cases":
                df.at[index,"target"] = str(num) + " day ahead cum case"
            if row["target"][-9:] == " inc case" or row["target"] == "Daily Cases":
                df.at[index,"target"] = str(num) + " day ahead inc case"
            if row["target"][-9:] == "cum death":
                df.at[index,"target"] = str(num) + " day ahead cum death"
            if row["target"][-9:] == "inc death":
                df.at[index,"target"] = str(num) + " day ahead inc death"
            
        df.to_csv("./Predictions_OurModels/" + name + "/" + file,index=False)
    
    


In [77]:
def get_prediction_GITIDEAS(model):
    filenames = os.listdir("./Predictions_OurModels/" + model)
    df_cum_death = pd.DataFrame(columns = ["state", "# days ahead", "forecast_date", "target_end_date", "mean"])
    df_inc_death = pd.DataFrame(columns = ["state", "# days ahead", "forecast_date", "target_end_date", "mean"])
    df_inc_case = pd.DataFrame(columns = ["state", "# days ahead", "forecast_date", "target_end_date", "mean"])
    df_cum_case = pd.DataFrame(columns = ["state", "# days ahead", "forecast_date", "target_end_date", "mean"])
    for filename in filenames:
        try:
            url = './Predictions_OurModels/' + model + '/' + filename
            df = pd.read_csv(url, error_bad_lines=False, dtype={"location":str})
            df.set_index(["location", "type"],inplace=True)

            new_cum_death = {}
            new_inc_death = {}
            new_inc_case = {}
            new_cum_case = {}

            state_list = list(dict.fromkeys(df.index.get_level_values(0)))
            
            for state in state_list:
                for index, row in df.loc[(state,"point")].iterrows():
                    if row["target"][0] == "0":
                        continue
                    if row["target"][-9:] == "cum death":
                        week_ahead = row["target"][:2]
                        new_cum_death[week_ahead] = {}
                        new_cum_death[week_ahead]["state"] = us_state_fips[index[0]]
                        new_cum_death[week_ahead]["target_end_date"] = row["target_end_date"]
                        new_cum_death[week_ahead]["forecast_date"] = row["forecast_date"]
                        new_cum_death[week_ahead]["mean"] = row["value"]
                        new_cum_death[week_ahead]["# days ahead"] = week_ahead
                    if row["target"][-9:] == "inc death":
                        week_ahead = row["target"][:2]
                        new_inc_death[week_ahead] = {}
                        new_inc_death[week_ahead]["state"] = us_state_fips[index[0]]
                        new_inc_death[week_ahead]["target_end_date"] = row["target_end_date"]
                        new_inc_death[week_ahead]["forecast_date"] = row["forecast_date"]
                        new_inc_death[week_ahead]["mean"] = row["value"]
                        new_inc_death[week_ahead]["# days ahead"] = week_ahead
                    if row["target"][-9:] == " inc case":
                        week_ahead = row["target"][:2]
                        new_inc_case[week_ahead] = {}
                        new_inc_case[week_ahead]["state"] = us_state_fips[index[0]]
                        new_inc_case[week_ahead]["target_end_date"] = row["target_end_date"]
                        new_inc_case[week_ahead]["forecast_date"] = row["forecast_date"]
                        new_inc_case[week_ahead]["mean"] = row["value"]
                        new_inc_case[week_ahead]["# days ahead"] = week_ahead
                    if row["target"][-9:] == " cum case":
                        week_ahead = row["target"][:2]
                        new_cum_case[week_ahead] = {}
                        new_cum_case[week_ahead]["state"] = us_state_fips[index[0]]
                        new_cum_case[week_ahead]["target_end_date"] = row["target_end_date"]
                        new_cum_case[week_ahead]["forecast_date"] = row["forecast_date"]
                        new_cum_case[week_ahead]["mean"] = row["value"]
                        new_cum_case[week_ahead]["# days ahead"] = week_ahead

                for key in new_cum_death:
                    df_cum_death = df_cum_death.append(new_cum_death[key],ignore_index=True)
                for key in new_inc_death:
                    df_inc_death = df_inc_death.append(new_inc_death[key],ignore_index=True)
                for key in new_inc_case:
                    df_inc_case = df_inc_case.append(new_inc_case[key],ignore_index=True)
                for key in new_cum_case:
                    df_cum_case = df_cum_case.append(new_cum_case[key],ignore_index=True)
        except:
                print(filename + " failed.")
                    
    df_cum_death.set_index(['state',"forecast_date", "# days ahead"], inplace=True)
    df_cum_death.sort_index(inplace=True)
    df_inc_death.set_index(['state',"forecast_date", "# days ahead"], inplace=True)
    df_inc_death.sort_index(inplace=True)
    df_inc_case.set_index(['state',"forecast_date", "# days ahead"], inplace=True)
    df_inc_case.sort_index(inplace=True)
    
    outdir = './Predictions/' + model
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    
    if not df_cum_death.empty:
        df_cum_death.to_csv(os.path.join(outdir, model + "_cum_death.csv") )
    if not df_inc_death.empty:
        df_inc_death.to_csv(os.path.join(outdir, model + "_inc_death.csv") )
    if not df_inc_case.empty:
        df_inc_case.to_csv(os.path.join(outdir, model + "_inc_case.csv") )
        
                    

In [78]:
models = ["LSTM", "SIR", "SIRD", "RandomForest", "SIRD-rt-live"]
for model in models:
    get_prediction_GITIDEAS(model)

.DS_Store failed.
.DS_Store failed.
.ipynb_checkpoints failed.
.DS_Store failed.
.ipynb_checkpoints failed.
.DS_Store failed.
.ipynb_checkpoints failed.
.DS_Store failed.


In [89]:
models = ["ARIMA","ARGOnet"]
for model in models:
    filename = os.listdir("./Predictions_OurModels/" + model)
    filename.remove('.DS_Store')
    df = pd.read_csv('./Predictions_OurModels/' + model + '/' + filename[0], error_bad_lines=False, dtype={"location":str})
    df.set_index(["location", "type"],inplace=True)
    
    df_cum_death = pd.DataFrame(columns = ["state", "# days ahead", "forecast_date", "target_end_date", "mean"])
    df_inc_death = pd.DataFrame(columns = ["state", "# days ahead", "forecast_date", "target_end_date", "mean"])
    df_inc_case = pd.DataFrame(columns = ["state", "# days ahead", "forecast_date", "target_end_date", "mean"])
    df_cum_case = pd.DataFrame(columns = ["state", "# days ahead", "forecast_date", "target_end_date", "mean"])

    state_list = list(dict.fromkeys(df.index.get_level_values(0)))
    
    for state in state_list:
        for index, row in df.loc[(state,"point")].iterrows():
            if row["target"][-9:] == "cum death":
                new_cum_death = {}
                new_cum_death["state"] = us_state_fips[index[0]]
                new_cum_death["target_end_date"] = row["target_end_date"]
                new_cum_death["forecast_date"] = row["forecast_date"]
                new_cum_death["mean"] = row["value"]
                new_cum_death["# days ahead"] = row["target"][:2]
                df_cum_death = df_cum_death.append(new_cum_death,ignore_index=True)
            if row["target"][-9:] == "inc death":
                new_inc_death = {}
                new_inc_death["state"] = us_state_fips[index[0]]
                new_inc_death["target_end_date"] = row["target_end_date"]
                new_inc_death["forecast_date"] = row["forecast_date"]
                new_inc_death["mean"] = row["value"]
                new_inc_death["# days ahead"] = row["target"][:2]
                df_inc_death = df_inc_death.append(new_inc_death,ignore_index=True)
            if row["target"][-9:] == " inc case":
                new_inc_case = {}
                new_inc_case["state"] = us_state_fips[index[0]]
                new_inc_case["target_end_date"] = row["target_end_date"]
                new_inc_case["forecast_date"] = row["forecast_date"]
                new_inc_case["mean"] = row["value"]
                new_inc_case["# days ahead"] = row["target"][:2]
                df_inc_case = df_inc_case.append(new_inc_case,ignore_index=True)
            if row["target"][-9:] == " cum case":
                new_cum_case = {}
                new_cum_case["state"] = us_state_fips[index[0]]
                new_cum_case["target_end_date"] = row["target_end_date"]
                new_cum_case["forecast_date"] = row["forecast_date"]
                new_cum_case["mean"] = row["value"]
                new_cum_case["# days ahead"] = row["target"][:2]
                df_cum_case = df_cum_case.append(new_cum_case,ignore_index=True)

    df_cum_death.set_index(['state',"forecast_date", "# days ahead"], inplace=True)
    df_cum_death.sort_index(inplace=True)
    df_inc_death.set_index(['state',"forecast_date", "# days ahead"], inplace=True)
    df_inc_death.sort_index(inplace=True)
    df_inc_case.set_index(['state',"forecast_date", "# days ahead"], inplace=True)
    df_inc_case.sort_index(inplace=True)
    df_cum_case.set_index(['state',"forecast_date", "# days ahead"], inplace=True)
    df_cum_case.sort_index(inplace=True)
    
    outdir = './Predictions/' + model
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    
    if not df_cum_death.empty:
        df_cum_death.to_csv(os.path.join(outdir, model + "_cum_death.csv") )
    if not df_inc_death.empty:
        df_inc_death.to_csv(os.path.join(outdir, model + "_inc_death.csv") )
    if not df_inc_case.empty:
        df_inc_case.to_csv(os.path.join(outdir, model + "_inc_case.csv") )
    if not df_cum_case.empty:
        df_cum_case.to_csv(os.path.join(outdir, model + "_cum_case.csv") )
    
    
    
    

  for index, row in df.loc[(state,"point")].iterrows():
