# IHME PROJECTIONS

IHME's COVID-19 projections were developed in response to requests from the University of Washington School of Medicine and other US hospital systems and state governments working to determine when COVID-19 would overwhelm their ability to care for patients. The forecasts show demand for hospital services, including the availability of ventilators, general hospital beds, and ICU beds, as well as daily and cumulative deaths due to COVID-19.

In [None]:
import pandas as pd
import requests
import pycountry
import zipfile
from functools import reduce
import io
import re
from datetime import datetime
from csv import QUOTE_NONNUMERIC

In [None]:
# papermill parameters
output_folder = "../output/"

In [None]:
# get .zip file content

response = requests.get("https://ihmecovid19storage.blob.core.windows.net/latest/ihme-covid19.zip", stream=True)
assert response.status_code is 200

# parse .zip
z = zipfile.ZipFile(io.BytesIO(response.content))

In [None]:
# create df from csv in zip

df = pd.read_csv(z.open(re.search(r".*\.csv", "\n".join(z.namelist()))[0]))

In [None]:
# set columns Last_Update_Date and LAST_REPORTED_FLAG

df['Last_Update_Date'] = datetime.now()
df['Last_Reported_Flag'] = df['date'].max() == df['date']

In [None]:
regions_manual_dict = {
    "Balearic Islands": ("ES", "Spain", "ES-IB"),
    "Basque Country": ("ES", "Spain", "ES-PV"),
    "Bavaria": ("DE", "Germany", "DE-BY"),
    "Canary Islands": ("ES", "Spain", "ES-CN"),
    "Castile and Leon": ("ES", "Spain", "ES-CL"),
    "Catalonia": ("ES", "Spain", "ES-CT"),
    "Community of Madrid": ("ES", "Spain", "ES-MD"),
    "King and Snohomish Counties (excluding Life Care Center), WA": ("US", "United States", "US-WA"),
    "Life Care Center, Kirkland, WA": ("US", "United States", "US-WA"),
    "Lower Saxony": ("DE", "Germany", "DE-NI"),
    "Navarre": ("ES", "Spain", "ES-NA"),
    "North Rhine-Westphalia": ("DE", "Germany", "DE-NW"),
    "Other Counties, WA": ("US", "United States", "US-WA"),
    "Provincia autonoma di Bolzano": ("IT", "Italy", "IT-BZ"),
    "Provincia autonoma di Trento": ("IT", "Italy", "IT-TN"),
    "Rhineland-Palatinate": ("DE", "Germany", "DE-RP"),
    "Saxony-Anhalt": ("DE", "Germany", "DE-ST"),
    "Saxony": ("DE", "Germany", "DE-SN"),
    "Thuringia": ("DE", "Germany", "DE-TH"),
    "Valencian Community": ("ES", "Spain", "ES-VC")
}

subdivisions = {}

In [None]:
# create country resolver helper func

def resolve_country(location_name):
    country_code, country_name, subdiv_code = None, None, None
    lookup = pycountry.countries.get(name=location_name)
    if not lookup:
        try:
            lookup = pycountry.countries.search_fuzzy(location_name)[0]
            country_name, country_code = lookup.name, lookup.alpha_2
            if country_code not in list(subdivisions):
                subdivisions[country_code] = {k.name: k.code.replace(f"{country_code}-", "") for k in pycountry.subdivisions.get(country_code=country_code)} 
            subdiv_code = subdivisions[country_code][location_name]
        except (LookupError): 
            if location_name in list(regions_manual_dict):

        
                country_code, country_name, subdiv_code = regions_manual_dict[location_name]
                subdiv_code = subdiv_code.replace(f"{country_code}-", "")
    else:
        country_name, country_code = lookup.name, lookup.alpha_2
    return country_name, country_code, subdiv_code


In [None]:
df['COUNTRY_REGION'] = None
df['ISO3166_1'] = None
df['ISO3166_2'] = None

# get distinct locations list
distinct_locations = list(df['location_name'].unique())

# iterate distinct_locations
for c in distinct_locations:
    country_name, country_code, subdiv_code = resolve_country(c)
    
    # set value where location_name == c
    df['COUNTRY_REGION'].loc[df['location_name'] == c] = country_name
    df['ISO3166_1'].loc[df['location_name'] == c] = country_code
    df['ISO3166_2'].loc[df['location_name'] == c] = subdiv_code
    

In [None]:
# fix some subdivisions manually

df['ISO3166_2'].loc[(df['ISO3166_2'].isna()) & (df['location_name'] == 'Aragon')] = "AR"
df['ISO3166_2'].loc[(df['ISO3166_2'].isna()) & (df['location_name'] == 'Andalucia')] = "AN"
df['ISO3166_2'].loc[(df['ISO3166_2'].isna()) & (df['location_name'] == 'Baden-Wurttemberg')] = "BW"
df['ISO3166_2'].loc[(df['ISO3166_2'].isna()) & (df['location_name'] == 'Hesse')] = "HE"

In [None]:
df['date'] = pd.to_datetime(df['date'].astype(str), format='%Y-%m-%d')

In [None]:
# set province_state where subdivisions code exists

df['PROVINCE_STATE'] = None
df['PROVINCE_STATE'].loc[df['ISO3166_2'].notna()] = df.loc[df['ISO3166_2'].notna()]['location_name']

In [None]:
# drop cols
cols = list(df.columns)  # dataset has uncertain columns, make sure not referencing any non-existing columns
drop_cols = list(filter(lambda col: col in cols, ['location_name', 'V1', 'location_id']))

df = df.drop(columns=drop_cols)

In [None]:
df.dtypes

In [None]:
df.sample(5)

In [None]:
df.to_csv(output_folder + "IHME_COVID_19.csv", columns=["date","allbed_mean","allbed_lower","allbed_upper","ICUbed_mean","ICUbed_lower","ICUbed_upper","InvVen_mean","InvVen_lower","InvVen_upper","deaths_mean","deaths_lower","deaths_upper","admis_mean","admis_lower","admis_upper","newICU_mean","newICU_lower","newICU_upper","totdea_mean","totdea_lower","totdea_upper","bedover_mean","bedover_lower","bedover_upper","icuover_mean","icuover_lower","icuover_upper","Last_Update_Date","Last_Reported_Flag","COUNTRY_REGION","ISO_3166_1","ISO_3166_2","PROVINCE_STATE"],
          index=False, sep=",", quoting=QUOTE_NONNUMERIC)