# Center for Systems Science and Engineering (CSSE) at Johns Hopkins University (JHU): COVID-19 Dashboard

Data from https://gisanddata.maps.arcgis.com/apps/opsdashboard/index.html using arcgis REST API

In [None]:
import pandas as pd
import requests
import json
import pycountry
from datetime import datetime
from functools import reduce

In [None]:
# papermill parameters
output_folder = "../output"

#### Fetch record from global feature server, row count, set REST API params

In [None]:
endpoint = "https://services9.arcgis.com/N9p5hsImWXAccRNI/arcgis/rest/services/Nc2JKvYFoAEOFCG5JSI6/FeatureServer/1/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&resultType=none&distance=0.0&units=esriSRUnit_Meter&returnGeodetic=false&outFields=*&returnGeometry=true&returnCentroid=false&featureEncoding=esriDefault&multipatchOption=xyFootprint&maxAllowableOffset=&geometryPrecision=&outSR=&datumTransformation=&applyVCSProjection=false&returnIdsOnly=false&returnUniqueIdsOnly=false&returnExtentOnly=false&returnQueryGeometry=false&returnDistinctValues=false&cacheHint=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&having=&returnZ=false&returnM=false&returnExceededLimitFeatures=true&quantizationParameters=&sqlFormat=none&f=pjson"
headers = {"Referer": "https://gisanddata.maps.arcgis.com/apps/opsdashboard/index.html"}
response = requests.get(endpoint + "&returnCountOnly=true", headers=headers)

# check request status is HTTP_OK
assert response.status_code is 200

# load to json
data = json.loads(response.text)

# set query fetch params
record_count = data['count']
offset = 0
record_fetch_count = 1000
fields = None
records = []

print(record_count)

#### Fetch records from global feature server using Arcgis RestAPI, create Dataframe from records

In [None]:
while offset < record_count:
    response = requests.get(endpoint + f"&resultRecordCount={record_fetch_count}&resultOffset={offset}", headers=headers)
    # check if request status code is HTTP_OK 
    assert response.status_code is 200
    
    # load to json
    data = json.loads(response.text)
    
    # set fields
    if not fields:
        fields = list(map(lambda f: f['name'],data['fields']))
    
    data = list(map(lambda record: record['attributes'].values(), data['features']))
    
    # add to records
    records += data
    
    # set params
    offset += record_fetch_count
    
assert len(records) == record_count
records = dict(zip(range(len(records)), records)) 
global_df = pd.DataFrame.from_dict(records, orient="index", columns=fields)

In [None]:
#### Fetch record from testings feature server, row count, set REST API params

In [None]:
endpoint = "https://services9.arcgis.com/N9p5hsImWXAccRNI/arcgis/rest/services/Nc2JKvYFoAEOFCG5JSI6/FeatureServer/3/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&resultType=none&distance=0.0&units=esriSRUnit_Meter&returnGeodetic=false&outFields=*&returnGeometry=true&returnCentroid=false&featureEncoding=esriDefault&multipatchOption=xyFootprint&maxAllowableOffset=&geometryPrecision=&outSR=&datumTransformation=&applyVCSProjection=false&returnIdsOnly=false&returnUniqueIdsOnly=false&returnExtentOnly=false&returnQueryGeometry=false&returnDistinctValues=false&cacheHint=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&having=&returnZ=false&returnM=false&returnExceededLimitFeatures=true&quantizationParameters=&sqlFormat=none&f=pjson"
headers = {"Referer": "https://gisanddata.maps.arcgis.com/apps/opsdashboard/index.html"}
response = requests.get(endpoint + "&returnCountOnly=true", headers=headers)

# check request status is HTTP_OK
assert response.status_code is 200

# load to json
data = json.loads(response.text)

# set query fetch params
record_count = data['count']
offset = 0
record_fetch_count = 1000
fields = None
records = []

print(record_count)

#### Fetch records from testings feature server using Arcgis RestAPI, create Dataframe from records

In [None]:
while offset < record_count:
    response = requests.get(endpoint + f"&resultRecordCount={record_fetch_count}&resultOffset={offset}", headers=headers)
    # check if request status code is HTTP_OK 
    assert response.status_code is 200
    
    # load to json
    data = json.loads(response.text)
    
    # set fields
    if not fields:
        fields = list(map(lambda f: f['name'],data['fields']))
    
    data = list(map(lambda record: record['attributes'].values(), data['features']))
    
    # add to records
    records += data
    
    # set params
    offset += record_fetch_count
    
assert len(records) == record_count
records = dict(zip(range(len(records)), records)) 
test_df = pd.DataFrame.from_dict(records, orient="index", columns=fields)

#### Parse date, drop unnecessary columns, rename columns 

In [None]:
global_df = global_df.drop(columns=["OBJECTID", "UID", "Combined_Key"])
global_df = global_df.rename(columns={"Long_": "Long", "Last_Update": "Date", "Admin2": "County"})
global_df["Date"] = pd.to_datetime(global_df["Date"], unit="ms")
global_df["Hospitalization_Rate"] = None
global_df["Testing_Rate"] = None
global_df["Mortality_Rate"] = None

test_df = test_df.drop(columns=["OBJECTID", "UID"])
test_df = test_df.rename(columns={"Long_": "Long", "Last_Update": "Date"})
test_df["Date"] = pd.to_datetime(test_df["Date"], unit="ms")
test_df["County"] = None

#### Append dataframes, reset index

In [None]:
global_df = global_df.append(test_df, ignore_index=True, sort=True)
global_df = global_df.reset_index()

#### Get US pycountry subdivisions

In [None]:
cn_replace = [
    " Sheng",
    " Zizhiqu", 
    " SAR (see also separate country code entry under HK)", 
    " Shi",
    " (see also separate country code entry under TW)",
    " Uygur",
    " SAR (see also separate country code entry under MO)",
    " Huizi",
    " Zhuangzu",
    " Huizi",
]

ca_replace = [
    " Territory"
]

us_subdivisions = {sd.name: sd.code.replace("US-", "") for sd in pycountry.subdivisions.get(country_code='US')}
ca_subdivisions = {reduce(lambda x, kv: x.replace(kv, ""), ca_replace, sd.name): sd.code.replace("CA-", "") for sd in pycountry.subdivisions.get(country_code='CA')}
uk_subdivisions = {sd.name: sd.code.replace("GB-", "") for sd in pycountry.subdivisions.get(country_code='GB')}
cn_subdivisions = {reduce(lambda x, kv: x.replace(kv, ""), cn_replace, sd.name): sd.code.replace("CN-", "") for sd in pycountry.subdivisions.get(country_code='CN')}
nl_subdivisions = {sd.name: sd.code.replace("NL-", "") for sd in pycountry.subdivisions.get(country_code='NL')}
au_subdivisions = {sd.name: sd.code.replace("AU-", "") for sd in pycountry.subdivisions.get(country_code='AU')}

subdivisions = {
    **us_subdivisions, 
    **ca_subdivisions, 
    **uk_subdivisions, 
    **cn_subdivisions, 
    **au_subdivisions}

#### create pycountry data resolve helper functions

In [None]:
def resolve_iso3166_1_by_iso3(iso):
    # get iso3166_1 from iso3166_3
    
    if iso and len(iso):
        if pycountry.countries.get(alpha_3=iso):
            return pycountry.countries.get(alpha_3=iso).alpha_2
    return ""

def resolve_iso3166_1_by_name(name):
    # get iso3166_1 from name using fuzzy search
    
    if name and len(name):
        try:
            res = pycountry.countries.search_fuzzy(name)  # according to docs: "...There’s also a bit of prioritization included to prefer matches on country names before subdivision names and have countries with more matches be listed before ones with fewer matches..."
            return pycountry.countries.search_fuzzy(name)[0].alpha_3
        except LookupError: # Lookup is thrown by pycountry search_fuzzy function when result is empty (catch IndexError is redundant)
            return ""
        
def resolve_iso3166_2(state):
    if state:
        try:
            return subdivisions[state]
        except KeyError:
            pass
    return None
        
def resolve_name(row):
    try:
        lookup = pycountry.countries.get(alpha_2=row["ISO3166_1"])
        row["Country_Region"] = lookup.name
        return row
    except (LookupError, KeyError, AttributeError):
        return row

#### resolve pycountry specific fields

In [None]:
global_df['ISO3166_1'] = global_df['ISO3'].apply(lambda row: resolve_iso3166_1_by_iso3(row))

# if iso3166_3 cannot resolve, try using fuzzy search, select top match 
global_df['ISO3166_1'].loc[global_df['ISO3166_1'] == ''] = global_df.loc[global_df['ISO3166_1'] == '']['Country_Region'].apply(lambda row: resolve_iso3166_1_by_name(row))

# when fuzzy search cannot resolve iso3166_2 means that entity is a cruise ship
global_df['ISO3166_1'].loc[global_df['ISO3166_1'] == ''] = 'Cruise Ship'

# drop iso3166_3
global_df = global_df.drop(columns=["ISO3"])

# resolve names for data consistency
global_df = global_df.apply(lambda row: resolve_name(row), axis=1)

# resolve iso3166_2 if given
global_df['ISO3166_2'] = global_df['Province_State'].apply(lambda c: resolve_iso3166_2(c))

global_df['ISO3166_2'].loc[global_df['ISO3166_1'] == 'Cruise Ship'] = 'Cruise Ship'

#### resolve County field values being unassigned, FIPS format

In [None]:
global_df['County'] = global_df['County'].replace(r'(?i)unassigned', 'unassigned', regex=True) 
global_df['FIPS'].loc[global_df['FIPS'] != ''] = global_df['FIPS'].str.zfill(5)
global_df['FIPS'] = global_df['FIPS'].replace(r'^(0{3,})(\d{2})$', r'\g<2>\g<1>', regex=True)
global_df = global_df.astype({
    'FIPS': 'object'
})

#### add Last_Reported_Flag and Last_Update_Date

In [None]:
global_df['Last_Reported_Flag'] = global_df["Date"].max() == global_df["Date"]  # Date updated by rows in dataset (possible chance of only one record being "last_reported")
global_df["Last_Update_Date"] = datetime.utcnow()

In [None]:
global_df = global_df[["Country_Region",
                          "Province_State",
                          "County",
                          "FIPS",
                          "Date",
                          "Active",
                          "People_Tested",
                          "Confirmed",
                          "People_Hospitalized",
                          "Deaths",
                          "Recovered",
                          "Incident_Rate",
                          "Testing_Rate",
                          "Hospitalization_Rate",
                          "Mortality_Rate",
                          "Long",
                          "Lat", 
                          "ISO3166_1",
                          "ISO3166_2",
                          "Last_Update_Date",
                          "Last_Reported_Flag"
                         ]]

In [None]:
global_df.dtypes

In [None]:
global_df

In [None]:
global_df.to_csv(output_folder + "JHU_DASHBOARD_COVID_19_GLOBAL.csv", index=False)

In [None]:
global_df.loc[global_df['People_Hospitalized'].notna()]