# Center for Systems Science and Engineering (CSSE) at Johns Hopkins University (JHU): COVID-19 Dashboard

Data from https://gisanddata.maps.arcgis.com/apps/opsdashboard/index.html using arcgis REST API

In [None]:
import pandas as pd
import requests
import json
import pycountry
from datetime import datetime

In [None]:
# papermill parameters
output_folder = "../output"

#### Fetch record, row count, set rest api params

In [None]:
endpoint = "https://services9.arcgis.com/N9p5hsImWXAccRNI/arcgis/rest/services/Nc2JKvYFoAEOFCG5JSI6/FeatureServer/2/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&resultType=none&distance=0.0&units=esriSRUnit_Meter&returnGeodetic=false&outFields=*&returnGeometry=true&returnCentroid=false&featureEncoding=esriDefault&multipatchOption=xyFootprint&maxAllowableOffset=&geometryPrecision=&outSR=&datumTransformation=&applyVCSProjection=false&returnIdsOnly=false&returnUniqueIdsOnly=false&returnExtentOnly=false&returnQueryGeometry=false&returnDistinctValues=false&cacheHint=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&having=&returnZ=false&returnM=false&returnExceededLimitFeatures=true&quantizationParameters=&sqlFormat=none&f=pjson"
headers = {"Referer": "https://gisanddata.maps.arcgis.com/apps/opsdashboard/index.html"}
response = requests.get(endpoint + "&returnCountOnly=true", headers=headers)

# check request status is HTTP_OK
assert response.status_code is 200

# load to json
data = json.loads(response.text)

# set query fetch params
record_count = data['count']
offset = 0
record_fetch_count = 2000
fields = None
records = []

print(record_count)

#### Fetch records using Arcgis RestAPI, create Dataframe from records

In [None]:
while offset < record_count:
    response = requests.get(endpoint + f"&resultRecordCount={record_fetch_count}&resultOffset={offset}", headers=headers)
    # check if request status code is HTTP_OK 
    assert response.status_code is 200
    
    # load to json
    data = json.loads(response.text)
    
    # set fields
    if not fields:
        fields = list(map(lambda f: f['name'],data['fields']))
    
    data = list(map(lambda record: record['attributes'].values(), data['features']))
    
    # add to records
    records += data
    
    # set params
    offset += record_fetch_count
    
assert len(records) == record_count
records = dict(zip(range(len(records)), records)) 
df = pd.DataFrame.from_dict(records, orient="index", columns=fields)

#### Parse date, drop unnecessary columns, rename columns 

In [None]:
df = df.drop(columns=["OBJECTID", "UID"])
df = df.rename(columns={"Long_": "Long", "Last_Update": "Date"})
df["Date"] = pd.to_datetime(df["Date"], unit="ms")

In [None]:
df.head()

#### create pycountry data resolve helper functions

In [None]:
def resolve_iso3166_1_by_iso3(iso):
    # get iso3166_1 from iso3166_3
    
    if iso and len(iso):
        if pycountry.countries.get(alpha_3=iso):
            return pycountry.countries.get(alpha_3=iso).alpha_2
    return ""

def resolve_iso3166_1_by_name(name):
    # get iso3166_1 from name using fuzzy search
    
    if name and len(name):
        try:
            res = pycountry.countries.search_fuzzy(name)  # according to docs: "...There’s also a bit of prioritization included to prefer matches on country names before subdivision names and have countries with more matches be listed before ones with fewer matches..."
            return pycountry.countries.search_fuzzy(name)[0].alpha_3
        except LookupError: # Lookup is thrown by pycountry search_fuzzy function when result is empty (catch IndexError is redundant)
            return ""
        
def resolve_name(row):
    try:
        lookup = pycountry.countries.get(alpha_2=row["ISO3166_1"])
        row["Country_Region"] = lookup.name
        return row
    except (LookupError, KeyError, AttributeError):
        return row

In [None]:
df['ISO3166_1'] = df['ISO3'].apply(lambda row: resolve_iso3166_1_by_iso3(row))

# if iso3166_3 cannot resolve, try using fuzzy search, select top match 
df['ISO3166_1'].loc[df['ISO3166_1'] == ''] = df.loc[df['ISO3166_1'] == '']['Country_Region'].apply(lambda row: resolve_iso3166_1_by_name(row))

# when fuzzy search cannot resolve iso3166_2 means that entity is a cruise ship
df['ISO3166_1'].loc[df['ISO3166_1'] == ''] = 'Cruise Ship'

# drop iso3166_3
df = df.drop(columns=["ISO3"])

# resolve names for data consistency
df = df.apply(lambda row: resolve_name(row), axis=1)

In [None]:
df['Last_Reported_Flag'] = df["Date"].max() == df["Date"]  # Date updated by rows in dataset (possible chance of only one record being "last_reported")
df["Last_Update_Date"] = datetime.utcnow()

In [None]:
df.head()

In [None]:
df.to_csv(output_folder + "JHU_DASHBOARD_COVID_19_GLOBAL.csv", index=False)