## WHO Coronavirus disease (COVID-2019) situation reports

PDF Reports from https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports in Tabular format.


In [None]:
import tabula
import pandas as pd
import pycountry
import requests
import os
import re
import numpy
from datetime import datetime

In [None]:
# papermill parameters
output_folder = "../output/"


In [None]:
pattern = r'/docs/default-source/coronaviruse/situation-reports/([^/]+).pdf'

r = requests.get('https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports')

reports_to_fetch = list(set(re.findall(pattern, r.text)))



In [None]:
reports_to_fetch.sort()
reports_to_fetch.reverse()
reports_to_fetch = [report for report in reports_to_fetch if report > '20200229']


if os.getenv("ENVIRONMENT") == "CI":
    reports_to_fetch = reports_to_fetch[0:2]
    
#reports_to_fetch = reports_to_fetch[0:7]
reports_to_fetch

In [None]:
all_reports = {}

for report in reports_to_fetch:
    report_url = "https://www.who.int/docs/default-source/coronaviruse/situation-reports/"+ report +".pdf"
    all_tables = tabula.read_pdf(report_url, pages='all', pandas_options={'header': None},silent=True)
    all_reports[report] = all_tables


In [None]:
country_data = pd.DataFrame([],columns=[])

for report, all_tables in all_reports.items():

    # Remove all tables without 7 columns
    for df in all_tables:
        
        # Data quality issue, sometimes we got an extra null column 
        if len(df.columns) > 7:
            for i in df.columns:

                if df[i].dtype != numpy.float64: 
                    if df[i].str.match("^\d+ (\d{4,8}|\d{1,2})$").any():
                        df.insert(i, str(i) + "_", value=df[i].str.split(' ').str[1] )
                        df[i] = df[i].str.split(' ').str[0]

                df[i] = df[i].replace('§',numpy.NaN)
                if df[i].isnull().unique()[0] & len(df[i].isnull().unique()) == 1:
                    df = df.drop(labels=i, axis=1)

            j=0
            for i in df.columns:
                df.rename({i:j},axis=1, inplace=True)
                j=j+1            
            
        if len(df.columns) == 7:
            df = df.rename(columns={0:'Country',1:'Total_Cases',2:'Cases_New',3:'Deaths',4:'Deaths_New',5:'Transmission_Classification',6:'Days_Since_Last_Reported_Case'})
            df["ISO3166-1"] = ""
            df['Country/Region'] = ""
            df["Date"] = datetime.strptime(report[0:8], '%Y%m%d')
            df["Situation_Report_name"] = report
            df["Situation_Report_URL"] = "https://www.who.int/docs/default-source/coronaviruse/situation-reports/"+ report +".pdf"
            country_data = country_data.append(df,ignore_index=True)

In [None]:
# Remove columns with null country or cases 
country_data = country_data[country_data['Country'].notnull()]
country_data = country_data[country_data['Total_Cases'].notnull()]
# header row
country_data = country_data[~country_data.Days_Since_Last_Reported_Case.isin(['reported case','Days since last reported case','last reported','Days since last','reported case'])]
country_data = country_data[country_data['Days_Since_Last_Reported_Case'].notnull()]

# remove `*` from numbers
country_data["Days_Since_Last_Reported_Case"] = country_data["Days_Since_Last_Reported_Case"].astype("str")
country_data["Days_Since_Last_Reported_Case"] = country_data["Days_Since_Last_Reported_Case"].str.replace( '\*', '')
country_data["Days_Since_Last_Reported_Case"] = pd.to_numeric( country_data["Days_Since_Last_Reported_Case"] ).astype("int32")

country_data["Deaths"] = country_data["Deaths"].astype("str").str.replace(' ','')
country_data["Deaths_New"] = country_data["Deaths_New"].astype("str").str.replace(' ','')
country_data["Cases_New"] = country_data["Cases_New"].astype("str").str.replace(' ','')
country_data["Total_Cases"] = country_data["Total_Cases"].str.replace(' ','').astype('float').fillna(0).astype('int32')



In [None]:
#country_data[country_data['Country']=='United States of America']
#country_data[country_data['Country']=='Spain']
#all_reports['20200319-sitrep-59-covid-19']

In [None]:
# fix for 20200515-covid-19-sitrep-116
# International conveyance (Diamond Princess),712,0,13,0-,,60,,International conveyance (Diamond Princess),2020-05-15,20200515-covid-19-sitrep-116,https://www.who.int/docs/default-source/coronaviruse/situation-reports/20200515-covid-19-sitrep-116.pdf,2020-05-19 12:07:09.219221,False

country_data["Deaths_New"] = country_data["Deaths_New"].astype("str").str.replace('0-','0') 


In [None]:
changed_names = {
    "The United Kingdom": "United Kingdom",
    "Serbia††": "Serbia",
    "Iran (Islamic Republic of)": "Iran",
    "occupied Palestinian territory": "Palestine",
    "occupied Palestinian Territory": "Palestine",  
    "Occupied Palestinian Territory": "Palestine",        
    "Venezuela (Bolivarian Republic of)": "Venezuela",
    "of)": "Venezuela", # 20200501-covid-19-sitrep
    "Bolivia (Plurinational State of)": "Bolivia",
    "State of)": "Bolivia", # 20200314-sitrep-54-covid-19 
    "Republic of)": "Venezuela", # 20200314-sitrep-54-covid-19
    "Côte d’Ivoire": "Côte d'Ivoire",
    "Cote d’Ivoire": "Côte d'Ivoire",
    "Cote d Ivoire": "Côte d'Ivoire",
    "International": "International conveyance (Diamond Princess)",
    "conveyance": "International conveyance (Diamond Princess)",
    "Other*": "International conveyance (Diamond Princess)",    
    "Kosovo[1]": "Kosovo",
    "United States Virgin Islands": "Virgin Islands",
    "Democratic Republic of the Congo": "Congo, The Democratic Republic of the",
    "Kingdom¶": "United Kingdom",
    "the United Kingdom": "United Kingdom",
    "the)": "Northern Mariana Islands",
    "Lao People’s": "Lao People's Democratic Republic",
    "wealth of the)": "Northern Mariana Islands",
    "(Commonwealth of the)": "Northern Mariana Islands"

}

country_data["Country"] = country_data["Country"].str.replace('\r', ' ')
country_data["Country"] = country_data["Country"].str.replace('^', '')
country_data["Country"] = country_data["Country"].replace(changed_names)

countries = country_data["Country"].unique()

country_dict = {}

for country in countries:
    if not "conveyance" in country:
        country_dict[country] = pycountry.countries.search_fuzzy( country )[0]
    else:
        country_dict[country] = None

        
#country_dict

In [None]:
def resolve_iso3166_1_row(row):
    country = country_dict[ row["Country"] ]
    if country:
        row["ISO3166-1"] = country.alpha_2
        row['Country/Region'] = country.name
    else:
        row["ISO3166-1"] = ""
        row['Country/Region'] = row["Country"]
    return row
    

data = country_data.apply(resolve_iso3166_1_row, axis="columns")
        


## Adding Metadata

Before we save the file locally, we add the `Last_Update_Date` in `UTC` time zone.


In [None]:
data["Last_Update_Date"] = datetime.utcnow()

In [None]:
data['Last_Reported_Flag'] = data['Date'] == data['Date'].max()

In [None]:
data.to_csv(output_folder + "WHO_SITUATION_REPORTS.csv", index=False)