# Coronavirus (COVID-19) England Cases Scraper
## David Beavan @DavidBeavan
## Licence: MIT. Sources: see below
## Notes
* This is liable to break, lots if the official sources change
* One problem is that the offical csv data has no date info
* We try to fudge it when scraping today and the data looks like yesterday we do not add it

In [1]:
# load libraries
from pathlib import Path
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import datetime
import shutil

In [2]:
# Prep data
# Data is not packaged with code, Will download from sources and save for future runsdata_base_dir = Path('data/secondary_sources')
data_base_dir = Path('data/secondary_sources')
data_base_dir.mkdir(parents=True, exist_ok=True)

data_sub_dir = data_base_dir.joinpath('utla_cases_table')
data_sub_dir.mkdir(parents=True, exist_ok=True)

utla_cases_file = data_sub_dir.joinpath('utla_cases.csv')

# Load existing data, if present, otherwise start with a blank dataframe
if utla_cases_file.exists():
    utla_cases_df = pd.read_csv(utla_cases_file, index_col='utla')
else:
    utla_cases_df = []

In [3]:
def add_new_data(df, existing_df=None):
    # Take date of new column
    scrape_date = list(df)[0]

    # If we have an existing dataframe we are adding this column to
    if existing_df is not None and len(existing_df) > 0:
        # If the new data is the same date as existing data
        # replace it if different, but if the same to nothing

        if scrape_date in existing_df:
            if not existing_df[scrape_date].equals(df[scrape_date]):
                existing_df[scrape_date] = df[scrape_date]
                df = existing_df
            else:
                df = existing_df
        else:
            # We do not have this new date in the existing dataframe
            # See if this new data matches yesterday's
            # This is the case if we scrape early in a day and assume the date is today
            # This is all thanks to assuming the data is dated when it is scraped
            # We do this because there is no date info in the primary source

            scrape_date_object = datetime.date.fromisoformat(scrape_date)
            scrape_date_object = scrape_date_object - \
                datetime.timedelta(days=1)
            scrape_date_yesterday = scrape_date_object.isoformat()

            # If the new data is the same data as yesterday do nothing
            # Otherwise add the new data column
            if not (scrape_date_yesterday in existing_df.columns and existing_df[scrape_date_yesterday].equals(df[scrape_date])):
                existing_df = existing_df.join(df, how='outer')
                df = existing_df
            else:
                df = existing_df

    return df

In [4]:
# Retired as data now comes as csv

# def scrape_page(url, existing_df=None):

#     # Fetch url, it is a web page
#     r = requests.get(url)
#     soup = BeautifulSoup(r.content, 'html.parser')

#     # Find paragraph that matches the date of the data
#     date_paras = soup.find_all('p', text=re.compile('These data are as of'))
#     if len(date_paras) != 1:
#         print('Error: too many possible dates')

#     date_para = date_paras[0].text

#     # Find the date, parse it and keep the iso data format as text
#     date_match = re.search('These data are as of .+? on (.+)\.', date_para)
#     date_string = date_match.group(1)
#     datetime_object = datetime.datetime.strptime(date_string, '%d  %B %Y')
#     scrape_date = datetime_object.date().isoformat()

#     # Find the data table and make a dataframe from it
#     table = soup.find_all('table')[0]
#     df = pd.read_html(str(table), index_col=0)
#     df = df[0]

#     # Neaten up the dataframe
#     df.index.rename('utla', inplace=True)
#     df.columns = [scrape_date]

#     # Remove those with no location, later datasets do not give awaiting info
#     df.drop('Awaiting confirmation', inplace=True, errors='ignore')

#     # As of 2020-03-09 The following areas were merged, we will retrospectivaly do this for early data
#     df.rename(index={'Cornwall': 'Cornwall and Isles of Scilly',
#                      'Hackney': 'Hackney and City of London'}, inplace=True)

#     # Add this column to the existing dataframe
#     df = add_new_data(df, existing_df)

#     return df

In [5]:
# Remove whitespace and thousand seperators
def str_to_int(str):
    trimmed = str.strip()
    intvalue = trimmed.replace(',', '')
    return int(intvalue)


def scrape_csv(url, existing_df=None, override_date=None):

    # Fetch url, it is a csv
    df = pd.read_csv(url, index_col='GSS_CD')
    df.index.name = 'utla'

    # Drop unneeded columns
    df.drop(columns=['GSS_NM'], inplace=True)

    # 2020-04-05 raw data had spaces and thousand seperators
    # convert to int
    df['TotalCases'] = df['TotalCases'].apply(str_to_int)

    # Set date of data, overriding if needed
    if override_date is not None:
        df.columns = [override_date]
    else:
        scrape_date = datetime.date.today().isoformat()
        df.columns = [scrape_date]

    # Add this column to the existing dataframe
    df = add_new_data(df, existing_df)

    return df

In [6]:
# Retired as we have transitioned from text indexes to utla codes when data source moved to csv

# def reindex(df):
#     # Fetch url of most recent data, it is a csv
#     data_df = pd.read_csv(
#         'https://www.arcgis.com/sharing/rest/content/items/b684319181f94875a6879bbc833ca3a6/data')

#     # Select code and name colmuns
#     data_df = data_df[['GSS_CD', 'GSS_NM']]

#     # Index on name, to match the dataframe
#     data_df.set_index('GSS_NM', inplace=True)
#     data_df.index.name = 'utla'

#     # Join to dataframe, so we now have code accessable
#     df = df.join(data_df)

#     # Change index to code
#     df.set_index('GSS_CD', inplace=True)
#     df.index.name = 'utla'

#     # Add in names in index 0
#     df.insert(0, 'GSS_NM', data_df.index.tolist())

#     return df

In [7]:
# Retired as data now comes as csv

# 8 March 2020
# utla_cases_df = scrape_page('https://web.archive.org/web/20200308150435/https://www.gov.uk/government/publications/coronavirus-covid-19-number-of-cases-in-england/coronavirus-covid-19-number-of-cases-in-england', utla_cases_df)

# 9 March 2020
# utla_cases_df = scrape_page('https://web.archive.org/web/20200309190503/https://www.gov.uk/government/publications/coronavirus-covid-19-number-of-cases-in-england/coronavirus-covid-19-number-of-cases-in-england', utla_cases_df)

# 10 March 2020
# utla_cases_df = scrape_page('https://web.archive.org/web/20200310222310/https://www.gov.uk/government/publications/coronavirus-covid-19-number-of-cases-in-england/coronavirus-covid-19-number-of-cases-in-england', utla_cases_df)

# 11 March 2020
# utla_cases_df = scrape_page('https://web.archive.org/web/20200311173829/https://www.gov.uk/government/publications/coronavirus-covid-19-number-of-cases-in-england/coronavirus-covid-19-number-of-cases-in-england', utla_cases_df)

In [8]:
# More recent case data, not archived, replaced daily, but then that is what we are doing

# utla_cases_df = scrape_csv('https://www.arcgis.com/sharing/rest/content/items/b684319181f94875a6879bbc833ca3a6/data', utla_cases_df, '2020-03-14')
utla_cases_df = scrape_csv(
    'https://www.arcgis.com/sharing/rest/content/items/b684319181f94875a6879bbc833ca3a6/data', utla_cases_df)

In [9]:
# Preview data
utla_cases_df.head()

Unnamed: 0_level_0,GSS_NM,2020-03-08,2020-03-09,2020-03-10,2020-03-11,2020-03-12,2020-03-13,2020-03-14,2020-03-15,2020-03-16,...,2020-03-31,2020-04-01,2020-04-02,2020-04-03,2020-04-04,2020-04-05,2020-04-06,2020-04-07,2020-04-08,2020-04-09
utla,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
E06000001,Hartlepool,0,0,0,0,0,0,0,0,0,...,12,15,17,23,23,29,33,36,49,55
E06000002,Middlesbrough,0,0,0,0,0,0,0,0,0,...,44,59,74,92,111,126,151,169,196,213
E06000003,Redcar and Cleveland,0,0,0,0,0,0,1,1,0,...,43,51,55,64,75,84,100,112,119,128
E06000004,Stockton-on-Tees,0,0,0,2,2,2,2,2,2,...,47,60,70,77,81,108,117,123,143,152
E06000005,Darlington,0,0,0,0,0,0,2,2,2,...,23,24,30,32,32,45,52,55,77,95


In [10]:
# Preview data
utla_cases_df.sum()

GSS_NM        HartlepoolMiddlesbroughRedcar and ClevelandSto...
2020-03-08                                                  224
2020-03-09                                                  253
2020-03-10                                                  309
2020-03-11                                                  357
2020-03-12                                                  434
2020-03-13                                                  535
2020-03-14                                                  764
2020-03-15                                                  975
2020-03-16                                                 1109
2020-03-17                                                 1421
2020-03-18                                                 2065
2020-03-19                                                 2544
2020-03-20                                                 3246
2020-03-21                                                 3995
2020-03-22                              

In [11]:
# Save data
utla_cases_df.to_csv(utla_cases_file)

# Delete old backup
backup_dir = Path('data/secondary_sources_bak/secondary_sources_bak_' +
                  datetime.date.today().isoformat())
if backup_dir.exists():
    shutil.rmtree(backup_dir)

# Make new backup
shutil.copytree(Path('data/secondary_sources'), backup_dir)

PosixPath('data/secondary_sources_bak/secondary_sources_bak_2020-04-09')