# Coronavirus (COVID-19) England Cases Scraper
## David Beavan @DavidBeavan
## Licence: MIT. Sources: see below
## Notes
* This is liable to break, lots if the official sources change (and they do)

In [1]:
# load libraries
from pathlib import Path
import requests
from lxml import etree
import re
import pandas as pd
import datetime
import shutil
import json

In [2]:
# Fetch the raw rest response from listing azure container


def blob_container_list_rest_response(blob_service_endpoint, container_name):
    base_uri = blob_service_endpoint + container_name
    uri = base_uri + '?restype=container&comp=list'
    r = requests.get(uri)

    return r.content

In [3]:
# List all blobs in azure container


def blob_container_list(blob_service_endpoint, container_name):
    content = blob_container_list_rest_response(
        blob_service_endpoint, container_name)
    root = etree.XML(content)

    blobs = []

    for name in root.xpath('/EnumerationResults/Blobs/Blob/Name'):
        blobs.append(blob_service_endpoint + container_name + name.text)

    return blobs

In [4]:
# Return most recent blob given dated list


def most_recent_data(blobs):
    data_blobs = [blob for blob in blobs if 'data_' in blob]

    max_date = 0

    for blob in data_blobs:
        date = re.match(r'.*?data_(\d+)\.json', blob)[1]
        date = int(date)

        if date > max_date:
            max_date = date
            max_blob = blob

    return max_blob

In [5]:
# Get uri of most recent PHE Covid-19 data

blobs = blob_container_list(
    'https://publicdashacc.blob.core.windows.net/',
    'publicdata/')
most_recent_data_uri = most_recent_data(blobs)

In [6]:
# Preview data
# most_recent_data_uri

In [7]:
# Prep data
# Data is not packaged with code, Will download from sources and save for future runs
data_base_dir = Path('data/secondary_sources')
data_base_dir.mkdir(parents=True, exist_ok=True)

data_sub_dir = data_base_dir.joinpath('utla_cases_table')
data_sub_dir.mkdir(parents=True, exist_ok=True)

utla_cases_file = data_sub_dir.joinpath('utla_cases.csv')

In [8]:
# Fetch most recent cases data
r = requests.get(most_recent_data_uri)
# Parse as json
content = json.loads(r.content)
# select utla cases
utlas = content['utlas']

utla_cases_df = pd.DataFrame()

# Go through each area and output its cases
for key, value in utlas.items():
    entry = {}

    # Select area name
    entry['GSS_NM'] = value['name']['value']

    # Select each daily report
    for k in value['dailyTotalConfirmedCases']:
        entry[k['date']] = int(k['value'])

    # Insert into a single row dataframe
    mini_df = pd.DataFrame(entry, index=[key])

    # Merge new single row to list
    utla_cases_df = pd.concat([utla_cases_df, mini_df], axis=0)

utla_cases_df.index.name = 'utla'

In [9]:
# Put date columns in chronological order
cols = utla_cases_df.columns.tolist()

non_date_cols = cols[:1]
date_cols = cols[1:]

utla_cases_non_date_cols_df = utla_cases_df[non_date_cols]

date_cols.sort()
cols = non_date_cols + date_cols
utla_cases_date_cols_df = utla_cases_df[date_cols]


# Insert new columns even if no case reports were mad eon tthat day (e.g.
# early on in the pandemic)
min_date = min(date_cols)
max_date = max(date_cols)
date_range = pd.date_range(min_date, max_date).strftime('%Y-%m-%d')
utla_cases_date_cols_df = utla_cases_date_cols_df.reindex(columns=date_range)

# Forward fill cases, duplicating case numbers if no new reports were made to made a full, non-sparse dataframe
utla_cases_df = pd.concat(
    [utla_cases_non_date_cols_df, utla_cases_date_cols_df.ffill(axis=1)], axis=1)

# Start all cases at zero, unless otherwise given
utla_cases_df.fillna(0, inplace=True)

# Set cases back to int from float, as int could not have handeled the NaNs we just removed
for col in date_range:
    utla_cases_df = utla_cases_df.astype({col: int})

In [10]:
# Preview data
utla_cases_df.head()

Unnamed: 0_level_0,GSS_NM,2020-01-30,2020-01-31,2020-02-01,2020-02-02,2020-02-03,2020-02-04,2020-02-05,2020-02-06,2020-02-07,...,2020-04-25,2020-04-26,2020-04-27,2020-04-28,2020-04-29,2020-04-30,2020-05-01,2020-05-02,2020-05-03,2020-05-04
utla,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
E09000002,Barking and Dagenham,0,0,0,0,0,0,0,0,0,...,446,447,450,452,454,454,459,460,461,462
E09000003,Barnet,0,0,0,0,0,0,0,0,0,...,1171,1176,1182,1194,1206,1215,1219,1223,1224,1224
E08000016,Barnsley,0,0,0,0,1,1,1,1,1,...,608,622,639,661,675,684,688,691,701,705
E06000022,Bath and North East Somerset,0,0,0,0,0,0,0,0,0,...,203,206,208,210,211,212,215,217,217,217
E06000055,Bedford,0,0,0,0,0,0,0,0,0,...,434,439,449,459,469,470,471,471,471,471


In [11]:
# Preview data
utla_cases_df.sum()

GSS_NM        Barking and DagenhamBarnetBarnsleyBath and Nor...
2020-01-30                                                    1
2020-01-31                                                    2
2020-02-01                                                    2
2020-02-02                                                    2
                                    ...                        
2020-04-30                                               119817
2020-05-01                                               121382
2020-05-02                                               122285
2020-05-03                                               122670
2020-05-04                                               122758
Length: 97, dtype: object

In [12]:
# Save data
utla_cases_df.to_csv(utla_cases_file)

# Delete old backup
backup_dir = Path('data/secondary_sources_bak/secondary_sources_bak_' +
                  datetime.date.today().isoformat())
if backup_dir.exists():
    shutil.rmtree(backup_dir)

# Make new backup
shutil.copytree(Path('data/secondary_sources'), backup_dir)

PosixPath('data/secondary_sources_bak/secondary_sources_bak_2020-05-05')