# Scrape Data from the NHS Quality and Outcomes Framework

The purpose of these functions is to
- Find and read the most recent NHS Quality and Outcomes file
- Read each of the sheets in the file into a dataframe
- Collect the unique column values from the dataframes

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| hide
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
#| export
def get_NHS_qualityOutcomes():
    """
    Find the link of the most recent dataset and download data at national
    level.
    """
    base_url = 'https://digital.nhs.uk'
    url = base_url + '/data-and-information/publications/statistical/quality-and-outcomes-framework-achievement-prevalence-and-exceptions-data'
    
    # Find link to latest dataset
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    latest_dataset_url_ending = soup.find_all('div', {'id': 'latest-statistics'})[0].findChildren('a')[0]['href']
    latest_dataset_url = base_url + latest_dataset_url_ending
    
    # find latest dataset file
    response = requests.get(latest_dataset_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    file_download_link = soup.find('div', {'id': 'resources'}).find_all('a')[0]['href']
    
    r = requests.get(file_download_link)
    file = pd.ExcelFile(r.content)
    
    dfs = {}
    for sheet in file.sheet_names[1:]:  # Ignore the first sheet as it just contains a list of titles for the other sheets
        title = file.parse(sheet).iloc[6, 0] # Table title is on the 6th row for every table
        
        """ 
        Each table has the same number of rows, which makes this much easier, however, this only works for the national 
        level excel file (rows correspond to regions)
        """
        temp_df = file.parse(sheet)[10:19].reset_index(drop=True)
        temp_df.columns = temp_df.iloc[0]
        dfs[title] = temp_df.iloc[1:].reset_index(drop=True) # save as df in dictionary

    return dfs

In [None]:
NHS_quality_outcomes = get_NHS_qualityOutcomes()
print('Title: ' + list(NHS_quality_outcomes.keys())[0][9:])

Title: Prevalence, achievement and personalised care adjustments, cardiovascular group, atrial fibrillation, 2021-22, region and national level


In [None]:
NHS_quality_outcomes[list(NHS_quality_outcomes.keys())[0]].head(3)

Unnamed: 0,Region ODS code,Region ONS code,Region name,Number of practices,List size,Register,Prevalence (%),Number of practices.1,List size.1,Register.1,...,Denominator plus PCAs,Patients receiving Intervention (%),Achievement Score (max 25),Numerator,Denominator,Underlying Achievement net of PCAs (%),PCAs,PCA Rate (%),Denominator plus PCAs.1,Patients receiving Intervention (%).1
0,ENG,E92000001,England,6460,60037519,1230460,2.049485,6470,61604213,1288599,...,557688,86.154086,161245.58,948937,1027297,92.372216,38850,3.643963,1066147,89.006206
1,Y56,E40000003,London,1174,10239440,113999,1.113332,1178,10644873,119035,...,51689,86.927586,29209.21,84343,93253,90.445348,3680,3.796437,96933,87.011647
2,Y58,E40000006,South West,552,5823756,154294,2.64939,552,5999796,163087,...,66873,83.695961,13800.0,119527,129371,92.390876,5437,4.033143,134808,88.664619


In [None]:
#| export
def get_qualityOutcomes_uniqueColumnValues(data):
    "Returns the unique, non-numeric column values given a dataframe"
    unq_cols = {}
    for i in range(len(data)):
        col_name = data.iloc[:, i].name
        if type((data.iloc[:, i])[0]) == str:
            unq_cols[col_name] = data.iloc[:, i].unique()
    return unq_cols

In [None]:
# Get the unique column values from the first sheet
sheet = NHS_quality_outcomes[list(NHS_quality_outcomes.keys())[0]]
get_qualityOutcomes_uniqueColumnValues(sheet)

{'Region ODS code': array(['ENG', 'Y56', 'Y58', 'Y59', 'Y60', 'Y61', 'Y62', 'Y63'],
       dtype=object),
 'Region ONS code': array(['E92000001', 'E40000003', 'E40000006', 'E40000005', 'E40000011',
        'E40000007', 'E40000010', 'E40000012'], dtype=object),
 'Region name': array(['England', 'London', 'South West', 'South East', 'Midlands',
        'East of England', 'North West', 'North East and Yorkshire'],
       dtype=object)}

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()