In [1]:
from gssutils import *

scraper = Scraper('https://digital.nhs.uk/data-and-information/publications/statistical/statistics-on-alcohol')
scraper

## Statistics on Alcohol

This is a catalog of datasets; choose one from the following:

* Statistics on Alcohol, England 2019 [PAS]
* Statistics on Alcohol, England, 2018 [PAS]
* Statistics on Alcohol, England, 2017
* Statistics on Alcohol, England, 2016
* Statistics on Alcohol, England, 2015
* Statistics on Alcohol - England, 2014; Additional Tables
* Statistics on Alcohol - England, 2014
* Statistics on Alcohol - England, 2013
* Statistics on Alcohol: England, 2012
* Statistics on Alcohol: England, 2011
* Statistics on Alcohol: England, 2010
* Statistics on Alcohol: England, 2009
* Statistics on Alcohol: England, 2008
* Statistics on Alcohol: England, 2007
* Statistics on Alcohol: England, 2006

In [2]:
scraper.select_dataset(latest=True)
scraper

## Statistics on Alcohol, England 2019 [PAS]

### Distributions

1. Statistics on Alcohol, England, 2018 - Appendices ([application/pdf](https://files.digital.nhs.uk/12/E3B2A2/alc-eng-2019-app.pdf))
1. Statistics on Alcohol, England, 2018 - Data Quality Statement ([application/pdf](https://files.digital.nhs.uk/39/E3B952/alc-eng-2019-qual.pdf))
1. Statistics on Alcohol, England, 2018 - Tables ([application/vnd.openxmlformats-officedocument.spreadsheetml.sheet](https://files.digital.nhs.uk/A1/9FE2A6/alc-eng-2019-tab.xlsx))
1. Statistics on Alcohol, England, 2018 - CSV Data Pack ([application/zip](https://files.digital.nhs.uk/4F/DC124A/Statistics%20on%20Alcohol%20England%2C%202019.zip))
1. Pre Release Access List ([application/pdf](https://files.digital.nhs.uk/81/52497E/Pre%20Release%20Access%20List.pdf))


In [3]:
dist = scraper.distribution(mediaType='application/zip')
dist

In [4]:
from zipfile import ZipFile
from io import BytesIO
from chardet import detect
from IPython.core.display import HTML

tables = {}

with dist.open() as csv_pack:
    # need to read the file as ZipFile does seek()
    csv_pack_data = BytesIO(csv_pack.read())
    with ZipFile(csv_pack_data) as zf:
        for csv_file in zf.namelist():
            if csv_file.endswith('.csv'):
                with zf.open(csv_file) as csv_stream:
                    # also need to read the data as there are some different character
                    # encodings we need to guess
                    raw_csv_data = csv_stream.read()
                    detected = detect(raw_csv_data)
                    table = pd.read_csv(BytesIO(raw_csv_data), encoding=detected['encoding'])
                    table_name = csv_file[:-4]
                    display(HTML(f'<h2>{table_name}</h2>'))
                    display(table)
                    tables[table_name] = table

Unnamed: 0,Year,Metric,Value
0,1987,Alcohol price index,101.7
1,1988,Alcohol price index,106.9
2,1989,Alcohol price index,112.9
3,1990,Alcohol price index,123.8
4,1991,Alcohol price index,139.2
5,1992,Alcohol price index,148.1
6,1993,Alcohol price index,154.7
7,1994,Alcohol price index,158.5
8,1995,Alcohol price index,164.5
9,1996,Alcohol price index,169.2


Unnamed: 0,Year,ICD10_Code,ICD10_Description,Metric,Value
0,2017,Total,Total,All persons,5843
1,2017,E24.4,Alcohol-induced pseudo-Cushing's syndrome,All persons,0
2,2017,F10,Mental and behavioural disorders due to the us...,All persons,544
3,2017,G31.2,Degeneration of nervous system due to alcohol,All persons,10
4,2017,G62.1,Alcoholic polyneuropathy,All persons,0
5,2017,G72.1,Alcoholic myopathy,All persons,1
6,2017,I42.6,Alcoholic cardiomyopathy,All persons,112
7,2017,K29.2,Alcoholic gastritis,All persons,16
8,2017,K70,Alcoholic liver disease,All persons,4694
9,2017,K85.2,Alcohol-induced acute pancreatitis,All persons,80


Unnamed: 0,Year,Metric,Value,Unnamed: 3,Unnamed: 4
0,1987,Household expenditure on alcohol,9732,,
1,1988,Household expenditure on alcohol,10068,,
2,1989,Household expenditure on alcohol,10172,,
3,1990,Household expenditure on alcohol,10303,,
4,1991,Household expenditure on alcohol,9962,,
5,1992,Household expenditure on alcohol,9800,,
6,1993,Household expenditure on alcohol,9940,,
7,1994,Household expenditure on alcohol,10695,,
8,1995,Household expenditure on alcohol,10219,,
9,1996,Household expenditure on alcohol,11386,,


Unnamed: 0,Year,Metric_Primary,Metric_Secondary,Value
0,2017,Prescription Items - All Settings,Acamprosate Calcium,134673
1,2017,Prescription Items - All Settings,Disulfiram,35807
2,2017,Prescription Items - All Settings,Nalmefene,2497
3,2017,Prescription Items - All Settings,Total,172977
4,2017,Prescribed in primary care,Acamprosate Calcium,131348
5,2017,Prescribed in primary care,Disulfiram,34718
6,2017,Prescribed in primary care,Nalmefene,2337
7,2017,Prescribed in primary care,Total,168403
8,2017,Prescribed in NHS hospitals,Acamprosate Calcium,3325
9,2017,Prescribed in NHS hospitals,Disulfiram,1089


Unnamed: 0,ONS_Code,Org_Code,Org_Name,Year,Metric_Primary,Metric_Secondary,Value
0,E92000001,921,England,2017,Prescription items,Total,168403
1,XXXXXXXX,XXX,Unknown,2017,Prescription items,Total,800
2,,,Non-CCG,2017,Prescription items,Total,15462
3,E40000001,Y54,North of England,2017,Prescription items,Total,74935
4,E39000029,Q72,NHS England North (Yorkshire and Humber),2017,Prescription items,Total,23833
5,E38000001,02N,"NHS Airedale, Wharfedale and Craven",2017,Prescription items,Total,318
6,E38000006,02P,NHS Barnsley,2017,Prescription items,Total,1717
7,E38000008,02Q,NHS Bassetlaw,2017,Prescription items,Total,164
8,E38000018,02W,NHS Bradford City,2017,Prescription items,Total,56
9,E38000019,02R,NHS Bradford Districts,2017,Prescription items,Total,362


In [5]:
import re
title_years = re.compile(r'(.*?)([0-9]+(_to_[0-9]+)?)')
titles = {}
for (filename, table) in tables.items():
    match = title_years.match(filename)
    if match:
        titles[filename] = match.group(1).replace('_', ' ')
        years = match.group(2)
        display(HTML(f'<h2>{titles[filename]}</h2>'))
        for col in table:
            if col not in ['Year', 'Value']:
                table[col] = table[col].astype('category')
                display(HTML(f'<h3>{col}</h3>'))
                display(table[col].cat.categories)
                                           

Index(['Affordability of alcohol index on a per capita basis (revised)',
       'Alcohol price index',
       'Alcohol price index relative to Retail price index (all items)',
       'Real disposable income per adult (18+) (revised)',
       'Real household disposable income (revised)',
       'Retail prices index (all items)'],
      dtype='object')

Index(['E24.4', 'F10', 'G31.2', 'G62.1', 'G72.1', 'I42.6', 'K29.2', 'K70',
       'K85.2', 'K86.0', 'Q86.0', 'R78.0', 'Total', 'X45', 'X65', 'Y15'],
      dtype='object')

Index(['Accidental poisoning by and exposure to alcohol',
       'Alcohol-induced acute pancreatitis',
       'Alcohol-induced chronic pancreatitis',
       'Alcohol-induced pseudo-Cushing's syndrome', 'Alcoholic cardiomyopathy',
       'Alcoholic gastritis', 'Alcoholic liver disease', 'Alcoholic myopathy',
       'Alcoholic polyneuropathy',
       'Degeneration of nervous system due to alcohol',
       'Excess alcohol blood levels', 'Fetal alcohol syndrome (dysmorphic)',
       'Intentional self-poisoning by and exposure to alcohol',
       'Mental and behavioural disorders due to the use of alcohol',
       'Poisoning by and exposure to alcohol, undetermined intent', 'Total'],
      dtype='object')

Index(['All persons', 'Female', 'Male'], dtype='object')

Index(['Expenditure on alcohol as a percentage of expenditure',
       'Household expenditure Total', 'Household expenditure on alcohol'],
      dtype='object')

Float64Index([], dtype='float64')

Float64Index([], dtype='float64')

Index(['Average Net Ingredient Cost per item (£)',
       'Net Ingredient Cost (£ 000s)', 'Prescribed in NHS hospitals',
       'Prescribed in primary care', 'Prescription Items - All Settings'],
      dtype='object')

Index(['Acamprosate Calcium', 'Disulfiram', 'Nalmefene', 'Total'], dtype='object')

Index(['E38000001', 'E38000002', 'E38000003', 'E38000004', 'E38000005',
       'E38000006', 'E38000007', 'E38000008', 'E38000009', 'E38000010',
       ...
       'E39000035', 'E39000036', 'E39000037', 'E39000038', 'E40000001',
       'E40000002', 'E40000003', 'E40000004', 'E92000001', 'XXXXXXXX'],
      dtype='object', length=227)

Index(['00C', '00D', '00J', '00K', '00L', '00M', '00N', '00P', '00Q', '00R',
       ...
       'Q80', 'Q81', 'Q82', 'Q83', 'Q84', 'XXX', 'Y54', 'Y55', 'Y56', 'Y57'],
      dtype='object', length=227)

Index(['England', 'London', 'Midlands and East of England',
       'NHS Airedale, Wharfedale and Craven ', 'NHS Ashford ',
       'NHS Aylesbury Vale ', 'NHS Barking & Dagenham ', 'NHS Barnet ',
       'NHS Barnsley ', 'NHS Basildon and Brentwood',
       ...
       'NHS Wiltshire ', 'NHS Windsor, Ascot and Maidenhead ', 'NHS Wirral',
       'NHS Wokingham ', 'NHS Wolverhampton ', 'NHS Wyre Forest ', 'Non-CCG',
       'North of England', 'South of England', 'Unknown'],
      dtype='object', length=228)

Index(['Prescription items', 'Prescription items per 100000 population'], dtype='object')

Index(['Acamprosate Calcium', 'Disulfiram', 'Nalmefene', 'Total'], dtype='object')