In [1]:
from gssutils import *

scraper = Scraper('https://www.nisra.gov.uk/publications/alcohol-related-deaths-2007-2017')
scraper.distribution(
    title='Alcohol Related Deaths Tables 2007- 2017'
).downloadURL

'https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/Alcohol_Tables_17.xls'

In [2]:
if is_interactive():
    import requests
    from cachecontrol import CacheControl
    from cachecontrol.caches.file_cache import FileCache
    from cachecontrol.heuristics import LastModified
    from pathlib import Path

    session = CacheControl(requests.Session(),
                           cache=FileCache('.cache'),
                           heuristic=LastModified())

    sourceFolder = Path('in')
    sourceFolder.mkdir(exist_ok=True)

    inputURL = 'https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/Alcohol_Tables_17.xls'
    inputFile = sourceFolder / 'Alcohol_Tables_17.xls'
    response = session.get(inputURL)
    with open(inputFile, 'wb') as f:
      f.write(response.content)
    tab = loadxlstabs(inputFile, sheetids='Table 2')[0]

Loading in\Alcohol_Tables_17.xls which has size 900608 bytes
Table names: ['Table 2']


In [3]:
tidy = pd.DataFrame()

In [4]:
cell = tab.filter('Registration Year')
age = cell.fill(RIGHT).is_not_blank().is_not_blank().is_not_whitespace() |\
        cell.shift(0,1).fill(RIGHT).is_not_blank().is_not_whitespace()
Year = cell.fill(DOWN).is_not_blank().is_not_whitespace()
observations = Year.fill(RIGHT).is_not_blank().is_not_whitespace()
Dimensions = [
            HDim(Year,'Year',DIRECTLY,LEFT),
            HDim(age, 'Age',DIRECTLY,ABOVE),
            HDimConst('Measure Type', 'Count'),
            HDimConst('Unit','People'),
            HDimConst('Sex', 'T'),
            HDimConst('Underlying Cause of Death', 'all-alcohol-related-deaths'),
            HDimConst('Health and Social Care Trust', 'all')
            ]
c1 = ConversionSegment(observations, Dimensions, processTIMEUNIT=True)
if is_interactive():
    savepreviewhtml(c1)
table = c1.topandas()

0,1,2
OBS,Year,Age

0,1,2,3,4,5,6,7,8,9,10,11
"Table 2: Number of alcohol related deaths by age and registration year, 2007-2017",,,,,,,,,,,
,,,,,,,,,,,
Registration Year,All Ages,Age Group,,,,,,,Median Age,,
,,Under 25,25-34,35-44,45-54,55-64,65-74,75 and over,,,
2007.0,238.0,1.0,9.0,62.0,77.0,56.0,25.0,8.0,50.0,,
2008.0,243.0,0.0,6.0,33.0,94.0,69.0,34.0,7.0,53.0,,
2009.0,249.0,0.0,8.0,43.0,92.0,71.0,27.0,8.0,53.0,,
2010.0,260.0,0.0,12.0,33.0,102.0,76.0,30.0,7.0,52.5,,
2011.0,228.0,0.0,6.0,52.0,73.0,62.0,27.0,8.0,52.0,,
2012.0,244.0,1.0,5.0,51.0,77.0,75.0,30.0,5.0,53.0,,





In [5]:
import numpy as np
table['OBS'].replace('', np.nan, inplace=True)
table.dropna(subset=['OBS'], inplace=True)
table.rename(columns={'OBS': 'Value'}, inplace=True)
table['Value'] = table['Value'].astype(int)

In [6]:
table['Period'] = 'year/' + table['Year'].astype(str).str[0:4]

In [7]:
table['Period'] = table['Period'].map(
    lambda x: {
        'year/Tota' : 'gregorian-interval/2007-01-01T00:00:00/P10Y'       
        }.get(x, x))

In [8]:
table = table[table['Age'] != 'Median Age']

In [9]:
table['Age'] = table['Age'].map(
    lambda x: {
        '25-34' : 'nisra5/25-34' , 
        '35-44' : 'nisra5/35-44' , 
        '45-54' : 'nisra5/45-54', 
        '55-64' : 'nisra5/55-64', 
        '65-74' : 'nisra5/65-74', 
        '75 and over' : 'nisra5/75-plus' ,
        'All Ages' : 'all',
        'Under 25' : 'nisra5/under-25'            
        }.get(x, x))

In [10]:
table = table[['Period','Age','Sex','Underlying Cause of Death','Health and Social Care Trust','Measure Type','Value','Unit']]

In [11]:
tidy = pd.concat([tidy,table])

In [12]:
tab1 = loadxlstabs(inputFile, sheetids='Table 1')[0]

Loading in\Alcohol_Tables_17.xls which has size 900608 bytes
Table names: ['Table 1']


In [13]:
cell1 = tab1.filter('Registration Year')
sex = cell1.fill(RIGHT).is_not_blank().is_not_blank().is_not_whitespace() |\
        cell1.shift(0,1).fill(RIGHT).is_not_blank().is_not_whitespace()
Year = cell1.fill(DOWN).is_not_blank().is_not_whitespace()
observations1 = Year.fill(RIGHT).is_not_blank().is_not_whitespace()
Dimensions1 = [
            HDim(Year,'Year',DIRECTLY,LEFT),
            HDim(sex, 'Sex',DIRECTLY,ABOVE),
            HDimConst('Measure Type', 'Count'),
            HDimConst('Unit','People'),
            HDimConst('Age', 'all'),
            HDimConst('Underlying Cause of Death', 'all-alcohol-related-deaths'),
            HDimConst('Health and Social Care Trust', 'all')
            ]
c2 = ConversionSegment(observations1, Dimensions1, processTIMEUNIT=True)
if is_interactive():
    savepreviewhtml(c2)
table1 = c2.topandas()

0,1,2
OBS,Year,Sex

0,1,2,3
"Table 1: Number of alcohol related deaths by sex and registration year, 2001-2017",,,
,,,
Registration Year,All Persons,Sex,
,,Male,Female
2001.0,178.0,117.0,61.0
2002.0,194.0,141.0,53.0
2003.0,175.0,112.0,63.0
2004.0,204.0,142.0,62.0
2005.0,217.0,155.0,62.0
2006.0,210.0,153.0,57.0





In [14]:
table1['OBS'].replace('', np.nan, inplace=True)
table1.dropna(subset=['OBS'], inplace=True)
table1.rename(columns={'OBS': 'Value'}, inplace=True)
table1['Value'] = table1['Value'].astype(int)

In [15]:
table1['Period'] = 'year/' + table1['Year'].astype(str).str[0:4]

In [16]:
table1['Period'] = table1['Period'].map(
    lambda x: {
        'year/Tota' : 'gregorian-interval/2007-01-01T00:00:00/P15Y'       
        }.get(x, x))

In [17]:
table1['Sex'] = table1['Sex'].map(
    lambda x: {
        'All Persons' : 'T' , 
        'Female' : 'F' , 
        'Male' : 'M', 
        }.get(x, x))

In [18]:
table1 = table1[['Period','Age','Sex','Underlying Cause of Death','Health and Social Care Trust','Measure Type','Value','Unit']]

In [19]:
tidy = pd.concat([tidy,table1])

In [20]:
tab2 = loadxlstabs(inputFile, sheetids='Table 3')[0]

Loading in\Alcohol_Tables_17.xls which has size 900608 bytes
Table names: ['Table 3']


In [21]:
cell2 = tab2.filter('Underlying Cause (ICD-10 codes)')
Year1 = cell2.fill(RIGHT).is_not_blank().is_not_blank().is_not_whitespace() |\
        cell2.shift(0,1).fill(RIGHT).is_not_blank().is_not_whitespace()
cd = cell2.fill(DOWN).is_not_blank().is_not_whitespace()
observations3 = Year1.fill(DOWN).is_not_blank().is_not_whitespace()
Dimensions3 = [
            HDim(Year1,'Year',DIRECTLY,ABOVE),
            HDim(cd, 'Underlying Cause of Death',DIRECTLY,LEFT),
            HDimConst('Measure Type', 'Count'),
            HDimConst('Unit','People'),
            HDimConst('Age', 'all'),
            HDimConst('Sex', 'T'),
            HDimConst('Health and Social Care Trust', 'all')
            ]
c3 = ConversionSegment(observations3, Dimensions3, processTIMEUNIT=True)
if is_interactive():
    savepreviewhtml(c3)
table2 = c3.topandas()

0,1,2
OBS,Year,Underlying Cause of Death

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
"Table 3: Number of alcohol related deaths by underlying cause of death and registration year, 2007-2017",,,,,,,,,,,,,,
,,,,,,,,,,,,,,
Underlying Cause (ICD-10 codes),,,,,,,,,,,,Total (2007-2017),,
,2007.0,2008.0,2009.0,2010.0,2011.0,2012.0,2013.0,2014.0,2015.0,2016.0,2017.0,,,
Mental and behavioural disorders due to use of alcohol (F10),94.0,46.0,39.0,48.0,36.0,53.0,33.0,27.0,45.0,51.0,70.0,542.0,,
Accidental poisoning by and exposure to alcohol (X45),2.0,26.0,26.0,44.0,27.0,29.0,25.0,36.0,28.0,12.0,21.0,276.0,,
"Intentional self-poisoning by and exposure to alcohol or poisoning by and exposure to alcohol, undetermined intent (X65, Y15)",1.0,-,1.0,-,0.0,0.0,0.0,0.0,0.0,10.0,7.0,19.0,,
"All other alcohol related deaths (E24.4, G31.2, G62.1, G72.1, I42.6, K29.2, K70, K85.2, Q86.0, R78.0, K86.0)",141.0,171.0,183.0,168.0,165.0,162.0,148.0,156.0,209.0,216.0,205.0,1924.0,,
All alcohol related deaths,238.0,243.0,249.0,260.0,228.0,244.0,206.0,219.0,282.0,289.0,303.0,2761.0,,
,,,,,,,,,,,,,,





In [22]:
table2['OBS'].replace('', np.nan, inplace=True)
table2.dropna(subset=['OBS'], inplace=True)
table2.rename(columns={'OBS': 'Value'}, inplace=True)
table2['Value'] = table2['Value'].astype(int)
table2['Period'] = 'year/' + table2['Year'].astype(str).str[0:4]

In [23]:
table2 = table2[table2['Underlying Cause of Death'] != 'Total deaths from all causes']
table2 = table2[table2['Underlying Cause of Death'] != 'All alcohol related deaths']

In [24]:
table2['Period'] = table2['Period'].map(
    lambda x: {
        'year/Tota' : 'gregorian-interval/2007-01-01T00:00:00/P10Y'       
        }.get(x, x))
table2['Underlying Cause of Death'] = table2['Underlying Cause of Death'].map(
    lambda x: {
        'Mental and behavioural disorders due to use of alcohol (F10)' : 'f10',
       'Accidental poisoning by and exposure to alcohol (X45)' : 'x45',
       'Intentional self-poisoning by and exposure to alcohol or poisoning by and exposure to alcohol, undetermined intent (X65, Y15)':'x65-y15',
       'All other alcohol related deaths (E24.4, G31.2, G62.1, G72.1, I42.6, K29.2, K70, K85.2, Q86.0, R78.0, K86.0)' : 'all-other-alcohol-related-deaths',
       'All alcohol related deaths': 'all-alcohol-related-deaths'        
        }.get(x, x))

In [25]:
table2 = table2[['Period','Age','Sex','Underlying Cause of Death','Health and Social Care Trust','Measure Type','Value','Unit']]

In [26]:
tidy = pd.concat([tidy,table2])

In [27]:
tab3 = loadxlstabs(inputFile, sheetids='Table 4')[0]

Loading in\Alcohol_Tables_17.xls which has size 900608 bytes
Table names: ['Table 4']


In [28]:
cell3 = tab3.filter('Registration Year')
hs = cell3.fill(RIGHT).is_not_blank().is_not_blank().is_not_whitespace() |\
        cell3.shift(0,1).fill(RIGHT).is_not_blank().is_not_whitespace()
Year = cell3.fill(DOWN).is_not_blank().is_not_whitespace()
observations4 = Year.fill(RIGHT).is_not_blank().is_not_whitespace()
Dimensions4 = [
            HDim(Year,'Year',DIRECTLY,LEFT),
            HDim(hs, 'Health and Social Care Trust',DIRECTLY,ABOVE),
            HDimConst('Measure Type', 'Count'),
            HDimConst('Unit','People'),
            HDimConst('Age', 'all'),
            HDimConst('Underlying Cause of Death', 'all-alcohol-related-deaths'),
            HDimConst('Sex', 'T')
            ]
c4 = ConversionSegment(observations4, Dimensions4, processTIMEUNIT=True)
if is_interactive():
    savepreviewhtml(c4)
table3 = c4.topandas()

0,1,2
OBS,Year,Health and Social Care Trust

0,1,2,3,4,5,6
"Table 4: Number of alcohol related deaths by Health and Social Care Trust and registration year, 2007-2017",,,,,,
,,,,,,
Registration Year,Health and Social Care Trust,,,,,Total
,Belfast,Northern,South Eastern,Southern,Western,
2007.0,74.0,49.0,36.0,38.0,41.0,238.0
2008.0,75.0,63.0,46.0,29.0,30.0,243.0
2009.0,67.0,52.0,45.0,37.0,48.0,249.0
2010.0,71.0,45.0,38.0,44.0,62.0,260.0
2011.0,67.0,45.0,47.0,33.0,36.0,228.0
2012.0,73.0,38.0,43.0,41.0,49.0,244.0





In [29]:
table3['OBS'].replace('', np.nan, inplace=True)
table3.dropna(subset=['OBS'], inplace=True)
table3.rename(columns={'OBS': 'Value'}, inplace=True)
table3['Value'] = table3['Value'].astype(int)
table3['Period'] = 'year/' + table3['Year'].astype(str).str[0:4]

In [30]:
table3['Period'] = table3['Period'].map(
    lambda x: {
        'year/Tota' : 'gregorian-interval/2007-01-01T00:00:00/P10Y'       
        }.get(x, x))

In [31]:
table3 = table3[table3['Health and Social Care Trust'] != 'Total']

In [32]:
table3['Health and Social Care Trust'] = table3['Health and Social Care Trust'].map(
    lambda x: {
        'Belfast': 'belfast', 'Northern': 'northern',
        'South Eastern' : 'south-eastern', 
        ' Southern' : 'southern', 'Western' : 'western'
        }.get(x, x))

In [33]:
table3 = table3[['Period','Age','Sex','Underlying Cause of Death','Health and Social Care Trust','Measure Type','Value','Unit']]

In [34]:
if is_interactive():
    destinationFolder = Path('out')
    destinationFolder.mkdir(exist_ok=True, parents=True)
    tidy.to_csv(destinationFolder / ('observations.csv'), index = False)

In [35]:
from pathlib import Path

out = Path('out')
out.mkdir(exist_ok=True, parents=True)
scraper.dataset.family = 'health'

with open(out / 'dataset.trig', 'wb') as metadata:
    metadata.write(scraper.generate_trig())