Table 1a:  Number of alcohol related deaths by sex and registration year, 2001-2016 - (new definition)

In [1]:
from gssutils import *

if is_interactive():
    import requests
    from cachecontrol import CacheControl
    from cachecontrol.caches.file_cache import FileCache
    from cachecontrol.heuristics import LastModified
    from pathlib import Path

    session = CacheControl(requests.Session(),
                           cache=FileCache('.cache'),
                           heuristic=LastModified())

    sourceFolder = Path('in')
    sourceFolder.mkdir(exist_ok=True)

    inputURL = 'https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/Alcohol_Tables_16_0.xls'
    inputFile = sourceFolder / 'Alcohol_Tables_16_0.xls'
    response = session.get(inputURL)
    with open(inputFile, 'wb') as f:
      f.write(response.content)
    tab = loadxlstabs(inputFile, sheetids='Table 1a')[0]

Loading in/Alcohol_Tables_16_0.xls which has size 969216 bytes
Table names: ['Table 1a']


In [2]:
observations = tab.excel_ref('B5').expand(DOWN).expand(RIGHT).is_not_blank()

In [3]:
observations

{<B12 243.0>, <D13 84.0>, <C20 184.0>, <B21 3636.0>, <C14 175.0>, <D8 62.0>, <B11 238.0>, <D21 1151.0>, <B13 249.0>, <C5 117.0>, <D19 97.0>, <D18 64.0>, <C19 185.0>, <B20 289.0>, <B14 260.0>, <D12 78.0>, <B16 244.0>, <D14 85.0>, <D15 67.0>, <B5 178.0>, <D6 53.0>, <B8 204.0>, <B18 219.0>, <C6 141.0>, <C21 2485.0>, <B9 217.0>, <D7 63.0>, <C9 155.0>, <C7 112.0>, <B7 175.0>, <D9 62.0>, <C8 142.0>, <C15 161.0>, <D11 73.0>, <C18 155.0>, <C11 165.0>, <C17 151.0>, <C16 159.0>, <D16 85.0>, <C12 165.0>, <D17 55.0>, <B6 194.0>, <D20 105.0>, <B10 210.0>, <D10 57.0>, <B19 282.0>, <C10 153.0>, <D5 61.0>, <B15 228.0>, <B17 206.0>, <C13 165.0>}

In [4]:
sex = tab.excel_ref('B4').expand(RIGHT).is_not_blank()
sex

{<C4 'Male'>, <D4 'Female'>}

In [5]:
Year = tab.excel_ref('A5').expand(DOWN) - tab.excel_ref('A22').expand(DOWN)  
Year

{<A17 2013.0>, <A21 'Total (2001-2016)'>, <A5 2001.0>, <A12 2008.0>, <A14 2010.0>, <A8 2004.0>, <A10 2006.0>, <A13 2009.0>, <A20 2016.0>, <A7 2003.0>, <A11 2007.0>, <A18 2014.0>, <A16 2012.0>, <A9 2005.0>, <A6 2002.0>, <A19 2015.0>, <A15 2011.0>}

In [6]:
Dimensions = [
            HDim(Year,'Year',DIRECTLY,LEFT),
            HDim(sex,'Sex',DIRECTLY,ABOVE),
            HDimConst('Measure Type', 'Count'),
            HDimConst('Unit','People')
            ]

In [7]:
c1 = ConversionSegment(observations, Dimensions, processTIMEUNIT=True)
if is_interactive():
    savepreviewhtml(c1)

0,1,2
OBS,Year,Sex

0,1,2,3
"Table 1a: Number of alcohol related deaths by sex and registration year, 2001-2016 - (new definition)",,,
,,,
Registration Year,All Persons,Sex,
,,Male,Female
2001.0,178.0,117.0,61.0
2002.0,194.0,141.0,53.0
2003.0,175.0,112.0,63.0
2004.0,204.0,142.0,62.0
2005.0,217.0,155.0,62.0
2006.0,210.0,153.0,57.0


In [8]:
new_table = c1.topandas()
new_table




Unnamed: 0,OBS,Year,Sex,Measure Type,Unit
0,178.0,2001.0,,Count,People
1,117.0,2001.0,Male,Count,People
2,61.0,2001.0,Female,Count,People
3,194.0,2002.0,,Count,People
4,141.0,2002.0,Male,Count,People
5,53.0,2002.0,Female,Count,People
6,175.0,2003.0,,Count,People
7,112.0,2003.0,Male,Count,People
8,63.0,2003.0,Female,Count,People
9,204.0,2004.0,,Count,People


In [9]:
new_table['Sex'].fillna('Persons', inplace = True)

In [10]:
new_table

Unnamed: 0,OBS,Year,Sex,Measure Type,Unit
0,178.0,2001.0,Persons,Count,People
1,117.0,2001.0,Male,Count,People
2,61.0,2001.0,Female,Count,People
3,194.0,2002.0,Persons,Count,People
4,141.0,2002.0,Male,Count,People
5,53.0,2002.0,Female,Count,People
6,175.0,2003.0,Persons,Count,People
7,112.0,2003.0,Male,Count,People
8,63.0,2003.0,Female,Count,People
9,204.0,2004.0,Persons,Count,People


In [11]:
new_table['Year'] = pd.to_numeric(new_table['Year'], errors='coerce').fillna(0)

In [12]:
new_table['Year'] = new_table['Year'].astype(int)

In [13]:
new_table.columns = ['Value' if x=='OBS' else x for x in new_table.columns]

In [14]:
new_table['Year'] = new_table['Year'].astype(str)

In [15]:
new_table['Value'] = new_table['Value'].astype(int)

In [16]:
new_table.dtypes

Value            int64
Year            object
Sex             object
Measure Type    object
Unit            object
dtype: object

In [17]:
new_table.tail(5)

Unnamed: 0,Value,Year,Sex,Measure Type,Unit
46,184,2016,Male,Count,People
47,105,2016,Female,Count,People
48,3636,0,Persons,Count,People
49,2485,0,Male,Count,People
50,1151,0,Female,Count,People


In [18]:
def user_perc(x):
    
    if str(x) == '0':
        return '2001-2016'
    else:
        return x
    
new_table['Year'] = new_table.apply(lambda row: user_perc(row['Year']), axis = 1)


In [19]:
new_table = new_table[['Year','Sex','Measure Type','Value','Unit']]

In [20]:
new_table.head(5)

Unnamed: 0,Year,Sex,Measure Type,Value,Unit
0,2001,Persons,Count,178,People
1,2001,Male,Count,117,People
2,2001,Female,Count,61,People
3,2002,Persons,Count,194,People
4,2002,Male,Count,141,People


In [21]:
if is_interactive():
    destinationFolder = Path('out')
    destinationFolder.mkdir(exist_ok=True, parents=True)

    new_table.to_csv(destinationFolder / ('tab1a.csv'), index = False)