Table 1 - Number of prescription items1, net ingredient cost2 and average net ingredient cost per item of drugs prescribed,3,4 for the treatment of alcohol dependence, dispensed in the community

In [1]:
from gssutils import *

if is_interactive():
    import requests
    from cachecontrol import CacheControl
    from cachecontrol.caches.file_cache import FileCache
    from cachecontrol.heuristics import LastModified
    from pathlib import Path

    session = CacheControl(requests.Session(),
                           cache=FileCache('.cache'),
                           heuristic=LastModified())

    sourceFolder = Path('in')
    sourceFolder.mkdir(exist_ok=True)

    inputURL = 'https://webarchive.nationalarchives.gov.uk/20180328130416/http://digital.nhs.uk/media/30889/Statistics-on-Alcohol-England-2017-Tables/Any/alc-eng-2017-tab'
    inputFile = sourceFolder / 'alc-eng-2017-tab'
    response = session.get(inputURL)
    with open(inputFile, 'wb') as f:
      f.write(response.content)    

In [2]:
tab = loadxlstabs(inputFile, sheetids='Table 1')[0]

Loading in\alc-eng-2017-tab which has size 126229 bytes
Table names: ['Table 1']


In [3]:
observations = tab.excel_ref('C8').expand(DOWN).expand(RIGHT).is_not_blank()
observations

{<F15 41652.0>, <E28 '-'>, <F17 112267.0>, <O16 3113.0>, <I32 15.812645915912698>, <L35 17.024458680158855>, <M27 841.37384>, <M14 132744.0>, <O23 4908.0>, <N20 3901.0>, <J23 10176.0>, <G33 15.200159992069787>, <N21 2749.0>, <O27 2156.6508>, <H29 2380.1972699999997>, <M32 18.100775313235854>, <C10 '-'>, <L21 4493.0>, <H26 1589.06936>, <N10 4359.0>, <I11 160181.0>, <O20 2986.0>, <C23 10120.0>, <O33 43.42745413906284>, <G8 83983.0>, <L26 2384.6333899999995>, <H27 791.12791>, <K35 16.452641503082802>, <N8 139193.0>, <I26 1623.6266699999999>, <D29 1960.23622>, <N17 189005.0>, <C15 36651.0>, <K8 117405.0>, <J15 55052.0>, <J8 107389.0>, <K22 '-'>, <G21 5097.0>, <C29 1515.73007>, <H34 '-'>, <E23 12531.0>, <E10 '-'>, <H15 49533.0>, <G22 '-'>, <F26 1532.29648>, <G28 '-'>, <D11 109112.0>, <C34 '-'>, <N22 363.0>, <K9 60842.0>, <F10 '-'>, <J11 167764.0>, <N28 273.45128000000005>, <G14 79708.0>, <K17 167468.0>, <H22 '-'>, <G35 17.857434144454448>, <G32 19.45338699498708>, <K32 18.43856496742047>, <

In [4]:
category = tab.excel_ref('A').expand(DOWN).by_index([7,13,19,25,31])
category

{<A25 'Net Ingredient Cost (£ 000s)'>, <A7 'Prescription items'>, <A31 'Average Net Ingredient Cost per item (£)'>, <A13 'Prescribed in primary care'>, <A19 'Prescribed in NHS hospitals'>}

In [5]:
treatment = tab.excel_ref('A8').expand(DOWN).is_not_blank() - category
treatment

{<A40 2.0>, <A8 'Acamprosate Calcium'>, <A34 'Nalmefene6'>, <A33 'Disulfiram'>, <A16 '   Nalmefene6'>, <A27 'Disulfiram'>, <A49 'Copyright © 2017, re-used with the permission of NHS Prescription Services'>, <A21 'Disulfiram'>, <A29 'Total'>, <A43 5.0>, <A35 'Total'>, <A47 'Prescribing Analysis and Cost (PACT) from NHS Prescription Services of the Business Service Authority. Health and Social Care Information Centre'>, <A15 'Disulfiram'>, <A26 'Acamprosate Calcium'>, <A17 'Total'>, <A9 'Disulfiram'>, <A32 'Acamprosate Calcium'>, <A22 '   Nalmefene6'>, <A14 'Acamprosate Calcium'>, <A44 6.0>, <A50 'Copyright © 2017. Health and Social Care Information Centre, Lifestyles Statistics. All rights reserved.'>, <A42 4.0>, <A23 'Total'>, <A46 'Source'>, <A10 'Nalmefene6'>, <A39 1.0>, <A20 'Acamprosate Calcium'>, <A11 'Total'>, <A41 3.0>, <A28 'Nalmefene6'>, <A38 'Footnotes'>}

In [6]:
period = tab.excel_ref('C5').expand(RIGHT).is_not_blank() 
period

{<K5 2012.0>, <L5 '20136'>, <O5 2016.0>, <C5 2004.0>, <I5 2010.0>, <D5 2005.0>, <H5 2009.0>, <F5 '20075'>, <G5 '20085'>, <J5 2011.0>, <E5 2006.0>, <M5 '2014'>, <N5 2015.0>}

In [7]:
Dimensions = [
            HDim(category,'Category',CLOSEST,ABOVE),
            HDim(treatment,'Treatment of alcohol dependence',DIRECTLY,LEFT),
            HDim(period,'Period',DIRECTLY,ABOVE),
            HDimConst('Unit','People'),
            HDimConst('Measure Type','Count')
            ]

In [8]:
c1 = ConversionSegment(observations, Dimensions, processTIMEUNIT=True)
# if is_interactive():
#     savepreviewhtml(c1)

In [9]:
new_table = c1.topandas()
new_table




Unnamed: 0,OBS,DATAMARKER,Category,Treatment of alcohol dependence,Period,Unit,Measure Type
0,66863,,Prescription items,Acamprosate Calcium,2004.0,People,Count
1,66851,,Prescription items,Acamprosate Calcium,2005.0,People,Count
2,70216,,Prescription items,Acamprosate Calcium,2006.0,People,Count
3,75842,,Prescription items,Acamprosate Calcium,20075,People,Count
4,83983,,Prescription items,Acamprosate Calcium,20085,People,Count
5,94921,,Prescription items,Acamprosate Calcium,2009.0,People,Count
6,102679,,Prescription items,Acamprosate Calcium,2010.0,People,Count
7,107389,,Prescription items,Acamprosate Calcium,2011.0,People,Count
8,117405,,Prescription items,Acamprosate Calcium,2012.0,People,Count
9,126558,,Prescription items,Acamprosate Calcium,20136,People,Count


In [10]:
new_table = new_table[new_table['OBS'] != '' ]

In [11]:
new_table.columns = ['Value' if x=='OBS' else x for x in new_table.columns]

In [12]:
new_table.head()

Unnamed: 0,Value,DATAMARKER,Category,Treatment of alcohol dependence,Period,Unit,Measure Type
0,66863,,Prescription items,Acamprosate Calcium,2004.0,People,Count
1,66851,,Prescription items,Acamprosate Calcium,2005.0,People,Count
2,70216,,Prescription items,Acamprosate Calcium,2006.0,People,Count
3,75842,,Prescription items,Acamprosate Calcium,20075.0,People,Count
4,83983,,Prescription items,Acamprosate Calcium,20085.0,People,Count


In [13]:
new_table['Treatment of alcohol dependence'] = new_table['Treatment of alcohol dependence'].map(
    lambda x: {
        'Total' : 'All' 
        }.get(x, x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [14]:
new_table.head()

Unnamed: 0,Value,DATAMARKER,Category,Treatment of alcohol dependence,Period,Unit,Measure Type
0,66863,,Prescription items,Acamprosate Calcium,2004.0,People,Count
1,66851,,Prescription items,Acamprosate Calcium,2005.0,People,Count
2,70216,,Prescription items,Acamprosate Calcium,2006.0,People,Count
3,75842,,Prescription items,Acamprosate Calcium,20075.0,People,Count
4,83983,,Prescription items,Acamprosate Calcium,20085.0,People,Count


In [15]:
new_table.tail()

Unnamed: 0,Value,DATAMARKER,Category,Treatment of alcohol dependence,Period,Unit,Measure Type
255,16.4526,,Average Net Ingredient Cost per item (£),All,2012.0,People,Count
256,17.0245,,Average Net Ingredient Cost per item (£),All,20136.0,People,Count
257,17.5885,,Average Net Ingredient Cost per item (£),All,2014.0,People,Count
258,20.027,,Average Net Ingredient Cost per item (£),All,2015.0,People,Count
259,25.8412,,Average Net Ingredient Cost per item (£),All,2016.0,People,Count


In [16]:
new_table.dtypes

Value                              object
DATAMARKER                         object
Category                           object
Treatment of alcohol dependence    object
Period                             object
Unit                               object
Measure Type                       object
dtype: object

In [17]:
new_table['Value'] = new_table['Value'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [18]:
new_table['Period'] = pd.to_numeric(new_table['Period'], errors='coerce').fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [19]:
new_table['Period'] = new_table['Period'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [20]:
new_table.head(3)

Unnamed: 0,Value,DATAMARKER,Category,Treatment of alcohol dependence,Period,Unit,Measure Type
0,66863,,Prescription items,Acamprosate Calcium,2004,People,Count
1,66851,,Prescription items,Acamprosate Calcium,2005,People,Count
2,70216,,Prescription items,Acamprosate Calcium,2006,People,Count


In [21]:
def user_perc(x,y):
    
    if (str(x) ==  'Net Ingredient Cost (£ 000s)') | (str(x) == 'Average Net Ingredient Cost per item (£)'): 
        
        return 'GBP Total'
    else:
        return y
    
new_table['Measure Type'] = new_table.apply(lambda row: user_perc(row['Category'], row['Measure Type']), axis = 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [22]:
def user_perc(x,y):
    
    if str(x) ==  'Net Ingredient Cost (£ 000s)': 
        
        return '£ 000s'
    else:
        return y
    
new_table['Unit'] = new_table.apply(lambda row: user_perc(row['Category'], row['Unit']), axis = 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [23]:
def user_perc(x,y):
    
    if str(x) ==  'Average Net Ingredient Cost per item (£)': 
        
        return '£'
    else:
        return y
    
new_table['Unit'] = new_table.apply(lambda row: user_perc(row['Category'], row['Unit']), axis = 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [24]:
new_table = new_table[['Period','Category','Treatment of alcohol dependence','Measure Type','Value','Unit']]

In [25]:
if is_interactive():
    SubstancetinationFolder = Path('out')
    SubstancetinationFolder.mkdir(exist_ok=True, parents=True)
    new_table.to_csv(SubstancetinationFolder / ('table1.csv'), index = False)

In [26]:
new_table.head()

Unnamed: 0,Period,Category,Treatment of alcohol dependence,Measure Type,Value,Unit
0,2004,Prescription items,Acamprosate Calcium,Count,66863,People
1,2005,Prescription items,Acamprosate Calcium,Count,66851,People
2,2006,Prescription items,Acamprosate Calcium,Count,70216,People
3,20075,Prescription items,Acamprosate Calcium,Count,75842,People
4,20085,Prescription items,Acamprosate Calcium,Count,83983,People
