In [1]:
from gssutils import *

scraper = Scraper('https://www.ons.gov.uk/businessindustryandtrade/internationaltrade/datasets/' + \
                  'internationaltradeinservicesreferencetables')
scraper



## International trade in services

Detailed breakdown of annual trade in UK services estimates, analysing data by country, product and industry.

### Distributions

1. International trade in services ([MS Excel Spreadsheet](https://www.ons.gov.uk/file?uri=/businessindustryandtrade/internationaltrade/datasets/internationaltradeinservicesreferencetables/alltables2017/internationaltradeinservices2017.xls))


In [2]:
tabs = scraper.distribution().as_databaker()
str([tab.name for tab in tabs])

"['Contents', 'Table A0', 'Table B1', 'Table B2', 'Table B3', 'Table C0', 'Table C1 2009-2012', 'Table C1 2013-2017', 'Table C2 2009-2012', 'Table C2 2013-2017', 'Table C3 2009-2012', 'Table C3 2013-2017', 'Table C4 2009-2012', 'Table C4 2013-2017', 'Table C5 2009-2012', 'Table C5 2013-2017', 'Table C6 2009-2012', ' Table C6 2013-2017', 'Table C7 2009-2012', 'Table C7 2013-2017', 'Table D1', 'Table D2']"

In [3]:
def fix_service(row):
    service = pathify(row['H2'])
    group = pathify(row['H1'])
    if service == '':
        if group == 'total-international-trade-in-services':
            service = 'all'
        elif group.startswith('total-'):
            service = group[len('total-'):]
        else:
            assert False, 'Service label is empty, expecting some "total" grouping.'
    elif not group.startswith('total-'):
        service = group + '-' + service
    return service

def fix_title(s):
    service = pathify(s)
    pos = service.find('-analysed-by-')
    if pos != -1:
        service = service[:pos]
    # one title doesn't use "analysed by"
    pos = service.find('-industry-by-product-')
    if pos != -1:
        service = service[:pos + len('-industry')]
    return service    

def fix_area(row):
    area = pathify(row['H2'])
    if area == '':
        area = pathify(row['H1'])
    if area == 'total-international-trade-in-services':
        area = 'world'
    elif area.startswith('total-'):
        area = area[len('total-'):]
    return f"itis/{area}"

def process_tab(tab):
    tab_group = tab.name.strip()[:len('Table XX')][-2:]
    tab_title = tab.excel_ref('A1').fill(RIGHT).is_not_blank().by_index(1).value.strip()
    display(f"Processing '{tab.name}' ({tab_group}) '{tab_title}'")
    # not doing C0 which is a bit different
    top_left = tab.excel_ref('A1').fill(DOWN).is_not_blank().by_index(1)
    if tab_group[0] == 'C':
        bottom_left = tab.filter('Total International Trade in Services')
    else:
        bottom_left = tab.filter('TOTAL INTERNATIONAL TRADE IN SERVICES')
    bottom_left.assert_one()
    h1_labels = (top_left.expand(DOWN) & bottom_left.expand(UP)).filter(lambda c: c.value.strip() != '') | \
        (top_left.shift(RIGHT).expand(DOWN) & bottom_left.shift(RIGHT).expand(UP)).filter(lambda c: c.value.strip() != '')
    h2_labels = (top_left.expand(DOWN) & bottom_left.expand(UP)).shift(RIGHT).shift(RIGHT)
    year = top_left.shift(UP).fill(RIGHT).is_not_blank()
    # some flow labels are in a strange place as cells have been merged inconsistently
    flow = top_left.shift(UP).shift(UP).fill(RIGHT).is_not_blank()
    observations = (h2_labels.fill(RIGHT) & year.fill(DOWN)).is_not_blank()
    h1_dim = HDim(h1_labels, 'H1', CLOSEST, ABOVE) # can this be DIRECTLY?
    h1_dim.AddCellValueOverride('Total European Union', 'Total European Union (EU)')
    h1_dim.AddCellValueOverride('Total Information Services', 'Total Telecommunication Computer and Information Services Information Services')
    h1_dim.AddCellValueOverride('Total Construction Goods and Services', 'Total Construction Services')
    h2_dim = HDim(h2_labels, 'H2', DIRECTLY, LEFT)
    h2_dim.AddCellValueOverride('Other techincal services', 'Other technical services')
    cs = ConversionSegment(observations, [
        HDim(year, 'Year', DIRECTLY, ABOVE),
        h1_dim,
        h2_dim,
        HDim(flow, 'Flow', CLOSEST, LEFT),
    ])
    obs = cs.topandas()
    obs['Value'] = pd.to_numeric(obs['OBS'], errors='coerce')
    obs.dropna(subset=['Value'], inplace=True)
    obs.drop(columns=['OBS'], inplace=True)
    if 'DATAMARKER' in obs:
        obs.drop(columns=['DATAMARKER'], inplace=True)    
    obs['Year'] = obs['Year'].apply(lambda y: int(float(y)))
    if tab_group[0] in ['A', 'B']:
        obs['ITIS Industry'] = 'all'
        obs['ITIS Service'] = fix_title(tab_title)
        obs['ONS Trade Areas ITIS'] = obs.apply(fix_area, axis='columns')
    elif tab_group[0] == 'C':
        if tab_group == 'C1':
            obs['ITIS Industry'] = 'all'
        else:
            obs['ITIS Industry'] = fix_title(tab_title)
        obs['ITIS Service'] = obs.apply(fix_service, axis='columns')
        obs['ONS Trade Areas ITIS'] = 'itis/world'
    else:
        # Table D2 has 'Exports' in the wrong place
        if tab_group == 'D2':
            obs['Flow'].fillna('exports', inplace=True)
        obs['ITIS Industry'] = fix_title(tab_title)
        obs['ITIS Service'] = 'total-international-trade-in-services'
        obs['ONS Trade Areas ITIS'] = obs.apply(fix_area, axis='columns')
    obs.drop(columns=['H1', 'H2'], inplace=True)
    obs['Flow'] = obs['Flow'].apply(lambda x: pathify(x.strip()))
    obs['International Trade Basis'] = 'BOP'
    obs['Measure Type'] = 'GBP Total'
    obs['Unit'] = 'gbp-million'
    return obs[['ONS Trade Areas ITIS', 'Year', 'Flow', 'ITIS Service', 'ITIS Industry',
                'International Trade Basis','Measure Type','Value','Unit']]

observations = pd.concat(process_tab(t) for t in tabs if t.name not in ['Contents', 'Table C0'])

"Processing 'Table A0' (A0) 'Total International Trade in Services (excluding travel, transport and banking) analysed by continents and countries 2013 - 2017'"




"Processing 'Table B1' (B1) 'Technical, trade-related, operational leasing & other business services analysed by continents and countries 2013 - 2017'"




"Processing 'Table B2' (B2) 'Professional, management consulting & R&D services analysed by continents and countries 2013 - 2017'"




"Processing 'Table B3' (B3) 'Merchanting, Other Trade-related and Services between related enterprises analysed by continents and countries 2013 - 2017'"




"Processing 'Table C1 2009-2012' (C1) 'Total International Trade in Services all industries (excluding travel, transport and banking) analysed by product 2009 - 2012'"




"Processing 'Table C1 2013-2017' (C1) 'Total International Trade in Services all industries (excluding travel, transport and banking) analysed by product 2013-2017'"




"Processing 'Table C2 2009-2012' (C2) 'Manufacturing industry analysed by products 2009 - 2012'"




"Processing 'Table C2 2013-2017' (C2) 'Manufacturing industry analysed by products 2013 - 2017'"




"Processing 'Table C3 2009-2012' (C3) 'Wholesale & Retail industry analysed by product 2009 - 2012'"




"Processing 'Table C3 2013-2017' (C3) 'Wholesale & Retail industry analysed by product 2013 - 2017'"




"Processing 'Table C4 2009-2012' (C4) 'Information and Communication industry analysed by products 2009 - 2012'"




"Processing 'Table C4 2013-2017' (C4) 'Information and Communication industry analysed by product 2013 - 2017'"




"Processing 'Table C5 2009-2012' (C5) 'Professional, Scientific and Technical Support industry analysed by products 2009 - 2012'"




"Processing 'Table C5 2013-2017' (C5) 'Professional, Scientific and Technical Support industry analysed by product 2013 - 2017'"




"Processing 'Table C6 2009-2012' (C6) 'Administrative and Support Service Activities industry analysed by products 2009 - 2012'"




"Processing ' Table C6 2013-2017' (C6) 'Administrative and Support Service Activities industry by product 2013 - 2017'"




"Processing 'Table C7 2009-2012' (C7) 'Arts, Entertainment, Recreation and Other Service Activities industry analysed by products 2009 - 2012'"




"Processing 'Table C7 2013-2017' (C7) 'Arts, Entertainment, Recreation and Other Service Activities industry analysed by product 2013 - 2017'"




"Processing 'Table D1' (D1) 'Film Industry (excluding other services) analysed by continents and countries 2013 - 2017'"




"Processing 'Table D2' (D2) 'Television Industry (excluding other services) analysed by continents and countries 2013 - 2017'"




In [4]:
for col in ['ONS Trade Areas ITIS', 'Flow', 'ITIS Service', 'ITIS Industry']:
    observations[col] = observations[col].astype('category')
    display(observations[col].cat.categories)

Index(['itis/africa', 'itis/africa-unallocated', 'itis/america',
       'itis/america-unallocated', 'itis/asia', 'itis/asia-unallocated',
       'itis/australasia-and-oceania',
       'itis/australasia-and-oceania-and-total-unallocated',
       'itis/australasia-oceania-and-others',
       'itis/australasia-oceania-and-total-unallocated', 'itis/australia',
       'itis/austria', 'itis/belgium', 'itis/brazil', 'itis/bulgaria',
       'itis/canada', 'itis/channel-islands', 'itis/china', 'itis/croatia',
       'itis/cyprus', 'itis/czech-republic', 'itis/denmark', 'itis/efta',
       'itis/estonia', 'itis/eu-institutions', 'itis/europe',
       'itis/europe-unallocated', 'itis/european-union-eu', 'itis/finland',
       'itis/france', 'itis/germany', 'itis/greece', 'itis/hong-kong',
       'itis/hungary', 'itis/iceland', 'itis/india', 'itis/indonesia',
       'itis/international-organisations', 'itis/irish-republic',
       'itis/isle-of-man', 'itis/israel', 'itis/italy', 'itis/japan',
    

Index(['balance', 'exports', 'imports'], dtype='object')

Index(['agricultural-and-mining-services',
       'agricultural-and-mining-services-agricultural-forestry-and-fishing',
       'agricultural-and-mining-services-mining-and-oil-and-gas-extraction-services',
       'agricultural-mining-and-on-site-processing-services',
       'agricultural-mining-and-on-site-processing-services-agricultural',
       'agricultural-mining-and-on-site-processing-services-mining',
       'agricultural-mining-and-on-site-processing-services-other-on-site-processing-services',
       'agricultural-mining-and-on-site-processing-services-waste-treatment-and-depollution',
       'all', 'business-and-professional-services',
       ...
       'telecommunication-computer-and-information-services-computer-services',
       'telecommunication-computer-and-information-services-information-services',
       'telecommunication-computer-and-information-services-news-agency-services',
       'telecommunication-computer-and-information-services-postal-and-courier',
       '

Index(['administrative-and-support-service-activities-industry', 'all',
       'arts-entertainment-recreation-and-other-service-activities-industry',
       'film-industry-excluding-other-services',
       'information-and-communication-industry', 'manufacturing-industry',
       'professional-scientific-and-technical-support-industry',
       'television-industry-excluding-other-services',
       'wholesale-retail-industry'],
      dtype='object')

In [5]:
out = Path('out')
out.mkdir(exist_ok=True)
observations.drop_duplicates().to_csv(out / 'observations.csv', index = False)

In [6]:
from gssutils.metadata import THEME
scraper.dataset.family = 'Trade'
scraper.dataset.theme = THEME['business-industry-trade-energy']

with open(out / 'dataset.trig', 'wb') as metadata:
     metadata.write(scraper.generate_trig())