In [1]:
from gssutils import *

scraper = Scraper('https://www.ons.gov.uk/businessindustryandtrade/internationaltrade/datasets/' + \
                  'internationaltradeinservicesreferencetables')
scraper



## International trade in services

Detailed breakdown of annual trade in UK services estimates, analysing data by country, product and industry.

### Distributions

1. International trade in services ([MS Excel Spreadsheet](https://www.ons.gov.uk/file?uri=/businessindustryandtrade/internationaltrade/datasets/internationaltradeinservicesreferencetables/alltables2017/internationaltradeinservices2017.xls))


In [2]:
tabs = scraper.distribution().as_databaker()
str([tab.name for tab in tabs])

"['Contents', 'Table A0', 'Table B1', 'Table B2', 'Table B3', 'Table C0', 'Table C1 2009-2012', 'Table C1 2013-2017', 'Table C2 2009-2012', 'Table C2 2013-2017', 'Table C3 2009-2012', 'Table C3 2013-2017', 'Table C4 2009-2012', 'Table C4 2013-2017', 'Table C5 2009-2012', 'Table C5 2013-2017', 'Table C6 2009-2012', ' Table C6 2013-2017', 'Table C7 2009-2012', 'Table C7 2013-2017', 'Table D1', 'Table D2']"

In [3]:
def fix_service(s):
    service = pathify(s)
    if service == 'total-internationl-trade-in-services':
        service = 'all'
    elif service.startswith('total-'):
        service = service[len('total-'):]
    return service

def fix_title(s):
    service = pathify(s)
    pos = service.find('-analysed-by-')
    if pos != -1:
        service = service[:pos]
    # one title doesn't use "analysed by"
    pos = service.find('-industry-by-product-')
    if pos != -1:
        service = service[:pos + len('-industry')]
    return service    

def fix_area(a):
    area = pathify(a)
    if area == 'total-international-trade-in-services':
        area = 'world'
    elif area.startswith('total-'):
        area = area[len('total-'):]
    return f"itis/{area}"

def process_tab(tab):
    tab_group = tab.name.strip()[:len('Table XX')][-2:]
    tab_title = tab.excel_ref('A1').fill(RIGHT).is_not_blank().by_index(1).value.strip()
    display(f"Processing '{tab.name}' ({tab_group}) '{tab_title}'")
    if tab_group in ['C0']:
        top_left = tab.filter('Industry Description').fill(DOWN).is_not_blank().by_index(1)
        bottom_left = tab.filter('Total International Trade in Services')
        row_labels = (top_left.expand(LEFT).expand(DOWN) & bottom_left.expand(RIGHT).expand(UP)).is_not_blank()
    elif tab_group[0] == 'C':
        top_left = tab.excel_ref('A1').fill(DOWN).is_not_blank().by_index(1)
        bottom_left = tab.filter('Total International Trade in Services')
        row_labels = top_left.expand(DOWN) & bottom_left.expand(UP)
        row_labels = (row_labels | row_labels.shift(RIGHT) | row_labels.shift(RIGHT).shift(RIGHT)).is_not_blank()
    else:
        top_left = tab.excel_ref('A1').fill(DOWN).is_not_blank().by_index(1)
        bottom_left = tab.filter('TOTAL INTERNATIONAL TRADE IN SERVICES')
        row_labels = top_left.expand(DOWN) & bottom_left.expand(UP)
        row_labels = (row_labels | row_labels.shift(RIGHT) | row_labels.shift(RIGHT).shift(RIGHT)).is_not_blank()
    assert len(row_labels) > 0
    year = top_left.shift(UP).fill(RIGHT).is_not_blank()
    bottom_left.assert_one()
    flow = year.shift(UP).is_not_blank()
    observations = (row_labels.fill(RIGHT) & year.fill(DOWN)).is_not_blank()
    
    #if tab_group in ['A0', 'B1', 'B2', 'B3', 'D1', 'D2']:
    #    dim_label = 'ONS Trade Areas ITIS'
    #elif tab_group[0] == 'C':
    #    dim_label = 'ITIS Service'
    cs = ConversionSegment(observations, [
        HDim(year, 'Year', DIRECTLY, ABOVE),
        HDim(row_labels, 'Row', DIRECTLY, LEFT),
        HDim(flow, 'Flow', CLOSEST, LEFT),
    ])
    if tab.name == 'Table C1 2013-2017':
        savepreviewhtml(cs)
    obs = cs.topandas()
    obs['Value'] = pd.to_numeric(obs['OBS'], errors='coerce')
    obs.dropna(subset=['Value'], inplace=True)
    obs.drop(columns=['OBS'], inplace=True)
    if 'DATAMARKER' in obs:
        obs.drop(columns=['DATAMARKER'], inplace=True)    
    obs['Year'] = obs['Year'].apply(lambda y: int(float(y)))
    if tab_group[0] in ['A', 'B']:
        obs['ITIS Industry'] = 'all'
        obs['ITIS Service'] = fix_title(tab_title)
        obs['ONS Trade Areas ITIS'] = obs['Row'].apply(fix_area)
    elif tab_group[0] == 'C':
        if tab_group == 'C1':
            obs['ITIS Industry'] = 'all'
        else:
            obs['ITIS Industry'] = fix_title(tab_title)
        obs['ITIS Service'] = obs['Row'].apply(fix_service)
        obs['ONS Trade Areas ITIS'] = 'itis/world'
    else:
        # Table D2 has 'Exports' in the wrong place
        if tab_group == 'D2':
            obs['Flow'].fillna('exports', inplace=True)
        obs['ITIS Industry'] = fix_title(tab_title)
        obs['ITIS Service'] = 'total-international-trade-in-services'
        obs['ONS Trade Areas ITIS'] = obs['Row'].apply(fix_area)
    obs.drop(columns=['Row'], inplace=True)
    obs['Flow'] = obs['Flow'].apply(lambda x: pathify(x.strip()))
    obs['International Trade Basis'] = 'BOP'
    obs['Measure Type'] = 'GBP Total'
    obs['Unit'] = 'gbp-million'
    return obs[['ONS Trade Areas ITIS', 'Year', 'Flow', 'ITIS Service', 'ITIS Industry',
                'International Trade Basis','Measure Type','Value','Unit']]

observations = pd.concat(process_tab(t) for t in tabs if t.name not in ['Contents', 'Table C0'])

"Processing 'Table A0' (A0) 'Total International Trade in Services (excluding travel, transport and banking) analysed by continents and countries 2013 - 2017'"




"Processing 'Table B1' (B1) 'Technical, trade-related, operational leasing & other business services analysed by continents and countries 2013 - 2017'"




"Processing 'Table B2' (B2) 'Professional, management consulting & R&D services analysed by continents and countries 2013 - 2017'"




"Processing 'Table B3' (B3) 'Merchanting, Other Trade-related and Services between related enterprises analysed by continents and countries 2013 - 2017'"




"Processing 'Table C1 2009-2012' (C1) 'Total International Trade in Services all industries (excluding travel, transport and banking) analysed by product 2009 - 2012'"




"Processing 'Table C1 2013-2017' (C1) 'Total International Trade in Services all industries (excluding travel, transport and banking) analysed by product 2013-2017'"

0,1,2,3
OBS,Year,Row,Flow

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38
C1,,"Total International Trade in Services all industries (excluding travel, transport and banking) analysed by product 2013-2017",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,,,,£ million,,,,,,,,,,,,,,,,,,
,,,,Exports,,,,,,Imports,,,,,,Balance,,,,,,,,,,,,,,,,,,,,,,
,,,,2013.0,2014.0,2015.0,2016.0,2017.0,,2013.0,2014.0,2015.0,2016.0,2017.0,,2013.0,2014.0,2015.0,2016.0,2017.0,,,,,,,,,,,,,,,,,,
Agricultural and Mining Services,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,"Agricultural, forestry and fishing",,15.403925574999999,55.231428806,14.7770562240968,15.0,19.0,,25.1651064254,116.04579591,10.39931648642013,12.0,17.0,,-9.761180850400002,-60.81436710399999,4.37773973767667,3.0,2.0,,,,,,,,,,,,,,,,,,
,,Mining and oil and gas extraction services,,1180.2113078504,1459.4531761,1678.5973797551237,1157.0,955.0,,145.05000046109998,69.699277753,168.10593304398992,236.0,204.0,,1035.1613073893002,1389.753898347,1510.4914467111334,921.0,751.0,,,,,,,,,,,,,,,,,,
,Total Agricultural and Mining services,,,1195.6152334254,1514.684604906,1693.3744359792204,1172.0,974.0,,170.2151068865,185.745073663,178.50524953041003,248.0,221.0,,1025.4001265389,1328.939531243,1514.8691864488103,924.0,753.0,,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"Manufacturing, Maintenance and On-site Processing Services",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,





"Processing 'Table C2 2009-2012' (C2) 'Manufacturing industry analysed by products 2009 - 2012'"




"Processing 'Table C2 2013-2017' (C2) 'Manufacturing industry analysed by products 2013 - 2017'"




"Processing 'Table C3 2009-2012' (C3) 'Wholesale & Retail industry analysed by product 2009 - 2012'"




"Processing 'Table C3 2013-2017' (C3) 'Wholesale & Retail industry analysed by product 2013 - 2017'"




"Processing 'Table C4 2009-2012' (C4) 'Information and Communication industry analysed by products 2009 - 2012'"




"Processing 'Table C4 2013-2017' (C4) 'Information and Communication industry analysed by product 2013 - 2017'"




"Processing 'Table C5 2009-2012' (C5) 'Professional, Scientific and Technical Support industry analysed by products 2009 - 2012'"




"Processing 'Table C5 2013-2017' (C5) 'Professional, Scientific and Technical Support industry analysed by product 2013 - 2017'"




"Processing 'Table C6 2009-2012' (C6) 'Administrative and Support Service Activities industry analysed by products 2009 - 2012'"




"Processing ' Table C6 2013-2017' (C6) 'Administrative and Support Service Activities industry by product 2013 - 2017'"




"Processing 'Table C7 2009-2012' (C7) 'Arts, Entertainment, Recreation and Other Service Activities industry analysed by products 2009 - 2012'"




"Processing 'Table C7 2013-2017' (C7) 'Arts, Entertainment, Recreation and Other Service Activities industry analysed by product 2013 - 2017'"




"Processing 'Table D1' (D1) 'Film Industry (excluding other services) analysed by continents and countries 2013 - 2017'"




"Processing 'Table D2' (D2) 'Television Industry (excluding other services) analysed by continents and countries 2013 - 2017'"




In [4]:
for col in ['ONS Trade Areas ITIS', 'Flow', 'ITIS Service', 'ITIS Industry']:
    observations[col] = observations[col].astype('category')
    display(observations[col].cat.categories)

Index(['itis/africa', 'itis/africa-unallocated', 'itis/america',
       'itis/america-unallocated', 'itis/asia', 'itis/asia-unallocated',
       'itis/australasia-and-oceania',
       'itis/australasia-and-oceania-and-total-unallocated',
       'itis/australasia-oceania-and-others',
       'itis/australasia-oceania-and-total-unallocated', 'itis/australia',
       'itis/austria', 'itis/belgium', 'itis/brazil', 'itis/bulgaria',
       'itis/canada', 'itis/channel-islands', 'itis/china', 'itis/croatia',
       'itis/cyprus', 'itis/czech-republic', 'itis/denmark', 'itis/efta',
       'itis/estonia', 'itis/eu-institutions', 'itis/europe',
       'itis/europe-unallocated', 'itis/european-union',
       'itis/european-union-eu', 'itis/finland', 'itis/france', 'itis/germany',
       'itis/greece', 'itis/hong-kong', 'itis/hungary', 'itis/iceland',
       'itis/india', 'itis/indonesia', 'itis/international-organisations',
       'itis/irish-republic', 'itis/isle-of-man', 'itis/israel', 'itis/ita

Index(['balance', 'exports', 'imports'], dtype='object')

Index(['accountancy-auditing-bookkeeping-and-tax-consulting-services',
       'advertising',
       'advertising-market-research-and-public-opinion-polling-services',
       'agricultural', 'agricultural-and-mining-services',
       'agricultural-forestry-and-fishing',
       'agricultural-mining-and-on-site-processing-services', 'architectural',
       'architectural-services', 'audio-visual-and-related-services',
       'business-and-professional-services',
       'business-management-and-management-consulting-services',
       'charges-or-payments-for-the-use-of',
       'communcations-and-computer-information-services',
       'communication-computer-and-information-services',
       'communications-services', 'computer-and-information-services',
       'computer-services', 'computers', 'construction-goods-and-services',
       'construction-in-the-uk', 'construction-outside-the-uk',
       'construction-services', 'engineering', 'engineering-services',
       'financial', 'health-

Index(['administrative-and-support-service-activities-industry', 'all',
       'arts-entertainment-recreation-and-other-service-activities-industry',
       'film-industry-excluding-other-services',
       'information-and-communication-industry', 'manufacturing-industry',
       'professional-scientific-and-technical-support-industry',
       'television-industry-excluding-other-services',
       'wholesale-retail-industry'],
      dtype='object')

In [5]:
out = Path('out')
out.mkdir(exist_ok=True)
observations.drop_duplicates().to_csv(out / 'observations.csv', index = False)

In [6]:
from gssutils.metadata import THEME
scraper.dataset.family = 'Trade'
scraper.dataset.theme = THEME['business-industry-trade-energy']

with open(out / 'dataset.trig', 'wb') as metadata:
     metadata.write(scraper.generate_trig())