In [1]:
from gssutils import *

inward_scraper = Scraper('https://www.ons.gov.uk/businessindustryandtrade/business/businessinnovation/datasets/' \
                  'foreigndirectinvestmentinvolvingukcompanies2013inwardtables')
outward_scraper = Scraper('https://www.ons.gov.uk/businessindustryandtrade/business/businessinnovation/datasets/' \
                  'foreigndirectinvestmentinvolvingukcompaniesoutwardtables')

display(inward_scraper)
display(outward_scraper)



## Foreign direct investment involving UK companies: inward

Annual statistics on the investment of foreign companies into the UK, including for investment flows, positions and earnings.

### Distributions

1. Foreign direct investment involving UK companies: inward ([MS Excel Spreadsheet](https://www.ons.gov.uk/file?uri=/businessindustryandtrade/business/businessinnovation/datasets/foreigndirectinvestmentinvolvingukcompanies2013inwardtables/current/annualforeigndirectinvestment2017inward.xls))


## Foreign direct investment involving UK companies: outward

Annual statistics on the investment of UK companies abroad, including for investment flows, positions and earnings.

### Distributions

1. Foreign direct investment involving UK companies: outward ([MS Excel Spreadsheet](https://www.ons.gov.uk/file?uri=/businessindustryandtrade/business/businessinnovation/datasets/foreigndirectinvestmentinvolvingukcompaniesoutwardtables/current/annualforeigndirectinvestment2017outward.xls))


Collect together all tabs in one list of `((tab name, direction), tab)`

In [2]:
sheets = {
    ** {(sheet.name.strip(), 'inward'): sheet for sheet in inward_scraper.distribution().as_databaker()},
    ** {(sheet.name.strip(), 'outward'): sheet for sheet in outward_scraper.distribution().as_databaker()}
}
print(list(sheets.keys()))

[('Contents', 'inward'), ('1.1', 'inward'), ('1.2', 'inward'), ('1.3', 'inward'), ('2.1', 'inward'), ('2.2', 'inward'), ('2.3', 'inward'), ('3.1', 'inward'), ('3.2', 'inward'), ('3.3', 'inward'), ('4.1', 'inward'), ('4.2', 'inward'), ('4.3', 'inward'), ('Geography', 'inward'), ('SIC', 'inward'), ('Contents', 'outward'), ('1.1', 'outward'), ('1.2', 'outward'), ('1.3', 'outward'), ('2.1', 'outward'), ('2.2', 'outward'), ('2.3', 'outward'), ('3.1', 'outward'), ('3.2', 'outward'), ('3.3', 'outward'), ('4.1', 'outward'), ('4.2', 'outward'), ('4.3', 'outward'), ('Geography', 'outward'), ('SIC', 'outward')]


A common issue is where a dimension label is split over more than one cell.
The following function does a rudimentary search for these splits in a bag, returns a list of
pairs of cells and their replacement, along with a list of extraneous cells to remove
from the bag.

In [3]:
def split_overrides(bag, splits):
    overrides = []
    to_remove = None
    for split in splits:
        for cell in bag:
            c = cell
            found = True
            remove_list = []
            for s in split:
                if c.value.strip() != s:
                    found = False
                    break
                try:
                    c = c.shift(DOWN)
                except:
                    found = False
                    break
                remove_list.append(c)
            if found:
                overrides.append((cell, ' '.join(split)))
                for c in remove_list:
                    if to_remove is None:
                        to_remove = c
                    else:
                        to_remove = to_remove | c
    return (overrides, to_remove)

In [4]:
tables = []
for (name, direction), sheet in sheets.items():
    if '.' not in name:
        continue
    major, minor = name.split('.')
    if major not in ['2', '3', '4']:
        continue
    display(f'Processing tab {name}: {direction}')
    dims = []
    top_right = sheet.filter('£ million')
    top_right.assert_one()
    left_top = sheet.filter('EUROPE').by_index(1)
    top_row = (left_top.fill(UP).fill(RIGHT) & top_right.expand(LEFT).fill(DOWN)).is_not_blank().is_not_whitespace()
    dims.append(HDim(top_row, 'top', DIRECTLY, ABOVE))
    bottom = sheet.filter('The sum of constituent items may not always agree exactly with the totals shown due to rounding.')
    bottom.assert_one()
    bottom_block = bottom.shift(UP).expand(RIGHT).expand(DOWN)
    left_col = (left_top | left_top.shift(RIGHT) | left_top.shift(RIGHT) \
                .shift(RIGHT)).expand(DOWN).is_not_blank().is_not_whitespace() - bottom_block
    left_col = left_col - left_col.filter('of which')
    # fix up cells that have been split
    overrides, to_remove = split_overrides(left_col, [
        ('OTHER', 'EUROPEAN', 'COUNTRIES'),
        ('OTHER EUROPEAN', 'COUNTRIES'),
        ('UK OFFSHORE', 'ISLANDS'),
        ('NEAR & MIDDLE EAST', 'COUNTRIES'),
        ('NEAR & MIDDLE', 'EAST COUNTRIES'),
        ('AUSTRALASIA &', 'OCEANIA'),
        ('AUSTRALASIA', '& OCEANIA'),
        ('CENTRAL & EASTERN', 'EUROPE'),
        ('CENTRAL &', 'EASTERN', 'EUROPE'),
        ('GULF ARABIAN', 'COUNTRIES'),
        ('OTHER ASIAN', 'COUNTRIES')
    ])
    if to_remove:
        left_col = left_col - to_remove
    left_dim = HDim(left_col, 'ONS FDI Area', CLOSEST, UP)
    for cell, replace in overrides:
        left_dim.AddCellValueOverride(cell, replace)
    # Also, "IRELAND" should be "IRISH REPUBLIC"
    left_dim.AddCellValueOverride('IRELAND', 'IRISH REPUBLIC')
    dims.append(left_dim)
    if minor != '1':
        year_col = left_top.fill(RIGHT).is_number().filter(lambda x: x.value > 1900).by_index(1).expand(DOWN).is_number() - bottom_block
        dims.append(HDim(year_col, 'Year', DIRECTLY, LEFT))
        obs = year_col.fill(RIGHT) & top_row.fill(DOWN)
    else:
        obs = left_col.fill(RIGHT) & top_row.fill(DOWN)
    cs = ConversionSegment(obs, dims, includecellxy=True)
    table = cs.topandas()
    table.drop(table[table['DATAMARKER'].notna()].index, inplace=True)
    table.drop(columns=['DATAMARKER'], inplace=True)
    table.rename(columns={'OBS': 'Value'}, inplace=True)
    table['ONS FDI Area'] = table['ONS FDI Area'].map(lambda x: 'fdi/' + pathify(x.strip()))
    if minor == '1':
        # top header row is year
        table.rename(columns={'top': 'Year'}, inplace=True)
    table['Year'] = table['Year'].map(lambda x: int(float(x)))
    if minor != '2':
        table['FDI Component'] = {
            '2': {'outward': 'total-net-fdi-abroad',
                  'inward': 'total-net-fdi-in-the-uk'},
            '3': {'outward': 'total-net-fdi-international-investment-position-abroad-at-end-period',
                  'inward': 'total-net-fdi-international-investment-position-in-the-uk-at-end-period'},
            '4': {'outward': 'total-net-fdi-earnings-abroad',
                  'inward': 'total-net-fdi-earnings-in-the-uk'}
        }.get(major).get(direction)
    else:
        table.rename(columns={'top': 'FDI Component'}, inplace=True)
        table['FDI Component'] = table['FDI Component'].map(pathify)
    if minor == '3':
        table.rename(columns={'top': 'SIC Industry'}, inplace=True)
        table['SIC Industry'] = table['SIC Industry'].map(
            lambda x: pathify(x) if x != 'Total' else 'all-activities'
        )
    else:
        table['SIC Industry'] = 'all-activities'
    # Disambiguate FDI Component between tabs 2.2 and 4.2
    if name == '2.2':
        table['FDI Component'] = table['FDI Component'].map(
            lambda x: 'fdi-' + x if not x.startswith('total-net-foreign-direct-investment-') else
            'total-net-fdi-' + x[len('total-net-foreign-direct-investment-'):]
        )
    elif name == '4.2':
        table['FDI Component'] = table['FDI Component'].map(
            lambda x: 'earnings-fdi-' + x if not x.startswith('total-net-') else
            'total-net-' + x[len('total-net-'):].replace('foreign-direct-investment', 'fdi')
        )
    table['International Trade Basis'] = table['Year'].map(lambda year: 'BPM5' if year < 2012 else 'BPM6')
    table['Investment Direction'] = direction
    tables.append(table)

observations = pd.concat(tables, sort=False)
observations

'Processing tab 2.1: inward'




'Processing tab 2.2: inward'




'Processing tab 2.3: inward'




'Processing tab 3.1: inward'




'Processing tab 3.2: inward'




'Processing tab 3.3: inward'




'Processing tab 4.1: inward'




'Processing tab 4.2: inward'




'Processing tab 4.3: inward'




'Processing tab 2.1: outward'




'Processing tab 2.2: outward'




'Processing tab 2.3: outward'




'Processing tab 3.1: outward'




'Processing tab 3.2: outward'




'Processing tab 3.3: outward'




'Processing tab 4.1: outward'




'Processing tab 4.2: outward'




'Processing tab 4.3: outward'




Unnamed: 0,Value,Year,ONS FDI Area,__x,__y,__tablename,FDI Component,SIC Industry,International Trade Basis,Investment Direction
0,22115,2008,fdi/europe,3,7,2.1,total-net-fdi-in-the-uk,all-activities,BPM5,inward
1,22584,2009,fdi/europe,4,7,2.1,total-net-fdi-in-the-uk,all-activities,BPM5,inward
2,8957,2010,fdi/europe,5,7,2.1,total-net-fdi-in-the-uk,all-activities,BPM5,inward
3,-28258,2011,fdi/europe,6,7,2.1,total-net-fdi-in-the-uk,all-activities,BPM5,inward
4,22830,2012,fdi/europe,7,7,2.1,total-net-fdi-in-the-uk,all-activities,BPM6,inward
5,9456,2013,fdi/europe,8,7,2.1,total-net-fdi-in-the-uk,all-activities,BPM6,inward
6,-8,2014,fdi/europe,9,7,2.1,total-net-fdi-in-the-uk,all-activities,BPM6,inward
7,-12305,2015,fdi/europe,10,7,2.1,total-net-fdi-in-the-uk,all-activities,BPM6,inward
8,135463,2016,fdi/europe,11,7,2.1,total-net-fdi-in-the-uk,all-activities,BPM6,inward
9,36203,2017,fdi/europe,12,7,2.1,total-net-fdi-in-the-uk,all-activities,BPM6,inward


In [5]:
from IPython.core.display import HTML
for col in observations:
    if col != 'Value':
        observations[col] = observations[col].astype('category')
        display(HTML(f'<h3>{col}</h3'))
        display(observations[col].cat.categories)

Int64Index([2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017], dtype='int64')

Index(['fdi/africa', 'fdi/asia', 'fdi/australasia-oceania', 'fdi/australia',
       'fdi/austria', 'fdi/belgium', 'fdi/bermuda', 'fdi/brazil',
       'fdi/bulgaria', 'fdi/canada', 'fdi/central-eastern-europe', 'fdi/chile',
       'fdi/china', 'fdi/colombia', 'fdi/croatia', 'fdi/cyprus',
       'fdi/czech-republic', 'fdi/denmark', 'fdi/efta', 'fdi/estonia',
       'fdi/eu', 'fdi/europe', 'fdi/finland', 'fdi/france', 'fdi/germany',
       'fdi/greece', 'fdi/gulf-arabian-countries', 'fdi/hong-kong',
       'fdi/hungary', 'fdi/india', 'fdi/indonesia', 'fdi/irish-republic',
       'fdi/italy', 'fdi/japan', 'fdi/kenya', 'fdi/latvia', 'fdi/lithuania',
       'fdi/luxembourg', 'fdi/malaysia', 'fdi/malta', 'fdi/mexico',
       'fdi/near-middle-east-countries', 'fdi/netherlands', 'fdi/new-zealand',
       'fdi/nigeria', 'fdi/norway', 'fdi/oecd', 'fdi/other-asian-countries',
       'fdi/other-european-countries', 'fdi/panama', 'fdi/poland',
       'fdi/portugal', 'fdi/romania', 'fdi/russia', 'fdi

Int64Index([3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
            21, 22, 23],
           dtype='int64')

Int64Index([  5,   6,   7,   8,   9,  10,  11,  12,  13,  14,
            ...
            268, 269, 270, 271, 272, 273, 274, 275, 276, 277],
           dtype='int64', length=273)

Index(['2.1', '2.2', '2.3', '3.1', '3.2', '3.2 ', '3.3', '4.1', '4.1 ', '4.2',
       '4.2 ', '4.3'],
      dtype='object')

Index(['amount-due-to-foreign-parent-companies-on-branch-head-office-account-at-end-period',
       'amount-due-to-foreign-parent-companies-on-inter-company-account-at-end-period',
       'amount-due-to-uk-parent-companies-on-branch-head-office-account-at-end-period',
       'amount-due-to-uk-parent-companies-on-inter-company-account-at-end-period',
       'earnings-fdi-foreign-companies-share-of-uk-companies-net-profits',
       'earnings-fdi-net-interest-accrued-by-uk-companies',
       'earnings-fdi-net-interest-accrued-to-foreign-parent-companies',
       'earnings-fdi-net-profits',
       'earnings-fdi-uk-companies-share-of-foreign-companies-net-profits',
       'fdi-acquisition-of-foreign-companies-share-loan-capital',
       'fdi-acquisition-of-uk-companies-share-loan-capital',
       'fdi-disposal-of-foreign-companies-share-loan-capital',
       'fdi-disposal-of-uk-companies-share-loan-capital',
       'fdi-foreign-parent-companies-share-of-uk-companies-net-profits',
       'fd

Index(['administrative-and-support-service-activities',
       'agriculture-forest-fishing', 'all-activities',
       'computer-electronic-and-optical-products', 'construction',
       'electricity-gas-water-and-waste', 'financial-services',
       'food-products-beverages-tobacco-products',
       'information-and-communication', 'metal-and-machinery-products',
       'mining-quarrying', 'other-manufacturing', 'other-services',
       'petroleum-chemicals-pharmaceuticals-rubber-plastic-products',
       'professional-scientific-technical-services',
       'retails-wholesale-trade-repair-of-motor-vehicles-motor-cycles',
       'textiles-wood-activities', 'transport-equipment',
       'transportation-storage'],
      dtype='object')

Index(['BPM5', 'BPM6'], dtype='object')

Index(['inward', 'outward'], dtype='object')

In [6]:
observations['Unit'] = 'gbp-million'
observations['Measure Type'] = 'GBP Total'
observations = observations[['Investment Direction', 'Year', 'International Trade Basis',
                             'ONS FDI Area', 'FDI Component', 'SIC Industry',
                             'Value', 'Unit', 'Measure Type',
                             '__x', '__y', '__tablename']]

In [7]:
out = Path('out')
out.mkdir(exist_ok=True, parents=True)

observations.drop(columns=['__x', '__y', '__tablename']).drop_duplicates().to_csv(out / 'observations.csv', index=False)
observations.drop_duplicates().to_csv(out / 'observations-annotated.csv', index=False)

In [8]:
display(inward_scraper)
display(outward_scraper)

## Foreign direct investment involving UK companies: inward

Annual statistics on the investment of foreign companies into the UK, including for investment flows, positions and earnings.

### Distributions

1. Foreign direct investment involving UK companies: inward ([MS Excel Spreadsheet](https://www.ons.gov.uk/file?uri=/businessindustryandtrade/business/businessinnovation/datasets/foreigndirectinvestmentinvolvingukcompanies2013inwardtables/current/annualforeigndirectinvestment2017inward.xls))


## Foreign direct investment involving UK companies: outward

Annual statistics on the investment of UK companies abroad, including for investment flows, positions and earnings.

### Distributions

1. Foreign direct investment involving UK companies: outward ([MS Excel Spreadsheet](https://www.ons.gov.uk/file?uri=/businessindustryandtrade/business/businessinnovation/datasets/foreigndirectinvestmentinvolvingukcompaniesoutwardtables/current/annualforeigndirectinvestment2017outward.xls))


In [9]:
inward_scraper.dataset.title = inward_scraper.dataset.title.replace(': inward', '')
inward_scraper.dataset.comment = inward_scraper.dataset.comment.replace(
    'into the UK', 'into the UK and of UK companies abroad')

from gssutils.metadata import THEME
inward_scraper.dataset.theme = THEME['business-industry-trade-energy']
inward_scraper.dataset.family = 'trade'

with open(out / 'dataset.trig', 'wb') as metadata:
    metadata.write(inward_scraper.generate_trig())