In [1]:
from gssutils import *

scraper = Scraper('https://www.ons.gov.uk/businessindustryandtrade/business/businessinnovation/datasets/' \
                  'foreigndirectinvestmentinvolvingukcompanies2013inwardtables')
scraper



## Foreign direct investment involving UK companies: inward

Annual statistics on the investment of foreign companies into the UK, including for investment flows, positions and earnings.

### Distributions

1. Foreign direct investment involving UK companies: inward ([MS Excel Spreadsheet](https://www.ons.gov.uk/file?uri=/businessindustryandtrade/business/businessinnovation/datasets/foreigndirectinvestmentinvolvingukcompanies2013inwardtables/current/annualforeigndirectinvestment2017inward.xls))


In [2]:
sheets = {sheet.name: sheet for sheet in scraper.distribution().as_databaker()}
sheets.keys()

dict_keys(['Contents', '1.1', '1.2', '1.3', '2.1', '2.2', '2.3', '3.1', '3.2', '3.3', '4.1', '4.2', '4.3', 'Geography', 'SIC'])

1.1 to 1.3 are summary tables of the same form

In [3]:
from IPython.display import display, HTML

def toint(s):
    try:
        return int(float(s))
    except:
        return None

summary_tables = {}
for summary in [sheets['1.1'], sheets['1.2'], sheets['1.3']]:
    years = summary.excel_ref('A4').fill(RIGHT).is_not_blank()
    components = summary.excel_ref('A4').fill(DOWN).is_not_blank()
    component_label = components.by_index(1)
    components = components - components.regex('^Total').fill(DOWN) - component_label
    obs = components.fill(RIGHT).is_not_blank()
    cs = ConversionSegment(
        obs, [
            HDim(components, 'Component', DIRECTLY, LEFT),
            HDim(years, 'Year', DIRECTLY, ABOVE)
        ]
    )
    table = cs.topandas()
    table['Year']  = table['Year'].map(toint)
    table['Value'] = table['OBS'].map(toint)
    table['Unit']  = '£ million'
    to_drop = ['OBS']
    if 'DATAMARKER' in table:
        to_drop.append('DATAMARKER')
    table.drop(columns=to_drop, inplace=True)
    summary_tables[component_label.value] = table
    display(HTML(f'<h2>{component_label.value}</h2>'))
    display(table)




Unnamed: 0,Component,Year,Value,Unit
0,Foreign companies’ share of UK subsidiaries' a...,2008,44050,£ million
1,Foreign companies’ share of UK subsidiaries' a...,2009,41061,£ million
2,Foreign companies’ share of UK subsidiaries' a...,2010,37679,£ million
3,Foreign companies’ share of UK subsidiaries' a...,2011,42878,£ million
4,Foreign companies’ share of UK subsidiaries' a...,2012,41516,£ million
5,Foreign companies’ share of UK subsidiaries' a...,2013,47560,£ million
6,Foreign companies’ share of UK subsidiaries' a...,2014,45074,£ million
7,Foreign companies’ share of UK subsidiaries' a...,2015,45651,£ million
8,Foreign companies’ share of UK subsidiaries' a...,2016,47038,£ million
9,Foreign companies’ share of UK subsidiaries' a...,2017,53637,£ million





Unnamed: 0,Component,Year,Value,Unit
0,Foreign companies’ share of UK companies’ shar...,2008,552981.0,£ million
1,Foreign companies’ share of UK companies’ shar...,2009,543846.0,£ million
2,Foreign companies’ share of UK companies’ shar...,2010,606659.0,£ million
3,Foreign companies’ share of UK companies’ shar...,2011,651758.0,£ million
4,Foreign companies’ share of UK companies’ shar...,2012,781127.0,£ million
5,Foreign companies’ share of UK companies’ shar...,2013,812739.0,£ million
6,Foreign companies’ share of UK companies’ shar...,2014,892250.0,£ million
7,Foreign companies’ share of UK companies’ shar...,2015,917519.0,£ million
8,Foreign companies’ share of UK companies’ shar...,2016,1076221.0,£ million
9,Foreign companies’ share of UK companies’ shar...,2017,1207732.0,£ million





Unnamed: 0,Component,Year,Value,Unit
0,Foreign companies’ share of UK subsidiaries’ a...,2008,34875.0,£ million
1,Foreign companies’ share of UK subsidiaries’ a...,2009,33310.0,£ million
2,Foreign companies’ share of UK subsidiaries’ a...,2010,30381.0,£ million
3,Foreign companies’ share of UK subsidiaries’ a...,2011,33784.0,£ million
4,Foreign companies’ share of UK subsidiaries’ a...,2012,35297.0,£ million
5,Foreign companies’ share of UK subsidiaries’ a...,2013,40756.0,£ million
6,Foreign companies’ share of UK subsidiaries’ a...,2014,41538.0,£ million
7,Foreign companies’ share of UK subsidiaries’ a...,2015,41335.0,£ million
8,Foreign companies’ share of UK subsidiaries’ a...,2016,43185.0,£ million
9,Foreign companies’ share of UK subsidiaries’ a...,2017,51018.0,£ million


Sheet 2.1 breakdown by country/area. First of all we need to get a breakdown of the geographic and economic areas from the `Geography` sheet, noting that the breakdown is not a strict hierarchy (e.g. Norway is in Europe, EFTA & OECD).

In [4]:
geog = sheets['Geography']
areas = geog.excel_ref('B3').expand(DOWN).is_not_any_border().is_not_blank()
areas = areas - areas.regex('^Information') - areas.regex('^From 2013')
europe_areas = geog.excel_ref('B3').expand(DOWN).is_any_border().is_not_blank()
europe_countries = europe_areas.fill(RIGHT).expand(DOWN).is_any_border().is_not_blank() \
    - areas.regex('The Americas').expand(DOWN).expand(RIGHT)
asia_countries_and_areas = areas.regex('Asia').fill(DOWN).fill(RIGHT).is_not_blank() \
    - areas.regex('Australasia').expand(DOWN).expand(RIGHT)
asia_countries = asia_countries_and_areas.is_not_bold()
asia_areas = asia_countries_and_areas - asia_countries
other_countries = areas.regex('The Americas').fill(RIGHT).fill(DOWN).is_any_border().is_not_blank() \
  - europe_countries - asia_countries - asia_areas
savepreviewhtml([areas, europe_areas, europe_countries, asia_countries, asia_areas, other_countries])

0,1,2,3,4,5
item 0,item 1,item 2,item 3,item 4,item 5

0,1,2,3,4,5,6,7
Definitions of geographic and economic areas,,,,,,,
,,,,,,,
,Europe,,,,,,
,EU,Austria,Belgium,Bulgaria,Croatia,,
,,Cyprus,Czech Republic,Denmark,Estonia,,
,,Finland,France,Germany,Greece,,
,,Hungary,Irish Republic,Italy,Latvia,,
,,Lithuania,Luxembourg,Malta,Netherlands,,
,,Poland,Portugal,Romania,Slovakia,,
,,Slovenia,Spain,Sweden,,,


In [5]:
cs_europe = ConversionSegment(
    europe_countries, [
        HDim(europe_areas, 'Sub area', CLOSEST, ABOVE),
        HDim(areas, 'Area', CLOSEST, ABOVE)
    ])
europe = cs_europe.topandas().drop(columns=['OBS']).rename(columns={'DATAMARKER': 'Country'})

cs_asia = ConversionSegment(
    asia_countries, [
        HDim(asia_areas, 'Sub area', CLOSEST, ABOVE),
        HDim(areas, 'Area', CLOSEST, ABOVE)
    ]
)
asia = cs_asia.topandas().drop(columns=['OBS']).rename(columns={'DATAMARKER': 'Country'})

cs_others = ConversionSegment(
    other_countries, [
        HDim(areas, 'Area', CLOSEST, ABOVE)
    ]
)
others = cs_others.topandas().drop(columns=['OBS']).rename(columns={'DATAMARKER': 'Country'})

all_areas = pd.concat([europe, asia, others], sort=False)
all_areas






Unnamed: 0,Country,Sub area,Area
0,Austria,EU,Europe
1,Belgium,EU,Europe
2,Bulgaria,EU,Europe
3,Croatia,EU,Europe
4,Cyprus,EU,Europe
5,Czech Republic,EU,Europe
6,Denmark,EU,Europe
7,Estonia,EU,Europe
8,Finland,EU,Europe
9,France,EU,Europe


In [6]:
by_area = sheets['2.1']
years = by_area.excel_ref('A5').expand(RIGHT).is_not_blank().is_not_whitespace()
areas = by_area.excel_ref('A5').expand(DOWN).is_not_blank()
areas = areas - areas.regex('WORLD TOTAL').fill(DOWN)
sub_areas = by_area.excel_ref('B5').expand(DOWN).is_not_blank().filter(lambda c: c.value.strip() != 'of which')
sub_areas = sub_areas - sub_areas.regex('CENTRAL & EASTERN EUROPE').fill(DOWN)
countries = by_area.excel_ref('C5').expand(DOWN).is_not_blank().is_not_whitespace()
obs = years.fill(DOWN)
obs = obs & (sub_areas | areas | countries).expand(RIGHT)
cs = ConversionSegment(
    obs, [
        HDim(years, 'Year', DIRECTLY, ABOVE),
        HDim(areas, 'Area', CLOSEST, ABOVE),
        HDim(sub_areas, 'Sub area', CLOSEST, ABOVE),
        HDim(sub_areas, 'Direct sub area', DIRECTLY, LEFT),
        HDim(countries, 'Country', DIRECTLY, LEFT)
    ]
)
savepreviewhtml(cs)

0,1,2,3,4,5
OBS,Year,Area,Sub area,Direct sub area,Country

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21
,,,,,,,,,,,,,,,,,,,,,
"2.1 Net foreign direct investment flows into the United Kingdom analysed by area and main country, 2008 to 2017 (Directional)",,,,,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,£ million,,,,,,,,,
,,,,,,,,,,,,,,,,,,,,,
,,,2008.0,2009.0,2010.0,2011.0,2012.0,2013.0,2014.0,2015.0,2016.0,2017.0,,,,,,,,,
,,,,,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,,,,,
EUROPE,,,22115.0,22584.0,8957.0,-28258.0,22830.0,9456.0,-8.0,-12305.0,135463.0,36203.0,,,,,,,,,
,EU,,21267.0,15181.0,-922.0,-23682.0,15381.0,-466.0,2911.0,-13807.0,129768.0,24291.0,,,,,,,,,
,,AUSTRIA,75.0,89.0,170.0,876.0,-74.0,21.0,-87.0,32.0,32.0,-40.0,,,,,,,,,


Some of the labels/hierarchy doesn't match: where there is no sub area it is searched for above, but that can go past the areas label.

In [7]:
t_by_area = cs.topandas()
t_by_area.loc[t_by_area['Country'].isnull() &
              t_by_area['Direct sub area'].isnull(), 'Sub area'] = None
t_by_area['Area'] = t_by_area.apply(
    lambda row: ':'.join(
        [s.strip() for s in [row['Area'], row['Sub area'], row['Country']] if s != None]), axis=1)
t_by_area.drop(columns=['DATAMARKER', 'Sub area', 'Direct sub area', 'Country'], inplace=True)
t_by_area.rename(columns={'OBS': 'Value'}, inplace=True)

# ignore non-numeric values for now. TODO: figure out how to represent "data markers"
t_by_area.replace('', pd.np.nan, inplace=True)
t_by_area.dropna(subset=['Value'], inplace=True)

t_by_area[['Value', 'Year']] = t_by_area[['Value', 'Year']].astype(float).astype(int)

t_by_area




Unnamed: 0,Value,Year,Area
0,22115,2008,EUROPE
1,22584,2009,EUROPE
2,8957,2010,EUROPE
3,-28258,2011,EUROPE
4,22830,2012,EUROPE
5,9456,2013,EUROPE
6,-8,2014,EUROPE
7,-12305,2015,EUROPE
8,135463,2016,EUROPE
9,36203,2017,EUROPE


Sheet 2.2 is "Foreign direct investment flows into the United Kingdom analysed by area & main country and by component, 2013 to 2016 (Directional)"

In [8]:
by_area_and_component = sheets['2.2']
areas = by_area_and_component.excel_ref('A5').expand(DOWN).is_not_blank().is_not_whitespace()
areas = areas - areas.regex('WORLD TOTAL').fill(DOWN)
areas = areas - areas.shift(DOWN) # remove double lines

# this dimension's labels can be split over two lines; detect this by taking any non-blank line
# underneath another label and overriding the line above with the concatenation of the two.
dim_areas = HDim(areas, 'Area', CLOSEST, ABOVE)
for next_row in areas.shift(DOWN).is_not_blank():
    override = next_row.shift(UP).value + ' ' + next_row.value
    dim_areas.AddCellValueOverride(next_row.shift(UP), override)

sub_areas = by_area_and_component.excel_ref('B5').expand(DOWN).is_not_blank().is_not_whitespace().filter(lambda c: c.value.strip() != 'of which')
sub_areas = sub_areas - sub_areas.regex('CENTRAL & EASTERN').fill(DOWN)
sub_areas = sub_areas - sub_areas.shift(DOWN) # remove double lines
# add potentially blank labels to the right of area so that the eventual lookup doesn't go
# above these labels.
sub_areas = sub_areas | areas.shift(RIGHT)

# same double-line problem with this dimension, so use the same trick
dim_sub_areas = HDim(sub_areas, 'Sub area', CLOSEST, ABOVE)
for next_row in sub_areas.shift(DOWN).is_not_blank():
    override = next_row.shift(UP).value + ' ' + next_row.value
    dim_sub_areas.AddCellValueOverride(next_row.shift(UP), override)

countries = by_area_and_component.excel_ref('C5').expand(DOWN).is_not_blank().is_not_whitespace()
countries = countries - countries.shift(DOWN)
# add potentially blank labels to the right of the sub-area labels so that eventual lookup
# doesn't go above them
countries = countries | sub_areas.shift(RIGHT)

dim_countries = HDim(countries, 'Country', CLOSEST, ABOVE)
for next_row in countries.shift(DOWN).is_not_blank().is_not_whitespace():
    override = next_row.shift(UP).value + ' ' + next_row.value
    dim_countries.AddCellValueOverride(next_row.shift(UP), override)

years = by_area_and_component.excel_ref('D5').expand(DOWN).is_not_blank().is_not_whitespace()
obs = years.fill(RIGHT).is_not_blank()
components = by_area_and_component.excel_ref('A6').fill(RIGHT).is_not_blank().is_not_whitespace()
trading_entities = by_area_and_component.excel_ref('A5').fill(RIGHT).is_not_blank().is_not_whitespace()

cs = ConversionSegment(
    obs, [
        HDim(years, 'Year', DIRECTLY, LEFT),
        dim_areas,
        dim_sub_areas,
        HDim(components, 'Component', DIRECTLY, ABOVE),
        HDim(trading_entities, 'Trading Entity', CLOSEST, LEFT),
        dim_countries
    ]
)
savepreviewhtml(cs)

0,1,2,3,4,5,6
OBS,Year,Area,Sub area,Component,Trading Entity,Country

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"2.2 Foreign direct investment flows into the United Kingdom analysed by area & main country and by component, 2014 to 2017 (Directional)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,£ million,,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,,,UK subsidiaries and associates,,,,,,,UK branches,Total net foreign direct investment in the UK,,,,,,,,,,,,,,,,,,
,,,,Foreign parent companies' share of UK companies' net profits,Less dividends paid to foreign parent companies,Unremitted profits (reinvested earnings),Acquisition of UK companies' share & loan capital,Disposal of UK companies' share & loan capital,Increase in amounts due to foreign parents on inter-company account,,Increase in amounts due to foreign parents on branch head-office account,,,,,,,,,,,,,,,,,,,
EUROPE,,,2014.0,24822.0,26410.0,-1587.0,12108.0,-5539.0,-5200.0,,210.0,-8.0,,,,,,,,,,,,,,,,,,
,,,2015.0,20759.0,29119.0,-8360.0,11948.0,-2092.0,..,,..,-12305.0,,,,,,,,,,,,,,,,,,
,,,2016.0,19540.0,20938.0,-1398.0,136377.0,-8108.0,8882.0,,-290.0,135463.0,,,,,,,,,,,,,,,,,,
,,,2017.0,22059.0,22106.0,-47.0,27994.0,-1971.0,9619.0,,608.0,36203.0,,,,,,,,,,,,,,,,,,


Todo:

* `trading_entities` has a merged cell in "Total net foreign direct investment in the UK" which needs a special case.

In [9]:
t_by_area_and_component = cs.topandas()
t_by_area_and_component['Area'] = t_by_area_and_component.apply(
    lambda row: ':'.join(
        [s.strip() for s in [row['Area'], row['Sub area'], row['Country']] if s != None and s.strip() != '']), axis=1)
t_by_area_and_component.drop(columns=['DATAMARKER', 'Sub area', 'Country'], inplace=True)
t_by_area_and_component.rename(columns={'OBS': 'Value'}, inplace=True)
t_by_area_and_component.loc[t_by_area_and_component['Trading Entity'] == 'Total net foreign direct investment in the UK',
                           'Component'] = 'Total net foreign direct investment in the UK'
t_by_area_and_component.loc[t_by_area_and_component['Trading Entity'] == 'Total net foreign direct investment in the UK',
                           'Trading Entity'] = 'All'
# ignore non-numeric values for now. TODO: figure out how to represent "data markers"
t_by_area_and_component.replace('', pd.np.nan, inplace=True)
t_by_area_and_component.dropna(subset=['Value'], inplace=True)

t_by_area_and_component[['Value', 'Year']] = t_by_area_and_component[['Value', 'Year']].astype(float).astype(int)

t_by_area_and_component




Unnamed: 0,Value,Year,Area,Component,Trading Entity
0,24822,2014,EUROPE,Foreign parent companies' share of UK companie...,UK subsidiaries and associates
1,26410,2014,EUROPE,Less dividends paid to foreign parent companies,UK subsidiaries and associates
2,-1587,2014,EUROPE,Unremitted profits (reinvested earnings),UK subsidiaries and associates
3,12108,2014,EUROPE,Acquisition of UK companies' share & loan capital,UK subsidiaries and associates
4,-5539,2014,EUROPE,Disposal of UK companies' share & loan capital,UK subsidiaries and associates
5,-5200,2014,EUROPE,Increase in amounts due to foreign parents on ...,UK subsidiaries and associates
6,210,2014,EUROPE,Increase in amounts due to foreign parents on ...,UK branches
7,-8,2014,EUROPE,Total net foreign direct investment in the UK,All
8,20759,2015,EUROPE,Foreign parent companies' share of UK companie...,UK subsidiaries and associates
9,29119,2015,EUROPE,Less dividends paid to foreign parent companies,UK subsidiaries and associates


In [10]:
from pathlib import Path
from datetime import datetime

destinationFolder = Path('out')
destinationFolder.mkdir(exist_ok=True, parents=True)

for label in summary_tables:
    summary_tables[label].to_csv(destinationFolder / (f'{pathify(label.strip())}.csv'), index=False)

t_by_area.to_csv(destinationFolder / 'fdi-net-by-area.csv', index=False)
t_by_area_and_component.to_csv(destinationFolder / 'fdi-net-by-area-and-component.csv', index=False)

scraper.dataset.family = 'trade'
scraper.dataset.license = 'http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/'

with open(destinationFolder / 'dataset.trig', 'wb') as metadata:
    metadata.write(scraper.generate_trig())