In [1]:
%run scrape_ons.ipynb

metadata = scrape('https://www.ons.gov.uk/businessindustryandtrade/business/businessinnovation/datasets/' \
                  'foreigndirectinvestmentinvolvingukcompanies2013inwardtables')
metadata

{'title': 'Foreign direct investment involving UK companies: Inward tables',
 'releaseDate': datetime.date(2017, 12, 1),
 'mailto': 'mailto:fdi@ons.gov.uk',
 'fileURL': 'https://www.ons.gov.uk/file?uri=/businessindustryandtrade/business/businessinnovation/datasets/foreigndirectinvestmentinvolvingukcompanies2013inwardtables/current/annualforeigndirectinvestmentinward2016.xls',
 'about': 'Inward datasets including data for flows, positions and earnings.'}

In [2]:
sourceFolder = Path('in')
sourceFolder.mkdir(exist_ok=True)
inputFile = sourceFolder / 'data.xls'
response = session.get(metadata['fileURL'])
with open(inputFile, 'wb') as f:
  f.write(response.content)
sheets = {sheet.name: sheet for sheet in loadxlstabs(inputFile)}

Loading in/data.xls which has size 724992 bytes
Table names: ['Contents', '1.1', '1.2', '1.3', '2.1', '2.2', '2.3', '3.1', '3.2', '3.3', '4.1', '4.2', '4.3', 'Geography', 'SIC']


1.1 to 1.3 are summary tables of the same form

In [3]:
from IPython.display import display, HTML

def toint(s):
    try:
        return int(float(s))
    except:
        return None

summary_tables = {}
for summary in [sheets['1.1'], sheets['1.2'], sheets['1.3']]:
    years = summary.excel_ref('A4').fill(RIGHT).is_not_blank()
    components = summary.excel_ref('A4').fill(DOWN).is_not_blank()
    component_label = components.by_index(1)
    components = components - components.regex('^Total').fill(DOWN) - component_label
    obs = components.fill(RIGHT).is_not_blank()
    cs = ConversionSegment(
        obs, [
            HDim(components, 'Component', DIRECTLY, LEFT),
            HDim(years, 'Year', DIRECTLY, ABOVE)
        ]
    )
    table = cs.topandas()
    table['Year']  = table['Year'].map(toint)
    table['Value'] = table['OBS'].map(toint)
    table['Unit']  = '£ million'
    to_drop = ['OBS']
    if 'DATAMARKER' in table:
        to_drop.append('DATAMARKER')
    table.drop(columns=to_drop, inplace=True)
    summary_tables[component_label.value] = table
    display(HTML(f'<h2>{component_label.value}</h2>'))
    display(table)




Unnamed: 0,Component,Year,Value,Unit
0,Foreign companies’ share of UK subsidiaries' a...,2007,47521,£ million
1,Foreign companies’ share of UK subsidiaries' a...,2008,44050,£ million
2,Foreign companies’ share of UK subsidiaries' a...,2009,41061,£ million
3,Foreign companies’ share of UK subsidiaries' a...,2010,37679,£ million
4,Foreign companies’ share of UK subsidiaries' a...,2011,42878,£ million
5,Foreign companies’ share of UK subsidiaries' a...,2012,41516,£ million
6,Foreign companies’ share of UK subsidiaries' a...,2013,47560,£ million
7,Foreign companies’ share of UK subsidiaries' a...,2014,45074,£ million
8,Foreign companies’ share of UK subsidiaries' a...,2015,45651,£ million
9,Foreign companies’ share of UK subsidiaries' a...,2016,49029,£ million





Unnamed: 0,Component,Year,Value,Unit
0,Foreign companies’ share of UK companies’ shar...,2007,494638.0,£ million
1,Foreign companies’ share of UK companies’ shar...,2008,552981.0,£ million
2,Foreign companies’ share of UK companies’ shar...,2009,543846.0,£ million
3,Foreign companies’ share of UK companies’ shar...,2010,606659.0,£ million
4,Foreign companies’ share of UK companies’ shar...,2011,651758.0,£ million
5,Foreign companies’ share of UK companies’ shar...,2012,781127.0,£ million
6,Foreign companies’ share of UK companies’ shar...,2013,812739.0,£ million
7,Foreign companies’ share of UK companies’ shar...,2014,892250.0,£ million
8,Foreign companies’ share of UK companies’ shar...,2015,917519.0,£ million
9,Foreign companies’ share of UK companies’ shar...,2016,1092732.0,£ million





Unnamed: 0,Component,Year,Value,Unit
0,Foreign companies’ share of UK subsidiaries’ a...,2007,43324.0,£ million
1,Foreign companies’ share of UK subsidiaries’ a...,2008,34875.0,£ million
2,Foreign companies’ share of UK subsidiaries’ a...,2009,33310.0,£ million
3,Foreign companies’ share of UK subsidiaries’ a...,2010,30381.0,£ million
4,Foreign companies’ share of UK subsidiaries’ a...,2011,33784.0,£ million
5,Foreign companies’ share of UK subsidiaries’ a...,2012,35297.0,£ million
6,Foreign companies’ share of UK subsidiaries’ a...,2013,40756.0,£ million
7,Foreign companies’ share of UK subsidiaries’ a...,2014,41538.0,£ million
8,Foreign companies’ share of UK subsidiaries’ a...,2015,41335.0,£ million
9,Foreign companies’ share of UK subsidiaries’ a...,2016,45184.0,£ million


Sheet 2.1 breakdown by country/area

In [4]:
by_area = sheets['2.1']
years = by_area.excel_ref('A5').expand(RIGHT).is_not_blank().is_not_whitespace()
continents = by_area.excel_ref('A5').expand(DOWN).is_not_blank()
continents = continents - continents.regex('WORLD TOTAL').fill(DOWN)
areas = by_area.excel_ref('B5').expand(DOWN).is_not_blank().filter(lambda c: c.value.strip() != 'of which')
areas = areas - areas.regex('CENTRAL & EASTERN EUROPE').fill(DOWN)
countries = by_area.excel_ref('C5').expand(DOWN).is_not_blank().is_not_whitespace()
obs = years.fill(DOWN)
#obs = (continents | areas | countries).fill(RIGHT)
obs = obs & (continents | areas | countries).expand(RIGHT)
cs = ConversionSegment(
    obs, [
        HDim(years, 'Year', DIRECTLY, ABOVE),
        HDim(continents, 'Continent', CLOSEST, ABOVE),
        HDim(areas, 'Area', CLOSEST, ABOVE),
        HDim(countries, 'Country', DIRECTLY, LEFT)
    ]
)
savepreviewhtml(cs)

0,1,2,3,4
OBS,Year,Continent,Area,Country

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"2.1 Net foreign direct investment flows into the United Kingdom analysed by area and main country, 2007 to 2016 (Directional)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,£ million,,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,,2007.0,2008.0,2009.0,2010.0,2011.0,2012.0,2013.0,2014.0,2015.0,2016.0,,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
EUROPE,,,38565.0,22115.0,22584.0,8957.0,-28258.0,22830.0,9456.0,-8.0,-12305.0,97807.0,,,,,,,,,,,,,,,,,,
,EU,,32508.0,21267.0,15181.0,-922.0,-23682.0,15381.0,-466.0,2911.0,-13807.0,92022.0,,,,,,,,,,,,,,,,,,
,,AUSTRIA,223.0,75.0,89.0,170.0,876.0,-74.0,21.0,-87.0,32.0,-75.0,,,,,,,,,,,,,,,,,,


In [5]:
t_by_area = cs.topandas()
t_by_area




Unnamed: 0,OBS,DATAMARKER,Year,Continent,Area,Country
0,38565,,2007.0,EUROPE,,
1,22115,,2008.0,EUROPE,,
2,22584,,2009.0,EUROPE,,
3,8957,,2010.0,EUROPE,,
4,-28258,,2011.0,EUROPE,,
5,22830,,2012.0,EUROPE,,
6,9456,,2013.0,EUROPE,,
7,-8,,2014.0,EUROPE,,
8,-12305,,2015.0,EUROPE,,
9,97807,,2016.0,EUROPE,,
