In [1]:
from gssutils import *

scraper = Scraper('https://www.ons.gov.uk/businessindustryandtrade/business/businessinnovation/datasets/' \
                  'foreigndirectinvestmentinvolvingukcompanies2013inwardtables')
scraper



## Foreign direct investment involving UK companies: inward

Annual statistics on the investment of foreign companies into the UK, including for investment flows, positions and earnings.

### Distributions

1. Foreign direct investment involving UK companies: inward ([MS Excel Spreadsheet](https://www.ons.gov.uk/file?uri=/businessindustryandtrade/business/businessinnovation/datasets/foreigndirectinvestmentinvolvingukcompanies2013inwardtables/current/annualforeigndirectinvestment2017inward.xls))


In [2]:
sheets = {sheet.name: sheet for sheet in scraper.distribution().as_databaker()}
sheets.keys()

dict_keys(['Contents', '1.1', '1.2', '1.3', '2.1', '2.2', '2.3', '3.1', '3.2', '3.3', '4.1', '4.2', '4.3', 'Geography', 'SIC'])

In [3]:
sh = ['2.2','3.2','4.2']

In [4]:
table = pd.DataFrame()

In [5]:
for sh_id in sh:
    by_area = sheets[sh_id]
    years = by_area.excel_ref('D6').expand(DOWN).is_not_blank().is_not_whitespace()
    areas = by_area.excel_ref('A5').expand(DOWN).is_not_blank()
    sub_areas = by_area.excel_ref('B5').expand(DOWN).is_not_blank().filter(lambda c: c.value.strip() != 'of which')
    countries = by_area.excel_ref('C').expand(DOWN).by_index([7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,67,71,75,79,83,87,
                                                            91,95,99,103,107,111,115,119,123,127,131,135,139,143,147,151,155,159,
                                                             163,167,171,175,179,183,187,191,195,199,203,207,211,215,219,223,227])
    components = by_area.excel_ref('E5:M6').expand(RIGHT).is_not_blank().is_not_whitespace()
    components = components - by_area.excel_ref('E5') - by_area.excel_ref('H5') - by_area.excel_ref('L5')                     
    obs = by_area.excel_ref('E7').expand(DOWN).expand(RIGHT).is_not_blank().is_not_whitespace() 
    cs = ConversionSegment(
        obs, [
            HDim(years, 'Year', DIRECTLY, LEFT),
            HDim(areas, 'Area', CLOSEST,ABOVE),
            HDim(sub_areas, 'Sub area', CLOSEST,ABOVE),
            HDim(countries, 'Country', CLOSEST,ABOVE),
            HDimConst('Investment Direction', 'inward'),
            HDim(components,'FDI Component', DIRECTLY, ABOVE),
            HDimConst('SIC Industry', 'all-activities'),
            HDimConst('International Trade Basis', 'BPM5')
        ]
    )
    # savepreviewhtml(cs)
    t_by_area = cs.topandas()    
    
    def user_perc(x,y,z):

        if x.strip() == '':
            if (y == '') | (y == None) | (y == ' '):
                return z
            else :
                return y
        else:
            return x

    t_by_area['ONS FDI Area'] = 'fdi/' + t_by_area.apply(lambda row: user_perc(row['Country'],row['Sub area'],row['Area']), axis = 1)
    t_by_area['ONS FDI Area'] = t_by_area['ONS FDI Area'].str.strip()
    t_by_area.drop(columns=['DATAMARKER','Area', 'Sub area','Country'], inplace=True)
    t_by_area.rename(columns={'OBS': 'Value'}, inplace=True)

    t_by_area.replace('', pd.np.nan, inplace=True)
    t_by_area.dropna(subset=['Value'], inplace=True)

    t_by_area[['Value', 'Year']] = t_by_area[['Value', 'Year']].astype(float).astype(int)

    table = pd.concat([table, t_by_area])






In [6]:
sr = ['2.3','3.3','4.3']

In [7]:
fc = ['total-net-fdi-uk','total-net-fdi-position-uk','total-net-fdi-earnings-uk']

In [8]:
i = 0
for sh_id in sr:
    by_area = sheets[sh_id]
    years = by_area.excel_ref('E6').expand(DOWN).is_not_blank().is_not_whitespace()
    areas = by_area.excel_ref('A6').expand(DOWN).is_not_blank()
    sub_areas = by_area.excel_ref('B6').expand(DOWN).is_not_blank().filter(lambda c: c.value.strip() != 'of which')
    countries = by_area.excel_ref('C').expand(DOWN).by_index([7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,67,71,75,79,83,87,
                                                            91,95,99,103,107,111,115,119,123,127,131,135,139,143,147,151,155,159,
                                                             163,167,171,175,179,183,187,191,195,199,203,207,211,215,219,223,227])
    components = by_area.excel_ref('F5').expand(RIGHT).is_not_blank().is_not_whitespace()
    obs = by_area.excel_ref('F6').expand(DOWN).expand(RIGHT).is_not_blank().is_not_whitespace()
    cs = ConversionSegment(
        obs, [
            HDim(years, 'Year', DIRECTLY, LEFT),
            HDim(areas, 'Area', CLOSEST,ABOVE),
            HDim(sub_areas, 'Sub area', CLOSEST,ABOVE),
            HDim(countries, 'Country', CLOSEST,ABOVE),
            HDimConst('Investment Direction', 'inward'),
            HDim(components,'SIC Industry', DIRECTLY, ABOVE),
            HDimConst('International Trade Basis', 'BPM5')
        ]
    )
    # savepreviewhtml(cs)
    t_by_area = cs.topandas()


    def user_perc(x,y,z):

        if (x == '') | (x == ' ') | (x == None) :
            if (y == '') | (y == None) | (y == ' '):
                return z
            else :
                return y
        else:
            return x

    t_by_area['ONS FDI Area'] = 'fdi/' + t_by_area.apply(lambda row: user_perc(row['Country'],row['Sub area'],row['Area']), axis = 1)
    t_by_area['ONS FDI Area'] = t_by_area['ONS FDI Area'].str.strip()
    t_by_area.drop(columns=['DATAMARKER','Area', 'Sub area','Country'], inplace=True)
    t_by_area.rename(columns={'OBS': 'Value'}, inplace=True)

    t_by_area.replace('', pd.np.nan, inplace=True)
    t_by_area.dropna(subset=['Value'], inplace=True)

    t_by_area[['Value', 'Year']] = t_by_area[['Value', 'Year']].astype(float).astype(int)
    t_by_area['FDI Component'] = fc[i]
    i = i + 1

    table = pd.concat([table, t_by_area])




of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.








In [9]:
st = ['2.1','3.1','4.1']

In [10]:
fc = ['total-net-fdi-uk','total-net-fdi-position-uk','total-net-fdi-earnings-uk']

In [11]:
i = 0
for sh_id in st:
    by_area = sheets[sh_id]
    years = by_area.excel_ref('A5').expand(RIGHT).is_not_blank().is_not_whitespace()
    areas = by_area.excel_ref('A5').expand(DOWN).is_not_blank()
    sub_areas = by_area.excel_ref('B5').expand(DOWN).is_not_blank().filter(lambda c: c.value.strip() != 'of which')
    countries = by_area.excel_ref('C5').expand(DOWN).is_not_blank().is_not_whitespace()
    obs = years.fill(DOWN)
    obs = obs & (sub_areas | areas | countries).expand(RIGHT)
    cs = ConversionSegment(
        obs, [
            HDim(years, 'Year', DIRECTLY, ABOVE),
            HDim(areas, 'Area', DIRECTLY, LEFT),
            HDim(sub_areas, 'Sub area', DIRECTLY, LEFT),
            HDim(countries, 'Country', DIRECTLY, LEFT),
            HDimConst('Investment Direction', 'inward'),
            HDimConst('SIC Industry', 'all-activities'),
            HDimConst('International Trade Basis', 'BPM5')
        ]
    )
    # savepreviewhtml(cs)
    t_by_area = cs.topandas()
    t_by_area['ONS FDI Area'] = t_by_area.apply(
        lambda row: ''.join(
            [s.strip() for s in [row['Area'], row['Sub area'], row['Country']] if s != None]), axis=1)
    t_by_area['ONS FDI Area'] = 'fdi/' + t_by_area['ONS FDI Area']
    t_by_area.drop(columns=['DATAMARKER','Area', 'Sub area','Country'], inplace=True)
    t_by_area.rename(columns={'OBS': 'Value'}, inplace=True)

    t_by_area.replace('', pd.np.nan, inplace=True)
    t_by_area.dropna(subset=['Value'], inplace=True)

    t_by_area[['Value', 'Year']] = t_by_area[['Value', 'Year']].astype(float).astype(int)
    t_by_area['FDI Component'] = fc[i]
    i = i + 1

    table = pd.concat([table, t_by_area])




of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.








In [12]:
table['International Trade Basis'] = ['BPM6' if x>=2012 else 'BPM5' for x in table['Year']]

In [13]:
table['ONS FDI Area'] = table['ONS FDI Area'].str.lower()

In [14]:
table['SIC Industry'] = table['SIC Industry'].map(
    lambda x: {
        'Total' : 'all-activities'
        }.get(x, x))

In [15]:
table['FDI Component'] = table['FDI Component'].map(
    lambda x: {
        "Foreign companies' share of UK companies' net profits" : "Foreign parent companies' share of UK companies' net profits"
        }.get(x, x))

In [16]:
table['ONS FDI Area'] = table['ONS FDI Area'].map(
    lambda x: {
        'fdi/other european' : 'fdi/other european countries',
        'fdi/countries' : 'fdi/other european countries',
        'fdi/uk offshore' : 'fdi/uk offshore islands',
        'fdi/near & middle east' : 'fdi/near & middle east countries',
        'fdi/other asian' : 'fdi/other asian countries',
        'fdi/central &' : 'fdi/central & eastern europe',
        'fdi/eastern' : 'fdi/central & eastern europe',
        'fdi/other' : 'fdi/other european countries',
        'fdi/european' : 'fdi/other european countries',
        'fdi/islands' : 'fdi/uk offshore islands',
        'fdi/near & middle' : 'fdi/near & middle east countries',
        'fdi/east countries' : 'fdi/near & middle east countries',
        'fdi/gulf arabian' : 'fdi/gulf arabian countries'
        }.get(x, x))

In [17]:
table.drop_duplicates(inplace = True)