TRADE IN GOODS STATISTICS:Employee size_Businesses

In [1]:
from databaker.framework import *
import pandas as pd

In [2]:
import requests
from pathlib import Path

sourceFolder = Path('in')
sourceFolder.mkdir(exist_ok=True)

inputURL = 'https://www.uktradeinfo.com/statistics/documents/idbr_ots_tables_2015.xls'
inputFile = sourceFolder / 'idbr_ots_tables_2015.xls'
if not(inputFile.exists() and inputFile.is_file()):
    response = requests.get(inputURL)
    with open(inputFile, 'wb') as f:
        f.write(response.content)

In [3]:
tab = loadxlstabs(inputFile)

Loading in\idbr_ots_tables_2015.xls which has size 174080 bytes
Table names: ['Notes and Contents', 'Industry Group', 'Age Group', 'Employee Size', 'Industry_Age', 'Industry_EmployeeSize', 'EmployeeSize_Age', 'Metadata']


In [4]:
tab = tab[5]

In [5]:
savepreviewhtml(tab)

0,1,2,3,4,5,6,7,8,9,10,11,12,13
TRADE IN GOODS STATISTICS MATCHED WITH REGISTERED BUSINESSES FROM THE INTER-DEPARTMENTAL BUSINESS REGISTER,,,,,,,,,,,,,
,,,,,,,,,,,,,
Release Date 24/11/2016,,,,,,,,,,,,,
,,,,,,,,,,,,,
"Total value of UK Trade, Business count and Employee count, by Industry group and Employee Size group for Imports and Exports",,,,,,,,,,,,,
,,,Exports,,,,Imports,,,,,,
,,,2015.0,,,,2015.0,,,,,,
Industry Group,Employees,,Value £ m,Business Count,Employee Count,,Value £ m,Business Count,Employee Count,,,,
,,,,,,,,,,,,,
Group 1 Agriculture and Food,0.0,,121.0,647.0,0.0,,588.0,1841.0,0.0,,,,


In [6]:
observations = tab.excel_ref('E10').expand(DOWN).expand(RIGHT).is_not_blank() 

In [7]:
observations = observations - tab.excel_ref('H10').expand(DOWN).is_not_blank()

In [8]:
observations = observations - tab.excel_ref('F10').expand(DOWN).is_not_blank()

In [9]:
observations = observations - tab.excel_ref('J10').expand(DOWN).is_not_blank()

In [10]:
Industrygroup = tab.excel_ref('A10').expand(DOWN).is_not_blank()

In [11]:
employees = tab.excel_ref('B10').expand(DOWN)

In [12]:
Flow = tab.excel_ref('D6').expand(RIGHT).is_not_blank()

In [13]:
Dimensions = [
            HDimConst('Geography', 'K02000001'),
            HDimConst('Year','2015'),
            HDimConst('Unit', 'Counts'), 
            HDimConst('Measure Type','Employees'),            
            HDim(Industrygroup, 'Industry', CLOSEST, ABOVE),
            HDim(Flow, 'Flow', CLOSEST, LEFT),
            HDim(employees, 'Total Employees', DIRECTLY, LEFT)
]

In [14]:
c1 = ConversionSegment(observations, Dimensions, processTIMEUNIT=True)

In [15]:
savepreviewhtml(c1)

0,1,2,3
OBS,Industry,Flow,Total Employees

0,1,2,3,4,5,6,7,8,9,10,11,12,13
TRADE IN GOODS STATISTICS MATCHED WITH REGISTERED BUSINESSES FROM THE INTER-DEPARTMENTAL BUSINESS REGISTER,,,,,,,,,,,,,
,,,,,,,,,,,,,
Release Date 24/11/2016,,,,,,,,,,,,,
,,,,,,,,,,,,,
"Total value of UK Trade, Business count and Employee count, by Industry group and Employee Size group for Imports and Exports",,,,,,,,,,,,,
,,,Exports,,,,Imports,,,,,,
,,,2015.0,,,,2015.0,,,,,,
Industry Group,Employees,,Value £ m,Business Count,Employee Count,,Value £ m,Business Count,Employee Count,,,,
,,,,,,,,,,,,,
Group 1 Agriculture and Food,0.0,,121.0,647.0,0.0,,588.0,1841.0,0.0,,,,


In [16]:
new_table = c1.topandas()




In [17]:
new_table.rename(index= str, columns= {'OBS':'Value'}, inplace = True)

In [18]:
new_table['Services'] = new_table['Industry'].map(lambda x: str(x)[8:])

In [19]:
new_table['Product'] = new_table['Industry'].map(lambda x: str(x)[0:8])

In [20]:
new_table['Services'] = new_table['Services'].str.strip()
new_table['Product'] = new_table['Product'].str.strip()
new_table['Year'] = new_table['Year'].str.strip()
new_table['Flow'] = new_table['Flow'].str.strip()
new_table['Unit'] = new_table['Unit'].str.strip()
new_table['Measure Type'] = new_table['Measure Type'].str.strip()
new_table['Geography'] = new_table['Geography'].str.strip()
new_table['Total Employees'] = new_table['Total Employees'].str.strip()

In [21]:
new_table = new_table[['Geography','Year','Product','Services','Flow','Total Employees','Measure Type','Value','Unit']]

In [22]:
new_table['Product'] = new_table['Product'].map(lambda cell:cell.replace('Grand To', 'Grand Total'))
new_table['Services'] = new_table['Services'].map(lambda cell:cell.replace('tal9', 'NA'))

In [23]:
def user_perc(x):
    
    if x.strip(' ') == '':
        return 'NA'
    else:
        return x
    
new_table['Services'] = new_table.apply(lambda row: user_perc(row['Services']), axis = 1)
new_table['Total Employees'] = new_table.apply(lambda row: user_perc(row['Total Employees']), axis = 1)

In [24]:
new_table = new_table[new_table['Value'] != '']

In [25]:
new_table = new_table[new_table['Value'] != 'S']

In [26]:
new_table.head(4)

Unnamed: 0,Geography,Year,Product,Services,Flow,Total Employees,Measure Type,Value,Unit
0,K02000001,2015,Group 1,Agriculture and Food,Exports,0.0,Employees,647,Counts
1,K02000001,2015,Group 1,Agriculture and Food,Imports,0.0,Employees,1841,Counts
2,K02000001,2015,Group 1,Agriculture and Food,Exports,1 to 9,Employees,2725,Counts
3,K02000001,2015,Group 1,Agriculture and Food,Imports,1 to 9,Employees,4846,Counts


In [27]:
new_table.tail(4)

Unnamed: 0,Geography,Year,Product,Services,Flow,Total Employees,Measure Type,Value,Unit
120,K02000001,2015,Unknown,,Exports,Unknown,Employees,5970,Counts
121,K02000001,2015,Unknown,,Imports,Unknown,Employees,16316,Counts
122,K02000001,2015,Grand Total,,Exports,,Employees,143358,Counts
123,K02000001,2015,Grand Total,,Imports,,Employees,219302,Counts


In [28]:
out = Path('out')
out.mkdir(exist_ok=True)
new_table.to_csv(out / 'employeessizebusinesses.csv', index = False)