In [2]:
from gssutils import *

scraper = Scraper('https://www.uktradeinfo.com/Statistics/OverseasTradeStatistics/AboutOverseastradeStatistics/Pages/OTSReports.aspx')
scraper



## HM Revenue & Customs uktradeinfo - OTS Reports

This is a catalog of datasets; choose one from the following:

* Midlands Regional Trade Statistics data 2014-2016
* 2015 UK VAT Registered Importer and Exporter Population
* Overseas Trade Statistics broken down by English Growth Hub areas - EU and non-EU Exports 2015
* UK Trade in Goods by Business Characteristics - Experimental Statistics
* 2010 Quinquennial Review of the UK Ancillary Cost Survey (ACS) full report
* Quality Report
* UK Statistics article
* 2009 Intrastat Triennial Final Report
* EDICOM Technical Implementation Report
* 2005 Intrastat Triennial Review V2.0
* EDICOM Report
* EDICOM Report

In [3]:
scraper.select_dataset(title=lambda x: x.startswith('UK Trade in Goods by Business Characteristics'))
scraper

## UK Trade in Goods by Business Characteristics - Experimental Statistics

### Distributions

1. UK Trade in Goods by Business Characteristics - Experimental Statistics ([application/pdf](https://www.uktradeinfo.com/Statistics/Documents/Reports/IDBR_OTS_2015.pdf))
1. IDBR OTS tables 2015 ([MS Excel Spreadsheet](https://www.uktradeinfo.com/Statistics/Documents/Reports/IDBR_OTS_tables_2015.xls))
1. IDBR OTS Country data (expert users) 2015 ([MS Excel Spreadsheet](https://www.uktradeinfo.com/Statistics/Documents/Reports/IDBR_OTS_%20Country_data_expert_2015.xls))
1. UK Trade in Goods by Business Characteristics - Experimental Statistics ([application/pdf](https://www.uktradeinfo.com/Statistics/Documents/Reports/IDBR_OTS_2014.pdf))
1. IDBR OTS tables 2014 ([MS Excel Spreadsheet](https://www.uktradeinfo.com/Statistics/Documents/Reports/IDBR_OTS_tables_2014.xls))
1. IDBR OTS Country data (expert users) 2014 ([MS Excel Spreadsheet](https://www.uktradeinfo.com/Statistics/Documents/Reports/IDBR_OTS_%20Country_data_expert_2014.xls))
1. UK Trade in Goods by Business Characteristics - Experimental Statistics ([application/pdf](https://www.uktradeinfo.com/Statistics/Documents/Reports/IDBR_OTS_2013.pdf))
1. IDBR OTS tables 2013 ([MS Excel Spreadsheet](https://www.uktradeinfo.com/Statistics/Documents/Reports/IDBR_OTS_tables_2013.xls))
1. IDBR OTS Country data (expert users) 2013 ([MS Excel Spreadsheet](https://www.uktradeinfo.com/Statistics/Documents/Reports/IDBR_OTS_Country_data%20_expert_2013.xls))
1. UK Trade in Goods by Business Characteristics - Experimental Statistics ([application/pdf](https://www.uktradeinfo.com/Statistics/OverseasTradeStatistics/AboutOverseastradeStatistics/Documents/IDBR_OTS_2012.pdf))
1. IDBR OTS tables 2009 - 2012 ([MS Excel Spreadsheet](https://www.uktradeinfo.com/Statistics/OverseasTradeStatistics/AboutOverseastradeStatistics/Documents/IDBR_OTS_tables_2009_to_2012.xls))
1. IDBR OTS Country data (expert users) 2009 - 2012 ([MS Excel Spreadsheet](https://www.uktradeinfo.com/Statistics/OverseasTradeStatistics/AboutOverseastradeStatistics/Documents/IDBR_OTS_Country_2009_to_2012.xls))


In [4]:
idbrs = sorted(
    [dist for dist in scraper.distributions if dist.title.startswith('IDBR OTS tables')],
    key=lambda d: d.title, reverse=True)
idbr = idbrs[0]
display(idbr.title)
tabs = {tab.name: tab for tab in idbr.as_databaker()}
tabs.keys()

'IDBR OTS tables 2015'

dict_keys(['Notes and Contents', 'Industry Group', 'Age Group', 'Employee Size', 'Industry_Age', 'Industry_EmployeeSize', 'EmployeeSize_Age', 'Metadata'])

In [5]:
%%capture

processors = [
    "Business count by Age of Business.ipynb",
    "Business count by Employee Size.ipynb",
    "Business count by Industry Group.ipynb",
    "Employee count for Businesses by Age of Business.ipynb",
    "Employee count for Businesses by Employee Size.ipynb",
    "Employee count for Businesses by Industry Group.ipynb",
    "Total value of UK trade by Age of Business.ipynb",
    "Total value of UK trade by Employee Size.ipynb",
    "Total value of UK trade by Industry Group.ipynb",
    "TRADE IN GOODS STATISTICS -Business Count.ipynb",
    "TRADE IN GOODS STATISTICS -Employee Count and age business.ipynb",
    "TRADE IN GOODS STATISTICS -Employee Count and age employee count.ipynb",
    "TRADE IN GOODS STATISTICS -Employee Count and age.ipynb",
    "TRADE IN GOODS STATISTICS -Employee Count.ipynb",
    "TRADE IN GOODS STATISTICS -total value of UK Trade.ipynb",
    "TRADE IN GOODS STATISTICS_Employee size_Businesses.ipynb",
    "TRADE IN GOODS STATISTICS_Employee size_Employee count.ipynb",
    "TRADE IN GOODS STATISTICS_Employee size_value.ipynb"
]

def create_table(proc):
    %run "$proc"
    return new_table

final_table = pd.concat(create_table(p) for p in processors)

In [6]:
final_table.fillna('Total', inplace = True)
final_table

Unnamed: 0,Age of Business,Employment,Flow,Geography,HMRC Industry,Measure Type,Unit,Value,Year
0,0 to 1 years,Total,Export,K02000001,Total,Count,Businesses,10515,2015
1,0 to 1 years,Total,Import,K02000001,Total,Count,Businesses,21541,2015
2,2 to 3 years,Total,Export,K02000001,Total,Count,Businesses,13308,2015
3,2 to 3 years,Total,Import,K02000001,Total,Count,Businesses,22043,2015
4,4 to 5 years,Total,Export,K02000001,Total,Count,Businesses,12088,2015
5,4 to 5 years,Total,Import,K02000001,Total,Count,Businesses,18430,2015
6,6 to 9 years,Total,Export,K02000001,Total,Count,Businesses,20136,2015
7,6 to 9 years,Total,Import,K02000001,Total,Count,Businesses,29117,2015
8,10 to 20 years,Total,Export,K02000001,Total,Count,Businesses,36292,2015
9,10 to 20 years,Total,Import,K02000001,Total,Count,Businesses,49196,2015


Rationalise the codes used in each dimension

In [7]:
for d in final_table:
    if d not in ['Value']:
        display(d)
        display(final_table[d].unique())

'Age of Business'

array(['0 to 1 years', '2 to 3 years', '4 to 5 years', '6 to 9 years',
       '10 to 20 years', '20 + years', 'Unknown years', 'Total years',
       'Total', ' years', '6 to 9  years', 'years'], dtype=object)

'Employment'

array(['Total', '0 employees', '1 to 9 employees', '10 to 49 employees',
       '50 to 249 employees', '250 + employees', 'Unknown employees',
       'Total employees', 'Grand Total employees', 'No employees'],
      dtype=object)

'Flow'

array(['Export', 'Import'], dtype=object)

'Geography'

array(['K02000001'], dtype=object)

'HMRC Industry'

array(['Total', 'group-1', 'group-2', 'group-3', 'group-4', 'group-5',
       'group-6', 'group-7', 'group-8', 'group-9', 'group-10', 'Unknown'],
      dtype=object)

'Measure Type'

array(['Count', 'Total Turnover'], dtype=object)

'Unit'

array(['Businesses', 'Employees', '£ Million'], dtype=object)

'Year'

array(['2015'], dtype=object)

In [8]:
final_table['Employment'] = final_table['Employment'].map(pathify)
final_table['HMRC Industry'] = final_table['HMRC Industry'].map(pathify)
final_table.replace({
    'Employment': {
        'total': 'total-employees',
        'grand-total-employees': 'total-employees',
        'no-employees': '0-employees'
    },
    'Flow': {
        'Export': 'Exports',
        'Import': 'Imports'
    },
    'Unit': {
        '£ Million': 'GBP Million'
    },
    'Age of Business': {
        'Total years': 'Total',
        '20 + years': '20+ years',
        'Unknown years': 'Unknown',
        '6 to 9  years': '6 to 9 years',
        ' years': 'Total',
        'years': 'Total'
    }
}, inplace=True)

for d in final_table:
    if d not in ['Value']:
        display(d)
        display(final_table[d].unique())

'Age of Business'

array(['0 to 1 years', '2 to 3 years', '4 to 5 years', '6 to 9 years',
       '10 to 20 years', '20+ years', 'Unknown', 'Total'], dtype=object)

'Employment'

array(['total-employees', '0-employees', '1-to-9-employees',
       '10-to-49-employees', '50-to-249-employees', '250-employees',
       'unknown-employees'], dtype=object)

'Flow'

array(['Exports', 'Imports'], dtype=object)

'Geography'

array(['K02000001'], dtype=object)

'HMRC Industry'

array(['total', 'group-1', 'group-2', 'group-3', 'group-4', 'group-5',
       'group-6', 'group-7', 'group-8', 'group-9', 'group-10', 'unknown'],
      dtype=object)

'Measure Type'

array(['Count', 'Total Turnover'], dtype=object)

'Unit'

array(['Businesses', 'Employees', 'GBP Million'], dtype=object)

'Year'

array(['2015'], dtype=object)

Distinguish measure types for business and employee counts

In [9]:
final_table.loc[(final_table['Measure Type'] == 'Count') & (final_table['Unit'] == 'Businesses'),
               'Measure Type'] = 'Count of Businesses'
final_table.loc[(final_table['Measure Type'] == 'Count') & (final_table['Unit'] == 'Employees'),
               'Measure Type'] = 'Count of Employees'
final_table.reset_index(inplace=True)
final_table

Unnamed: 0,index,Age of Business,Employment,Flow,Geography,HMRC Industry,Measure Type,Unit,Value,Year
0,0,0 to 1 years,total-employees,Exports,K02000001,total,Count of Businesses,Businesses,10515,2015
1,1,0 to 1 years,total-employees,Imports,K02000001,total,Count of Businesses,Businesses,21541,2015
2,2,2 to 3 years,total-employees,Exports,K02000001,total,Count of Businesses,Businesses,13308,2015
3,3,2 to 3 years,total-employees,Imports,K02000001,total,Count of Businesses,Businesses,22043,2015
4,4,4 to 5 years,total-employees,Exports,K02000001,total,Count of Businesses,Businesses,12088,2015
5,5,4 to 5 years,total-employees,Imports,K02000001,total,Count of Businesses,Businesses,18430,2015
6,6,6 to 9 years,total-employees,Exports,K02000001,total,Count of Businesses,Businesses,20136,2015
7,7,6 to 9 years,total-employees,Imports,K02000001,total,Count of Businesses,Businesses,29117,2015
8,8,10 to 20 years,total-employees,Exports,K02000001,total,Count of Businesses,Businesses,36292,2015
9,9,10 to 20 years,total-employees,Imports,K02000001,total,Count of Businesses,Businesses,49196,2015


In [10]:
final_table = final_table[['Geography','Year','Employment','Flow','Age of Business','HMRC Industry','Measure Type','Value','Unit']].copy()
final_table.drop_duplicates(inplace=True)
final_table

Unnamed: 0,Geography,Year,Employment,Flow,Age of Business,HMRC Industry,Measure Type,Value,Unit
0,K02000001,2015,total-employees,Exports,0 to 1 years,total,Count of Businesses,10515,Businesses
1,K02000001,2015,total-employees,Imports,0 to 1 years,total,Count of Businesses,21541,Businesses
2,K02000001,2015,total-employees,Exports,2 to 3 years,total,Count of Businesses,13308,Businesses
3,K02000001,2015,total-employees,Imports,2 to 3 years,total,Count of Businesses,22043,Businesses
4,K02000001,2015,total-employees,Exports,4 to 5 years,total,Count of Businesses,12088,Businesses
5,K02000001,2015,total-employees,Imports,4 to 5 years,total,Count of Businesses,18430,Businesses
6,K02000001,2015,total-employees,Exports,6 to 9 years,total,Count of Businesses,20136,Businesses
7,K02000001,2015,total-employees,Imports,6 to 9 years,total,Count of Businesses,29117,Businesses
8,K02000001,2015,total-employees,Exports,10 to 20 years,total,Count of Businesses,36292,Businesses
9,K02000001,2015,total-employees,Imports,10 to 20 years,total,Count of Businesses,49196,Businesses


In [15]:
final_table = final_table[(final_table['Measure Type'] != 'Total Turnover')]

In [16]:
out = Path('out')
out.mkdir(exist_ok=True)
final_table.to_csv(out / 'observations.csv', index = False)

In [17]:
scraper.dataset.family = 'trade'
from gssutils.metadata import THEME
scraper.dataset.theme = THEME['business-industry-trade-energy']
with open(out / 'dataset.trig', 'wb') as metadata:
    metadata.write(scraper.generate_trig())