###  ABS to Tidydata

In [1]:
from databaker.framework import *
import pandas as pd 

ABS Excel spreadsheet is available from [ONS website](https://www.ons.gov.uk/businessindustryandtrade/business/businessservices/datasets/annualbusinesssurveyimportersandexporters).

In [2]:
import requests
from pathlib import Path
from io import BytesIO
from cachecontrol import CacheControl
from cachecontrol.caches.file_cache import FileCache
from cachecontrol.heuristics import LastModified

session = CacheControl(requests.Session(),
                       cache=FileCache('.cache'),
                       heuristic=LastModified())

sourceFolder = Path('in')
sourceFolder.mkdir(exist_ok=True)

filename = 'importersandexporterssummarytablesinitial.xls'
url = 'https://www.ons.gov.uk/file?uri=/businessindustryandtrade/business/businessservices/datasets/annualbusinesssurveyimportersandexporters/current/' + filename
inputFile = sourceFolder / filename
response = session.get(url)
with open(inputFile, 'wb') as f:
    f.write(response.content)
sheets = loadxlstabs(inputFile)

Loading in/importersandexporterssummarytablesinitial.xls which has size 109867 bytes
Table names: ['Content Page', '2016 Goods and Services', '2016 Goods ', '2016 Services  ', '2015 Goods and Services', '2015 Goods ', '2015 Services  ', '2014 Goods and Services', '2014 Goods', '2014 Services', 'Standard Errors']


In [3]:
import re
tab_name_re = re.compile(r'^([0-9]{4}) (.*)$')
tidy = pd.DataFrame()

for sheet in sheets[1:-1]:
    name_match = tab_name_re.match(sheet.name)
    assert name_match, "sheet name doesn't match regex"
    for breakdown in ['Detailed employment', 'Employment', 'Ownership', 'Turnover', 'Age']:
        year = HDimConst('Year', name_match.group(1))
        trade = HDimConst('Trade', name_match.group(2).strip())
        breakdown_on_down = sheet.filter(starts_with(breakdown)).fill(DOWN).expand(RIGHT).is_not_blank()
        breakdown_obs = breakdown_on_down - \
            breakdown_on_down.filter(contains_string('Total')).expand(DOWN).expand(RIGHT) - \
            sheet.filter(starts_with(breakdown)).fill(DOWN)
        classifiers = sheet.filter(starts_with(breakdown)).fill(DOWN).is_not_blank()
        classifiers = classifiers - classifiers.filter(contains_string('Total')).expand(DOWN)
        classifiers = HDim(classifiers, breakdown, DIRECTLY, LEFT)
        classifiers.AddCellValueOverride('2 to9', '2 to 9')
        import_export = sheet.filter(starts_with(breakdown)).fill(RIGHT).is_not_blank()
        import_export = HDim(import_export, 'Import/Export', DIRECTLY, UP)
        import_export.AddCellValueOverride('Businesses 4', 'Businesses')
        import_export.AddCellValueOverride('Exporter and/or Importer 7', 'Exporter and/or Importer')
        measure = sheet.filter(starts_with(breakdown)).shift(UP).fill(RIGHT).is_not_blank()
        measure = HDim(measure, 'Measure Type', CLOSEST, LEFT)
        measure.AddCellValueOverride('Number of 5', 'Count')
        measure.AddCellValueOverride('% 6', 'Proportion of all Business')
        tidy = tidy.append(ConversionSegment(breakdown_obs, [classifiers, import_export, year, trade, measure]).topandas(), sort=True)
        #savepreviewhtml([breakdown_obs, classifiers, import_export, measure])
        #break
    #break

tidy
















































Unnamed: 0,Age,Detailed employment,Employment,Import/Export,Measure Type,OBS,Ownership,Trade,Turnover,Year
0,,1,,Businesses,Count,1087600.0,,Goods and Services,,2016
1,,1,,Exporters,Count,65900.0,,Goods and Services,,2016
2,,1,,Importers,Count,55800.0,,Goods and Services,,2016
3,,1,,Exporter and Importer,Count,25200.0,,Goods and Services,,2016
4,,1,,Exporter and/or Importer,Count,96500.0,,Goods and Services,,2016
5,,1,,Exporters,Proportion of all Business,6.1,,Goods and Services,,2016
6,,1,,Importers,Proportion of all Business,5.1,,Goods and Services,,2016
7,,1,,Exporter and Importer,Proportion of all Business,2.3,,Goods and Services,,2016
8,,1,,Exporter and/or Importer,Proportion of all Business,8.9,,Goods and Services,,2016
9,,2 to 9,,Businesses,Count,1019200.0,,Goods and Services,,2016


Check for duplicate rows

In [4]:
assert tidy.duplicated().sum() == 0, 'duplicate rows'

"Employment" is the parent of "Detailed employment".

Also, the class "250 and over" is repeated in each, so we need to drop the duplicates. However, there appear to be some discrepancies.

In [5]:
duplicate_label = '250 and over'
emp_250 = tidy[tidy['Employment'] == duplicate_label].drop(columns=['Employment', 'Detailed employment']).reset_index(drop=True)
detailed_emp_250 = tidy[tidy['Detailed employment'] == duplicate_label].drop(columns=['Employment', 'Detailed employment']).reset_index(drop=True)
assert emp_250.size > 0
assert detailed_emp_250.size > 0
#assert emp_250.equals(detailed_emp_250)
merged = emp_250.merge(detailed_emp_250, indicator=True, how='outer')

display(merged[merged['_merge'] == 'right_only'])

tidy = tidy[tidy['Detailed employment'] != '250 and over'].reset_index(drop=True)

Unnamed: 0,Age,Import/Export,Measure Type,OBS,Ownership,Trade,Turnover,Year,_merge
81,,Exporters,Proportion of all Business,40.7,,Goods and Services,,2014,right_only
82,,Importers,Proportion of all Business,48.0,,Goods and Services,,2014,right_only
83,,Exporter and Importer,Proportion of all Business,36.5,,Goods and Services,,2014,right_only
84,,Exporter and/or Importer,Proportion of all Business,52.1,,Goods and Services,,2014,right_only


We need to merge them and also list their values so that we can create a codelist.

In [6]:
display(tidy['Employment'].unique())
display(tidy['Detailed employment'].unique())
tidy['Employees'] = tidy.apply(lambda x: x['Employment'] if pd.notnull(x['Employment']) else x['Detailed employment'], axis=1)
tidy = tidy.drop(columns=['Employment', 'Detailed employment'])
tidy

array([nan, '1 to 49', '50 to 249', '250 and over'], dtype=object)

array(['1', '2 to 9', '10 to 19', '20 to 49', '50 to 99', '100 to 249',
       nan], dtype=object)

Unnamed: 0,Age,Import/Export,Measure Type,OBS,Ownership,Trade,Turnover,Year,Employees
0,,Businesses,Count,1087600.0,,Goods and Services,,2016,1
1,,Exporters,Count,65900.0,,Goods and Services,,2016,1
2,,Importers,Count,55800.0,,Goods and Services,,2016,1
3,,Exporter and Importer,Count,25200.0,,Goods and Services,,2016,1
4,,Exporter and/or Importer,Count,96500.0,,Goods and Services,,2016,1
5,,Exporters,Proportion of all Business,6.1,,Goods and Services,,2016,1
6,,Importers,Proportion of all Business,5.1,,Goods and Services,,2016,1
7,,Exporter and Importer,Proportion of all Business,2.3,,Goods and Services,,2016,1
8,,Exporter and/or Importer,Proportion of all Business,8.9,,Goods and Services,,2016,1
9,,Businesses,Count,1019200.0,,Goods and Services,,2016,2 to 9


Fill NaN with top values.

In [7]:
tidy.fillna(value={'Age': 'Any', 'Ownership': 'Any', 'Turnover': 'Any', 'Employees': 'Any', }, inplace=True)
tidy

Unnamed: 0,Age,Import/Export,Measure Type,OBS,Ownership,Trade,Turnover,Year,Employees
0,Any,Businesses,Count,1087600.0,Any,Goods and Services,Any,2016,1
1,Any,Exporters,Count,65900.0,Any,Goods and Services,Any,2016,1
2,Any,Importers,Count,55800.0,Any,Goods and Services,Any,2016,1
3,Any,Exporter and Importer,Count,25200.0,Any,Goods and Services,Any,2016,1
4,Any,Exporter and/or Importer,Count,96500.0,Any,Goods and Services,Any,2016,1
5,Any,Exporters,Proportion of all Business,6.1,Any,Goods and Services,Any,2016,1
6,Any,Importers,Proportion of all Business,5.1,Any,Goods and Services,Any,2016,1
7,Any,Exporter and Importer,Proportion of all Business,2.3,Any,Goods and Services,Any,2016,1
8,Any,Exporter and/or Importer,Proportion of all Business,8.9,Any,Goods and Services,Any,2016,1
9,Any,Businesses,Count,1019200.0,Any,Goods and Services,Any,2016,2 to 9


Show the range of the codes and check for duplicated rows.

In [8]:
from IPython.core.display import HTML
for col in tidy:
    if col not in ['OBS']:
        display(HTML(f'<h2>{col}</h2>'))
        display(tidy[col].unique())
dups = tidy.duplicated()
display(dups.sum())
tidy[dups]

array(['Any', '<2', '2-<4', '4-<10', '10-<20', '20+'], dtype=object)

array(['Businesses', 'Exporters', 'Importers', 'Exporter and Importer',
       'Exporter and/or Importer'], dtype=object)

array(['Count', 'Proportion of all Business'], dtype=object)

array(['Any', 'UK', 'Foreign'], dtype=object)

array(['Goods and Services', 'Goods', 'Services'], dtype=object)

array(['Any', '<1000', '1000 - 4999', '5000 - 9999', '10,000 - 24,999',
       '25,000 - 49,999', '50,000 - 99,999', '100,000 - 249,999',
       '250,000 - 499,999', '500,000 +'], dtype=object)

array(['2016', '2015', '2014'], dtype=object)

array(['1', '2 to 9', '10 to 19', '20 to 49', '50 to 99', '100 to 249',
       '1 to 49', '50 to 249', '250 and over', 'Any'], dtype=object)

0

Unnamed: 0,Age,Import/Export,Measure Type,OBS,Ownership,Trade,Turnover,Year,Employees


We need to specify the units of the observations.

In [9]:
tidy['Unit'] = tidy['Measure Type'].map(lambda x: 'Businesses' if x == 'Count' else 'Percent')

And rename some columns.

In [10]:
tidy.rename(columns={'Import/Export': 'Export and Import Activity',
                     'Employees': 'Employment',
                     'Ownership': 'Country of Ownership',
                     'Age': 'Age of Business',
                     'OBS': 'Value',
                     'Trade': 'ONS ABS Trade'
                    }, inplace=True)
tidy

Unnamed: 0,Age of Business,Export and Import Activity,Measure Type,Value,Country of Ownership,ONS ABS Trade,Turnover,Year,Employment,Unit
0,Any,Businesses,Count,1087600.0,Any,Goods and Services,Any,2016,1,Businesses
1,Any,Exporters,Count,65900.0,Any,Goods and Services,Any,2016,1,Businesses
2,Any,Importers,Count,55800.0,Any,Goods and Services,Any,2016,1,Businesses
3,Any,Exporter and Importer,Count,25200.0,Any,Goods and Services,Any,2016,1,Businesses
4,Any,Exporter and/or Importer,Count,96500.0,Any,Goods and Services,Any,2016,1,Businesses
5,Any,Exporters,Proportion of all Business,6.1,Any,Goods and Services,Any,2016,1,Percent
6,Any,Importers,Proportion of all Business,5.1,Any,Goods and Services,Any,2016,1,Percent
7,Any,Exporter and Importer,Proportion of all Business,2.3,Any,Goods and Services,Any,2016,1,Percent
8,Any,Exporter and/or Importer,Proportion of all Business,8.9,Any,Goods and Services,Any,2016,1,Percent
9,Any,Businesses,Count,1019200.0,Any,Goods and Services,Any,2016,2 to 9,Businesses


In [11]:
destinationFolder = Path('out')
destinationFolder.mkdir(exist_ok=True, parents=True)

tidy.to_csv(destinationFolder / ('observations.csv'), index = False)