## HMRC Regional Trade Statistics

Transform to Tidy Data.

The source data is available from https://www.uktradeinfo.com/Statistics/RTS/Documents/Forms/AllItems.aspx in a series of zip files, `RTS web YYYY.zip` for the years 2013 to 2016 currently.

Each zip file contains fixed-width formatted text files following a layout described in https://www.uktradeinfo.com/Statistics/RTS/Documents/RTS%20Detailed%20data%20information%20pack.pdf. Each row is has two measures: net mass in tonnes and statistical value in £1000's. We're assuming each observation has one measure, so split these  out into separate files.

In [None]:
import pandas as pd
from databaker.framework import *
from pathlib import Path
import requests
from cachecontrol import CacheControl
from cachecontrol.caches.file_cache import FileCache
from cachecontrol.heuristics import LastModified
from io import BytesIO, TextIOWrapper
from zipfile import ZipFile

session = CacheControl(requests.Session(),
                       cache=FileCache('.cache'),
                       heuristic=LastModified())

destinationFolder = Path('out')
destinationFolder.mkdir(exist_ok=True, parents=True)

for zipURL in [f'https://www.uktradeinfo.com/Statistics/RTS/Documents/RTS%20web%20{year}.zip'
            for year in range(2013,2017)]:
    with ZipFile(BytesIO(session.get(zipURL).content)) as zip:
        for name in zip.namelist():
            with zip.open(name, 'r') as quarterFile:
                quarterText = TextIOWrapper(quarterFile, encoding='utf-8')
                table = pd.read_fwf(quarterText, widths=[6, 1, 2, 1, 3, 2, 1, 2, 9, 9], names=[
                    'Period',
                    'Flow',
                    'HMRC Reporter Region',
                    'HMRC Partner Geography',
                    'Codalpha',
                    'Codseq',
                    'SITC Section',
                    'SITC 4',
                    'Value',
                    'Netmass'
                ], dtype=str)
                table['Period'] = table['Period'].map(lambda x: f'quarter/{x[2:]}-Q{x[0]}')
                table['Flow'] = table['Flow'].map(lambda x: 'Exports' if x == 'E' else 'Imports')
                table['HMRC Partner Geography'] = table.apply(
                    lambda x: x['Codseq'] if x['Codseq'][0] != '#' else x['Codalpha'],
                    axis=1)
                table.drop(columns=['Codalpha', 'Codseq', 'SITC Section'], inplace=True)
                mass = table.drop(columns=['Value'])
                mass['Measure Type'] = 'Net Mass'
                mass['Unit'] = 'kg (thousands)'
                mass.rename(columns={'Netmass': 'Value'}, inplace=True, index=str)
                textFile = destinationFolder / name
                mass.to_csv(textFile.with_suffix('.mass.csv'), index=False)
                value = table.drop(columns=['Netmass'])
                value['Measure Type'] = 'GBP Total'
                value['Unit'] = '£ (thousands)'
                value.to_csv(textFile.with_suffix('.value.csv'), index=False)