Country by commodity imports

In [1]:
from gssutils import *

if is_interactive():
    import requests
    from cachecontrol import CacheControl
    from cachecontrol.caches.file_cache import FileCache
    from cachecontrol.heuristics import LastModified
    from pathlib import Path
    from io import BytesIO

    session = CacheControl(requests.Session(),
                       cache=FileCache('.cache'),
                       heuristic=LastModified())

inputURL = 'https://www.ons.gov.uk/file?uri=/economy/nationalaccounts/balanceofpayments/datasets/uktradecountrybycommodityimports/current/countrybycommodityimportsfinal.xlsx'
data = BytesIO(session.get(inputURL).content)


In [2]:
tab = pd.read_excel(data, header= 0)
# tab.rename(columns=tab.iloc[0], inplace=True)

In [3]:
tab = tab.drop(['DIRECTION'], axis = 1)

In [4]:
tab.columns.values[0] = 'CORD SITC'
tab.columns.values[1] = 'ONS Partner Geography'

In [5]:
new_table = pd.melt(tab, id_vars=['CORD SITC','ONS Partner Geography'], var_name='Period', value_name='Value')

In [6]:
import re
YEAR_RE = re.compile(r'[0-9]{4}')
YEAR_MONTH_RE = re.compile(r'([0-9]{4})(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)')
YEAR_QUARTER_RE = re.compile(r'([0-9]{4})\s+(Q[1-4])')

# from https://stackoverflow.com/questions/597476/how-to-concisely-cascade-through-multiple-regex-statements-in-python
class Re(object):
  def __init__(self):
    self.last_match = None
  def fullmatch(self,pattern,text):
    self.last_match = re.fullmatch(pattern,text)
    return self.last_match

def time2period(t):
    gre = Re()
    if gre.fullmatch(YEAR_RE, t):
        return f"year/{t}"
    elif gre.fullmatch(YEAR_MONTH_RE, t):
        year, month = gre.last_match.groups()
        month_num = {'JAN': '01', 'FEB': '02', 'MAR': '03', 'APR': '04', 'MAY': '05', 'JUN': '06',
                     'JUL': '07', 'AUG': '08', 'SEP': '09', 'OCT': '10', 'NOV': '11', 'DEC': '12'}.get(month)
        return f"month/{year}-{month_num}"
    elif gre.fullmatch(YEAR_QUARTER_RE, t):
        year, quarter = gre.last_match.groups()
        return f"quarter/{year}-{quarter}"
    else:
        print(f"no match for {t}")

new_table['Period'] = new_table['Period'].apply(time2period)

In [7]:
new_table['Seasonal Adjustment'] =  'NSA'
new_table['Measure Type'] =  'GBP Total'
new_table['Unit'] =  'gbp-million'
new_table['Flow'] = 'imports'

In [8]:
new_table['Value'] = pd.to_numeric(new_table['Value'], errors='coerce').fillna(0)

In [9]:
new_table['Value'] = new_table['Value'].astype(int)

In [10]:
new_table['ONS Partner Geography'] = new_table['ONS Partner Geography'].astype(str).str[0:2] 

In [11]:
new_table['CORD SITC'] = new_table['CORD SITC'].str.partition(' ')

In [12]:
new_table = new_table[new_table['Value'] !=  0 ]

In [13]:
new_table = new_table[['ONS Partner Geography', 'Period','Flow','CORD SITC', 'Seasonal Adjustment', 'Measure Type','Value','Unit' ]]

In [14]:
if is_interactive():
    SubstancetinationFolder = Path('out')
    SubstancetinationFolder.mkdir(exist_ok=True, parents=True)
    new_table.to_csv(SubstancetinationFolder / ('imports.csv'), index = False)