###  MRET xlsx to Tidy Data

Take the Trade in goods MRETS (all BOP - EU2013): time series dataset and convert to Tidy Data in CSV.

Fetch and cache the latest published MRETS data as an Excel spreadsheet (single sheet).

In [1]:
import requests
from pathlib import Path
from io import BytesIO
from cachecontrol import CacheControl
from cachecontrol.caches.file_cache import FileCache
from cachecontrol.heuristics import LastModified

session = CacheControl(requests.Session(),
                       cache=FileCache('.cache'),
                       heuristic=LastModified())

inputURL = 'https://www.ons.gov.uk/file?uri=/economy/nationalaccounts/balanceofpayments/datasets/tradeingoodsmretsallbopeu2013timeseriesspreadsheet/current/mret.xlsx'
mretsExcel = BytesIO(session.get(inputURL).content)

Read in the spreadsheet as a table, naming the columns after the CDID (second row).

In [2]:
import pandas as pd
tab = pd.read_excel(mretsExcel, header=None)
tab.rename(columns=tab.iloc[1], inplace=True)
tab.rename(columns={'CDID': 'Period'}, inplace=True)
tab

Unnamed: 0,Period,SESM,HCPC,SESX,SESQ,SESZ,SEUJ,SEUC,LKVB,HCRB,...,QALL,QALN,QALY,QALX,QALZ,QALU,SGRX,QALW,QALV,SGTK
0,Title,EU:BOP:Balance:SA:Semi-manufactures: SITC 5+6,Balance of payments: Trade in Goods: Oil Expor...,EU:BOP:EX:price index:NSA:Semi-manufactures: S...,EU:BOP:EX:SA:Semi-manufactures: SITC 5+6,EU:BOP:EX:volume index:SA:Semi-manufactures: S...,non-EU:BOP:EX:price index:NSA:Semi-manufacture...,non-EU:BOP:EX:SA:Semi-manufactures: SITC 5+6,EU(2004):BOP:IM:price index:NSA:Total trade in...,Balance of payments:Trade in Goods: North Amer...,...,BoP-consistent: Egypt: Exports: SA,BoP-consistent: Egypt: Balance: SA,Balance of payments: Trade in Goods: Ships: SI...,Balance of payments: Trade in Goods: Ships: SI...,Balance of payments: Ships: SITC 793:Balance: ...,Balance of payments: Trade in Goods: Aircraft:...,non-EU:BOP:EX:SA:Unspecified goods: SITC 9,Balance of payments: Trade in Goods: Aircraft:...,Balance of payments: Trade in Goods: Aircraft:...,non-EU:BOP:IM:SA:Unspecified goods: SITC 9
1,CDID,SESM,HCPC,SESX,SESQ,SESZ,SEUJ,SEUC,LKVB,HCRB,...,QALL,QALN,QALY,QALX,QALZ,QALU,SGRX,QALW,QALV,SGTK
2,PreUnit,,,,,,,,,,...,,,,,,,,,,
3,Unit,,,,,,,,,,...,,,,,,,,,,
4,Release Date,10-05-2018,10-05-2018,10-05-2018,10-05-2018,10-05-2018,10-05-2018,10-05-2018,10-05-2018,10-05-2018,...,10-05-2018,10-05-2018,10-05-2018,10-05-2018,10-05-2018,10-05-2018,10-05-2018,10-05-2018,10-05-2018,10-05-2018
5,Next release,11 June 2018,11 June 2018,11 June 2018,11 June 2018,11 June 2018,11 June 2018,11 June 2018,11 June 2018,11 June 2018,...,11 June 2018,11 June 2018,11 June 2018,11 June 2018,11 June 2018,11 June 2018,11 June 2018,11 June 2018,11 June 2018,11 June 2018
6,Important Notes,,,,,,,,,,...,,,,,,,,,,
7,1945,,,,,,,,,,...,,,,,,,,,,
8,1946,,,,,,,,,,...,,,,,,,,,,
9,1947,,,,,,,,,,...,,,,,,,,,,


The observations are in rows 7 on.

In [3]:
observations = tab[7:].rename(columns={'CDID': 'Period'})
observations.head()

Unnamed: 0,Period,SESM,HCPC,SESX,SESQ,SESZ,SEUJ,SEUC,LKVB,HCRB,...,QALL,QALN,QALY,QALX,QALZ,QALU,SGRX,QALW,QALV,SGTK
7,1945,,,,,,,,,,...,,,,,,,,,,
8,1946,,,,,,,,,,...,,,,,,,,,,
9,1947,,,,,,,,,,...,,,,,,,,,,
10,1948,,,,,,,,,,...,,,,,,,,,,
11,1949,,,,,,,,,,...,,,,,,,,,,


Each CDID corresponds to a unique time-series slice. Unpivot the table so we have one row per observation and drop any rows with no value for the observation.

In [4]:
observations = pd.melt(observations, id_vars=['Period'], var_name='CDID', value_name='Value')
observations.dropna(inplace=True)
observations.reset_index(drop=True, inplace=True)
print(len(observations))
observations.tail(5)

270682


Unnamed: 0,Period,CDID,Value
270677,2017 NOV,SGTK,1091
270678,2017 DEC,SGTK,872
270679,2018 JAN,SGTK,576
270680,2018 FEB,SGTK,490
270681,2018 MAR,SGTK,880


Each CDID is described by a colon separated title, so add these back in to describe the observations.

In [5]:
observations['Title'] = tab.iloc[0][observations['CDID']].values
observations.head()

Unnamed: 0,Period,CDID,Value,Title
0,1998,SESM,-2766,EU:BOP:Balance:SA:Semi-manufactures: SITC 5+6
1,1999,SESM,-2568,EU:BOP:Balance:SA:Semi-manufactures: SITC 5+6
2,2000,SESM,-2441,EU:BOP:Balance:SA:Semi-manufactures: SITC 5+6
3,2001,SESM,-3157,EU:BOP:Balance:SA:Semi-manufactures: SITC 5+6
4,2002,SESM,-4703,EU:BOP:Balance:SA:Semi-manufactures: SITC 5+6


The date/time values need to be in a format that can be used to create URIs for British calendar intervals,
see https://github.com/epimorphics/IntervalServer/blob/master/interval-uris.md#british-calendar-intervals

In [6]:
observations['Period'].unique()

array(['1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005',
       '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017', '1998 Q1', '1998 Q2', '1998 Q3',
       '1998 Q4', '1999 Q1', '1999 Q2', '1999 Q3', '1999 Q4', '2000 Q1',
       '2000 Q2', '2000 Q3', '2000 Q4', '2001 Q1', '2001 Q2', '2001 Q3',
       '2001 Q4', '2002 Q1', '2002 Q2', '2002 Q3', '2002 Q4', '2003 Q1',
       '2003 Q2', '2003 Q3', '2003 Q4', '2004 Q1', '2004 Q2', '2004 Q3',
       '2004 Q4', '2005 Q1', '2005 Q2', '2005 Q3', '2005 Q4', '2006 Q1',
       '2006 Q2', '2006 Q3', '2006 Q4', '2007 Q1', '2007 Q2', '2007 Q3',
       '2007 Q4', '2008 Q1', '2008 Q2', '2008 Q3', '2008 Q4', '2009 Q1',
       '2009 Q2', '2009 Q3', '2009 Q4', '2010 Q1', '2010 Q2', '2010 Q3',
       '2010 Q4', '2011 Q1', '2011 Q2', '2011 Q3', '2011 Q4', '2012 Q1',
       '2012 Q2', '2012 Q3', '2012 Q4', '2013 Q1', '2013 Q2', '2013 Q3',
       '2013 Q4', '2014 Q1', '2014 Q2', '2014 Q3', '2014

In [7]:
import re
YEAR_RE = re.compile(r'[0-9]{4}')
YEAR_MONTH_RE = re.compile(r'([0-9]{4})\s+(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)')
YEAR_QUARTER_RE = re.compile(r'([0-9]{4})\s+(Q[1-4])')

# from https://stackoverflow.com/questions/597476/how-to-concisely-cascade-through-multiple-regex-statements-in-python
class Re(object):
  def __init__(self):
    self.last_match = None
  def fullmatch(self,pattern,text):
    self.last_match = re.fullmatch(pattern,text)
    return self.last_match

def time2period(t):
    gre = Re()
    if gre.fullmatch(YEAR_RE, t):
        return f"year/{t}"
    elif gre.fullmatch(YEAR_MONTH_RE, t):
        year, month = gre.last_match.groups()
        month_num = {'JAN': '01', 'FEB': '02', 'MAR': '03', 'APR': '04', 'MAY': '05', 'JUN': '06',
                     'JUL': '07', 'AUG': '08', 'SEP': '09', 'OCT': '10', 'NOV': '11', 'DEC': '12'}.get(month)
        return f"month/{year}-{month_num}"
    elif gre.fullmatch(YEAR_QUARTER_RE, t):
        year, quarter = gre.last_match.groups()
        return f"quarter/{year}-{quarter}"
    else:
        print(f"no match for {t}")

observations['Period'] = observations['Period'].apply(time2period)
observations

Unnamed: 0,Period,CDID,Value,Title
0,year/1998,SESM,-2766,EU:BOP:Balance:SA:Semi-manufactures: SITC 5+6
1,year/1999,SESM,-2568,EU:BOP:Balance:SA:Semi-manufactures: SITC 5+6
2,year/2000,SESM,-2441,EU:BOP:Balance:SA:Semi-manufactures: SITC 5+6
3,year/2001,SESM,-3157,EU:BOP:Balance:SA:Semi-manufactures: SITC 5+6
4,year/2002,SESM,-4703,EU:BOP:Balance:SA:Semi-manufactures: SITC 5+6
5,year/2003,SESM,-5435,EU:BOP:Balance:SA:Semi-manufactures: SITC 5+6
6,year/2004,SESM,-4851,EU:BOP:Balance:SA:Semi-manufactures: SITC 5+6
7,year/2005,SESM,-3673,EU:BOP:Balance:SA:Semi-manufactures: SITC 5+6
8,year/2006,SESM,-3746,EU:BOP:Balance:SA:Semi-manufactures: SITC 5+6
9,year/2007,SESM,-5259,EU:BOP:Balance:SA:Semi-manufactures: SITC 5+6


CDID is an arbitrary/opaque 4 letter code registered by ONS and corresponds to a timeseries slice, so each CDID provides the value of a list of dimensions. These codes are also elaborated on in separate spreadsheets currently in Google Drive.

In [8]:
from IPython.display import display, HTML
cord_sitc_classification_table_url = 'https://drive.google.com/uc?export=download&id=1uJck_DtSgLs0XcEuKDB0swzj1UrWmauj'
cord_sitc_classification_table = BytesIO(session.get(cord_sitc_classification_table_url).content)
cord_sitc_table = pd.read_excel(cord_sitc_classification_table,
                                sheet_name=0, usecols="D:K", index_col=0, skip_footer=1,
                                dtype={'COMMODITY': str, 'AREA': str, 'DIRECTION': str, 'BASIS': str,
                                       'PRICE': str, 'SEASADJ': str, 'PERIOD': str})
display(HTML('<b>CORD SITC Classification table</b>'))
display(cord_sitc_table.head())

Unnamed: 0_level_0,COMMODITY,AREA,DIRECTION,BASIS,PRICE,SEASADJ,PERIOD
cdid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
SDSX,2plus4,EU,BAL,BOP,CP,NSA,M
SGLO,5minus8,EU,BAL,BOP,CP,NSA,M
SFJC,7plus8,EU,BAL,BOP,CP,NSA,M
LKTX,TminusO,EU,BAL,BOP,CP,NSA,M
SDMS,0plus1,EU,BAL,BOP,CP,NSA,M


In [9]:
csdb_classification_tables_url = 'https://drive.google.com/uc?export=download&id=1miAzQ6s8om4Ark3BpRk3Y90OAWfWErTb'
csdb_classification_table = BytesIO(session.get(csdb_classification_tables_url).content)
csdb_sheets = pd.read_excel(csdb_classification_table, sheet_name=None, index_col=0,
                            dtype={'COMMODITY': str, 'AREA': str, 'DIRECTION': str, 'BASIS': str,
                                   'PRICE': str, 'SEASADJ': str, 'PERIOD': str, 'PRODUCT': str,
                                   'COUNTRY': str})
for (sheet, df) in csdb_sheets.items():
    display(HTML('<b>' + sheet + '</b>'))
    display(df.head())

Unnamed: 0_level_0,COMMODITY,AREA,DIRECTION,BASIS,PRICE,SEASADJ,PERIOD
cdid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
SDSX,2plus4,EU,BAL,BOP,CP,NSA,Q
SGLO,5minus8,EU,BAL,BOP,CP,NSA,Q
SESL,5plus6,EU,BAL,BOP,CP,NSA,Q
SFJC,7plus8,EU,BAL,BOP,CP,NSA,Q
LKTX,TminusO,EU,BAL,BOP,CP,NSA,Q


Unnamed: 0_level_0,PRODUCT,AREA,DIRECTION,BASIS,PRICE,SEASADJ,PERIOD
cdid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
P42L,24.2,EU,EX,BOP,CP,NSA,Q
P483,24.2,EU,EX,BOP,CP,SA,Q
P4DJ,24.2,EU,EX,BOP,CVM,NSA,Q
P4IZ,24.2,EU,EX,BOP,CVM,SA,Q
P3EP,24.2,EU,IM,BOP,CP,NSA,Q


Unnamed: 0_level_0,COUNTRY,DIRECTION,BASIS,SEASADJ,PERIOD
cdid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
KN2O,XS,BAL,BOP,NSA,Q
LGDS,V4,BAL,BOP,NSA,Q
L87P,V3,BAL,BOP,NSA,Q
L87J,V2,BAL,BOP,NSA,Q
MHN8,I7,BAL,BOP,NSA,Q


It looks as though the `cord_sitc_table` is the same as the `cord_sitc` sheet loaded above.

In [10]:
different_cdids = set(cord_sitc_table.index.values).symmetric_difference(set(csdb_sheets['cord_sitc'].index.values))
display(different_cdids)
cord_sitc_table.sort_index().eq(csdb_sheets['cord_sitc'].sort_index())

{'OCOW', 'SGKA', 'SJNK'}

Unnamed: 0_level_0,COMMODITY,AREA,DIRECTION,BASIS,PRICE,SEASADJ,PERIOD
cdid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
BAFB,True,True,True,True,True,True,False
BAFC,True,True,True,True,True,True,False
BAHA,True,True,True,True,True,True,False
BAHY,True,True,True,True,True,True,False
BOBZ,True,True,True,True,True,True,False
BOGG,True,True,True,True,True,True,False
BOGH,True,True,True,True,True,True,False
BOGO,True,True,True,True,True,True,False
BOGP,True,True,True,True,True,True,False
BOGQ,True,True,True,True,True,True,False


Apparently not. Let's use `csdb_sheets` for now. __TODO: need to check whether the CDID period length matches the MRETS period length.__

Next we have another table of CDIDs not listed in the above:

In [11]:
codelist_url = 'https://drive.google.com/uc?export=download&id=161OtInylx2518gmhRu7UgUYnZZ_x9FQr'
codelist = pd.read_csv(BytesIO(session.get(codelist_url).content), index_col=0,
                       dtype={'AREA': str, 'DIRECTION': str, 'BASIS': str,
                              'PRICE': str, 'SEASADJ': str, 'PERIOD': str},
                       converters={'COMMODITY': lambda x: str(x).strip()})
codelist

Unnamed: 0_level_0,COMMODITY,AREA,DIRECTION,BASIS,PRICE,SEASADJ,PERIOD
cdid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AJFB,Canadian dollar,UK,,BE,CP,NSA,
AJFD,Swiss franc,UK,,BE,CP,NSA,
AJFI,Swedish kroner,UK,,BE,CP,NSA,
AJFJ,Norwegian kroner,UK,,BE,CP,NSA,
AJFK,Danish kroner,UK,,BE,CP,NSA,
AJFO,Japanese yen,UK,,BE,CP,NSA,
AJFP,Australian Dollar,UK,,BE,CP,NSA,
AJFU,Hong Kong Dollar,UK,,BE,CP,NSA,
AJFV,New Zealand Dollar,UK,,BE,CP,NSA,
AJFW,South African Rand,UK,,BE,CP,NSA,


This (above) list seems to be missing some values (NaN). E.g. looking up `BOQM` on https://www.ons.gov.uk/timeseriestool finds `BOP:Exports:Tons:SA:Crude oil: SITC 333`.

Check that all CDIDs used in MRETS are defined in these tables.

In [12]:
all_cdids = pd.concat(list(csdb_sheets.values()) + [codelist])
display(all_cdids)
defined_cdids = set(all_cdids.index.values)
remaining = set(observations['CDID'].unique()).difference(defined_cdids)  
assert not remaining, 'Not all CDIDs defined: ' + str(remaining)

Unnamed: 0_level_0,AREA,BASIS,COMMODITY,COUNTRY,DIRECTION,PERIOD,PRICE,PRODUCT,SEASADJ
cdid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
SDSX,EU,BOP,2plus4,,BAL,Q,CP,,NSA
SGLO,EU,BOP,5minus8,,BAL,Q,CP,,NSA
SESL,EU,BOP,5plus6,,BAL,Q,CP,,NSA
SFJC,EU,BOP,7plus8,,BAL,Q,CP,,NSA
LKTX,EU,BOP,TminusO,,BAL,Q,CP,,NSA
SDMS,EU,BOP,0plus1,,BAL,Q,CP,,NSA
SDTB,EU,BOP,2plus4,,BAL,Q,CP,,SA
SGLP,EU,BOP,5minus8,,BAL,Q,CP,,SA
SESM,EU,BOP,5plus6,,BAL,Q,CP,,SA
SFJD,EU,BOP,7plus8,,BAL,Q,CP,,SA


Inspect the unique values for these dimensions.

In [13]:
for col in all_cdids:
    display(HTML('<b>' + col + '</b>'))
    display(all_cdids[col].unique())

array(['EU', 'RW', 'WW', nan, 'UK', 'W1'], dtype=object)

array(['BOP', 'OTS', 'BE', nan], dtype=object)

array(['2plus4', '5minus8', '5plus6', '7plus8', 'TminusO', '0plus1',
       'TminusE', 'TminusOE', '04plus08', '7plus8C', '7plus8I', '7plus8K',
       '84plus85', '87plus88C', '87plus88K', '68minusS', '6minusPS',
       '57plus58', '87plus88I', '87plus88', '56plus59', '792and3',
       '71minus77', 'T', '0', '1', '2', '3', '4', '8', '5', '6', '7', '9',
       '11', '12', '24', '25', '26', '28', '2OCM', '21', '22', '23', '27',
       '29', '96', '41', '42', '43', '33', '33O', '33R', '3OF', '32',
       '34', '35', '51', '52', '53', '54', '55', '57+58', '57', '58',
       '56', '59', '61', '62', '63', '64', '65', '66', '66minusP', '66P',
       '67', '68', '68S', '69', '71-77', '7M', '7MC', '71MC', '72C',
       '74C', '7MI', '71MI', '72I', '73I', '74I', '7MK', '71MK', '72K',
       '73K', '74K', '7E', '7EC', '76C', '77C', '7EI', '71EI', '75I',
       '76I', '77I', '7EK', '71EK', '75K', '76K', '77K', '78', '78M',
       '78C', '78I', '78K', '79', '791I', '791K', '792', '793', '84+85',
  

array([nan, 'XS', 'V4', 'V3', 'V2', 'I7', 'I3', 'K6', 'D5', 'K5', 'K4',
       'I8', 'A1', 'K3', 'J6', 'J3', 'J7', 'XR', 'CW', 'NW', 'V1', 'D3',
       'D2', 'AT', 'A3', 'BE', 'LU', 'DK', 'FI', 'FR', 'DE', 'GR', 'IE',
       'IT', 'NL', 'PT', 'ES', 'SE', 'CY', 'CZ', 'EE', 'HU', 'LV', 'LT',
       'MT', 'PL', 'SK', 'SI', 'BG', 'RO', 'D4', 'C80', 'A5', 'IS', 'LI',
       'NO', 'CH', 'GI', 'TR', 'C482', 'AD', 'FO', 'SM', 'VA', 'C83',
       'B1', 'CA', 'MX', 'US', 'US1', 'PR', 'C484', 'GL', 'PM', 'C76',
       'AU', 'JP', 'NZ', 'KR', 'XC', 'XL', 'C77', 'SA', 'AE', 'AE1', 'DH',
       'DU', 'HA', 'C486', 'BH', 'KW', 'IQ', 'OM', 'QA', 'C487', 'DZ',
       'BN', 'TL', 'EC', 'GA', 'ID', 'IR', 'LY', 'NG', 'TT', 'VE', 'C72',
       'C493', 'AL', 'BY', 'BA', 'HR', 'MK', 'MD', 'RU', 'XK', 'ME', 'CS',
       'UA', 'C472', 'BS', 'BB', 'JM', 'PA', 'KN', 'C473', 'BM', 'VG',
       'KY', 'MS', 'C474', 'AG', 'BZ', 'DM', 'DO', 'GD', 'HT', 'LC', 'VC',
       'C475', 'CR', 'CU', 'SV', 'GT', 'HN', 'NI', 'C

array(['BAL', 'EX', 'IM', nan], dtype=object)

array(['Q', nan], dtype=object)

array(['CP', 'DEF', 'CVM', 'KQ', 'IDEF', 'VM', 'PYP', 'KP', nan],
      dtype=object)

array([nan, '24.2', 'TOTAL', 'A', '1', '1.1', '1.2', '1.3', '1.4', '2',
       '2.1', '2.2', '2.3', '3', 'B', '5', '5.1', '5.2', '6', '6.1',
       '6.2', '7', '7.1', '7.2', '8', '8.1', '8.9', 'C', '10', '10.1',
       '10.2', '10.3', '10.4', '10.5', '10.6', '10.7', '10.8', '10.9',
       '11', '11.01', '11.02', '11.03', '11.04', '11.05', '11.06',
       '11.07', '12', '13', '13.1', '13.2', '13.9', '14', '14.1', '14.2',
       '14.3', '15', '15.1', '15.2', '16', '16.1', '16.2', '17', '17.1',
       '17.2', '18', '18.1', '19', '19.1', '19.2', '20', '20.11', '20.12',
       '20.13', '20.15', '20.14', '20.16', '20.17', '20.1', '20.2',
       '20.3', '20.4', '20.5', '20.6', '21', '21.1', '21.2', '22', '22.1',
       '22.2', '23', '23.1', '23.2', '23.3', '23.4', '23.5', '23.6',
       '23.7', '23.9', '24', '24.1', '24.3', '24.4', '24.5', '25', '25.1',
       '25.2', '25.3', '25.4', '25.7', '25.9', '26', '26.1', '26.2',
       '26.3', '26.4', '26.5', '26.6', '26.7', '26.8', '27', '27.1',
   

array(['NSA', 'SA'], dtype=object)

__TODO: Still not sure what to make of the titles provided for the CDIDs in the MRETS table.__

__TODO: PERIOD doesn't seem to correspond to usage.__

In [14]:
bop_series = all_cdids[all_cdids['BASIS'] == 'BOP'].copy()
def area_country(row):
    if pd.isnull(row['AREA']):
        if (pd.isnull(row['COUNTRY'])):
            return None
        assert pd.notnull(row['COUNTRY'])
        return 'country/' + row['COUNTRY']
    else:
        assert pd.isnull(row['COUNTRY'])
        return 'area/' + row['AREA']
#bop_series.loc[bop_series.AREA == None, 'AREA'] = bop_series.COUNTRY
bop_series['Area'] = bop_series.apply(area_country, axis=1)
def product_commodity(row):
    if pd.isnull(row['PRODUCT']):
        if pd.isnull(row['COMMODITY']):
#            print('PRODUCT and COMMODITY both empty for CDID: ' + row.name)
#            print(row)
            return None
        assert pd.notnull(row['COMMODITY'])
        return row['COMMODITY']
    else:
        assert pd.isnull(row['COMMODITY'])
        return row['PRODUCT']
bop_series['Product'] = bop_series.apply(product_commodity, axis=1)
bop_series.drop(columns=['PERIOD', 'AREA', 'COUNTRY', 'PRODUCT', 'COMMODITY', 'BASIS'], inplace=True)
bop_series.rename(columns={'DIRECTION': 'Flow',
                           'PRICE': 'Price Classification',
                           'SEASADJ': 'Seasonal Adjustment'}, inplace=True)
bop_series.replace({'Flow': {'BAL': 'Balance', 'IM': 'Imports', 'EX': 'Exports'}}, inplace=True)
bop_series['Measure Type'] = 'GBP Total'
bop_series['Unit'] = '£ Million'

bop_observations = observations[observations['CDID'].isin(bop_series.index.values)]
bop_observations = bop_observations.merge(bop_series, how = 'left', left_on = 'CDID', right_index=True)
bop_observations.drop(columns=['Title'], inplace=True)
bop_observations.dropna(how='any', inplace=True)
bop_observations

Unnamed: 0,Period,CDID,Value,Flow,Price Classification,Seasonal Adjustment,Area,Product,Measure Type,Unit
0,year/1998,SESM,-2766,Balance,CP,SA,area/EU,5plus6,GBP Total,£ Million
1,year/1999,SESM,-2568,Balance,CP,SA,area/EU,5plus6,GBP Total,£ Million
2,year/2000,SESM,-2441,Balance,CP,SA,area/EU,5plus6,GBP Total,£ Million
3,year/2001,SESM,-3157,Balance,CP,SA,area/EU,5plus6,GBP Total,£ Million
4,year/2002,SESM,-4703,Balance,CP,SA,area/EU,5plus6,GBP Total,£ Million
5,year/2003,SESM,-5435,Balance,CP,SA,area/EU,5plus6,GBP Total,£ Million
6,year/2004,SESM,-4851,Balance,CP,SA,area/EU,5plus6,GBP Total,£ Million
7,year/2005,SESM,-3673,Balance,CP,SA,area/EU,5plus6,GBP Total,£ Million
8,year/2006,SESM,-3746,Balance,CP,SA,area/EU,5plus6,GBP Total,£ Million
9,year/2007,SESM,-5259,Balance,CP,SA,area/EU,5plus6,GBP Total,£ Million


In [15]:
destinationFolder = Path('out')
destinationFolder.mkdir(exist_ok=True, parents=True)

bop_observations.to_csv(destinationFolder / ('bop_observations.csv'), index = False)