###  Individual country data (goods) on a monthly basis to Tidy Data

In [74]:
from databaker.framework import *
import pandas as pd 

This will be a bit nasty as the data source is an adhoc release, so there's no guarantee in terms of url convention between releases.

To get around this, we're going to hit the ons website search page and parse out the most recent version of the adhoc
from there.

To help explain the following code, this is an example of a single search result from this page.

-----
```<a href="/redir/eyJhbGciOiJIUzI1NiJ9.eyJpbmRleCI6MSwicGFnZVNpemUiOjEwLCJ0ZXJtIjoiaW5kaXZpZHVhbCBjb3VudHJ5IGRhdGEgb24gYSBtb250aGx5IGJhc2lzIGZyb20gamFudWFyeS9kYXRhIiwicGFnZSI6MSwidXJpIjoiL2Vjb25vbXkvbmF0aW9uYWxhY2NvdW50cy9iYWxhbmNlb2ZwYXltZW50cy9hZGhvY3MvMDA2Njc1aW5kaXZpZHVhbGNvdW50cnlkYXRhb25hbW9udGhseWJhc2lzZnJvbWphbnVhcnkyMDE2dG9qYW51YXJ5MjAxNyIsImxpc3RUeXBlIjoic2VhcmNoIn0.Gdw3U8ZGrtT85SftZUcHMEVqem3KmGpWRyRjS_ow77Y" data-gtm-uri="/economy/nationalaccounts/balanceofpayments/adhocs/006675individualcountrydataonamonthlybasisfromjanuary2016tojanuary2017"><strong>Individual</strong> <strong>Country</strong> <strong>data</strong> on a <strong>monthly</strong> <strong>basis</strong> <strong>from</strong> <strong>January</strong> 2016 to <strong>January</strong> 2017 </a>```

In [75]:
search_for = "https://www.ons.gov.uk/search?q=individual+country+data+on+a+monthly+basis+from+january/data"

# get the page
r = requests.get(search_for)
if r.status_code != 200:
    raise ValueError("Aborting operation. Failed to get 1st (of 2) scrapes of ONS website.")

# We're going to look for the words: country, data, monthly and individual
# since there're loose conventions, we'll look for in any order and case
find = ["individual", "country", "data", "monthly", "tojanuary"]
found_urls = []
for html_line in r.text.split("\n"): # for every line of html
    
    # if all have matched and if its a redirect (they're all redirects) and has a data url
    matched = [x for x in find if x in html_line.lower()]
    if len(matched) == len(find) and 'href="/redir' in html_line and 'data-gtm-uri=' in html_line:
        
        # prefix the boiler plate and store the new url in our list
        found_urls.append("http://www.ons.gov.uk" + html_line.split('data-gtm-uri="')[1].split("\"")[0])
        
found_urls

['http://www.ons.gov.uk/economy/nationalaccounts/balanceofpayments/adhocs/006675individualcountrydataonamonthlybasisfromjanuary2016tojanuary2017',
 'http://www.ons.gov.uk/economy/nationalaccounts/balanceofpayments/adhocs/008182individualcountrydatagoodsonamonthlybasisfromjanuary1998tojanuary2018']

-----

Now that we've found the right adhoc page, we need to get the xls url. The element we're grabbing looks like the below:

-----
```
<a href="/file?uri=/economy/nationalaccounts/balanceofpayments/adhocs/008182individualcountrydatagoodsonamonthlybasisfromjanuary1998tojanuary2018/01.allcountriesjanuary2018.xls">Individual country data (goods) on a monthly basis from January 1998 to January 2018</a>
```

In [79]:
# get the most recent one
dates_to_urls = { int(x[-4:]):x for x in found_urls}
url = dates_to_urls[max(dates_to_urls.keys())]

# around we go again...
r = requests.get(url)
if r.status_code != 200:
    raise ValueError("Aborting operation. Failed to get 2nd (of 2) scrapes of ONS website.")

# It's an adhoc, the first xls link should always contain the spreadsheet we want
download_url = [x for x in r.text.split("\n") if ".xls" in x][0]

# note - see above sample for what we're doing here
sourceUrl = "http://www.ons.gov.uk" + download_url.split('href="/file?uri=')[1].split("\">")[0]

sourceUrl

'http://www.ons.gov.uk/economy/nationalaccounts/balanceofpayments/adhocs/008182individualcountrydatagoodsonamonthlybasisfromjanuary1998tojanuary2018/01.allcountriesjanuary2018.xls'

In [77]:
import requests
from cachecontrol import CacheControl
from cachecontrol.caches.file_cache import FileCache
from cachecontrol.heuristics import LastModified
from pathlib import Path
from io import BytesIO

session = CacheControl(requests.Session(),
                       cache=FileCache('.cache'),
                       heuristic=LastModified())


In [78]:
tab = pd.read_excel(BytesIO(session.get(sourceUrl).content), header = None, sheet_name = 1)
tab.iloc[2][0] = 'Dummy'
tab.columns=tab.iloc[2]
tab.rename(columns={'Dummy': 'Period'}, inplace = True)
tab

TypeError: expected str, bytes or os.PathLike object, not NoneType

In [4]:
observations = tab[3:].rename(columns={'ONS Partner Geography': 'Period'})
observations.head()

2,Period,AF Afghanistan,AL Albania,DZ Algeria,AS American Samoa,AD Andorra,AO Angola,AI Anguilla,AQ Antarctica,AG Antigua & Barbuda,...,VI US Virgin Islands,UZ Uzbekistan,VU Vanuatu,VA Vatican City,VE Venezuela,VN Vietnam,WF Wallis & Futuna,YE Yemen,ZM Zambia,ZW Zimbabwe
3,1998,12,9,110,0,14,41,6,0,26,...,16,34,0,0,232,65,0,76,34,70
4,1999,0,13,110,0,12,67,1,0,31,...,5,31,0,0,198,77,0,65,30,75
5,2000,2,7,101,0,11,76,2,2,23,...,3,17,0,0,212,106,0,53,27,43
6,2001,3,23,116,0,17,95,0,0,15,...,0,22,0,0,312,89,0,73,25,36
7,2002,2,19,130,0,8,60,1,0,14,...,2,15,0,0,299,78,0,74,18,34


In [5]:
new_table = pd.melt(observations, id_vars= ['Period'], var_name='ONS Partner Geography', value_name='OBS')
new_table.reset_index(drop=True, inplace=True)
print(len(new_table))
new_table.head(50)

4580


Unnamed: 0,Period,ONS Partner Geography,OBS
0,1998,AF Afghanistan,12
1,1999,AF Afghanistan,0
2,2000,AF Afghanistan,2
3,2001,AF Afghanistan,3
4,2002,AF Afghanistan,2
5,2003,AF Afghanistan,11
6,2004,AF Afghanistan,14
7,2005,AF Afghanistan,14
8,2006,AF Afghanistan,23
9,2007,AF Afghanistan,43


In [6]:
new_table = new_table[new_table['OBS'] != 0]

In [7]:
new_table.count()

Period                   3750
ONS Partner Geography    3750
OBS                      3750
dtype: int64

In [8]:
new_table['Period'].unique()

array(['1998', '2000', '2001', '2002', '2003', '2004', '2005', '2006',
       '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014',
       '2015', '2016', '2017', '1999'], dtype=object)

In [None]:
# new_table['Period'] = 'month/' + new_table['Period'].astype(str).str[0:4]+ '-' + new_table['Period'].astype(str).str[-3:]
# new_table.head()

In [9]:
import re
YEAR_RE = re.compile(r'[0-9]{4}')
YEAR_MONTH_RE = re.compile(r'([0-9]{4})\s+(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)')
YEAR_QUARTER_RE = re.compile(r'([0-9]{4})\s+(Q[1-4])')

# from https://stackoverflow.com/questions/597476/how-to-concisely-cascade-through-multiple-regex-statements-in-python
class Re(object):
  def __init__(self):
    self.last_match = None
  def fullmatch(self,pattern,text):
    self.last_match = re.fullmatch(pattern,text)
    return self.last_match

def time2period(t):
    gre = Re()
    if gre.fullmatch(YEAR_RE, t):
        return f"year/{t}"
    elif gre.fullmatch(YEAR_MONTH_RE, t):
        year, month = gre.last_match.groups()
        month_num = {'JAN': '01', 'FEB': '02', 'MAR': '03', 'APR': '04', 'MAY': '05', 'JUN': '06',
                     'JUL': '07', 'AUG': '08', 'SEP': '09', 'OCT': '10', 'NOV': '11', 'DEC': '12'}.get(month)
        return f"month/{year}-{month_num}"
    elif gre.fullmatch(YEAR_QUARTER_RE, t):
        year, quarter = gre.last_match.groups()
        return f"quarter/{year}-{quarter}"
    else:
        print(f"no match for {t}")

new_table['Period'] = new_table['Period'].apply(time2period)
new_table.head(10)

Unnamed: 0,Period,ONS Partner Geography,OBS
0,year/1998,AF Afghanistan,12
2,year/2000,AF Afghanistan,2
3,year/2001,AF Afghanistan,3
4,year/2002,AF Afghanistan,2
5,year/2003,AF Afghanistan,11
6,year/2004,AF Afghanistan,14
7,year/2005,AF Afghanistan,14
8,year/2006,AF Afghanistan,23
9,year/2007,AF Afghanistan,43
10,year/2008,AF Afghanistan,71


In [10]:
new_table['Unit'] = '£ Million'
new_table['Measure Type'] = 'GBP Total'
new_table['Flow'] = 'Exports'
new_table.tail(5)

Unnamed: 0,Period,ONS Partner Geography,OBS,Unit,Measure Type,Flow
4575,year/2013,ZW Zimbabwe,55,£ Million,GBP Total,Exports
4576,year/2014,ZW Zimbabwe,42,£ Million,GBP Total,Exports
4577,year/2015,ZW Zimbabwe,43,£ Million,GBP Total,Exports
4578,year/2016,ZW Zimbabwe,36,£ Million,GBP Total,Exports
4579,year/2017,ZW Zimbabwe,36,£ Million,GBP Total,Exports


In [11]:
new_table.rename(index= str, columns= {'OBS':'Value'}, inplace = True)

In [12]:
Final_table = pd.DataFrame()

In [13]:
new_table = new_table[['ONS Partner Geography','Period','Flow','Measure Type','Value','Unit']]

In [14]:
new_table.tail(5)

Unnamed: 0,ONS Partner Geography,Period,Flow,Measure Type,Value,Unit
4575,ZW Zimbabwe,year/2013,Exports,GBP Total,55,£ Million
4576,ZW Zimbabwe,year/2014,Exports,GBP Total,42,£ Million
4577,ZW Zimbabwe,year/2015,Exports,GBP Total,43,£ Million
4578,ZW Zimbabwe,year/2016,Exports,GBP Total,36,£ Million
4579,ZW Zimbabwe,year/2017,Exports,GBP Total,36,£ Million


In [15]:
new_table['Value'] = new_table['Value'].astype(int)

In [16]:
new_table.dtypes

ONS Partner Geography    object
Period                   object
Flow                     object
Measure Type             object
Value                     int32
Unit                     object
dtype: object

In [17]:
Final_table = pd.concat([Final_table, new_table])

In [18]:
tab = pd.read_excel(BytesIO(session.get(sourceUrl).content), header = None, sheet_name = 3)
tab.iloc[2][0] = 'Dummy'
tab.columns=tab.iloc[2]
tab.rename(columns={'Dummy': 'Period'}, inplace = True)
tab

2,Period,AF Afghanistan,AL Albania,DZ Algeria,AS American Samoa,AD Andorra,AO Angola,AI Anguilla,AQ Antarctica,AG Antigua & Barbuda,...,VI US Virgin Islands,UZ Uzbekistan,VU Vanuatu,VA Vatican City,VE Venezuela,VN Vietnam,WF Wallis & Futuna,YE Yemen,ZM Zambia,ZW Zimbabwe
0,Please Note: Data up to 2016 is consistent wit...,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,Dummy,AF Afghanistan,AL Albania,DZ Algeria,AS American Samoa,AD Andorra,AO Angola,AI Anguilla,AQ Antarctica,AG Antigua & Barbuda,...,VI US Virgin Islands,UZ Uzbekistan,VU Vanuatu,VA Vatican City,VE Venezuela,VN Vietnam,WF Wallis & Futuna,YE Yemen,ZM Zambia,ZW Zimbabwe
3,1998,1,0,82,0,0,8,0,2,0,...,12,7,0,0,110,243,0,2,24,116
4,1999,0,0,166,0,0,9,0,0,0,...,0,22,0,0,145,302,0,3,15,122
5,2000,8,2,446,0,0,0,0,0,2,...,5,15,0,0,205,380,0,4,15,101
6,2001,0,0,233,0,0,71,0,0,3,...,10,26,0,0,155,424,0,2,14,89
7,2002,0,2,342,0,0,23,0,0,0,...,0,21,0,0,179,486,0,38,13,85
8,2003,0,3,257,0,0,5,1,0,8,...,2,27,0,0,110,606,0,8,16,58
9,2004,2,0,375,0,0,8,1,0,30,...,4,26,0,0,207,708,0,12,15,45


In [19]:
observations = tab[3:].rename(columns={'ONS Partner Geography': 'Period'})
observations.head()

2,Period,AF Afghanistan,AL Albania,DZ Algeria,AS American Samoa,AD Andorra,AO Angola,AI Anguilla,AQ Antarctica,AG Antigua & Barbuda,...,VI US Virgin Islands,UZ Uzbekistan,VU Vanuatu,VA Vatican City,VE Venezuela,VN Vietnam,WF Wallis & Futuna,YE Yemen,ZM Zambia,ZW Zimbabwe
3,1998,1,0,82,0,0,8,0,2,0,...,12,7,0,0,110,243,0,2,24,116
4,1999,0,0,166,0,0,9,0,0,0,...,0,22,0,0,145,302,0,3,15,122
5,2000,8,2,446,0,0,0,0,0,2,...,5,15,0,0,205,380,0,4,15,101
6,2001,0,0,233,0,0,71,0,0,3,...,10,26,0,0,155,424,0,2,14,89
7,2002,0,2,342,0,0,23,0,0,0,...,0,21,0,0,179,486,0,38,13,85


In [20]:
new_table = pd.melt(observations, id_vars= ['Period'], var_name='ONS Partner Geography', value_name='OBS')
new_table.reset_index(drop=True, inplace=True)
print(len(new_table))
new_table.head(50)

4580


Unnamed: 0,Period,ONS Partner Geography,OBS
0,1998,AF Afghanistan,1
1,1999,AF Afghanistan,0
2,2000,AF Afghanistan,8
3,2001,AF Afghanistan,0
4,2002,AF Afghanistan,0
5,2003,AF Afghanistan,0
6,2004,AF Afghanistan,2
7,2005,AF Afghanistan,0
8,2006,AF Afghanistan,10
9,2007,AF Afghanistan,0


In [21]:
new_table = new_table[new_table['OBS'] != 0]

In [22]:
new_table.count()

Period                   3212
ONS Partner Geography    3212
OBS                      3212
dtype: int64

In [23]:
new_table['Period'].unique()

array(['1998', '2000', '2004', '2006', '2010', '2012', '2016', '2002',
       '2003', '2013', '2014', '2017', '1999', '2001', '2005', '2007',
       '2008', '2009', '2011', '2015'], dtype=object)

In [24]:
import re
YEAR_RE = re.compile(r'[0-9]{4}')
YEAR_MONTH_RE = re.compile(r'([0-9]{4})\s+(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)')
YEAR_QUARTER_RE = re.compile(r'([0-9]{4})\s+(Q[1-4])')

# from https://stackoverflow.com/questions/597476/how-to-concisely-cascade-through-multiple-regex-statements-in-python
class Re(object):
  def __init__(self):
    self.last_match = None
  def fullmatch(self,pattern,text):
    self.last_match = re.fullmatch(pattern,text)
    return self.last_match

def time2period(t):
    gre = Re()
    if gre.fullmatch(YEAR_RE, t):
        return f"year/{t}"
    elif gre.fullmatch(YEAR_MONTH_RE, t):
        year, month = gre.last_match.groups()
        month_num = {'JAN': '01', 'FEB': '02', 'MAR': '03', 'APR': '04', 'MAY': '05', 'JUN': '06',
                     'JUL': '07', 'AUG': '08', 'SEP': '09', 'OCT': '10', 'NOV': '11', 'DEC': '12'}.get(month)
        return f"month/{year}-{month_num}"
    elif gre.fullmatch(YEAR_QUARTER_RE, t):
        year, quarter = gre.last_match.groups()
        return f"quarter/{year}-{quarter}"
    else:
        print(f"no match for {t}")

new_table['Period'] = new_table['Period'].apply(time2period)
new_table.head(10)

Unnamed: 0,Period,ONS Partner Geography,OBS
0,year/1998,AF Afghanistan,1
2,year/2000,AF Afghanistan,8
6,year/2004,AF Afghanistan,2
8,year/2006,AF Afghanistan,10
12,year/2010,AF Afghanistan,2
14,year/2012,AF Afghanistan,3
18,year/2016,AF Afghanistan,2
22,year/2000,AL Albania,2
24,year/2002,AL Albania,2
25,year/2003,AL Albania,3


In [25]:
new_table['Unit'] = '£ Million'
new_table['Measure Type'] = 'GBP Total'
new_table['Flow'] = 'Imports'
new_table.tail(5)

Unnamed: 0,Period,ONS Partner Geography,OBS,Unit,Measure Type,Flow
4575,year/2013,ZW Zimbabwe,27,£ Million,GBP Total,Imports
4576,year/2014,ZW Zimbabwe,25,£ Million,GBP Total,Imports
4577,year/2015,ZW Zimbabwe,22,£ Million,GBP Total,Imports
4578,year/2016,ZW Zimbabwe,42,£ Million,GBP Total,Imports
4579,year/2017,ZW Zimbabwe,75,£ Million,GBP Total,Imports


In [26]:
new_table.rename(index= str, columns= {'OBS':'Value'}, inplace = True)

In [27]:
new_table = new_table[['ONS Partner Geography','Period','Flow','Measure Type','Value','Unit']]

In [28]:
new_table['Value'] = new_table['Value'].astype(int)

In [29]:
Final_table = pd.concat([Final_table, new_table])

In [30]:
tab = pd.read_excel(BytesIO(session.get(sourceUrl).content), header = None, sheet_name = 2)
tab.iloc[2][0] = 'Dummy'
tab.columns=tab.iloc[2]
tab.rename(columns={'Dummy': 'Period'}, inplace = True)
tab

2,Period,AF Afghanistan,AL Albania,DZ Algeria,AS American Samoa,AD Andorra,AO Angola,AI Anguilla,AQ Antarctica,AG Antigua & Barbuda,...,VI US Virgin Islands,UZ Uzbekistan,VU Vanuatu,VA Vatican City,VE Venezuela,VN Vietnam,WF Wallis & Futuna,YE Yemen,ZM Zambia,ZW Zimbabwe
0,Please Note: Data for 2017 has been revised th...,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,Dummy,AF Afghanistan,AL Albania,DZ Algeria,AS American Samoa,AD Andorra,AO Angola,AI Anguilla,AQ Antarctica,AG Antigua & Barbuda,...,VI US Virgin Islands,UZ Uzbekistan,VU Vanuatu,VA Vatican City,VE Venezuela,VN Vietnam,WF Wallis & Futuna,YE Yemen,ZM Zambia,ZW Zimbabwe
3,1998JAN,1,1,8,0,0,3,1,0,4,...,0,2,0,0,31,7,0,6,3,6
4,1998FEB,0,1,9,0,1,3,0,0,3,...,0,3,0,0,19,6,0,5,2,5
5,1998MAR,2,1,10,0,2,5,1,0,2,...,4,3,0,0,25,6,0,7,4,6
6,1998APR,1,0,8,0,1,5,1,0,2,...,0,6,0,0,23,6,0,4,5,6
7,1998MAY,1,0,9,0,3,2,0,0,2,...,0,6,0,0,24,6,0,5,2,6
8,1998JUN,1,1,9,0,1,4,1,0,1,...,6,4,0,0,25,7,0,7,3,7
9,1998JUL,1,1,10,0,1,4,0,0,4,...,0,5,0,0,19,6,0,5,3,7


In [31]:
observations = tab[3:].rename(columns={'ONS Partner Geography': 'Period'})
observations.head()

2,Period,AF Afghanistan,AL Albania,DZ Algeria,AS American Samoa,AD Andorra,AO Angola,AI Anguilla,AQ Antarctica,AG Antigua & Barbuda,...,VI US Virgin Islands,UZ Uzbekistan,VU Vanuatu,VA Vatican City,VE Venezuela,VN Vietnam,WF Wallis & Futuna,YE Yemen,ZM Zambia,ZW Zimbabwe
3,1998JAN,1,1,8,0,0,3,1,0,4,...,0,2,0,0,31,7,0,6,3,6
4,1998FEB,0,1,9,0,1,3,0,0,3,...,0,3,0,0,19,6,0,5,2,5
5,1998MAR,2,1,10,0,2,5,1,0,2,...,4,3,0,0,25,6,0,7,4,6
6,1998APR,1,0,8,0,1,5,1,0,2,...,0,6,0,0,23,6,0,4,5,6
7,1998MAY,1,0,9,0,3,2,0,0,2,...,0,6,0,0,24,6,0,5,2,6


In [32]:
new_table = pd.melt(observations, id_vars= ['Period'], var_name='ONS Partner Geography', value_name='OBS')
new_table.reset_index(drop=True, inplace=True)
print(len(new_table))
new_table.head(50)

55876


Unnamed: 0,Period,ONS Partner Geography,OBS
0,1998JAN,AF Afghanistan,1
1,1998FEB,AF Afghanistan,0
2,1998MAR,AF Afghanistan,2
3,1998APR,AF Afghanistan,1
4,1998MAY,AF Afghanistan,1
5,1998JUN,AF Afghanistan,1
6,1998JUL,AF Afghanistan,1
7,1998AUG,AF Afghanistan,1
8,1998SEP,AF Afghanistan,1
9,1998OCT,AF Afghanistan,1


In [33]:
new_table = new_table[new_table['OBS'] != 0]

In [34]:
new_table.count()

Period                   40493
ONS Partner Geography    40493
OBS                      40493
dtype: int64

In [35]:
new_table['Period'].unique()

array(['1998JAN', '1998MAR', '1998APR', '1998MAY', '1998JUN', '1998JUL',
       '1998AUG', '1998SEP', '1998OCT', '1998NOV', '1998DEC', '2000JAN',
       '2000APR', '2001FEB', '2001JUN', '2001JUL', '2002OCT', '2002NOV',
       '2003MAR', '2003APR', '2003MAY', '2003JUN', '2003JUL', '2003AUG',
       '2003SEP', '2003OCT', '2003NOV', '2003DEC', '2004JAN', '2004FEB',
       '2004MAR', '2004APR', '2004MAY', '2004JUN', '2004JUL', '2004AUG',
       '2004SEP', '2004OCT', '2004NOV', '2004DEC', '2005FEB', '2005MAR',
       '2005APR', '2005MAY', '2005JUN', '2005JUL', '2005AUG', '2005SEP',
       '2005OCT', '2005NOV', '2005DEC', '2006JAN', '2006FEB', '2006MAR',
       '2006APR', '2006MAY', '2006JUN', '2006JUL', '2006AUG', '2006SEP',
       '2006OCT', '2006NOV', '2006DEC', '2007JAN', '2007FEB', '2007MAR',
       '2007APR', '2007MAY', '2007JUN', '2007JUL', '2007AUG', '2007SEP',
       '2007OCT', '2007NOV', '2007DEC', '2008JAN', '2008FEB', '2008MAR',
       '2008APR', '2008MAY', '2008JUN', '2008JUL', 

In [36]:
new_table['Period'] = new_table['Period'].astype(str).str[0:4]+ ' ' + new_table['Period'].astype(str).str[-3:]
new_table.head()

Unnamed: 0,Period,ONS Partner Geography,OBS
0,1998 JAN,AF Afghanistan,1
2,1998 MAR,AF Afghanistan,2
3,1998 APR,AF Afghanistan,1
4,1998 MAY,AF Afghanistan,1
5,1998 JUN,AF Afghanistan,1


In [37]:
import re
YEAR_RE = re.compile(r'[0-9]{4}')
YEAR_MONTH_RE = re.compile(r'([0-9]{4})\s+(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)')
YEAR_QUARTER_RE = re.compile(r'([0-9]{4})\s+(Q[1-4])')

# from https://stackoverflow.com/questions/597476/how-to-concisely-cascade-through-multiple-regex-statements-in-python
class Re(object):
  def __init__(self):
    self.last_match = None
  def fullmatch(self,pattern,text):
    self.last_match = re.fullmatch(pattern,text)
    return self.last_match

def time2period(t):
    gre = Re()
    if gre.fullmatch(YEAR_RE, t):
        return f"year/{t}"
    elif gre.fullmatch(YEAR_MONTH_RE, t):
        year, month = gre.last_match.groups()
        month_num = {'JAN': '01', 'FEB': '02', 'MAR': '03', 'APR': '04', 'MAY': '05', 'JUN': '06',
                     'JUL': '07', 'AUG': '08', 'SEP': '09', 'OCT': '10', 'NOV': '11', 'DEC': '12'}.get(month)
        return f"month/{year}-{month_num}"
    elif gre.fullmatch(YEAR_QUARTER_RE, t):
        year, quarter = gre.last_match.groups()
        return f"quarter/{year}-{quarter}"
    else:
        print(f"no match for {t}")

new_table['Period'] = new_table['Period'].apply(time2period)
new_table.head(10)

Unnamed: 0,Period,ONS Partner Geography,OBS
0,month/1998-01,AF Afghanistan,1
2,month/1998-03,AF Afghanistan,2
3,month/1998-04,AF Afghanistan,1
4,month/1998-05,AF Afghanistan,1
5,month/1998-06,AF Afghanistan,1
6,month/1998-07,AF Afghanistan,1
7,month/1998-08,AF Afghanistan,1
8,month/1998-09,AF Afghanistan,1
9,month/1998-10,AF Afghanistan,1
10,month/1998-11,AF Afghanistan,1


In [38]:
new_table['Unit'] = '£ Million'
new_table['Measure Type'] = 'GBP Total'
new_table['Flow'] = 'Exports'
new_table.tail(5)

Unnamed: 0,Period,ONS Partner Geography,OBS,Unit,Measure Type,Flow
55871,month/2017-12,ZW Zimbabwe,4,£ Million,GBP Total,Exports
55872,month/2018-01,ZW Zimbabwe,3,£ Million,GBP Total,Exports
55873,month/2018-02,ZW Zimbabwe,3,£ Million,GBP Total,Exports
55874,month/2018-03,ZW Zimbabwe,5,£ Million,GBP Total,Exports
55875,month/2018-04,ZW Zimbabwe,3,£ Million,GBP Total,Exports


In [39]:
new_table.rename(index= str, columns= {'OBS':'Value'}, inplace = True)

In [40]:
new_table = new_table[['ONS Partner Geography','Period','Flow','Measure Type','Value','Unit']]

In [41]:
new_table['Value'] = new_table['Value'].astype(int)

In [42]:
new_table.dtypes

ONS Partner Geography    object
Period                   object
Flow                     object
Measure Type             object
Value                     int32
Unit                     object
dtype: object

In [43]:
Final_table = pd.concat([Final_table, new_table])

In [44]:
tab = pd.read_excel(BytesIO(session.get(sourceUrl).content), header = None, sheet_name = 4)
tab.iloc[2][0] = 'Dummy'
tab.columns=tab.iloc[2]
tab.rename(columns={'Dummy': 'Period'}, inplace = True)
tab

2,Period,AF Afghanistan,AL Albania,DZ Algeria,AS American Samoa,AD Andorra,AO Angola,AI Anguilla,AQ Antarctica,AG Antigua & Barbuda,...,VI US Virgin Islands,UZ Uzbekistan,VU Vanuatu,VA Vatican City,VE Venezuela,VN Vietnam,WF Wallis & Futuna,YE Yemen,ZM Zambia,ZW Zimbabwe
0,Please Note: Data for 2017 has been revised th...,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,Dummy,AF Afghanistan,AL Albania,DZ Algeria,AS American Samoa,AD Andorra,AO Angola,AI Anguilla,AQ Antarctica,AG Antigua & Barbuda,...,VI US Virgin Islands,UZ Uzbekistan,VU Vanuatu,VA Vatican City,VE Venezuela,VN Vietnam,WF Wallis & Futuna,YE Yemen,ZM Zambia,ZW Zimbabwe
3,1998JAN,0,0,1,0,0,1,0,1,0,...,9,1,0,0,8,21,0,0,2,11
4,1998FEB,1,0,18,0,0,0,0,1,0,...,2,0,0,0,6,21,0,0,2,6
5,1998MAR,0,0,7,0,0,1,0,0,0,...,0,1,0,0,10,23,0,0,2,7
6,1998APR,0,0,3,0,0,0,0,0,0,...,0,0,0,0,9,24,0,0,1,7
7,1998MAY,0,0,1,0,0,0,0,0,0,...,0,0,0,0,7,20,0,1,3,11
8,1998JUN,0,0,8,0,0,0,0,0,0,...,0,1,0,0,21,19,0,0,2,10
9,1998JUL,0,0,7,0,0,2,0,0,0,...,1,1,0,0,9,18,0,0,2,20


In [45]:
observations = tab[3:].rename(columns={'ONS Partner Geography': 'Period'})
observations.head()

2,Period,AF Afghanistan,AL Albania,DZ Algeria,AS American Samoa,AD Andorra,AO Angola,AI Anguilla,AQ Antarctica,AG Antigua & Barbuda,...,VI US Virgin Islands,UZ Uzbekistan,VU Vanuatu,VA Vatican City,VE Venezuela,VN Vietnam,WF Wallis & Futuna,YE Yemen,ZM Zambia,ZW Zimbabwe
3,1998JAN,0,0,1,0,0,1,0,1,0,...,9,1,0,0,8,21,0,0,2,11
4,1998FEB,1,0,18,0,0,0,0,1,0,...,2,0,0,0,6,21,0,0,2,6
5,1998MAR,0,0,7,0,0,1,0,0,0,...,0,1,0,0,10,23,0,0,2,7
6,1998APR,0,0,3,0,0,0,0,0,0,...,0,0,0,0,9,24,0,0,1,7
7,1998MAY,0,0,1,0,0,0,0,0,0,...,0,0,0,0,7,20,0,1,3,11


In [46]:
new_table = pd.melt(observations, id_vars= ['Period'], var_name='ONS Partner Geography', value_name='OBS')
new_table.reset_index(drop=True, inplace=True)
print(len(new_table))
new_table.head()

55876


Unnamed: 0,Period,ONS Partner Geography,OBS
0,1998JAN,AF Afghanistan,0
1,1998FEB,AF Afghanistan,1
2,1998MAR,AF Afghanistan,0
3,1998APR,AF Afghanistan,0
4,1998MAY,AF Afghanistan,0


In [47]:
new_table = new_table[new_table['OBS'] != 0]

In [48]:
new_table.count()

Period                   33301
ONS Partner Geography    33301
OBS                      33301
dtype: int64

In [49]:
new_table['Period'].unique()

array(['1998FEB', '2000FEB', '2000APR', '2000MAY', '2000JUN', '2000AUG',
       '2000SEP', '2000NOV', '2000DEC', '2004JUL', '2004NOV', '2006JUN',
       '2010JUN', '2010SEP', '2012JAN', '2012APR', '2012MAY', '2016JUN',
       '2016NOV', '2002SEP', '2002DEC', '2003MAR', '2003APR', '2003OCT',
       '2013NOV', '2014SEP', '2014OCT', '2017JAN', '2017MAY', '2017AUG',
       '2017DEC', '2018APR', '1998JAN', '1998MAR', '1998APR', '1998MAY',
       '1998JUN', '1998JUL', '1998AUG', '1998SEP', '1998OCT', '1998NOV',
       '1998DEC', '1999JAN', '1999FEB', '1999MAR', '1999MAY', '1999JUN',
       '1999JUL', '1999AUG', '1999SEP', '1999OCT', '1999NOV', '1999DEC',
       '2000JAN', '2000MAR', '2000JUL', '2000OCT', '2001JAN', '2001FEB',
       '2001MAR', '2001APR', '2001MAY', '2001JUN', '2001JUL', '2001SEP',
       '2001OCT', '2001NOV', '2001DEC', '2002JAN', '2002FEB', '2002MAR',
       '2002APR', '2002MAY', '2002JUN', '2002JUL', '2002AUG', '2002OCT',
       '2002NOV', '2003FEB', '2003MAY', '2003JUN', 

In [50]:
new_table['Period'] = new_table['Period'].astype(str).str[0:4]+ ' ' + new_table['Period'].astype(str).str[-3:]
new_table.head()

Unnamed: 0,Period,ONS Partner Geography,OBS
1,1998 FEB,AF Afghanistan,1
25,2000 FEB,AF Afghanistan,1
27,2000 APR,AF Afghanistan,1
28,2000 MAY,AF Afghanistan,1
29,2000 JUN,AF Afghanistan,1


In [51]:
import re
YEAR_RE = re.compile(r'[0-9]{4}')
YEAR_MONTH_RE = re.compile(r'([0-9]{4})\s+(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)')
YEAR_QUARTER_RE = re.compile(r'([0-9]{4})\s+(Q[1-4])')

# from https://stackoverflow.com/questions/597476/how-to-concisely-cascade-through-multiple-regex-statements-in-python
class Re(object):
  def __init__(self):
    self.last_match = None
  def fullmatch(self,pattern,text):
    self.last_match = re.fullmatch(pattern,text)
    return self.last_match

def time2period(t):
    gre = Re()
    if gre.fullmatch(YEAR_RE, t):
        return f"year/{t}"
    elif gre.fullmatch(YEAR_MONTH_RE, t):
        year, month = gre.last_match.groups()
        month_num = {'JAN': '01', 'FEB': '02', 'MAR': '03', 'APR': '04', 'MAY': '05', 'JUN': '06',
                     'JUL': '07', 'AUG': '08', 'SEP': '09', 'OCT': '10', 'NOV': '11', 'DEC': '12'}.get(month)
        return f"month/{year}-{month_num}"
    elif gre.fullmatch(YEAR_QUARTER_RE, t):
        year, quarter = gre.last_match.groups()
        return f"quarter/{year}-{quarter}"
    else:
        print(f"no match for {t}")

new_table['Period'] = new_table['Period'].apply(time2period)
new_table.head(10)

Unnamed: 0,Period,ONS Partner Geography,OBS
1,month/1998-02,AF Afghanistan,1
25,month/2000-02,AF Afghanistan,1
27,month/2000-04,AF Afghanistan,1
28,month/2000-05,AF Afghanistan,1
29,month/2000-06,AF Afghanistan,1
31,month/2000-08,AF Afghanistan,1
32,month/2000-09,AF Afghanistan,1
34,month/2000-11,AF Afghanistan,1
35,month/2000-12,AF Afghanistan,1
78,month/2004-07,AF Afghanistan,1


In [52]:
new_table['Unit'] = '£ Million'
new_table['Measure Type'] = 'GBP Total'
new_table['Flow'] = 'Imports'
new_table.tail(5)

Unnamed: 0,Period,ONS Partner Geography,OBS,Unit,Measure Type,Flow
55870,month/2017-11,ZW Zimbabwe,14,£ Million,GBP Total,Imports
55871,month/2017-12,ZW Zimbabwe,4,£ Million,GBP Total,Imports
55873,month/2018-02,ZW Zimbabwe,7,£ Million,GBP Total,Imports
55874,month/2018-03,ZW Zimbabwe,7,£ Million,GBP Total,Imports
55875,month/2018-04,ZW Zimbabwe,7,£ Million,GBP Total,Imports


In [53]:
new_table.rename(index= str, columns= {'OBS':'Value'}, inplace = True)

In [54]:
new_table = new_table[['ONS Partner Geography','Period','Flow','Measure Type','Value','Unit']]

In [55]:
new_table['Value'] = new_table['Value'].astype(int)

In [56]:
new_table.dtypes

ONS Partner Geography    object
Period                   object
Flow                     object
Measure Type             object
Value                     int32
Unit                     object
dtype: object

In [57]:
Final_table = pd.concat([Final_table, new_table])

In [58]:
Final_table.count()

ONS Partner Geography    80756
Period                   80756
Flow                     80756
Measure Type             80756
Value                    80756
Unit                     80756
dtype: int64

In [59]:
Final_table['ONS Partner Geography'].unique()

array(['AF Afghanistan', 'AL Albania', 'DZ Algeria', 'AS American Samoa',
       'AD Andorra', 'AO Angola', 'AI Anguilla', 'AQ Antarctica',
       'AG Antigua & Barbuda', 'AR Argentina', 'AM Armenia', 'AW Aruba',
       'AU Australia', 'AT Austria', 'AZ Azerbaijan', 'BS Bahamas',
       'BH Bahrain', 'BD Bangladesh', 'BB Barbados', 'BY Belarus',
       'BE Belgium', 'BZ Belize', 'BJ Benin', 'BM Bermuda', 'BT Bhutan',
       'BO Bolivia', 'BA Bosnia & Herzegovina', 'BW Botswana', 'BR Brazil',
       'VG British Virgin Islands', 'BN Brunei', 'BG Bulgaria',
       'BF Burkina Faso', 'MM Burma (Myanmar)', 'BI Burundi',
       'KH Cambodia', 'CM Cameroon', 'CA Canada', 'CV Cape Verde',
       'KY Cayman Islands', 'CF Central African Republic', 'XC Ceuta',
       'TD Chad', 'CL Chile', 'CN China', 'CX Christmas Islands',
       'CC Cocos Islands', 'CO Colombia', 'CD Congo (Democratic Republic)',
       'CG Congo (Republic)', 'CK Cook Islands', 'CR Costa Rica',
       "CI Côte d'Ivoire (Ivory

In [60]:
Final_table['ONS Partner Geography'] = Final_table['ONS Partner Geography'].astype(str).str[0:2]

In [61]:
Final_table['ONS Partner Geography'] = 'cord/' + Final_table['ONS Partner Geography']

In [62]:
Final_table.head()

Unnamed: 0,ONS Partner Geography,Period,Flow,Measure Type,Value,Unit
0,cord/AF,year/1998,Exports,GBP Total,12,£ Million
2,cord/AF,year/2000,Exports,GBP Total,2,£ Million
3,cord/AF,year/2001,Exports,GBP Total,3,£ Million
4,cord/AF,year/2002,Exports,GBP Total,2,£ Million
5,cord/AF,year/2003,Exports,GBP Total,11,£ Million


In [63]:
Final_table.tail()

Unnamed: 0,ONS Partner Geography,Period,Flow,Measure Type,Value,Unit
55870,cord/ZW,month/2017-11,Imports,GBP Total,14,£ Million
55871,cord/ZW,month/2017-12,Imports,GBP Total,4,£ Million
55873,cord/ZW,month/2018-02,Imports,GBP Total,7,£ Million
55874,cord/ZW,month/2018-03,Imports,GBP Total,7,£ Million
55875,cord/ZW,month/2018-04,Imports,GBP Total,7,£ Million


In [64]:
destinationFolder = Path('out')
destinationFolder.mkdir(exist_ok=True, parents=True)

Final_table.to_csv(destinationFolder / ('observations.csv'), index = False)