###  Individual country data (goods) on a monthly basis to Tidy Data

In [1]:
from databaker.framework import *
import pandas as pd 

In [2]:
import requests
from cachecontrol import CacheControl
from cachecontrol.caches.file_cache import FileCache
from cachecontrol.heuristics import LastModified
from pathlib import Path
from io import BytesIO

session = CacheControl(requests.Session(),
                       cache=FileCache('.cache'),
                       heuristic=LastModified())

sourceUrl = 'https://www.ons.gov.uk/file?uri=/economy/nationalaccounts/balanceofpayments/adhocs/007948individualcountrydatagoodsonamonthlybasisfromjanuary1998tonovember2017/11.allcountriesnovember20172.xlsx'

In [3]:
tab = pd.read_excel(BytesIO(session.get(sourceUrl).content), header = None, sheet_name = 1)
tab.iloc[0][0] = 'Dummy'
tab.columns=tab.iloc[0]
tab.rename(columns={'Dummy': 'Period'}, inplace = True)
tab

Unnamed: 0,Period,AF Afghanistan,AL Albania,DZ Algeria,AS American Samoa,AD Andorra,AO Angola,AI Anguilla,AQ Antarctica,AG Antigua & Barbuda,...,VI US Virgin Islands,UZ Uzbekistan,VU Vanuatu,VA Vatican City,VE Venezuela,VN Vietnam,WF Wallis & Futuna,YE Yemen,ZM Zambia,ZW Zimbabwe
0,Dummy,AF Afghanistan,AL Albania,DZ Algeria,AS American Samoa,AD Andorra,AO Angola,AI Anguilla,AQ Antarctica,AG Antigua & Barbuda,...,VI US Virgin Islands,UZ Uzbekistan,VU Vanuatu,VA Vatican City,VE Venezuela,VN Vietnam,WF Wallis & Futuna,YE Yemen,ZM Zambia,ZW Zimbabwe
1,1998JAN,1,1,8,-,-,3,1,-,4,...,-,2,-,-,31,7,-,6,3,6
2,1998FEB,-,1,9,-,1,3,-,-,3,...,-,3,-,-,19,6,-,5,2,5
3,1998MAR,2,1,10,-,2,5,1,-,2,...,4,3,-,-,25,6,-,7,4,6
4,1998APR,1,-,8,-,1,5,1,-,2,...,-,6,-,-,23,6,-,4,5,6
5,1998MAY,1,-,9,-,3,2,-,-,2,...,-,6,-,-,24,6,-,5,2,6
6,1998JUN,1,1,9,-,1,4,1,-,1,...,6,4,-,-,25,7,-,7,3,7
7,1998JUL,1,1,10,-,1,4,-,-,4,...,-,5,-,-,19,6,-,5,3,7
8,1998AUG,1,1,7,-,1,2,-,-,1,...,4,1,-,-,13,6,-,5,2,5
9,1998SEP,1,-,11,-,1,4,1,-,2,...,1,1,-,-,18,3,-,7,2,7


In [4]:
observations = tab[1:].rename(columns={'ONS Partner Geography': 'Period'})
observations.head()

Unnamed: 0,Period,AF Afghanistan,AL Albania,DZ Algeria,AS American Samoa,AD Andorra,AO Angola,AI Anguilla,AQ Antarctica,AG Antigua & Barbuda,...,VI US Virgin Islands,UZ Uzbekistan,VU Vanuatu,VA Vatican City,VE Venezuela,VN Vietnam,WF Wallis & Futuna,YE Yemen,ZM Zambia,ZW Zimbabwe
1,1998JAN,1,1,8,-,-,3,1,-,4,...,-,2,-,-,31,7,-,6,3,6
2,1998FEB,-,1,9,-,1,3,-,-,3,...,-,3,-,-,19,6,-,5,2,5
3,1998MAR,2,1,10,-,2,5,1,-,2,...,4,3,-,-,25,6,-,7,4,6
4,1998APR,1,-,8,-,1,5,1,-,2,...,-,6,-,-,23,6,-,4,5,6
5,1998MAY,1,-,9,-,3,2,-,-,2,...,-,6,-,-,24,6,-,5,2,6


In [5]:
new_table = pd.melt(observations, id_vars= ['Period'], var_name='ONS Partner Geography', value_name='OBS')
new_table.reset_index(drop=True, inplace=True)
print(len(new_table))
new_table.head(50)

55448


Unnamed: 0,Period,ONS Partner Geography,OBS
0,1998JAN,AF Afghanistan,1
1,1998FEB,AF Afghanistan,-
2,1998MAR,AF Afghanistan,2
3,1998APR,AF Afghanistan,1
4,1998MAY,AF Afghanistan,1
5,1998JUN,AF Afghanistan,1
6,1998JUL,AF Afghanistan,1
7,1998AUG,AF Afghanistan,1
8,1998SEP,AF Afghanistan,1
9,1998OCT,AF Afghanistan,1


In [6]:
new_table = new_table[new_table['OBS'] != '-']

In [7]:
new_table.count()

Period                   39613
ONS Partner Geography    39613
OBS                      39613
dtype: int64

In [8]:
new_table['Period'].unique()

array(['1998JAN', '1998MAR', '1998APR', '1998MAY', '1998JUN', '1998JUL',
       '1998AUG', '1998SEP', '1998OCT', '1998NOV', '1998DEC', '2000JAN',
       '2000APR', '2001FEB', '2001JUN', '2001JUL', '2002OCT', '2002NOV',
       '2003MAR', '2003APR', '2003MAY', '2003JUN', '2003JUL', '2003AUG',
       '2003SEP', '2003OCT', '2003NOV', '2003DEC', '2004JAN', '2004FEB',
       '2004MAR', '2004APR', '2004MAY', '2004JUN', '2004JUL', '2004AUG',
       '2004SEP', '2004OCT', '2004NOV', '2004DEC', '2005FEB', '2005MAR',
       '2005APR', '2005MAY', '2005JUN', '2005JUL', '2005AUG', '2005SEP',
       '2005OCT', '2005NOV', '2005DEC', '2006JAN', '2006FEB', '2006MAR',
       '2006APR', '2006MAY', '2006JUN', '2006JUL', '2006AUG', '2006SEP',
       '2006OCT', '2006NOV', '2006DEC', '2007JAN', '2007FEB', '2007MAR',
       '2007APR', '2007MAY', '2007JUN', '2007JUL', '2007AUG', '2007SEP',
       '2007OCT', '2007NOV', '2007DEC', '2008JAN', '2008FEB', '2008MAR',
       '2008APR', '2008MAY', '2008JUN', '2008JUL', 

In [9]:
new_table['Period'] = 'month/' + new_table['Period'].astype(str).str[0:4]+ '-' + new_table['Period'].astype(str).str[-3:]
new_table.head()

Unnamed: 0,Period,ONS Partner Geography,OBS
0,month/1998-JAN,AF Afghanistan,1
2,month/1998-MAR,AF Afghanistan,2
3,month/1998-APR,AF Afghanistan,1
4,month/1998-MAY,AF Afghanistan,1
5,month/1998-JUN,AF Afghanistan,1


In [10]:
new_table['Unit'] = '£ Million'
new_table['Measure Type'] = 'GBP Total'
new_table['Flow'] = 'Exports'
new_table.tail(5)

Unnamed: 0,Period,ONS Partner Geography,OBS,Unit,Measure Type,Flow
55443,month/2017-JUL,ZW Zimbabwe,3,£ Million,GBP Total,Exports
55444,month/2017-AUG,ZW Zimbabwe,4,£ Million,GBP Total,Exports
55445,month/2017-SEP,ZW Zimbabwe,4,£ Million,GBP Total,Exports
55446,month/2017-OCT,ZW Zimbabwe,3,£ Million,GBP Total,Exports
55447,month/2017-NOV,ZW Zimbabwe,2,£ Million,GBP Total,Exports


In [11]:
new_table.rename(index= str, columns= {'OBS':'Value'}, inplace = True)

In [12]:
Final_table = pd.DataFrame()

In [13]:
new_table = new_table[['ONS Partner Geography','Period','Flow','Measure Type','Value','Unit']]

In [14]:
new_table.tail(5)

Unnamed: 0,ONS Partner Geography,Period,Flow,Measure Type,Value,Unit
55443,ZW Zimbabwe,month/2017-JUL,Exports,GBP Total,3,£ Million
55444,ZW Zimbabwe,month/2017-AUG,Exports,GBP Total,4,£ Million
55445,ZW Zimbabwe,month/2017-SEP,Exports,GBP Total,4,£ Million
55446,ZW Zimbabwe,month/2017-OCT,Exports,GBP Total,3,£ Million
55447,ZW Zimbabwe,month/2017-NOV,Exports,GBP Total,2,£ Million


In [15]:
new_table['Value'] = new_table['Value'].astype(int)

In [16]:
new_table.dtypes

ONS Partner Geography    object
Period                   object
Flow                     object
Measure Type             object
Value                     int32
Unit                     object
dtype: object

In [17]:
Final_table = pd.concat([Final_table, new_table])

In [18]:
tab = pd.read_excel(BytesIO(session.get(sourceUrl).content), header = None, sheet_name = 2)
tab.iloc[0][0] = 'Dummy'
tab.columns=tab.iloc[0]
tab.rename(columns={'Dummy': 'Period'}, inplace = True)
tab

Unnamed: 0,Period,AF Afghanistan,AL Albania,DZ Algeria,AS American Samoa,AD Andorra,AO Angola,AI Anguilla,AQ Antarctica,AG Antigua & Barbuda,...,VI US Virgin Islands,UZ Uzbekistan,VU Vanuatu,VA Vatican City,VE Venezuela,VN Vietnam,WF Wallis & Futuna,YE Yemen,ZM Zambia,ZW Zimbabwe
0,Dummy,AF Afghanistan,AL Albania,DZ Algeria,AS American Samoa,AD Andorra,AO Angola,AI Anguilla,AQ Antarctica,AG Antigua & Barbuda,...,VI US Virgin Islands,UZ Uzbekistan,VU Vanuatu,VA Vatican City,VE Venezuela,VN Vietnam,WF Wallis & Futuna,YE Yemen,ZM Zambia,ZW Zimbabwe
1,1998JAN,-,-,1,-,-,1,-,1,-,...,9,1,-,-,8,21,-,-,2,11
2,1998FEB,1,-,18,-,-,-,-,1,-,...,2,-,-,-,6,21,-,-,2,6
3,1998MAR,-,-,7,-,-,1,-,-,-,...,-,1,-,-,10,23,-,-,2,7
4,1998APR,-,-,3,-,-,-,-,-,-,...,-,-,-,-,9,24,-,-,1,7
5,1998MAY,-,-,1,-,-,-,-,-,-,...,-,-,-,-,7,20,-,1,3,11
6,1998JUN,-,-,8,-,-,-,-,-,-,...,-,1,-,-,21,19,-,-,2,10
7,1998JUL,-,-,7,-,-,2,-,-,-,...,1,1,-,-,9,18,-,-,2,20
8,1998AUG,-,-,5,-,-,1,-,-,-,...,-,-,-,-,12,19,-,-,2,10
9,1998SEP,-,-,6,-,-,1,-,-,-,...,-,1,-,-,6,22,-,1,2,9


In [19]:
observations1 = tab[1:].rename(columns={'ONS Partner Geography': 'Period'})
observations1.head()

Unnamed: 0,Period,AF Afghanistan,AL Albania,DZ Algeria,AS American Samoa,AD Andorra,AO Angola,AI Anguilla,AQ Antarctica,AG Antigua & Barbuda,...,VI US Virgin Islands,UZ Uzbekistan,VU Vanuatu,VA Vatican City,VE Venezuela,VN Vietnam,WF Wallis & Futuna,YE Yemen,ZM Zambia,ZW Zimbabwe
1,1998JAN,-,-,1,-,-,1,-,1,-,...,9,1,-,-,8,21,-,-,2,11
2,1998FEB,1,-,18,-,-,-,-,1,-,...,2,-,-,-,6,21,-,-,2,6
3,1998MAR,-,-,7,-,-,1,-,-,-,...,-,1,-,-,10,23,-,-,2,7
4,1998APR,-,-,3,-,-,-,-,-,-,...,-,-,-,-,9,24,-,-,1,7
5,1998MAY,-,-,1,-,-,-,-,-,-,...,-,-,-,-,7,20,-,1,3,11


In [20]:
new_table1 = pd.melt(observations1, id_vars= ['Period'], var_name='ONS Partner Geography', value_name='OBS')
new_table1.reset_index(drop=True, inplace=True)
print(len(new_table1))
new_table1.head()

55448


Unnamed: 0,Period,ONS Partner Geography,OBS
0,1998JAN,AF Afghanistan,-
1,1998FEB,AF Afghanistan,1
2,1998MAR,AF Afghanistan,-
3,1998APR,AF Afghanistan,-
4,1998MAY,AF Afghanistan,-


In [21]:
new_table1 = new_table1[new_table1['OBS'] != '-']

In [22]:
new_table1.count()

Period                   32630
ONS Partner Geography    32630
OBS                      32630
dtype: int64

In [23]:
new_table1['Period'].unique()

array(['1998FEB', '2000FEB', '2000APR', '2000MAY', '2000JUN', '2000AUG',
       '2000SEP', '2000NOV', '2000DEC', '2004JUL', '2004NOV', '2006JUN',
       '2010JUN', '2010SEP', '2012JAN', '2012APR', '2012MAY', '2016JUN',
       '2016NOV', '2002SEP', '2002DEC', '2003MAR', '2003APR', '2003OCT',
       '2013NOV', '2014SEP', '2014OCT', '2017JAN', '2017MAY', '2017AUG',
       '1998JAN', '1998MAR', '1998APR', '1998MAY', '1998JUN', '1998JUL',
       '1998AUG', '1998SEP', '1998OCT', '1998NOV', '1998DEC', '1999JAN',
       '1999FEB', '1999MAR', '1999MAY', '1999JUN', '1999JUL', '1999AUG',
       '1999SEP', '1999OCT', '1999NOV', '1999DEC', '2000JAN', '2000MAR',
       '2000JUL', '2000OCT', '2001JAN', '2001FEB', '2001MAR', '2001APR',
       '2001MAY', '2001JUN', '2001JUL', '2001SEP', '2001OCT', '2001NOV',
       '2001DEC', '2002JAN', '2002FEB', '2002MAR', '2002APR', '2002MAY',
       '2002JUN', '2002JUL', '2002AUG', '2002OCT', '2002NOV', '2003FEB',
       '2003MAY', '2003JUN', '2003JUL', '2003AUG', 

In [24]:
new_table1['Period'] = 'month/' + new_table1['Period'].astype(str).str[0:4]+ '-' + new_table1['Period'].astype(str).str[-3:]
new_table1.head()

Unnamed: 0,Period,ONS Partner Geography,OBS
1,month/1998-FEB,AF Afghanistan,1
25,month/2000-FEB,AF Afghanistan,1
27,month/2000-APR,AF Afghanistan,1
28,month/2000-MAY,AF Afghanistan,1
29,month/2000-JUN,AF Afghanistan,1


In [25]:
new_table1['Unit'] = '£ Million'
new_table1['Measure Type'] = 'GBP Total'
new_table1['Flow'] = 'Imports'
new_table1.tail(5)

Unnamed: 0,Period,ONS Partner Geography,OBS,Unit,Measure Type,Flow
55443,month/2017-JUL,ZW Zimbabwe,2,£ Million,GBP Total,Imports
55444,month/2017-AUG,ZW Zimbabwe,9,£ Million,GBP Total,Imports
55445,month/2017-SEP,ZW Zimbabwe,12,£ Million,GBP Total,Imports
55446,month/2017-OCT,ZW Zimbabwe,1,£ Million,GBP Total,Imports
55447,month/2017-NOV,ZW Zimbabwe,14,£ Million,GBP Total,Imports


In [26]:
new_table1.rename(index= str, columns= {'OBS':'Value'}, inplace = True)

In [27]:
new_table1 = new_table1[['ONS Partner Geography','Period','Flow','Measure Type','Value','Unit']]

In [28]:
new_table1['Value'] = new_table1['Value'].astype(int)

In [29]:
new_table.dtypes

ONS Partner Geography    object
Period                   object
Flow                     object
Measure Type             object
Value                     int32
Unit                     object
dtype: object

In [30]:
Final_table = pd.concat([Final_table, new_table1])

In [31]:
Final_table.count()

ONS Partner Geography    72243
Period                   72243
Flow                     72243
Measure Type             72243
Value                    72243
Unit                     72243
dtype: int64

In [32]:
Final_table['ONS Partner Geography'].unique()

array(['AF Afghanistan', 'AL Albania', 'DZ Algeria', 'AS American Samoa',
       'AD Andorra', 'AO Angola', 'AI Anguilla', 'AQ Antarctica',
       'AG Antigua & Barbuda', 'AR Argentina', 'AM Armenia', 'AW Aruba',
       'AU Australia', 'AT Austria', 'AZ Azerbaijan', 'BS Bahamas',
       'BH Bahrain', 'BD Bangladesh', 'BB Barbados', 'BY Belarus',
       'BE Belgium', 'BZ Belize', 'BJ Benin', 'BM Bermuda', 'BT Bhutan',
       'BO Bolivia', 'BA Bosnia & Herzegovina', 'BW Botswana', 'BR Brazil',
       'VG British Virgin Islands', 'BN Brunei', 'BG Bulgaria',
       'BF Burkina Faso', 'MM Myanmar', 'BI Burundi', 'KH Cambodia',
       'CM Cameroon', 'CA Canada', 'CV Cape Verde', 'KY Cayman Islands',
       'CF Central African Republic', 'XC Ceuta', 'TD Chad', 'CL Chile',
       'CN China', 'CX Christmas Islands', 'CC Cocos Islands',
       'CO Colombia', 'CD Congo (Democratic Republic)',
       'CG Congo (Republic)', 'CK Cook Islands', 'CR Costa Rica',
       'CI Ivory Coast', 'HR Croatia', 

In [33]:
destinationFolder = Path('out')
destinationFolder.mkdir(exist_ok=True, parents=True)

Final_table.to_csv(destinationFolder / ('observations.csv'), index = False)