###  Mret xlsx to Tidydata

In [1]:
from databaker.framework import *
import pandas as pd

In [2]:
import requests
from pathlib import Path

sourceFolder = Path('in')
sourceFolder.mkdir(exist_ok=True)

inputURL = 'https://www.ons.gov.uk/file?uri=/economy/nationalaccounts/balanceofpayments/datasets/tradeingoodsmretsallbopeu2013timeseriesspreadsheet/current/mret.xlsx'
inputFile = sourceFolder / 'mret.xls'
if not(inputFile.exists() and inputFile.is_file()):
    response = requests.get(inputURL)
    with open(inputFile, 'wb') as f:
        f.write(response.content)

In [3]:
# inputFile = 'mret.xlsx'

In [4]:
tab = loadxlstabs(inputFile)

Loading in/mret.xls which has size 2795578 bytes
Table names: ['data']


In [5]:
tab = tab[0]

In [6]:
observations = tab.excel_ref('B8').expand(DOWN).expand(RIGHT).is_not_blank()

In [7]:
Title = tab.excel_ref('B1').expand(RIGHT).is_not_blank()

In [8]:
CDID = tab.excel_ref('B2').expand(RIGHT).is_not_blank()

In [9]:
Year = tab.excel_ref('A8').expand(DOWN).is_not_blank()

In [10]:
Currency = tab.excel_ref('P3')

In [11]:
Dimensions = [
            HDimConst('Geography', 'K02000001'),
            HDim(Year,'TIME',DIRECTLY,LEFT),
            HDim(CDID,'CDID',DIRECTLY,ABOVE),
            HDimConst('Unit', '£ Millions'), 
            HDimConst('Measure Type','GBP Total'),            
            HDim(Title,'Title',DIRECTLY,ABOVE)
    
]

In [12]:
c1 = ConversionSegment(observations, Dimensions, processTIMEUNIT=True)

In [13]:
new_table = c1.topandas()

multiple TIMEUNITs: 'Year'(15790), 'Quarter'(63160), ''(190162)


In [14]:
# new_table['Title'] = new_table['Title'].map(lambda cell:cell.replace('£m', ''))

In [15]:
# new_table = new_table.drop('TIMEUNIT', axis=1)

In [16]:
print(len(new_table))

269112


In [17]:
new_table.tail(5)

Unnamed: 0,OBS,TIME,TIMEUNIT,Geography,CDID,Unit,Measure Type,Title
269107,1282.0,2018 FEB,,K02000001,QALU,£ Millions,GBP Total,Balance of payments: Trade in Goods: Aircraft:...
269108,457.0,2018 FEB,,K02000001,SGRX,£ Millions,GBP Total,non-EU:BOP:EX:SA:Unspecified goods: SITC 9
269109,757.0,2018 FEB,,K02000001,QALW,£ Millions,GBP Total,Balance of payments: Trade in Goods: Aircraft:...
269110,525.0,2018 FEB,,K02000001,QALV,£ Millions,GBP Total,Balance of payments: Trade in Goods: Aircraft:...
269111,439.0,2018 FEB,,K02000001,SGTK,£ Millions,GBP Total,non-EU:BOP:IM:SA:Unspecified goods: SITC 9


In [18]:
def user_perc3(x):
    
    if str(x) == '':
        return 'month'
    else:
        return x
    
new_table['TIMEUNIT'] = new_table.apply(lambda row: user_perc3(row['TIMEUNIT']), axis = 1)


In [19]:
temp_table_file = sourceFolder / 'cord_sitc classification table.xlsx'
if not(temp_table_file.exists() and temp_table_file.is_file()):
    response = requests.get('https://drive.google.com/uc?export=download&id=1uJck_DtSgLs0XcEuKDB0swzj1UrWmauj')
    with open(temp_table_file, 'wb') as f:
        f.write(response.content)

temp_table = pd.read_excel(temp_table_file, sheet_name = 0)
temp_table = temp_table.reset_index()
temp_table.drop(['level_0','level_1','Sequence'], axis =1, inplace = True)
temp_table.tail(5)


Unnamed: 0,cdid,COMMODITY,AREA,DIRECTION,BASIS,PRICE,SEASADJ,PERIOD
6196,BPFR,5min8minE,WW,IM,BOP,IDEF,NSA,M
6197,BPGR,5min8minE,WW,IM,BOP,IDEF,SA,M
6198,BPCR,5min8minE,WW,IM,BOP,VM,NSA,M
6199,ELAI,5min8minE,WW,IM,BOP,VM,SA,M
6200,,,,,,,,


In [20]:
classificationTablesFile = sourceFolder / 'CSDB classification tables.xlsx'
if not(classificationTablesFile.exists() and classificationTablesFile.is_file()):
    response = requests.get('https://drive.google.com/uc?export=download&id=1miAzQ6s8om4Ark3BpRk3Y90OAWfWErTb')
    with open(classificationTablesFile, 'wb') as f:
        f.write(response.content)

classification1 = pd.read_excel(classificationTablesFile, 'cord_sitc')
classification2 = pd.read_excel(classificationTablesFile, 'cord_cpa')
classification3 = pd.read_excel(classificationTablesFile, 'cord_country')

codelistFile = sourceFolder / 'Codelist.csv'
if not(codelistFile.exists() and codelistFile.is_file()):
    response = requests.get('https://drive.google.com/uc?export=download&id=161OtInylx2518gmhRu7UgUYnZZ_x9FQr')
    with open(codelistFile, 'wb') as f:
        f.write(response.content)

classification4 = pd.read_csv(codelistFile)

In [21]:
classification1.head(5)

Unnamed: 0,cdid,COMMODITY,AREA,DIRECTION,BASIS,PRICE,SEASADJ,PERIOD
0,SDSX,2plus4,EU,BAL,BOP,CP,NSA,Q
1,SGLO,5minus8,EU,BAL,BOP,CP,NSA,Q
2,SESL,5plus6,EU,BAL,BOP,CP,NSA,Q
3,SFJC,7plus8,EU,BAL,BOP,CP,NSA,Q
4,LKTX,TminusO,EU,BAL,BOP,CP,NSA,Q


In [22]:
classification2.head(5)

Unnamed: 0,cdid,PRODUCT,AREA,DIRECTION,BASIS,PRICE,SEASADJ,PERIOD
0,P42L,24.2,EU,EX,BOP,CP,NSA,Q
1,P483,24.2,EU,EX,BOP,CP,SA,Q
2,P4DJ,24.2,EU,EX,BOP,CVM,NSA,Q
3,P4IZ,24.2,EU,EX,BOP,CVM,SA,Q
4,P3EP,24.2,EU,IM,BOP,CP,NSA,Q


In [23]:
classification3.head(5)

Unnamed: 0,cdid,COUNTRY,DIRECTION,BASIS,SEASADJ,PERIOD
0,KN2O,XS,BAL,BOP,NSA,Q
1,LGDS,V4,BAL,BOP,NSA,Q
2,L87P,V3,BAL,BOP,NSA,Q
3,L87J,V2,BAL,BOP,NSA,Q
4,MHN8,I7,BAL,BOP,NSA,Q


In [24]:
classification4.head(5)

Unnamed: 0,cdid,COMMODITY,AREA,DIRECTION,BASIS,PRICE,SEASADJ,PERIOD
0,AJFB,Canadian dollar,UK,,BE,CP,NSA,
1,AJFD,Swiss franc,UK,,BE,CP,NSA,
2,AJFI,Swedish kroner,UK,,BE,CP,NSA,
3,AJFJ,Norwegian kroner,UK,,BE,CP,NSA,
4,AJFK,Danish kroner,UK,,BE,CP,NSA,


In [25]:
classification2.rename(index = str, columns = {'PRODUCT':'COMMODITY'}, inplace = True)

In [26]:
classification3['COMMODITY'] = ''
classification3['PRICE'] = ''

In [27]:
classification3.rename(index = str, columns = {'COUNTRY':'AREA'}, inplace = True)

In [28]:
temp_table = pd.concat([temp_table, classification1, classification2,classification3,classification4])

In [29]:
temp_table.head()

Unnamed: 0,AREA,BASIS,COMMODITY,DIRECTION,PERIOD,PRICE,SEASADJ,cdid
0,EU,BOP,2plus4,BAL,M,CP,NSA,SDSX
1,EU,BOP,5minus8,BAL,M,CP,NSA,SGLO
2,EU,BOP,7plus8,BAL,M,CP,NSA,SFJC
3,EU,BOP,TminusO,BAL,M,CP,NSA,LKTX
4,EU,BOP,0plus1,BAL,M,CP,NSA,SDMS


In [30]:
temp_table.shape

(18295, 8)

In [31]:
temp_table.drop_duplicates(['cdid'], keep='first', inplace=True)

In [32]:
temp_table.head()

Unnamed: 0,AREA,BASIS,COMMODITY,DIRECTION,PERIOD,PRICE,SEASADJ,cdid
0,EU,BOP,2plus4,BAL,M,CP,NSA,SDSX
1,EU,BOP,5minus8,BAL,M,CP,NSA,SGLO
2,EU,BOP,7plus8,BAL,M,CP,NSA,SFJC
3,EU,BOP,TminusO,BAL,M,CP,NSA,LKTX
4,EU,BOP,0plus1,BAL,M,CP,NSA,SDMS


In [33]:
new_table.head(5)

Unnamed: 0,OBS,TIME,TIMEUNIT,Geography,CDID,Unit,Measure Type,Title
0,1039.0,1955,Year,K02000001,IKBB,£ Millions,GBP Total,BOP: Exports:CP SA: Total Trade in Services £m
1,997.0,1955,Year,K02000001,IKBC,£ Millions,GBP Total,BOP:Imports:CP SA:Total Trade in Services £m
2,1147.0,1956,Year,K02000001,IKBB,£ Millions,GBP Total,BOP: Exports:CP SA: Total Trade in Services £m
3,1121.0,1956,Year,K02000001,IKBC,£ Millions,GBP Total,BOP:Imports:CP SA:Total Trade in Services £m
4,1250.0,1957,Year,K02000001,IKBB,£ Millions,GBP Total,BOP: Exports:CP SA: Total Trade in Services £m


In [34]:
new_table.head(1)['CDID']

0    IKBB
Name: CDID, dtype: object

In [35]:
temp_table.head(1)['cdid']

0    SDSX
Name: cdid, dtype: object

In [36]:
new_table = pd.merge(new_table,temp_table, how = 'left', left_on = 'CDID', right_on = 'cdid')

In [37]:
new_table.tail(5)

Unnamed: 0,OBS,TIME,TIMEUNIT,Geography,CDID,Unit,Measure Type,Title,AREA,BASIS,COMMODITY,DIRECTION,PERIOD,PRICE,SEASADJ,cdid
269107,1282.0,2018 FEB,month,K02000001,QALU,£ Millions,GBP Total,Balance of payments: Trade in Goods: Aircraft:...,WW,BOP,792,EX,M,CP,SA,QALU
269108,457.0,2018 FEB,month,K02000001,SGRX,£ Millions,GBP Total,non-EU:BOP:EX:SA:Unspecified goods: SITC 9,RW,BOP,9,EX,M,CP,SA,SGRX
269109,757.0,2018 FEB,month,K02000001,QALW,£ Millions,GBP Total,Balance of payments: Trade in Goods: Aircraft:...,WW,BOP,792,BAL,M,CP,SA,QALW
269110,525.0,2018 FEB,month,K02000001,QALV,£ Millions,GBP Total,Balance of payments: Trade in Goods: Aircraft:...,WW,BOP,792,IM,M,CP,SA,QALV
269111,439.0,2018 FEB,month,K02000001,SGTK,£ Millions,GBP Total,non-EU:BOP:IM:SA:Unspecified goods: SITC 9,RW,BOP,9,IM,M,CP,SA,SGTK


In [38]:
new_table.drop(['cdid'], axis = 1, inplace = True)

In [39]:
new_table['Year'] = new_table['TIME'].map(lambda x: str(x)[:4])

In [40]:
new_table['Months/Quarter'] = new_table['TIME'].map(lambda x: str(x)[4:])

In [41]:
new_table = new_table.drop('TIME', axis=1)

In [42]:
new_table.rename(index= str, columns= {'Year':'TIME'}, inplace = True)

In [43]:
new_table['Months/Quarter'] = new_table['Months/Quarter'].str.lstrip('')

In [44]:
new_table['Period'] = new_table['TIMEUNIT'].map(str)+'/'+new_table['TIME']+'-'+new_table['Months/Quarter']

In [45]:
new_table['Period'] = new_table['Period'].str.rstrip('-')

In [46]:
new_table['Period'] = new_table['Period'].map(lambda cell:cell.replace(' ', ''))

In [47]:
new_table['Period'] = new_table['Period'].map(lambda cell:cell.replace('Year', 'year'))

In [48]:
new_table['Period'] = new_table['Period'].map(lambda cell:cell.replace('Quarter', 'quarter'))

In [49]:
new_table.rename(index= str, columns= {'OBS':'Value'}, inplace = True)
new_table.rename(index= str, columns= {'DIRECTION':'Flow'}, inplace = True)
new_table.rename(index= str, columns= {'COMMODITY':'Product'}, inplace = True)
new_table.rename(index= str, columns= {'SEASADJ':'Seasonal Adjustment'}, inplace = True)

In [50]:
new_table.tail(5)

Unnamed: 0,Value,TIMEUNIT,Geography,CDID,Unit,Measure Type,Title,AREA,BASIS,Product,Flow,PERIOD,PRICE,Seasonal Adjustment,TIME,Months/Quarter,Period
269107,1282.0,month,K02000001,QALU,£ Millions,GBP Total,Balance of payments: Trade in Goods: Aircraft:...,WW,BOP,792,EX,M,CP,SA,2018,FEB,month/2018-FEB
269108,457.0,month,K02000001,SGRX,£ Millions,GBP Total,non-EU:BOP:EX:SA:Unspecified goods: SITC 9,RW,BOP,9,EX,M,CP,SA,2018,FEB,month/2018-FEB
269109,757.0,month,K02000001,QALW,£ Millions,GBP Total,Balance of payments: Trade in Goods: Aircraft:...,WW,BOP,792,BAL,M,CP,SA,2018,FEB,month/2018-FEB
269110,525.0,month,K02000001,QALV,£ Millions,GBP Total,Balance of payments: Trade in Goods: Aircraft:...,WW,BOP,792,IM,M,CP,SA,2018,FEB,month/2018-FEB
269111,439.0,month,K02000001,SGTK,£ Millions,GBP Total,non-EU:BOP:IM:SA:Unspecified goods: SITC 9,RW,BOP,9,IM,M,CP,SA,2018,FEB,month/2018-FEB


In [51]:
# # new_table.drop(['TIME'], axis = 1, inplace = True)
# new_table.drop(['TIMEUNIT'], axis = 1, inplace = True)
# new_table.drop(['Geography'], axis = 1, inplace = True)
# new_table.drop(['PERIOD'], axis = 1, inplace = True)
# new_table.drop(['Months/Quarter'], axis = 1, inplace = True)


In [52]:
new_table =new_table[['AREA','Period','CDID','BASIS','Product','Seasonal Adjustment','Flow','PRICE','Measure Type','Value','Unit']]

In [53]:
new_table.head(5)

Unnamed: 0,AREA,Period,CDID,BASIS,Product,Seasonal Adjustment,Flow,PRICE,Measure Type,Value,Unit
0,,year/1955,IKBB,,,SA,EX,CP,GBP Total,1039.0,£ Millions
1,,year/1955,IKBC,,,SA,IM,CP,GBP Total,997.0,£ Millions
2,,year/1956,IKBB,,,SA,EX,CP,GBP Total,1147.0,£ Millions
3,,year/1956,IKBC,,,SA,IM,CP,GBP Total,1121.0,£ Millions
4,,year/1957,IKBB,,,SA,EX,CP,GBP Total,1250.0,£ Millions


In [54]:
new_table.tail(5)

Unnamed: 0,AREA,Period,CDID,BASIS,Product,Seasonal Adjustment,Flow,PRICE,Measure Type,Value,Unit
269107,WW,month/2018-FEB,QALU,BOP,792,SA,EX,CP,GBP Total,1282.0,£ Millions
269108,RW,month/2018-FEB,SGRX,BOP,9,SA,EX,CP,GBP Total,457.0,£ Millions
269109,WW,month/2018-FEB,QALW,BOP,792,SA,BAL,CP,GBP Total,757.0,£ Millions
269110,WW,month/2018-FEB,QALV,BOP,792,SA,IM,CP,GBP Total,525.0,£ Millions
269111,RW,month/2018-FEB,SGTK,BOP,9,SA,IM,CP,GBP Total,439.0,£ Millions


Pull out missing Values

In [55]:
# temp_table1 = new_table[new_table['AREA'].isnull() == True] 
# Codelist = temp_table1['Cdid'].unique()
# Codelist.shape
# codes = pd.Series(Codelist)
# codes.to_csv('Codelist.csv', index = False)

In [56]:
new_table.shape

(269112, 11)

In [57]:
new_table = new_table[new_table['Value'] != 0]

In [58]:
new_table.fillna('NA', inplace = True)

In [59]:
new_table.Flow.unique()

array(['EX', 'IM', 'BAL', 'NA'], dtype=object)

In [60]:
new_table['Flow'] = new_table['Flow'].map(lambda cell:cell.replace('EX', 'Exports'))
new_table['Flow'] = new_table['Flow'].map(lambda cell:cell.replace('IM', 'Imports'))
new_table['Flow'] = new_table['Flow'].map(lambda cell:cell.replace('BAL', 'Balance'))

In [61]:
new_table.head(5)

Unnamed: 0,AREA,Period,CDID,BASIS,Product,Seasonal Adjustment,Flow,PRICE,Measure Type,Value,Unit
0,,year/1955,IKBB,,,SA,Exports,CP,GBP Total,1039.0,£ Millions
1,,year/1955,IKBC,,,SA,Imports,CP,GBP Total,997.0,£ Millions
2,,year/1956,IKBB,,,SA,Exports,CP,GBP Total,1147.0,£ Millions
3,,year/1956,IKBC,,,SA,Imports,CP,GBP Total,1121.0,£ Millions
4,,year/1957,IKBB,,,SA,Exports,CP,GBP Total,1250.0,£ Millions


In [62]:
new_table.dtypes

AREA                    object
Period                  object
CDID                    object
BASIS                   object
Product                 object
Seasonal Adjustment     object
Flow                    object
PRICE                   object
Measure Type            object
Value                  float64
Unit                    object
dtype: object

In [63]:
destinationFolder = Path('out')
destinationFolder.mkdir(exist_ok=True, parents=True)

new_table.to_csv(destinationFolder / ('MRET_Tidydata.csv'), index = False)