# Implement reconciliation

In [1]:
import pandas as pd
import numpy as np
import time

In [2]:
start_time= time.time()
data = pd.read_csv('../Data/Trade_filtered.csv', sep=',', encoding='utf-8',index_col=0).reset_index()
data.columns = data.columns.str.lower().str.replace(' ', '_')
data

Unnamed: 0,index,reporter_country_code,reporter_countries,partner_country_code,partner_countries,item,element,year,unit,value
0,0,2,Afghanistan,4,Algeria,"Cashew nuts, shelled",Export Quantity,2016,tonnes,3.0
1,1,2,Afghanistan,4,Algeria,"Cashew nuts, shelled",Export Value,2016,1000 US$,23.0
2,4,2,Afghanistan,4,Algeria,Crude organic material n.e.c.,Export Value,2015,1000 US$,1.0
3,5,4,Algeria,2,Afghanistan,"Almonds, shelled",Import Quantity,2005,tonnes,3.0
4,6,4,Algeria,2,Afghanistan,"Almonds, shelled",Import Value,2005,1000 US$,3.0
...,...,...,...,...,...,...,...,...,...,...
46466144,46807394,150,Netherlands (Kingdom of the),181,Zimbabwe,Wine,Export Value,1988,1000 US$,1.0
46466145,46807395,150,Netherlands (Kingdom of the),181,Zimbabwe,Wine,Export Value,1992,1000 US$,8.0
46466146,46807396,150,Netherlands (Kingdom of the),181,Zimbabwe,Wine,Export Value,1999,1000 US$,2.0
46466147,46807397,150,Netherlands (Kingdom of the),181,Zimbabwe,Wine,Export Value,2004,1000 US$,2.0


In [3]:
filter_t = data.item.unique()

In [4]:
# Define origin & destination
to_add = data.loc[:,['reporter_country_code','reporter_countries','partner_country_code','partner_countries']]
to_add.columns = ['origin_country_code','origin_country','destin_country_code','destin_country']
data=data.join(to_add)

replace_bool = (data.element == 'Import Quantity') | (data.element == 'Import Value')
data.loc[replace_bool, ['origin_country', 'origin_country_code']] = data.loc[replace_bool, ['partner_countries', 'partner_country_code']].values
data.loc[replace_bool, ['destin_country', 'destin_country_code']] = data.loc[replace_bool, ['reporter_countries', 'reporter_country_code']].values

In [5]:
data.head(10)

Unnamed: 0,index,reporter_country_code,reporter_countries,partner_country_code,partner_countries,item,element,year,unit,value,origin_country_code,origin_country,destin_country_code,destin_country
0,0,2,Afghanistan,4,Algeria,"Cashew nuts, shelled",Export Quantity,2016,tonnes,3.0,2,Afghanistan,4,Algeria
1,1,2,Afghanistan,4,Algeria,"Cashew nuts, shelled",Export Value,2016,1000 US$,23.0,2,Afghanistan,4,Algeria
2,4,2,Afghanistan,4,Algeria,Crude organic material n.e.c.,Export Value,2015,1000 US$,1.0,2,Afghanistan,4,Algeria
3,5,4,Algeria,2,Afghanistan,"Almonds, shelled",Import Quantity,2005,tonnes,3.0,2,Afghanistan,4,Algeria
4,6,4,Algeria,2,Afghanistan,"Almonds, shelled",Import Value,2005,1000 US$,3.0,2,Afghanistan,4,Algeria
5,7,2,Afghanistan,4,Algeria,Crude organic material n.e.c.,Export Value,2016,1000 US$,1.0,2,Afghanistan,4,Algeria
6,9,7,Angola,3,Albania,"Coffee, green",Export Quantity,2019,tonnes,36.6,7,Angola,3,Albania
7,10,7,Angola,3,Albania,"Coffee, green",Export Value,2019,1000 US$,43.0,7,Angola,3,Albania
8,12,2,Afghanistan,4,Algeria,"Other stimulant, spice and aromatic crops, n.e.c.",Export Quantity,2014,tonnes,0.16,2,Afghanistan,4,Algeria
9,13,4,Algeria,2,Afghanistan,"Anise, badian, coriander, cumin, caraway, fenn...",Import Quantity,1996,tonnes,171.0,2,Afghanistan,4,Algeria


### Reconciliation
Find repeated instances of code.

In [6]:
# takes a bit good (3min)
data['double_reports']= 1*(data.groupby(["origin_country",'destin_country',"year","item","unit"], group_keys=False)['year'].transform('count')>1)
data['same_double_reports']= 1*(data.groupby(["origin_country",'destin_country',"year",'item','unit','value'], group_keys=False)["value"].transform('count')>1)

# Reliability estimation: (1.2 min)
reliability=data.groupby("reporter_countries", group_keys=False).apply(lambda x: sum(x.loc[:,'same_double_reports']) /sum(x.loc[:,'double_reports']))
reliability=pd.DataFrame(reliability,columns=['reliability'])

data=pd.merge(data,reliability, left_on='reporter_countries',right_index=True)
print('Cum time (Reliability):', (time.time()- start_time)/60,'min')

Cum time (Reliability): 2.388299822807312 min


In [7]:
# Remove where there is a double report and the country is not the preferred one 
# Find reliable country (can take up to 5min)

reliable_clean = data.sort_index().groupby(["origin_country",'destin_country',"year",'item','unit'], group_keys=False).tail(1)
reliable_clean=reliable_clean.drop(columns=['reporter_country_code', 'reporter_countries','partner_country_code','partner_countries','reliability','double_reports','same_double_reports'])

# Drop Paddy rice column
conditon= reliable_clean.item == 'Rice, paddy (rice milled equivalent)'
reliable_clean = reliable_clean.loc[~conditon,:]

reliable_clean.to_csv('../Data/Trade_reconciled.csv',encoding='utf-8',index=False) # Checked data clean same dimensions than R code need to check values
print('Cum time (after save):', (time.time()- start_time)/60,'min')
reliable_clean.head(10)  

Cum time (after save): 4.29031545718511 min


Unnamed: 0,index,item,element,year,unit,value,origin_country_code,origin_country,destin_country_code,destin_country
3,5,"Almonds, shelled",Import Quantity,2005,tonnes,3.0,2,Afghanistan,4,Algeria
4,6,"Almonds, shelled",Import Value,2005,1000 US$,3.0,2,Afghanistan,4,Algeria
6,9,"Coffee, green",Export Quantity,2019,tonnes,36.6,7,Angola,3,Albania
7,10,"Coffee, green",Export Value,2019,1000 US$,43.0,7,Angola,3,Albania
9,13,"Anise, badian, coriander, cumin, caraway, fenn...",Import Quantity,1996,tonnes,171.0,2,Afghanistan,4,Algeria
10,14,Cider and other fermented beverages,Import Quantity,2012,tonnes,0.0,2,Afghanistan,8,Antigua and Barbuda
11,15,Cider and other fermented beverages,Import Value,2012,1000 US$,0.0,2,Afghanistan,8,Antigua and Barbuda
12,16,"Anise, badian, coriander, cumin, caraway, fenn...",Import Quantity,1997,tonnes,160.0,2,Afghanistan,4,Algeria
14,19,Pastry,Import Quantity,2020,tonnes,18.85,3,Albania,7,Angola
15,20,beef and veal preparations nes,Export Quantity,1998,tonnes,25.0,9,Argentina,2,Afghanistan


In [8]:
# Compare to R dataset
reliable_clean = pd.read_csv('../Data/Trade_reconciled.csv', sep=',', encoding='utf-8',index_col='index').reset_index()
reliable_clean['item']=(reliable_clean.item.str.lower())
#                        .replace('matã©','mate',regex=True)
#                        .replace(';',',',regex=True)
#                        .replace('"','',regex=True).str.lower())# Separator is a coma

r_dataset = pd.read_csv('../FAO_2023_10.csv', sep=',', encoding='utf-8').reset_index()
r_dataset['item']=(r_dataset.item.str.lower())
#                        .replace('maté','mate',regex=True)
#                        .replace(';',',',regex=True)
#                        .replace('"','',regex=True).str.lower())# Separator is a coma

# Validate 
print('Items R:',len(r_dataset.item.unique()))
print('Items python:',len(reliable_clean.item.unique()))

#Differences in items shown
set(reliable_clean.item.unique()).difference(set(r_dataset.item.unique()))

print('Countries R:',len(set(r_dataset.origin_country_code.unique()).union(r_dataset.destin_country_code.unique())))

print('Countries python:',len(set(reliable_clean.origin_country_code.unique()).union(reliable_clean.destin_country_code.unique())))

print('Samples R:',len(r_dataset))
print('Samples python:',len(reliable_clean))

Items R: 558
Items python: 558
Countries R: 220
Countries python: 220
Samples R: 32426012
Samples python: 32426012
