# Implement reconciliation

In [1]:
import pandas as pd
import numpy as np
import time
import janitor

  @_expand_grid.register(pd.arrays.PandasArray)


In [2]:
start_time= time.time()
data = pd.read_csv('../Data/Trade_filtered.csv', sep=',', encoding='utf-8',index_col=0).reset_index()
data = data.clean_names()


In [3]:
filter_t=data.item.unique()

In [4]:
# Define origin & destination
to_add = data.loc[:,['reporter_country_code','reporter_countries','partner_country_code','partner_countries']]
to_add.columns = ['origin_country_code','origin_country','destin_country_code','destin_country']
data=data.join(to_add)

replace_bool = (data.element == 'Import Quantity') | (data.element == 'Import Value')
data.loc[replace_bool, ['origin_country', 'origin_country_code']] = data.loc[replace_bool, ['partner_countries', 'partner_country_code']].values
data.loc[replace_bool, ['destin_country', 'destin_country_code']] = data.loc[replace_bool, ['reporter_countries', 'reporter_country_code']].values

In [5]:
data.head(10)

Unnamed: 0,index,reporter_country_code,reporter_countries,partner_country_code,partner_countries,item,element,year,unit,value,origin_country_code,origin_country,destin_country_code,destin_country
0,0,2,Afghanistan,4,Algeria,"Cashew nuts, shelled",Export Quantity,2016,tonnes,3.0,2,Afghanistan,4,Algeria
1,1,2,Afghanistan,4,Algeria,"Cashew nuts, shelled",Export Value,2016,1000 US$,23.0,2,Afghanistan,4,Algeria
2,4,2,Afghanistan,4,Algeria,Crude organic material n.e.c.,Export Value,2015,1000 US$,1.0,2,Afghanistan,4,Algeria
3,5,4,Algeria,2,Afghanistan,"Almonds, shelled",Import Quantity,2005,tonnes,3.0,2,Afghanistan,4,Algeria
4,6,4,Algeria,2,Afghanistan,"Almonds, shelled",Import Value,2005,1000 US$,3.0,2,Afghanistan,4,Algeria
5,7,2,Afghanistan,4,Algeria,Crude organic material n.e.c.,Export Value,2016,1000 US$,1.0,2,Afghanistan,4,Algeria
6,9,7,Angola,3,Albania,"Coffee, green",Export Quantity,2019,tonnes,36.6,7,Angola,3,Albania
7,10,7,Angola,3,Albania,"Coffee, green",Export Value,2019,1000 US$,43.0,7,Angola,3,Albania
8,12,2,Afghanistan,4,Algeria,"Other stimulant, spice and aromatic crops, n.e.c.",Export Quantity,2014,tonnes,0.16,2,Afghanistan,4,Algeria
9,13,4,Algeria,2,Afghanistan,"Anise, badian, coriander, cumin, caraway, fenn...",Import Quantity,1996,tonnes,171.0,2,Afghanistan,4,Algeria


### Reconciliation
Find repeated instances of code.

In [6]:
# takes a bit good (3min)
data['double_reports']= 1*(data.groupby(["origin_country",'destin_country',"year","item","unit"], group_keys=False)['year'].transform('count')>1)
data['same_double_reports']= 1*(data.groupby(["origin_country",'destin_country',"year",'item','unit','value'], group_keys=False)["value"].transform('count')>1)

# Reliability estimation: (1.2 min)
reliability=data.groupby("reporter_countries", group_keys=False).apply(lambda x: sum(x.loc[:,'same_double_reports']) /sum(x.loc[:,'double_reports']))
reliability=pd.DataFrame(reliability,columns=['reliability'])

data=pd.merge(data,reliability, left_on='reporter_countries',right_index=True)
print('Cum time (Reliability):', (time.time()- start_time)/60,'min')

Cum time (Reliability): 2.486053212483724 min


In [7]:
# Remove where there is a double report and the country is not the preferred one 
# Find reliable country (can take up to 5min)

reliable_clean = data.sort_index().groupby(["origin_country",'destin_country',"year",'item','unit'], group_keys=False).tail(1)
reliable_clean=reliable_clean.drop(columns=['reporter_country_code', 'reporter_countries','partner_country_code','partner_countries','reliability','double_reports','same_double_reports'])

# Drop Paddy rice column
conditon= reliable_clean.item == 'Rice, paddy (rice milled equivalent)'
reliable_clean = reliable_clean.loc[~conditon,:]

reliable_clean.to_csv('../Data/Trade_reconciled.csv',encoding='utf-8',index=False) # Checked data clean same dimensions than R code need to check values
print('Cum time (after save):', (time.time()- start_time)/60,'min')
reliable_clean.head(10)  


Cum time (after save): 4.37853076060613 min


Unnamed: 0,index,item,element,year,unit,value,origin_country_code,origin_country,destin_country_code,destin_country
3,5,"Almonds, shelled",Import Quantity,2005,tonnes,3.0,2,Afghanistan,4,Algeria
4,6,"Almonds, shelled",Import Value,2005,1000 US$,3.0,2,Afghanistan,4,Algeria
6,9,"Coffee, green",Export Quantity,2019,tonnes,36.6,7,Angola,3,Albania
7,10,"Coffee, green",Export Value,2019,1000 US$,43.0,7,Angola,3,Albania
9,13,"Anise, badian, coriander, cumin, caraway, fenn...",Import Quantity,1996,tonnes,171.0,2,Afghanistan,4,Algeria
10,14,Cider and other fermented beverages,Import Quantity,2012,tonnes,0.0,2,Afghanistan,8,Antigua and Barbuda
11,15,Cider and other fermented beverages,Import Value,2012,1000 US$,0.0,2,Afghanistan,8,Antigua and Barbuda
12,16,"Anise, badian, coriander, cumin, caraway, fenn...",Import Quantity,1997,tonnes,160.0,2,Afghanistan,4,Algeria
14,19,Pastry,Import Quantity,2020,tonnes,18.85,3,Albania,7,Angola
15,20,beef and veal preparations nes,Export Quantity,1998,tonnes,25.0,9,Argentina,2,Afghanistan


In [8]:
# Compare to R dataset
reliable_clean = pd.read_csv('../Data/Trade_reconciled.csv', sep=',', encoding='utf-8',index_col='index').reset_index()
reliable_clean['item']=(reliable_clean.item.str.lower())
#                        .replace('matã©','mate',regex=True)
#                        .replace(';',',',regex=True)
#                        .replace('"','',regex=True).str.lower())# Separator is a coma

r_dataset = pd.read_csv('../FAO_2023_10.csv', sep=',', encoding='utf-8').reset_index()
r_dataset['item']=(r_dataset.item.str.lower())
#                        .replace('maté','mate',regex=True)
#                        .replace(';',',',regex=True)
#                        .replace('"','',regex=True).str.lower())# Separator is a coma

# Validate 
print('Items R:',len(r_dataset.item.unique()))
print('Items python:',len(reliable_clean.item.unique()))

#Differences in items shown
set(reliable_clean.item.unique()).difference(set(r_dataset.item.unique()))

print('Countries R:',len(r_dataset.item.unique()))
print('Countries python:',len(reliable_clean.item.unique()))

print('Samples R:',len(r_dataset))
print('Samples python:',len(reliable_clean))

Items R: 558
Items python: 558
Countries R: 558
Countries python: 558
Samples R: 32426012
Samples python: 32426012


**[END OF USEFUL CODE]**

# Add item labels to reliable_clean 

Include item_code to dataframe. Not needed but maybe helps for identification purposes. *(Already incorporated in Clean geography)*

In [9]:
'''
# Add item code (just in case), probably not needed. 3min (Requires A LOT OF MEMORY)

# Reload Trade data
reliable_clean = pd.read_csv('../Data/Trade_reconciled.csv', sep=',', encoding='utf-8',index_col='index').reset_index()
reliable_clean['item']=(reliable_clean.item.str.lower()
#                        .replace('matã©','mate',regex=True)
                        .replace(';',',',regex=True)
                        .replace('"','',regex=True).str.lower())# Separator is a coma

# Load item codes
item_code = pd.read_csv('../Data/raw_trade/Trade_DetailedTradeMatrix_E_ItemCodes.csv', sep=',', encoding='latin1')#.reset_index() # changed manually the wrong accent in matÈ
item_code['Item']=(item_code.Item.str.lower() #replace('é','e',regex=True)
                   .replace(';',',',regex=True)
                   .replace('"','',regex=True)
                   .str.lower())# Separator is a coma

item_code = item_code.clean_names()

#Merge datasets 
reliable_clean=pd.merge(reliable_clean, item_code, how='left', on='item')
reliable_clean= reliable_clean.drop(columns=['index','cpc_code'])

# Repeat validation to see nothing changed

r_dataset = pd.read_csv('../FAO_2023_10.csv', sep=',', encoding='utf-8').reset_index()
r_dataset['item']=(r_dataset.item.str.lower())

validation=reliable_clean.loc[np.isnan(reliable_clean.item_code),:]

print('Num. no item_code:', len(validation.item.unique()))

print('Num. food products (Items) R:',len(r_dataset.item.unique()))
print('Num. food products (Items) python:',len(reliable_clean.item.unique()))
print('Num. samples R:',len(r_dataset))
print('Num. samples python:',len(reliable_clean))

# Save clean data
#reliable_clean.to_csv('../Data/Trade_reconciled_clean.csv',index=False, encoding='utf-8')
print('Total time:', (time.time()- start_time)/60,'min')
'''

'\n# Add item code (just in case), probably not needed. 3min (Requires A LOT OF MEMORY)\n\n# Reload Trade data\nreliable_clean = pd.read_csv(\'../Data/Trade_reconciled.csv\', sep=\',\', encoding=\'utf-8\',index_col=\'index\').reset_index()\nreliable_clean[\'item\']=(reliable_clean.item.str.lower()\n#                        .replace(\'matã©\',\'mate\',regex=True)\n                        .replace(\';\',\',\',regex=True)\n                        .replace(\'"\',\'\',regex=True).str.lower())# Separator is a coma\n\n# Load item codes\nitem_code = pd.read_csv(\'../Data/raw_trade/Trade_DetailedTradeMatrix_E_ItemCodes.csv\', sep=\',\', encoding=\'latin1\')#.reset_index() # changed manually the wrong accent in matÈ\nitem_code[\'Item\']=(item_code.Item.str.lower() #replace(\'é\',\'e\',regex=True)\n                   .replace(\';\',\',\',regex=True)\n                   .replace(\'"\',\'\',regex=True)\n                   .str.lower())# Separator is a coma\n\nitem_code = item_code.clean_names()\n\n

## Add Domestic consumption 
To be honest the data for the FAOSTAT Food Balance matrices is very incomplete and cannot be mixed easily with the trade matrices. Maybe it is better to stick with the trade matrices. 

I will not be using it for now. But maybe it serves later. 

In [10]:
'''
#Load balance data and merge 2 datasets
balance_data = pd.read_csv('../Data/Balance_data.csv', sep=',', encoding='latin1',header=0).clean_names().reset_index()
balance_data2 = balance_data.loc[balance_data.element == 'Domestic supply quantity']#.rename(columns={'item_code':'item_code_og'})
balance_data2 = balance_data2.loc[~(balance_data2.item == 'Population'),:]

balance_data_old = pd.read_csv('../Data/FoodBalanceSheetsHistoric_E_All_Data_(Normalized)/FoodBalanceSheetsHistoric_E_All_Data_(Normalized).csv', sep=',', encoding='latin1',header=0).clean_names().reset_index()
balance_data_old = balance_data_old.loc[balance_data_old.element == 'Domestic supply quantity']#.rename(columns={'item_code':'item_code_og'})
balance_data_old = balance_data_old.loc[~(balance_data_old.item == 'Population'),:]

balance_result = pd.concat([balance_data_old, balance_data2], ignore_index=True, sort=False)
balance_result
'''

"\n#Load balance data and merge 2 datasets\nbalance_data = pd.read_csv('../Data/Balance_data.csv', sep=',', encoding='latin1',header=0).clean_names().reset_index()\nbalance_data2 = balance_data.loc[balance_data.element == 'Domestic supply quantity']#.rename(columns={'item_code':'item_code_og'})\nbalance_data2 = balance_data2.loc[~(balance_data2.item == 'Population'),:]\n\nbalance_data_old = pd.read_csv('../Data/FoodBalanceSheetsHistoric_E_All_Data_(Normalized)/FoodBalanceSheetsHistoric_E_All_Data_(Normalized).csv', sep=',', encoding='latin1',header=0).clean_names().reset_index()\nbalance_data_old = balance_data_old.loc[balance_data_old.element == 'Domestic supply quantity']#.rename(columns={'item_code':'item_code_og'})\nbalance_data_old = balance_data_old.loc[~(balance_data_old.item == 'Population'),:]\n\nbalance_result = pd.concat([balance_data_old, balance_data2], ignore_index=True, sort=False)\nbalance_result\n"

In [11]:
'''
#Balance data standardise
balance_result.loc[:,'item']=balance_result.item.replace(';',',',regex=True).replace('"','',regex=True).str.lower()# Separator is a coma

balance_result=balance_result.drop(columns=['area_code_m49_','item_code_fbs_','element_code','year_code','flag'])
'''


'\n#Balance data standardise\nbalance_result.loc[:,\'item\']=balance_result.item.replace(\';\',\',\',regex=True).replace(\'"\',\'\',regex=True).str.lower()# Separator is a coma\n\nbalance_result=balance_result.drop(columns=[\'area_code_m49_\',\'item_code_fbs_\',\'element_code\',\'year_code\',\'flag\'])\n'

In [12]:
'''
bal_set = set(balance_result.item.unique())
reliable_set = set(reliable_clean.item.unique())
check= bal_set.intersection(reliable_set)
check

bal_list=  list(balance_result.item.unique())
reliable_list = list(reliable_clean.item.unique())
'''

'\nbal_set = set(balance_result.item.unique())\nreliable_set = set(reliable_clean.item.unique())\ncheck= bal_set.intersection(reliable_set)\ncheck\n\nbal_list=  list(balance_result.item.unique())\nreliable_list = list(reliable_clean.item.unique())\n'

In [13]:
'''
#Balance data standardise
balance_result.loc[:,'item']=balance_result.item.replace(';',',',regex=True).replace('"','',regex=True).str.lower()# Separator is a coma

balance_result=balance_result.drop(columns=['area_code_m49_','item_code_fbs_','element_code','year_code','flag'])

#Add destination 
to_add = balance_result.loc[:,['area_code','area']]
to_add.columns = ['destin_country_code','destin_country']
balance_result2=balance_result.join(to_add).rename(columns={'area_code':'origin_country_code','area':'origin_country'}).drop(columns=['cpc_code'])


bal_set = set(balance_data2.item.unique())
reliable_set = set(reliable_clean.item.unique())
check= bal_set.intersection(reliable_set)
check

balance_data2

#balance_data2_filt=pd.merge(balance_data2, reliable_clean, how='left', on='item')

#item_code = pd.read_csv('../Data/Codes_FAOSTAT.csv',usecols=['Item Code','Item','CPC Code'], sep=',', encoding='latin1') 
#data deleated for now Source:https://data.apps.fao.org/catalog/dataset/faostat-code-list-global-country/resource/4d250dca-f7fe-49ad-9efb-d92799304c0a 
#item_code = item_code.clean_names()#.drop('CPC Code')
#balance_data2_filt=pd.merge(balance_data2, reliable_clean, how='left', on='item')

#reliable_clean.to_csv('../Data/Trade_reconciled.csv') # Checked data clean same dimensions than R code need to check values
'''

'\n#Balance data standardise\nbalance_result.loc[:,\'item\']=balance_result.item.replace(\';\',\',\',regex=True).replace(\'"\',\'\',regex=True).str.lower()# Separator is a coma\n\nbalance_result=balance_result.drop(columns=[\'area_code_m49_\',\'item_code_fbs_\',\'element_code\',\'year_code\',\'flag\'])\n\n#Add destination \nto_add = balance_result.loc[:,[\'area_code\',\'area\']]\nto_add.columns = [\'destin_country_code\',\'destin_country\']\nbalance_result2=balance_result.join(to_add).rename(columns={\'area_code\':\'origin_country_code\',\'area\':\'origin_country\'}).drop(columns=[\'cpc_code\'])\n\n\nbal_set = set(balance_data2.item.unique())\nreliable_set = set(reliable_clean.item.unique())\ncheck= bal_set.intersection(reliable_set)\ncheck\n\nbalance_data2\n\n#balance_data2_filt=pd.merge(balance_data2, reliable_clean, how=\'left\', on=\'item\')\n\n#item_code = pd.read_csv(\'../Data/Codes_FAOSTAT.csv\',usecols=[\'Item Code\',\'Item\',\'CPC Code\'], sep=\',\', encoding=\'latin1\') \n#da

In [14]:
'''
print('Balance',len(balance_data.item_code.unique()))
#print('Items',len(item_code.item_code.unique()))
#print('data',len(reliable_clean.item_code.unique()))
balance_data.item_code.unique()
'''

"\nprint('Balance',len(balance_data.item_code.unique()))\n#print('Items',len(item_code.item_code.unique()))\n#print('data',len(reliable_clean.item_code.unique()))\nbalance_data.item_code.unique()\n"

In [15]:
#prova= reliable_clean.loc[:,['item','item_code']].drop_duplicates()
#dup= prova.loc[prova.item.duplicated(),:]
#dup