# 3- Clean data: Fix problems with geometries

Countries are messy, need to include ISO codes and to add their position on a map (for visualisation purposes). 

This script pre-processes the data and gives 2 csv files as output: 

- ``Trade_geo.csv``: Contains the clean version of Trade_reconciled. The country names are standardised and the correct ISO2 codes are included for all countries.

- ``Country_info.csv``: Contains metadata about all the countries in the dataset including their Standardised name, ISO2 code, ISO3 code (for some) and the coordinates of a representative point inside the country. *Needs to be improved to find the spatial position of the countries that have changed their name.* 

In [1]:
import pandas as pd
import numpy as np
import time
import networkx as nx
import geopandas as gpd # pip installed
import matplotlib.pyplot as plt 

# FUNCTIONS 
def ISO2_fix (match_ids):
    # Add iso codes for all missing countries
    bool_cond= match_ids.loc[:,'Country or Area']== 'Canton and Enderbury Islands'
    match_ids.loc[bool_cond,'ISO-alpha2 Code']= 'CT'

    bool_cond= match_ids.loc[:,'Country or Area']== 'China, Hong Kong SAR'
    match_ids.loc[bool_cond,'ISO-alpha2 Code']= 'HK'

    bool_cond= match_ids.loc[:,'Country or Area']== 'Heard and McDonald Islands'
    match_ids.loc[bool_cond,'ISO-alpha2 Code']= 'HM'

    bool_cond= match_ids.loc[:,'Country or Area']== 'China, Taiwan Province of'
    match_ids.loc[bool_cond,'ISO-alpha2 Code']= 'TW'

    bool_cond= match_ids.loc[:,'Country or Area']== 'USSR'
    match_ids.loc[bool_cond,'ISO-alpha2 Code']= 'SU'

    bool_cond= match_ids.loc[:,'Country or Area']== 'Serbia and Montenegro'
    match_ids.loc[bool_cond,'ISO-alpha2 Code']= 'RS-ME'

    bool_cond= match_ids.loc[:,'Country or Area']== 'Midway Island'
    match_ids.loc[bool_cond,'ISO-alpha2 Code']= 'UM-71'

    bool_cond= match_ids.loc[:,'Country or Area']== 'Yugoslav SFR'
    match_ids.loc[bool_cond,'ISO-alpha2 Code']= 'YU'

    bool_cond= match_ids.loc[:,'Country or Area']== 'China, Macao SAR'
    match_ids.loc[bool_cond,'ISO-alpha2 Code']= 'MO'

    bool_cond= match_ids.loc[:,'Country or Area']== 'Sudan (former)'
    match_ids.loc[bool_cond,'ISO-alpha2 Code']= 'SD-SS'

    bool_cond= match_ids.loc[:,'Country or Area']== 'Johnston Island'
    match_ids.loc[bool_cond,'ISO-alpha2 Code']= 'UM-67'

    bool_cond= match_ids.loc[:,'Country or Area']== 'Belgium-Luxembourg'
    match_ids.loc[bool_cond,'ISO-alpha2 Code']= 'BE-LU'

    bool_cond= match_ids.loc[:,'Country or Area']== 'Czechoslovakia'
    match_ids.loc[bool_cond,'ISO-alpha2 Code']= 'CS'

    bool_cond= match_ids.loc[:,'Country or Area']== 'Serbia and Montenegro'
    match_ids.loc[bool_cond,'ISO-alpha2 Code']= 'RS-ME'

    bool_cond= match_ids.loc[:,'Country or Area']== 'Wake Island'
    match_ids.loc[bool_cond,'ISO-alpha2 Code']= 'UM-79'
    return(match_ids)


def Geometries_missing(shape_file):  
    
    #Fix Former Sudan
    boolean_cond=(shape_file.ISO == 'SD') | (shape_file.ISO=='SS')
    geometry_merged= shape_file.loc[boolean_cond,'geometry'].unary_union
    sd_ss = {'ISO': 'SD-SS', 'geometry': geometry_merged}
    
    #Fix Serbia & Montenegro
    boolean_cond=(shape_file.ISO == 'RS') | (shape_file.ISO=='ME')
    geometry_merged= shape_file.loc[boolean_cond,'geometry'].unary_union
    rs_me = {'ISO': 'RS-ME', 'geometry': geometry_merged}

    #Fix Belgium & Luxembourg
    boolean_cond=(shape_file.ISO == 'BE') | (shape_file.ISO=='LU')
    geometry_merged= shape_file.loc[boolean_cond,'geometry'].unary_union
    be_lu = {'ISO': 'BE-LU', 'geometry': geometry_merged}

    # ADD shapefiles of other weird countries (if I find them): 
    #geometry_imported = ???
    #taiwan = {'ISO': '??', 'geometry': geometry_imported }

    # Merge data 
    to_merge = gpd.GeoDataFrame([sd_ss,rs_me,be_lu],index = ['SD-SS','RS-ME','BE-LU'])
    shape_file2 = pd.concat([shape_file, to_merge])
    return (shape_file2)


def Standardise_names(data, country_match):
    # Fix formatting isseues 
    data.loc[:,['origin_country','destin_country']]= data.loc[:,['origin_country','destin_country']].replace('Palestine','State of Palestine')
    #data.loc[:,['origin_country','destin_country']]= data.loc[:,['origin_country','destin_country']].replace("Côte d'Ivoire","Côte d'Ivoire")
    data.loc[:,['origin_country','destin_country']]= data.loc[:,['origin_country','destin_country']].replace("Ethiopia PDR",'Ethiopia')

    country_match.loc[:,'Country or Area']= country_match.loc[:,'Country or Area'].replace('China','China, mainland')
    country_match.loc[:,'Country or Area']= country_match.loc[:,'Country or Area'].replace("Côte d’Ivoire","Côte d'Ivoire")# also change apostrophe in map dataset

    all_countries = (set(data.origin_country.unique()).union(set(data.destin_country.unique()))) #find all possible countries in dataset
    all_countries= pd.DataFrame(all_countries,columns=['Country or Area'])

    no_overlap = list(set(all_countries['Country or Area']).difference(set(country_match['Country or Area'].unique())))
    print('Countries that have changed their name:')
    print(no_overlap)
    return (data, country_match, all_countries)


## 3.1. Solve country name issues 
There are many countries that are not being correctly labeled in the dataset when compared to the UN labels dataset. This dataset has the list of all the ISO2 codes of the currently existing countries. We solve the issue using the function ``Standardise_names`` for the countries where the problem is a labeling issue. 

However, some of the countries reported in the trade data are administrative units or countries that do not exist anymore. In this case, we will track these countries/regions using their specific ISO codes. To define it we use ``ISO2_fix``.

Source: https://unstats.un.org/unsd/methodology/m49/overview/

In [2]:
#Load data 
data_og = pd.read_csv('../Data/Trade_reconciled.csv',encoding="utf-8",index_col=0).reset_index()
data=data_og.drop(columns=['origin_country_code','destin_country_code'])

# Find all countries existing across the years
all_countries = list(set(data.origin_country.unique()).union(set(data.destin_country.unique()))) #find all possible countries in dataset 2

# Load dataset with ISO codes for all countries, reported from UN stats (https://unstats.un.org/unsd/methodology/m49/overview/)
country_match = pd.read_csv('../Data/raw_trade/UNSD — Methodology.csv',encoding="utf-8",sep=';',index_col=0,keep_default_na=False).reset_index()

# Find countries with label mismatch
no_overlap = list(set(all_countries).difference(set(country_match['Country or Area'].unique())))
print('Countries with a label mismatch:')
print(no_overlap)

# Solve possible mapping issues 
data, country_match, all_countries_std = Standardise_names(data,country_match)


# ADD ISO ids for all countries 
match_ids= pd.merge(all_countries_std, country_match, how='left', on='Country or Area')

# Add iso codes for non matching countries
match_ids= ISO2_fix(match_ids)

# Validaiton Check
print('\nNum countries without ISO code after ISO2-fix (should be 0): '+ str(len(match_ids.loc[match_ids['ISO-alpha2 Code'].isna(),])))

# Maybe need to get iso codes form here: 
#https://data.apps.fao.org/catalog/dataset/iso-2-code-list-global-region-country

Countries with a label mismatch:
['Heard and McDonald Islands', 'Czechoslovakia', 'Johnston Island', 'Midway Island', 'Wake Island', 'China, Hong Kong SAR', 'Serbia and Montenegro', 'Sudan (former)', 'China, Macao SAR', 'Yugoslav SFR', 'USSR', 'Palestine', "Côte d'Ivoire", 'China, Taiwan Province of', 'Belgium-Luxembourg', 'China, mainland', 'Canton and Enderbury Islands', 'Ethiopia PDR']
Countries that have changed their name:
['Heard and McDonald Islands', 'Czechoslovakia', 'Johnston Island', 'Midway Island', 'Wake Island', 'China, Hong Kong SAR', 'Sudan (former)', 'China, Macao SAR', 'USSR', 'Yugoslav SFR', 'China, Taiwan Province of', 'Belgium-Luxembourg', 'Serbia and Montenegro', 'Canton and Enderbury Islands']

Num countries without ISO code after ISO2-fix (should be 0): 0


## Add geometries to country properties 
If we want to plot trade-network on a map we need a representative position of each country. For that we need the spatial geometry of each country. This is loaded from a shapefile. The shapefile can be linked to the data using the ISO2 codes. 

Of course, there are some geometries missing in our current world map. Countries have changed a lot. The countries missing should be the same for which we created artificial ISO2 codes. 

Source: https://hub.arcgis.com/datasets/esri::world-countries-generalized/explore?location=0.031147%2C80.508596%2C2.00

In [3]:
shape_file = gpd.read_file('../Data/World_Countries_Generalized/World_Countries_Generalized.shp',keep_default_na=False)

In [4]:
# ADD geometry of all countries (using ISO-2 mapping)
shape_file = gpd.read_file('../Data/World_Countries_Generalized/World_Countries_Generalized.shp',keep_default_na=False)
shape_file['ISO2']=shape_file.ISO #backup iso2
shape_file_pd= shape_file.drop('geometry',axis=1) #explore

shape_file= shape_file.dissolve(by='ISO2')
shape_file= shape_file.loc[:,['ISO','geometry']]

# Add missing geometries (old countries)
shape_file = Geometries_missing(shape_file)
#shape_file_pd= shape_file.drop('geometry',axis=1)# explore

# Create representative point for each region
rep_point=shape_file.representative_point().get_coordinates()

shape_file['pos']=list(zip(rep_point.x,rep_point.y))

merged_match= gpd.GeoDataFrame(pd.merge(match_ids,shape_file, how='left', left_on='ISO-alpha2 Code',right_on='ISO'))
merged_match= (merged_match.loc[:,['Country or Area','ISO-alpha2 Code','pos']]
.rename(columns={'Country or Area':'name','ISO-alpha2 Code':'ISO'}))

# Check countries without iso code 
no_geometry = list(set(match_ids['ISO-alpha2 Code']).difference(set(shape_file['ISO'])))
print('Check that the ISO codes here correspond only to the countries that were non-existing:')
print(no_geometry)

Check that the ISO codes here correspond only to the countries that were non-existing:
['YU', 'TW', 'CT', 'CS', 'UM-71', 'UM-67', 'UM-79', 'MO', 'HK', 'SU']


  return GeometryArray(data, crs=_get_common_crs(to_concat))


Save geometries from shapefile in the dataset containing all the country info and add the ISO2 codes to the data file. 

In [5]:
# Add positions to dataframe with all countries: 
merged_match=gpd.GeoDataFrame(pd.merge(match_ids,shape_file, how='left',left_on='ISO-alpha2 Code',right_on='ISO'))
merged_match= merged_match.loc[:,['Country or Area','ISO-alpha2 Code','pos']].rename(columns={'Country or Area':'name','ISO-alpha2 Code':'ISO'})

# Add iso codes to data: 
data_merged = (pd.merge(merged_match, data, how='left',left_on='name',right_on='origin_country')
            .rename(columns={'ISO': 'origin_country_ISO'}).drop(columns=['name','pos']))

data_merged = (pd.merge(merged_match, data_merged,right_on='destin_country',left_on='name')
            .rename(columns={'ISO': 'destin_country_ISO'}).drop(columns=['name','pos','index','element']))

In [6]:
# Save data clean: 
merged_match.to_pickle('../Data/Country_info.pkl')
data_merged.to_pickle('../Data/Trade_geo.pkl')
shape_file.to_pickle('../Data/Shapefile_with_positions.pkl')

#merged_match.to_csv('../Data/Country_info.csv',index=False, encoding='utf-8',na_rep="")
#data_merged.to_csv('../Data/Trade_geo.csv',index=False, encoding='utf-8', na_rep="")

#shape_file=shape_file.drop(columns=['pos']).reset_index()
#shape_file.to_file('../Data/Shapefile_with_positions.geojson', driver='GeoJSON') 


In [27]:

data = pd.read_pickle('../Data/Data_year_groups_12.pkl')
FAO = pd.read_csv('../FAOSTAT_data_12-11-2023.csv')


In [28]:
len(data.item.unique())

442

In [30]:
FAO

Unnamed: 0,Domain Code,Domain,Item Group Code,Item Group,Item Code,Item,Factor,CPC Code,HS Code,HS07 Code,HS12 Code
0,CBH,"Commodity Balances (non-food) (-2013, old meth...",2924,Alcoholic Beverages,2659,"Alcohol, Non-Food",1.0,,,,
1,CBH,"Commodity Balances (non-food) (-2013, old meth...",2913,Oilcrops,2559,Cottonseed,1.0,,,,
2,CBH,"Commodity Balances (non-food) (-2013, old meth...",2913,Oilcrops,2562,Palm kernels,1.0,,,,
3,CBH,"Commodity Balances (non-food) (-2013, old meth...",2913,Oilcrops,2558,Rape and Mustardseed,1.0,,,,
4,ESB,Cropland Nutrient Budget,5081,Nutrient Budget,5082,Leaching,1.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...
11160,QV,Value of Agricultural Production,1735,Vegetables Primary,417,"Peas, green",1.0,01242,,070810,070810
11161,QV,Value of Agricultural Production,1735,Vegetables Primary,394,"Pumpkins, squash and gourds",1.0,01235,,070990,070993
11162,QV,Value of Agricultural Production,1735,Vegetables Primary,373,Spinach,1.0,01215,,070970,070970
11163,QV,Value of Agricultural Production,1735,Vegetables Primary,423,String beans,1.0,01241.01,,070820,070820


In [32]:
filtered= FAO.loc[FAO.Item.isin(data.item.unique()),:]

In [37]:
filtered['Domain'].unique()

array(['Commodity Balances (non-food) (-2013, old methodology)',
       'Crops and livestock products', 'Emissions from Crops',
       'Emissions from Livestock', 'Food Balances (2010-)',
       'Food Balances (-2013, old methodology and population)',
       'Livestock Manure', 'Livestock Patterns', 'Producer Prices',
       'Producer Prices (old series)', 'Production Indices',
       'Supply Utilization Accounts (2010-)', 'Trade Indices',
       'Value of Agricultural Production'], dtype=object)

In [None]:
# FoodEx database: https://efsa.onlinelibrary.wiley.com/doi/epdf/10.2903/sp.efsa.2015.EN-804 
