# 3- Clean data: Fix problems with geometries

Countries are messy, need to include ISO codes and to add their position on a map (for visualisation purposes). 

This script pre-processes the data and gives 3 csv files as output: 

- ``Trade_geo.csv``: Contains the clean version of Trade_reconciled. The country names are standardised and the correct ISO3 codes are included for all countries.

- ``Country_info.csv``: Contains metadata about all the countries in the dataset including their Standardised name, ISO2 code, ISO3 code (for some) and the coordinates of a representative point inside the country. *Needs to be improved to find the spatial position of the countries that have changed their name.* 
- ``Shapefile_with_positions_code.shp``: May be useful for plotting maps. *Not for now* 

In [24]:
import pandas as pd
import numpy as np
import time
import networkx as nx
import geopandas as gpd # pip installed
import matplotlib.pyplot as plt 

# FUNCTIONS 
def ISO2_fix (match_ids):
    # Add iso codes for all missing countries
    bool_cond= match_ids.loc[:,'Country or Area']== 'Canton and Enderbury Islands'
    match_ids.loc[bool_cond,'ISO-alpha2 Code']= 'CT'

    bool_cond= match_ids.loc[:,'Country or Area']== 'China, Hong Kong SAR'
    match_ids.loc[bool_cond,'ISO-alpha2 Code']= 'HK'

    bool_cond= match_ids.loc[:,'Country or Area']== 'Heard and McDonald Islands'
    match_ids.loc[bool_cond,'ISO-alpha2 Code']= 'HM'

    bool_cond= match_ids.loc[:,'Country or Area']== 'China, Taiwan Province of'
    match_ids.loc[bool_cond,'ISO-alpha2 Code']= 'TW'

    bool_cond= match_ids.loc[:,'Country or Area']== 'USSR'
    match_ids.loc[bool_cond,'ISO-alpha2 Code']= 'SU'

    bool_cond= match_ids.loc[:,'Country or Area']== 'Serbia and Montenegro'
    match_ids.loc[bool_cond,'ISO-alpha2 Code']= 'RS-ME'

    bool_cond= match_ids.loc[:,'Country or Area']== 'Midway Island'
    match_ids.loc[bool_cond,'ISO-alpha2 Code']= 'UM-71'

    bool_cond= match_ids.loc[:,'Country or Area']== 'Yugoslav SFR'
    match_ids.loc[bool_cond,'ISO-alpha2 Code']= 'YU'

    bool_cond= match_ids.loc[:,'Country or Area']== 'China, Macao SAR'
    match_ids.loc[bool_cond,'ISO-alpha2 Code']= 'MO'

    bool_cond= match_ids.loc[:,'Country or Area']== 'Sudan (former)'
    match_ids.loc[bool_cond,'ISO-alpha2 Code']= 'SD-SS'

    bool_cond= match_ids.loc[:,'Country or Area']== 'Johnston Island'
    match_ids.loc[bool_cond,'ISO-alpha2 Code']= 'UM-67'

    bool_cond= match_ids.loc[:,'Country or Area']== 'Belgium-Luxembourg'
    match_ids.loc[bool_cond,'ISO-alpha2 Code']= 'BE-LU'

    bool_cond= match_ids.loc[:,'Country or Area']== 'Czechoslovakia'
    match_ids.loc[bool_cond,'ISO-alpha2 Code']= 'CS'

    bool_cond= match_ids.loc[:,'Country or Area']== 'Serbia and Montenegro'
    match_ids.loc[bool_cond,'ISO-alpha2 Code']= 'RS-ME'

    bool_cond= match_ids.loc[:,'Country or Area']== 'Wake Island'
    match_ids.loc[bool_cond,'ISO-alpha2 Code']= 'UM-79'
    return(match_ids)


def Standardise_names(data, country_match):
    # Fix formatting isseues 
    data.loc[:,['origin_country','destin_country']]= data.loc[:,['origin_country','destin_country']].replace('Palestine','State of Palestine')
    data.loc[:,['origin_country','destin_country']]= data.loc[:,['origin_country','destin_country']].replace("Ethiopia PDR",'Ethiopia')

    country_match.loc[:,'Country or Area']= country_match.loc[:,'Country or Area'].replace('China','China, mainland')
    country_match.loc[:,'Country or Area']= country_match.loc[:,'Country or Area'].replace("Côte d’Ivoire","Côte d'Ivoire")# also change apostrophe in map dataset

    all_countries = (set(data.origin_country.unique()).union(set(data.destin_country.unique()))) #find all possible countries in dataset
    all_countries= pd.DataFrame(all_countries,columns=['Country or Area'])

    no_overlap = list(set(all_countries['Country or Area']).difference(set(country_match['Country or Area'].unique())))
    print('Countries that have changed their name:')
    print(no_overlap)
    return (data, country_match, all_countries)


def Fix_shapefile_old_countries(country_metadata):
    '''
    When possible merge countries to generate the shapefile of the past countries from which geospatial data is not available.
    Note that the borders may not be accurate for the time period of interest. This is only for visualisation purposes,
    and it does not affect the outcomes of our simulations.

    Other countries that are not possible to map to current countries.
    '''

    # Belgium-Luxembourg
    F15_iso = ['BEL', 'LUX']
    F51_geo = country_metadata.loc[country_metadata['ISO3 Code'].isin(F15_iso), :]  # Belgium
    F51_geo = F51_geo.dissolve()
    country_metadata.loc[country_metadata['ISO3 Code'] == 'F15', 'geometry'] = F51_geo.geometry.values[0]
    
    # Czechoslovakia (1986-1993)
    F51_iso = ['CZE','SVK']
    F51_geo= country_metadata.loc[country_metadata['ISO3 Code'].isin(F51_iso),:]# Czechia
    F51_geo= F51_geo.dissolve()
    country_metadata.loc[country_metadata['ISO3 Code']=='F51','geometry'] = F51_geo.geometry.values[0]

    # Ethiopia PDR (1987-1991)
    F62_iso= ['ETH','ERI']
    F62_geo= country_metadata.loc[country_metadata['ISO3 Code'].isin(F62_iso),:]# Ethiopia
    F62_geo= F62_geo.dissolve()
    country_metadata.loc[country_metadata['ISO3 Code']=='F62','geometry'] = F62_geo.geometry.values[0]
    
    # Serbia and Montenegro (1992-2006)
    SCG_iso= ['SRB','MNE']
    SCG_geo= country_metadata.loc[country_metadata['ISO3 Code'].isin(SCG_iso),:]# Serbia
    SCG_geo= SCG_geo.dissolve()
    country_metadata.loc[country_metadata['ISO3 Code']=='SCG','geometry'] = SCG_geo.geometry.values[0]

    # Sudan (1986-2011)
    F206_iso= ['SDN','SSD']
    F206_geo= country_metadata.loc[country_metadata['ISO3 Code'].isin(F206_iso),:]# Sudan
    F206_geo= F206_geo.dissolve()
    country_metadata.loc[country_metadata['ISO3 Code']=='F206','geometry'] = F206_geo.geometry.values[0]

    # USSR (1986-1991)
    USSR_countries= ['RUS','BLR','UKR','KAZ','UZB','TUR','AZE','ARM','GEO','MDA','EST','LVA','LTU','KGZ','TJK']
    USSR_geo= country_metadata.loc[country_metadata['ISO3 Code'].isin(USSR_countries),:] 
    USSR_geo= USSR_geo.dissolve()
    country_metadata.loc[country_metadata['ISO3 Code']=='F228','geometry'] =USSR_geo.geometry.values[0]

    
    # Yugoslavia (1986-1992)
    YUG_countries= ['SRB','MNE','HRV','BIH','MKD','SVN','XKX']
    YUG_geo= country_metadata.loc[country_metadata['ISO3 Code'].isin(YUG_countries),:]
    YUG_geo= YUG_geo.dissolve()
    country_metadata.loc[country_metadata['ISO3 Code']=='F248','geometry'] =YUG_geo.geometry.values[0]
    # Not fixed: HKG, MAC, TWN,JTN,MID,PSE,WAK
    return country_metadata



## 3.1. Solve country name issues 
Now we use another list set that only requires us from fixing the Kiribati country. 

**Before:**
There are many countries that are not being correctly labeled in the dataset when compared to the UN labels dataset. This dataset has the list of all the ISO2 codes of the currently existing countries. We solve the issue using the function ``Standardise_names`` for the countries where the problem is a labeling issue. 

However, some of the countries reported in the trade data are administrative units or countries that do not exist anymore. In this case, we will track these countries/regions using their specific ISO codes. To define it we use ``ISO2_fix``.

Source: https://unstats.un.org/unsd/methodology/m49/overview/

In [25]:
# New function to fix countries: 

#Load data 
data_og = pd.read_csv('../Data/intermediate/Trade_reconciled.csv',encoding="utf-8")
data=data_og.drop(columns=['origin_country_code','destin_country_code'])

# Load data from FAOSTAT to merge 
countries_FAOSTAT=pd.read_csv('../Data/input_2023/FAOSTAT_list_countries_ISO.csv',na_filter=False)
# keep only columns of interest
countries_FAOSTAT=countries_FAOSTAT.loc[:,['Country','ISO3 Code','ISO2 Code', 'Start Year','End Year']]
countries_FAOSTAT

Unnamed: 0,Country,ISO3 Code,ISO2 Code,Start Year,End Year
0,Afghanistan,AFG,AF,,
1,Africa,X06,F5100,,
2,Åland Islands,ALA,F284,,
3,Albania,ALB,AL,,
4,Algeria,DZA,DZ,,
...,...,...,...,...,...
338,Yemen Ar Rp,F246,F246,,
339,Yemen Dem,F247,F247,,
340,Yugoslav SFR,F248,F248,,1991
341,Zambia,ZMB,ZM,,


In [26]:
# Find all countries existing across the years
all_countries = list(set(data.origin_country.unique()).union(set(data.destin_country.unique()))) #find all possible countries in dataset 2

# Find countries with label mismatch
no_overlap = list(set(all_countries).difference(set(countries_FAOSTAT['Country'].unique())))
print('Countries with a label mismatch:',no_overlap)


Countries with a label mismatch: ['RÃ©union', "CÃ´te d'Ivoire", 'TÃ¼rkiye', 'Canton and Enderbury Islands']


In [27]:
# Integrate Canton and Enderbury Islands in Kiribati and fix other names
data.replace({'Canton and Enderbury Islands':'Kiribati',
              'RÃ©union':'Réunion','TÃ¼rkiye':'Türkiye',
              "CÃ´te d'Ivoire":"Côte d'Ivoire"}, inplace=True)

# Re-check matching
all_countries = list(set(data.origin_country.unique()).union(set(data.destin_country.unique()))) #find all possible countries in dataset 2
no_overlap = list(set(all_countries).difference(set(countries_FAOSTAT['Country'].unique())))
print('Countries with a label mismatch:',no_overlap)

Countries with a label mismatch: []


In [28]:
# Merge data with FAOSTAT
data_merged = pd.merge(data, countries_FAOSTAT.loc[:,['ISO3 Code','ISO2 Code','Country']], how='left', left_on='origin_country', right_on='Country')

data_merged.rename(columns={'ISO3 Code':'origin_country_ISO','ISO2 Code':'origin_ISO2'},inplace=True)
data_merged.drop(columns=['Country'],inplace=True)

data_merged= pd.merge(data_merged, countries_FAOSTAT.loc[:,['ISO3 Code','ISO2 Code','Country']], how='left', left_on='destin_country', right_on='Country')
data_merged.rename(columns={'ISO3 Code':'destin_country_ISO','ISO2 Code':'destin_ISO2'},inplace=True)
data_merged.drop(columns=['Country'],inplace=True)
data_merged

Unnamed: 0,item,item_code,element,year,unit,value,origin_country,destin_country,origin_country_ISO,origin_ISO2,destin_country_ISO,destin_ISO2
0,"Cake, oilseeds nes",341,Import quantity,2017,t,463.11,Argentina,Afghanistan,ARG,AR,AFG,AF
1,"Cake, oilseeds nes",341,Import quantity,2019,t,1192.62,Argentina,Afghanistan,ARG,AR,AFG,AF
2,"Almonds, shelled",231,Import quantity,2005,t,3.00,Afghanistan,Algeria,AFG,AF,DZA,DZ
3,"Cake, oilseeds nes",341,Import value,2017,1000 USD,85.00,Argentina,Afghanistan,ARG,AR,AFG,AF
4,"Almonds, shelled",231,Import value,2005,1000 USD,3.00,Afghanistan,Algeria,AFG,AF,DZA,DZ
...,...,...,...,...,...,...,...,...,...,...,...,...
35528991,Wine,564,Export value,1988,1000 USD,1.00,Netherlands (Kingdom of the),Zimbabwe,NLD,NL,ZWE,ZW
35528992,Wine,564,Export value,1992,1000 USD,8.00,Netherlands (Kingdom of the),Zimbabwe,NLD,NL,ZWE,ZW
35528993,Wine,564,Export value,1999,1000 USD,2.00,Netherlands (Kingdom of the),Zimbabwe,NLD,NL,ZWE,ZW
35528994,Wine,564,Export value,2004,1000 USD,2.00,Netherlands (Kingdom of the),Zimbabwe,NLD,NL,ZWE,ZW


## Add geometries to country properties 
If we want to plot trade-network on a map we need a representative position of each country. For that we need the spatial geometry of each country. This is loaded from a shapefile. The shapefile can be linked to the data using the ISO2 codes. 

Of course, there are some geometries missing in our current world map (e.g. USSR, Yugoslavia...). We try to impute their shape for visualisation purposes by combining/dividing the shapes of the current countries they are associated with.  

Source: https://hub.arcgis.com/datasets/esri::world-countries-generalized/explore?location=0.031147%2C80.508596%2C2.00

In [29]:
# Add geometry of all countries (using ISO-2 mapping)
shape_file = gpd.read_file('../Data/input_2023/World_Countries_Generalized/World_Countries_Generalized.shp',keep_default_na=False)
shape_file['ISO2']=shape_file.ISO #backup iso2
shape_file_pd= shape_file.drop('geometry',axis=1) #explore

shape_file= shape_file.dissolve(by='ISO2')
shape_file= shape_file.loc[:,['ISO','geometry']]
shape_file

# Create representative point for each region
rep_point=shape_file.representative_point().get_coordinates()

shape_file['pos']=list(zip(rep_point.x,rep_point.y))


  return ogr_read(


In [30]:
# Create representative point for each region
rep_point=shape_file.representative_point().get_coordinates()

shape_file['pos']=list(zip(rep_point.x,rep_point.y))

# Save a metadata dataframe with the geoposition of all countries: 
merged_match= gpd.GeoDataFrame(pd.merge(countries_FAOSTAT,shape_file, how='left', left_on='ISO2 Code',right_on='ISO'))

merged_match = Fix_shapefile_old_countries(merged_match)

# keep only existing countries in the data
merged_match = merged_match.loc[merged_match.Country.isin(all_countries),:]
merged_match.drop(columns='ISO',inplace=True)

# Clean unecessary columns in dataframe
data_merged.drop(columns=['origin_ISO2','destin_ISO2','element'],inplace=True)


In [None]:
# Save data clean: 
merged_match.to_csv('../Data/intermediate/Country_info_metadata.csv', index=False)
data_merged.to_csv('../Data/intermediate/Trade_fixed_geo.csv', index=False)