# Trade Under Pressure



## Import packages

In [719]:
import pandas as pa
import numpy as np
import functools as ft

In [720]:
# Excluded country codes
excluded_country_codes = [
    "NCL", "ZAR", "WLF", "NFK", "PAL", "MNP", "SHN", "COK",
    "BMU", "PYF", "NIU", "ANT", "VGB", "TMP", "GUF", "CYM",
    "TCA", "REU", "FLK", "GLP", "TKL", "MTQ", "AIA", "SPM",
    "SSD", "MNT", "DDR", "BYS", "MNE", "TLS", "LIE", "VAT",
    "CSK", "MSR", "PSE", "DHY", "KOS", "KSV", "RHO", "HVO",
    "VDR", "MCO", "SVU", "XKX", "ASM", "VIR", "SXM", "CUW",
    "MAF", "GUM", "IMN", "MAC", "PCN", "ALI", "YDR", "ATA",
    "CXR", "MID", "SJM", "SPE", "UMI", "SIK", "BAT", "GAZ",
    "BUN", "SGS", "ETF", "PCZ", "TAN", "JTN", "RYU", "PCE",
    "CCK", "BVT", "USP", "KN1", "ATF", "ZPM", "IOT", "MYT",
    "HMD", "SWK", "WAK", "SVR", "ZW1", "PMY", "FRE", "BLX",
    "UNS", "SBH", "NZE"
]

replace_country_codes = {
    "ROM" : {
       "values": ["ROU"]
    },
    "SER": {
        "values": ["SRB"]
    },
    "COG": {
        "values": ["COD"]
    },
    "CHL": {
        "values": ["CHI"]
    },
}

## Data preprocessing
### Dist CEPII

In [721]:
# Loading and preprocessing Dist CEPII dataset
dist_cepii = pa.read_excel("./data/geographic/dist_cepii.xls",
                           index_col=None, header=0,
                           na_values=[".", "nan", "NaN"], verbose=True, decimal=",",
                           dtype={"contig": bool, "comlang_off": bool,
                                  "comlang_ethno": bool, "colony": bool,
                                  "smctry": bool}
                           )
dist_cepii = dist_cepii.drop(["comcol", "curcol", "col45"], axis=1)
dist_cepii = dist_cepii.rename(columns={"iso_o": "origin", "iso_d": "destination"})

Reading sheet 0


Inspect DataFrame properties

In [722]:
dist_cepii.head(n=50)

Unnamed: 0,origin,destination,contig,comlang_off,comlang_ethno,colony,smctry,dist,distcap,distw,distwces
0,ABW,ABW,False,False,False,False,False,5.225315,5.225315,25.09354,23.04723
1,ABW,AFG,False,False,False,False,False,13257.81,13257.81,13168.22,13166.37
2,ABW,AGO,False,False,False,False,False,9516.913,9516.913,9587.316,9584.193
3,ABW,AIA,False,False,True,False,False,983.2682,983.2682,976.8974,976.8916
4,ABW,ALB,False,False,False,False,False,9091.742,9091.742,9091.576,9091.466
5,ABW,AND,False,True,False,False,False,7572.788,7572.788,7570.084,7570.083
6,ABW,ANT,False,True,True,False,True,136.3848,136.3848,239.9064,142.8583
7,ABW,ARE,False,False,False,False,False,12735.01,12735.01,12773.08,12772.95
8,ABW,ARG,False,True,False,False,False,5396.22,5396.22,5187.788,5157.126
9,ABW,ARM,False,False,False,False,False,11107.78,11107.78,11106.96,11106.76


In [723]:
dist_cepii.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50176 entries, 0 to 50175
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   origin         50176 non-null  object 
 1   destination    50176 non-null  object 
 2   contig         50176 non-null  bool   
 3   comlang_off    50176 non-null  bool   
 4   comlang_ethno  50176 non-null  bool   
 5   colony         50176 non-null  bool   
 6   smctry         50176 non-null  bool   
 7   dist           50176 non-null  float64
 8   distcap        50176 non-null  float64
 9   distw          47961 non-null  float64
 10  distwces       47961 non-null  float64
dtypes: bool(5), float64(4), object(2)
memory usage: 2.5+ MB


In [724]:
dist_cepii.dtypes

origin            object
destination       object
contig              bool
comlang_off         bool
comlang_ethno       bool
colony              bool
smctry              bool
dist             float64
distcap          float64
distw            float64
distwces         float64
dtype: object

Check how many rows have NaN

In [725]:
dist_cepii.isna().sum()

origin              0
destination         0
contig              0
comlang_off         0
comlang_ethno       0
colony              0
smctry              0
dist                0
distcap             0
distw            2215
distwces         2215
dtype: int64

Count how many times a country has NaN in Origin and in Destination

In [726]:
na_rows = dist_cepii[dist_cepii.isna().any(axis=1)][["origin", "destination"]]
# Sanity check
na_rows.isna().sum()

origin         0
destination    0
dtype: int64

In [727]:
na_rows[["origin"]].value_counts()

origin
PCN       224
CCK       224
MAC       224
MSR       224
CXR       224
         ... 
GNQ         5
GRC         5
GRD         5
GRL         5
ZWE         5
Name: count, Length: 224, dtype: int64

Check how much % of rows are NaN from the total amount

In [728]:
print("Percentage of NaN rows: ", (na_rows.shape[0]/dist_cepii.shape[0])*100, "%")

Percentage of NaN rows:  4.414461096938775 %


Drop NaN values

In [729]:
dist_cepii = dist_cepii.dropna()

In [730]:
# Sanity check
dist_cepii.isna().any()

origin           False
destination      False
contig           False
comlang_off      False
comlang_ethno    False
colony           False
smctry           False
dist             False
distcap          False
distw            False
distwces         False
dtype: bool

Check how many unique countries there are in both origin and destination. Numbers should match.

In [731]:
print("Unique countries in origin column", dist_cepii["origin"].nunique())
print("Unique countries in destination column", dist_cepii["destination"].nunique())

Unique countries in origin column 219
Unique countries in destination column 219


Remove unwanted countries

### GSDB V4 Dyadic

Inspect the `GSDB_V4_Dyadic.dta` - this should be a Stata file. What is the difference to `GSDB_V4.csv`?

In [732]:
gsdb_stata = pa.read_stata("data/sanctions/GSDB_V4_Dyadic.dta")
gsdb_csv = pa.read_csv("data/sanctions/GSDB_V4.csv")

In [733]:
# Remove sanctions against terrorist organisations
gsdb_stata = gsdb_stata[gsdb_stata["sanctioned_state_iso3"].astype(str) != '']

Retrieve rows that include `case_id` **471**:

In [734]:
gsdb_stata[gsdb_stata["case_id"].astype(str).str.contains("471")]

Unnamed: 0,case_id,sanctioning_state_iso3,sanctioning_state,sanctioned_state_iso3,sanctioned_state,year,arms,military,trade,descr_trade,financial,travel,other,target_mult,sender_mult,objective,success
0,471,AFG,Afghanistan,AGO,Angola,1993-01-01,1,1,1,exp_part,0,0,0,0,1,end_war,success_total
1,471,AFG,Afghanistan,AGO,Angola,1994-01-01,1,1,1,exp_part,0,0,0,0,1,end_war,success_total
2,471,AFG,Afghanistan,AGO,Angola,1995-01-01,1,1,1,exp_part,0,0,0,0,1,end_war,success_total
3,471,AFG,Afghanistan,AGO,Angola,1996-01-01,1,1,1,exp_part,0,0,0,0,1,end_war,success_total
4,471574,AFG,Afghanistan,AGO,Angola,1997-01-01,1,1,1,exp_part,0,1,1,0,1,"end_war,end_war","success_total,success_total"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158537,595574471,ZWE,Zimbabwe,AGO,Angola,1998-01-01,1,1,1,"exp_part,imp_part",1,1,1,0,1,"end_war,end_war,end_war","success_total,success_total,success_total"
158538,574595471,ZWE,Zimbabwe,AGO,Angola,1999-01-01,1,1,1,"exp_part,imp_part",1,1,1,0,1,"end_war,end_war,end_war","success_total,success_total,success_total"
158539,595471574,ZWE,Zimbabwe,AGO,Angola,2000-01-01,1,1,1,"exp_part,imp_part",1,1,1,0,1,"end_war,end_war,end_war","success_total,success_total,success_total"
158540,471595574,ZWE,Zimbabwe,AGO,Angola,2001-01-01,1,1,1,"exp_part,imp_part",1,1,1,0,1,"end_war,end_war,end_war","success_total,success_total,success_total"


In [735]:
gsdb_stata.head(500)

Unnamed: 0,case_id,sanctioning_state_iso3,sanctioning_state,sanctioned_state_iso3,sanctioned_state,year,arms,military,trade,descr_trade,financial,travel,other,target_mult,sender_mult,objective,success
0,471,AFG,Afghanistan,AGO,Angola,1993-01-01,1,1,1,exp_part,0,0,0,0,1,end_war,success_total
1,471,AFG,Afghanistan,AGO,Angola,1994-01-01,1,1,1,exp_part,0,0,0,0,1,end_war,success_total
2,471,AFG,Afghanistan,AGO,Angola,1995-01-01,1,1,1,exp_part,0,0,0,0,1,end_war,success_total
3,471,AFG,Afghanistan,AGO,Angola,1996-01-01,1,1,1,exp_part,0,0,0,0,1,end_war,success_total
4,471574,AFG,Afghanistan,AGO,Angola,1997-01-01,1,1,1,exp_part,0,1,1,0,1,"end_war,end_war","success_total,success_total"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,300217321,AFG,Afghanistan,ZAF,South Africa,1987-01-01,1,1,1,exp_part,0,0,0,0,1,"policy_change,prevent_war,policy_change,preven...","success_total,success_total,success_total,succ..."
496,321300217,AFG,Afghanistan,ZAF,South Africa,1988-01-01,1,1,1,exp_part,0,0,0,0,1,"policy_change,democracy,human_rights,policy_ch...","success_total,success_total,success_total,succ..."
497,300321217,AFG,Afghanistan,ZAF,South Africa,1989-01-01,1,1,1,exp_part,0,0,0,0,1,"policy_change,prevent_war,policy_change,democr...","success_total,success_total,success_total,succ..."
498,300321217,AFG,Afghanistan,ZAF,South Africa,1990-01-01,1,1,1,exp_part,0,0,0,0,1,"policy_change,prevent_war,policy_change,democr...","success_total,success_total,success_total,succ..."


In [736]:
gsdb_csv.head(500)

Unnamed: 0,case_id,sanctioned_state,sanctioning_state,begin,end,trade,descr_trade,arms,military,financial,travel,other,target_mult,sender_mult,objective,success
0,1,German Democratic Republic,Germany,1949,1973,0,,0,0,0,0,1,0,0,territorial_conflict,success_total
1,2,Pakistan,India,1949,1951,1,"exp_compl,imp_compl",0,0,0,0,0,0,0,policy_change,nego_settlement
2,3,Bulgaria,United States,1950,1963,0,,0,0,1,0,0,0,0,destab_regime,success_part
3,4,Bulgaria,United States,1950,1966,0,,0,0,0,0,1,0,0,destab_regime,failed
4,5,Bulgaria,United States,1950,1959,0,,0,0,0,1,0,0,0,destab_regime,success_part
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,496,Russia,Ukraine,1993,1996,1,imp_part,0,1,0,0,0,0,0,policy_change,nego_settlement
496,497,Sudan,United States,1993,2020,0,,0,0,1,0,0,0,0,terrorism,success_part
497,498,Togo,EU,1993,2007,0,,0,0,1,0,0,0,1,"democracy,human_rights","success_total,success_total"
498,499,Togo,France,1993,2008,0,,0,0,1,0,0,0,0,"human_rights,democracy,end_war","success_total,success_total,success_total"


In [737]:
print("Shape for Stata data: ", gsdb_stata.shape)
print("Shape for XLS data:", gsdb_csv.shape)

Shape for Stata data:  (150200, 17)
Shape for XLS data: (1547, 16)


In [738]:
# Column names Stata
gsdb_stata.info()

<class 'pandas.core.frame.DataFrame'>
Index: 150200 entries, 0 to 159064
Data columns (total 17 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   case_id                 150200 non-null  object        
 1   sanctioning_state_iso3  150200 non-null  object        
 2   sanctioning_state       150200 non-null  object        
 3   sanctioned_state_iso3   150200 non-null  object        
 4   sanctioned_state        150200 non-null  object        
 5   year                    150200 non-null  datetime64[ns]
 6   arms                    150200 non-null  int8          
 7   military                150200 non-null  int8          
 8   trade                   150200 non-null  int8          
 9   descr_trade             150200 non-null  object        
 10  financial               150200 non-null  int8          
 11  travel                  150200 non-null  int8          
 12  other                   150200 non-

In [739]:
gsdb_stata.dtypes

case_id                           object
sanctioning_state_iso3            object
sanctioning_state                 object
sanctioned_state_iso3             object
sanctioned_state                  object
year                      datetime64[ns]
arms                                int8
military                            int8
trade                               int8
descr_trade                       object
financial                           int8
travel                              int8
other                               int8
target_mult                         int8
sender_mult                         int8
objective                         object
success                           object
dtype: object

In [740]:
# Column names XLS
gsdb_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1547 entries, 0 to 1546
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   case_id            1547 non-null   int64 
 1   sanctioned_state   1547 non-null   object
 2   sanctioning_state  1547 non-null   object
 3   begin              1547 non-null   int64 
 4   end                1547 non-null   int64 
 5   trade              1547 non-null   int64 
 6   descr_trade        588 non-null    object
 7   arms               1547 non-null   int64 
 8   military           1547 non-null   int64 
 9   financial          1547 non-null   int64 
 10  travel             1547 non-null   int64 
 11  other              1547 non-null   int64 
 12  target_mult        1547 non-null   int64 
 13  sender_mult        1547 non-null   int64 
 14  objective          1547 non-null   object
 15  success            1547 non-null   object
dtypes: int64(11), object(5)
memory usage: 193.

In [741]:
gsdb_csv.dtypes

case_id               int64
sanctioned_state     object
sanctioning_state    object
begin                 int64
end                   int64
trade                 int64
descr_trade          object
arms                  int64
military              int64
financial             int64
travel                int64
other                 int64
target_mult           int64
sender_mult           int64
objective            object
success              object
dtype: object

In [742]:
gsdb_stata.isna().any()

case_id                   False
sanctioning_state_iso3    False
sanctioning_state         False
sanctioned_state_iso3     False
sanctioned_state          False
year                      False
arms                      False
military                  False
trade                     False
descr_trade               False
financial                 False
travel                    False
other                     False
target_mult               False
sender_mult               False
objective                 False
success                   False
dtype: bool

In [743]:
(gsdb_stata["sanctioned_state_iso3"].astype(str) == '').sum()

np.int64(0)

In [785]:
gsdb_stata[gsdb_stata["sanctioning_state_iso3"] == "YUG"]

Unnamed: 0,case_id,sanctioning_state_iso3,sanctioning_state,sanctioned_state_iso3,sanctioned_state,year,arms,military,trade,descr_trade,financial,travel,other,target_mult,sender_mult,objective,success
157697,420,YUG,Yugoslavia,BIH,Bosnia and Herzegovina,1991-01-01,1,0,0,,0,0,0,1,1,end_war,success_total
157698,22,YUG,Yugoslavia,CHN,China,1951-01-01,1,0,0,,0,0,0,0,1,"end_war,destab_regime","success_total,failed"
157699,22,YUG,Yugoslavia,CHN,China,1952-01-01,1,0,0,,0,0,0,0,1,"end_war,destab_regime","success_total,failed"
157700,22,YUG,Yugoslavia,CHN,China,1953-01-01,1,0,0,,0,0,0,0,1,"end_war,destab_regime","success_total,failed"
157701,420,YUG,Yugoslavia,HRV,Croatia,1991-01-01,1,0,0,,0,0,0,1,1,end_war,success_total
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157820,300217321,YUG,Yugoslavia,ZAF,South Africa,1987-01-01,1,1,1,exp_part,0,0,0,0,1,"policy_change,prevent_war,policy_change,preven...","success_total,success_total,success_total,succ..."
157821,300217321,YUG,Yugoslavia,ZAF,South Africa,1988-01-01,1,1,1,exp_part,0,0,0,0,1,"policy_change,prevent_war,policy_change,preven...","success_total,success_total,success_total,succ..."
157822,300217321,YUG,Yugoslavia,ZAF,South Africa,1989-01-01,1,1,1,exp_part,0,0,0,0,1,"policy_change,prevent_war,policy_change,preven...","success_total,success_total,success_total,succ..."
157823,300217321,YUG,Yugoslavia,ZAF,South Africa,1990-01-01,1,1,1,exp_part,0,0,0,0,1,"policy_change,prevent_war,policy_change,preven...","success_total,success_total,success_total,succ..."


Try to unify names

### GDP Data

In [745]:
gdp = pa.read_csv(filepath_or_buffer="data/economic/GDP(currentUSD)_1974-2023.csv", sep=",", na_values=["..", "nan"])

In [746]:
gdp = gdp.drop(["Series Name", "Series Code", "Country Name"], axis=1)

In [747]:
gdp = gdp.rename(columns=lambda x: x if not x.endswith("]") else x.split(" ")[0])

In [748]:
gdp.head(10)

Unnamed: 0,Country Code,1974,1975,1976,1977,1978,1979,1980,1981,1982,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,AFG,,,,,,,,,,...,20497130000.0,19134220000.0,18116570000.0,18753460000.0,18053220000.0,18799440000.0,19955930000.0,14260000000.0,14497240000.0,17233050000.0
1,ALB,,,,,,,1578102000.0,1808177000.0,1861163000.0,...,13228150000.0,11386850000.0,11861200000.0,13019730000.0,15379510000.0,15585110000.0,15241460000.0,18032010000.0,19017240000.0,23547180000.0
2,DZA,13209870000.0,15557900000.0,17728240000.0,20972110000.0,26364490000.0,33243710000.0,42345830000.0,44348590000.0,45207170000.0,...,238942700000.0,187493900000.0,180763800000.0,189880900000.0,194554500000.0,193459700000.0,164873400000.0,186231200000.0,225638500000.0,247626200000.0
3,ASM,,,,,,,,,,...,643000000.0,673000000.0,671000000.0,612000000.0,639000000.0,647000000.0,721000000.0,750000000.0,871000000.0,
4,AND,186557100.0,220112600.0,227283900.0,253997900.0,308020300.0,411548700.0,446377800.0,388983300.0,375914700.0,...,3271686000.0,2789881000.0,2896610000.0,3000162000.0,3218420000.0,3155149000.0,2891001000.0,3324648000.0,3380613000.0,3785067000.0
5,AGO,,,,,,,5930503000.0,5550483000.0,5550483000.0,...,135966800000.0,90496420000.0,52761620000.0,73690150000.0,79450690000.0,70897960000.0,48501560000.0,66505130000.0,104399700000.0,84824650000.0
6,ATG,,,,77496300.0,88033330.0,109585200.0,132440700.0,149377800.0,166425900.0,...,1378830000.0,1437756000.0,1489693000.0,1531152000.0,1661530000.0,1725352000.0,1410796000.0,1601367000.0,1867733000.0,2033085000.0
7,ARG,72436780000.0,52438650000.0,51169500000.0,56781000000.0,89049450000.0,69252330000.0,76961920000.0,78676840000.0,84307490000.0,...,526319700000.0,594749300000.0,557532300000.0,643628400000.0,524819900000.0,447754700000.0,385740500000.0,486564100000.0,632790100000.0,646075300000.0
8,ARM,,,,,,,,,,...,11609510000.0,10553340000.0,10546140000.0,11527460000.0,12457940000.0,13619290000.0,12641700000.0,13878910000.0,19513510000.0,24085750000.0
9,ABW,,,,,,,,,,...,2790850000.0,2962907000.0,2983635000.0,3092429000.0,3276184000.0,3395799000.0,2481857000.0,2929447000.0,3279344000.0,3648573000.0


In [749]:
gdp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 222 entries, 0 to 221
Data columns (total 51 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Country Code  217 non-null    object 
 1   1974          147 non-null    float64
 2   1975          150 non-null    float64
 3   1976          150 non-null    float64
 4   1977          153 non-null    float64
 5   1978          152 non-null    float64
 6   1979          153 non-null    float64
 7   1980          163 non-null    float64
 8   1981          163 non-null    float64
 9   1982          164 non-null    float64
 10  1983          164 non-null    float64
 11  1984          166 non-null    float64
 12  1985          168 non-null    float64
 13  1986          168 non-null    float64
 14  1987          174 non-null    float64
 15  1988          176 non-null    float64
 16  1989          176 non-null    float64
 17  1990          192 non-null    float64
 18  1991          193 non-null    

In [750]:
gdp.shape

(222, 51)

Check how many countries there are in the dataset.

In [751]:
gdp["Country Code"].nunique()

217

In [752]:
gdp["Country Code"].isna().sum()

np.int64(5)

In [753]:
gdp = gdp[~gdp["Country Code"].isna()]
print("Shape after removing NaN: ", gdp.shape)

Shape after removing NaN:  (217, 51)


Check countries that are in CEPII, but not in GDP dataset.

In [754]:
countries_unique_to_cepii = list(set(dist_cepii["origin"].unique()) - set(gdp["Country Code"].unique()))
countries_unique_to_cepii

['TMP',
 'WLF',
 'REU',
 'ZAR',
 'ANT',
 'GLP',
 'SPM',
 'NIU',
 'TWN',
 'TKL',
 'PAL',
 'ROM',
 'YUG',
 'GUF',
 'FLK',
 'AIA',
 'ESH',
 'COK',
 'NFK',
 'MTQ',
 'SHN']

Check countries that are in GDP, but not in CEPII dataset.

In [755]:
countries_unique_to_gdp = list(set(gdp["Country Code"].unique()) - set(dist_cepii["origin"].unique()))
countries_unique_to_gdp

['XKX',
 'MNE',
 'TLS',
 'MAF',
 'IMN',
 'MAC',
 'SXM',
 'LIE',
 'CUW',
 'PSE',
 'ROU',
 'GUM',
 'ASM',
 'CHI',
 'SRB',
 'MCO',
 'SSD',
 'VIR',
 'COD']

### UN Comtrade

In [756]:
import pandas as pd
import os

# Specify the input and output file paths
input_file = "data/trade/Romania_2020_Plus"  # Update with your input file's path
output_file = "data/trade/Romania_2020_Plus.xlsx"  # Desired output Excel file path

# Determine the file extension to decide how to read the input file
_, ext = os.path.splitext(input_file)
ext = ext.lower()

df = None  # Initialize df
def convert(active):
    if active:
        return
    try:
        if ext in ['.csv', '']:
            # Assume the file is tab-delimited (TSV)
            df = pd.read_csv(input_file, sep='\t', low_memory=False)
        elif ext in ['.xls', '.xlsx']:
            df = pd.read_excel(input_file)
        else:
            raise ValueError(f"Unsupported file extension: {ext}")
        print("File read successfully!")
    except Exception as e:
        print(f"Error reading the input file: {e}")

    # Only attempt conversion if the DataFrame was successfully created
    if df is not None:
        try:
            df.to_excel(output_file, index=False)
            print(f"Conversion successful! The Excel file is saved as: {output_file}")
        except Exception as e:
            print(f"Error writing to Excel: {e}")
    else:
        print("Skipping conversion to Excel because the input file could not be read.")

# Read the input file into a DataFrame
convert(True)


### Rename and remove countries

In [757]:
for key in replace_country_codes.keys():
    for value in replace_country_codes[key]["values"]:
        dist_cepii = dist_cepii.replace(value, key)
        gdp = gdp.replace(value, key)
        gsdb_stata = gsdb_stata.replace(value, key)
        print("Replacing", value, "with", key)

dist_cepii = dist_cepii[~dist_cepii.isin(excluded_country_codes).any(axis=1)]
gsdb_stata = gsdb_stata[~gsdb_stata.isin(excluded_country_codes).any(axis=1)]
gdp = gdp[~gdp.isin(excluded_country_codes).any(axis=1)]

# Set distance for SER to the distance of YUG
yug_rows = dist_cepii[dist_cepii.apply(lambda row: row.astype(str).str.contains("YUG").any(), axis=1)].copy()
yug_rows = yug_rows.replace("YUG", "SER")
dist_cepii = pd.concat([dist_cepii, yug_rows], ignore_index=True)

Replacing ROU with ROM
Replacing SRB with SER
Replacing COD with COG
Replacing CHI with CHL




## Collect and analyze country labels in all the data sets

### Country codes

In [758]:
country_codes = pa.read_excel("./data/geographic/country_codes.xls",
                              dtype={
                                  "CountryCode": object,
                              })
country_codes = country_codes[~country_codes.isin(excluded_country_codes).any(axis = 1)]

In [759]:
country_codes.shape

(198, 3)

In [760]:
# Check if all unique
print("All Unique ?: ", country_codes.shape[0] == len(country_codes["ISO3"].unique()))

All Unique ?:  True


In [761]:
country_codes_labels = country_codes["ISO3"]

In [762]:
np.array(country_codes_labels)

array(['ABW', 'AFG', 'AGO', 'ALB', 'AND', 'ARE', 'ARG', 'ARM', 'ATG',
       'AUS', 'AUT', 'AZE', 'BDI', 'BEL', 'BEN', 'BFA', 'BGD', 'BGR',
       'BHR', 'BHS', 'BIH', 'BLR', 'BLZ', 'BOL', 'BRA', 'BRB', 'BRN',
       'BTN', 'BWA', 'CAF', 'CAN', 'CHE', 'CHL', 'CHN', 'CIV', 'CMR',
       'COG', 'COL', 'COM', 'CPV', 'CRI', 'CUB', 'CYP', 'CZE', 'DEU',
       'DJI', 'DMA', 'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'ERI', 'ESH',
       'ESP', 'EST', 'ETH', 'EUN', 'FIN', 'FJI', 'FRA', 'FRO', 'FSM',
       'GAB', 'GBR', 'GEO', 'GHA', 'GIB', 'GIN', 'GMB', 'GNB', 'GNQ',
       'GRC', 'GRD', 'GRL', 'GTM', 'GUY', 'HKG', 'HND', 'HRV', 'HTI',
       'HUN', 'IDN', 'IND', 'IRL', 'IRN', 'IRQ', 'ISL', 'ISR', 'ITA',
       'JAM', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KIR', 'KNA',
       'KOR', 'KWT', 'LAO', 'LBN', 'LBR', 'LBY', 'LCA', 'LKA', 'LSO',
       'LTU', 'LUX', 'LVA', 'MAR', 'MDA', 'MDG', 'MDV', 'MEX', 'MHL',
       'MKD', 'MLI', 'MLT', 'MMR', 'MNG', 'MOZ', 'MRT', 'MUS', 'MWI',
       'MYS', 'NAM',

### DIST CEPII Labels

In [763]:
dist_labels_origin = dist_cepii["origin"].unique()
print(len(dist_labels_origin))
dist_labels_origin

196


array(['ABW', 'AFG', 'AGO', 'ALB', 'AND', 'ARE', 'ARG', 'ARM', 'ATG',
       'AUS', 'AUT', 'AZE', 'BDI', 'BEL', 'BEN', 'BFA', 'BGD', 'BGR',
       'BHR', 'BHS', 'BIH', 'BLR', 'BLZ', 'BOL', 'BRA', 'BRB', 'BRN',
       'BTN', 'BWA', 'CAF', 'CAN', 'CHE', 'CHL', 'CHN', 'CIV', 'CMR',
       'COG', 'COL', 'COM', 'CPV', 'CRI', 'CUB', 'CYP', 'CZE', 'DEU',
       'DJI', 'DMA', 'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'ERI', 'ESH',
       'ESP', 'EST', 'ETH', 'FIN', 'FJI', 'FRA', 'FRO', 'FSM', 'GAB',
       'GBR', 'GEO', 'GHA', 'GIB', 'GIN', 'GMB', 'GNB', 'GNQ', 'GRC',
       'GRD', 'GRL', 'GTM', 'GUY', 'HKG', 'HND', 'HRV', 'HTI', 'HUN',
       'IDN', 'IND', 'IRL', 'IRN', 'IRQ', 'ISL', 'ISR', 'ITA', 'JAM',
       'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KIR', 'KNA', 'KOR',
       'KWT', 'LAO', 'LBN', 'LBR', 'LBY', 'LCA', 'LKA', 'LSO', 'LTU',
       'LUX', 'LVA', 'MAR', 'MDA', 'MDG', 'MDV', 'MEX', 'MHL', 'MKD',
       'MLI', 'MLT', 'MMR', 'MNG', 'MOZ', 'MRT', 'MUS', 'MWI', 'MYS',
       'NAM', 'NER',

In [764]:
dist_cepii["origin"].unique()
dist_cepii["origin"].isna().sum()

np.int64(0)

In [765]:
dist_labels_dest = dist_cepii["destination"].unique()
print(len(dist_labels_dest))
dist_labels_dest

196


array(['ABW', 'AFG', 'AGO', 'ALB', 'AND', 'ARE', 'ARG', 'ARM', 'ATG',
       'AUS', 'AUT', 'AZE', 'BDI', 'BEL', 'BEN', 'BFA', 'BGD', 'BGR',
       'BHR', 'BHS', 'BIH', 'BLR', 'BLZ', 'BOL', 'BRA', 'BRB', 'BRN',
       'BTN', 'BWA', 'CAF', 'CAN', 'CHE', 'CHL', 'CHN', 'CIV', 'CMR',
       'COG', 'COL', 'COM', 'CPV', 'CRI', 'CUB', 'CYP', 'CZE', 'DEU',
       'DJI', 'DMA', 'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'ERI', 'ESH',
       'ESP', 'EST', 'ETH', 'FIN', 'FJI', 'FRA', 'FRO', 'FSM', 'GAB',
       'GBR', 'GEO', 'GHA', 'GIB', 'GIN', 'GMB', 'GNB', 'GNQ', 'GRC',
       'GRD', 'GRL', 'GTM', 'GUY', 'HKG', 'HND', 'HRV', 'HTI', 'HUN',
       'IDN', 'IND', 'IRL', 'IRN', 'IRQ', 'ISL', 'ISR', 'ITA', 'JAM',
       'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KIR', 'KNA', 'KOR',
       'KWT', 'LAO', 'LBN', 'LBR', 'LBY', 'LCA', 'LKA', 'LSO', 'LTU',
       'LUX', 'LVA', 'MAR', 'MDA', 'MDG', 'MDV', 'MEX', 'MHL', 'MKD',
       'MLI', 'MLT', 'MMR', 'MNG', 'MOZ', 'MRT', 'MUS', 'MWI', 'MYS',
       'NAM', 'NER',

In [766]:
dist_cepii["destination"]

0        ABW
1        AFG
2        AGO
3        ALB
4        AND
        ... 
38409    ZMB
38410    ZWE
38411    SER
38412    SER
38413    SER
Name: destination, Length: 38414, dtype: object

In [767]:
# Check if there are any differences
dist_labels_check = np.array(dist_labels_origin == dist_labels_dest)
print("Any false:", np.any(~dist_labels_check))

Any false: False


In [768]:
# Sanity check
print("Sum of TRUE values: ", sum(dist_labels_check))
print("Lenght of ALL values: ", len(dist_labels_check))

Sum of TRUE values:  196
Lenght of ALL values:  196


In [769]:
dist_labels = dist_labels_origin

In [770]:
len(dist_labels)

196

In [771]:
len(np.unique(dist_labels))

196

### GSDB Labels

In [772]:
gsdb_sanctioning_labels = gsdb_stata["sanctioning_state_iso3"].unique()
gsdb_sanctioned_labels = gsdb_stata["sanctioned_state_iso3"].unique()

In [773]:
print("Number of unique sanctioning: ", len(gsdb_sanctioning_labels))
print("Number of unique sanctioned: ", len(gsdb_sanctioned_labels))

Number of unique sanctioning:  191
Number of unique sanctioned:  176


In [774]:
unique_to_sanctioning = list(set(gsdb_sanctioning_labels) - set(gsdb_sanctioned_labels))
print("Unique to SANCTIONING:", unique_to_sanctioning)

unique_to_sanctioned = list(set(gsdb_sanctioned_labels) - set(gsdb_sanctioning_labels))
print("Unique to SANCTIONED: ", unique_to_sanctioned)

Unique to SANCTIONING: ['VCT', 'TON', 'PLW', 'STP', 'VUT', 'KNA', 'TUV', 'SWZ', 'KIR', 'SLB', 'BHS', 'ESH', 'BRN', 'FSM', 'BTN', 'PNG']
Unique to SANCTIONED:  ['GIB']


In [775]:
gsdb_labels = np.union1d(gsdb_sanctioned_labels, gsdb_sanctioning_labels)
print("Total numbers of GSDB labels: ", len(gsdb_labels))

Total numbers of GSDB labels:  192


### GDP Labels

In [776]:
gdp_labels = gdp["Country Code"].unique()
gdp_labels

array(['AFG', 'ALB', 'DZA', 'AND', 'AGO', 'ATG', 'ARG', 'ARM', 'ABW',
       'AUS', 'AUT', 'AZE', 'BHS', 'BHR', 'BGD', 'BRB', 'BLR', 'BEL',
       'BLZ', 'BEN', 'BTN', 'BOL', 'BIH', 'BWA', 'BRA', 'BRN', 'BGR',
       'BFA', 'BDI', 'CPV', 'KHM', 'CMR', 'CAN', 'CAF', 'TCD', 'CHL',
       'CHN', 'COL', 'COM', 'COG', 'CRI', 'CIV', 'HRV', 'CUB', 'CYP',
       'CZE', 'DNK', 'DJI', 'DMA', 'DOM', 'ECU', 'EGY', 'SLV', 'GNQ',
       'ERI', 'EST', 'SWZ', 'ETH', 'FRO', 'FJI', 'FIN', 'FRA', 'GAB',
       'GMB', 'GEO', 'DEU', 'GHA', 'GIB', 'GRC', 'GRL', 'GRD', 'GTM',
       'GIN', 'GNB', 'GUY', 'HTI', 'HND', 'HKG', 'HUN', 'ISL', 'IND',
       'IDN', 'IRN', 'IRQ', 'IRL', 'ISR', 'ITA', 'JAM', 'JPN', 'JOR',
       'KAZ', 'KEN', 'KIR', 'PRK', 'KOR', 'KWT', 'KGZ', 'LAO', 'LVA',
       'LBN', 'LSO', 'LBR', 'LBY', 'LTU', 'LUX', 'MDG', 'MWI', 'MYS',
       'MDV', 'MLI', 'MLT', 'MHL', 'MRT', 'MUS', 'MEX', 'FSM', 'MDA',
       'MNG', 'MAR', 'MOZ', 'MMR', 'NAM', 'NRU', 'NPL', 'NLD', 'NZL',
       'NIC', 'NER',

### Matching

In [777]:
print("Number of DIST CEPII labels: ", len(dist_labels))
print("Number of GSDB labels: ", len(gsdb_labels))
print("Number of GDP labels: ", len(gdp_labels))
print("Number of country codes: ", len(country_codes_labels))

label_arrays = [dist_labels, gsdb_labels, gdp_labels, country_codes_labels]

intersection_labels = ft.reduce(np.intersect1d, label_arrays)

print("Total number of intersect labels: ", len(intersection_labels))

Number of DIST CEPII labels:  196
Number of GSDB labels:  192
Number of GDP labels:  193
Number of country codes:  198
Total number of intersect labels:  189


In [778]:
intersection_labels

array(['AFG', 'AGO', 'ALB', 'AND', 'ARE', 'ARG', 'ARM', 'ATG', 'AUS',
       'AUT', 'AZE', 'BDI', 'BEL', 'BEN', 'BFA', 'BGD', 'BGR', 'BHR',
       'BHS', 'BIH', 'BLR', 'BLZ', 'BOL', 'BRA', 'BRB', 'BRN', 'BTN',
       'BWA', 'CAF', 'CAN', 'CHE', 'CHL', 'CHN', 'CIV', 'CMR', 'COG',
       'COL', 'COM', 'CPV', 'CRI', 'CUB', 'CYP', 'CZE', 'DEU', 'DJI',
       'DMA', 'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'ERI', 'ESP', 'EST',
       'ETH', 'FIN', 'FJI', 'FRA', 'FSM', 'GAB', 'GBR', 'GEO', 'GHA',
       'GIB', 'GIN', 'GMB', 'GNB', 'GNQ', 'GRC', 'GRD', 'GTM', 'GUY',
       'HKG', 'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IND', 'IRL', 'IRN',
       'IRQ', 'ISL', 'ISR', 'ITA', 'JAM', 'JOR', 'JPN', 'KAZ', 'KEN',
       'KGZ', 'KHM', 'KIR', 'KNA', 'KOR', 'KWT', 'LAO', 'LBN', 'LBR',
       'LBY', 'LCA', 'LKA', 'LSO', 'LTU', 'LUX', 'LVA', 'MAR', 'MDA',
       'MDG', 'MDV', 'MEX', 'MHL', 'MKD', 'MLI', 'MLT', 'MMR', 'MNG',
       'MOZ', 'MRT', 'MUS', 'MWI', 'MYS', 'NAM', 'NER', 'NGA', 'NIC',
       'NLD', 'NOR',

In [779]:
# Intersection of each set with each other
inter_dist_gsdb = np.intersect1d(gsdb_labels, dist_labels)
print("Intersect DIST CEPII and GSDB labels: ", len(inter_dist_gsdb))
list(set(gsdb_labels) - set(inter_dist_gsdb))

Intersect DIST CEPII and GSDB labels:  192


[]

In [780]:
list(set(dist_labels) - set(intersection_labels))

['PRI', 'FRO', 'ABW', 'YUG', 'TWN', 'GRL', 'ESH']

In [781]:
list(set(gsdb_labels) - set(intersection_labels))

['ESH', 'YUG', 'TWN']

In [782]:
list(set(gdp_labels) - set(intersection_labels))

['FRO', 'GRL', 'PRI', 'ABW']

In [783]:
list(set(country_codes_labels) - set(intersection_labels))

['WLD', 'PRI', 'EUN', 'FRO', 'ABW', 'YUG', 'TWN', 'GRL', 'ESH']

In [784]:
############
############
############

---







# --- ENDE GELÄNDE ---