In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
df = pd.read_csv("african_crises.csv")

In [3]:
df.head(5)

Unnamed: 0,case,cc3,country,year,systemic_crisis,exch_usd,domestic_debt_in_default,sovereign_external_debt_default,gdp_weighted_default,inflation_annual_cpi,independence,currency_crises,inflation_crises,banking_crisis
0,1,DZA,Algeria,1870,1,0.052264,0,0,0.0,3.441456,0,0,0,crisis
1,1,DZA,Algeria,1871,0,0.052798,0,0,0.0,14.14914,0,0,0,no_crisis
2,1,DZA,Algeria,1872,0,0.052274,0,0,0.0,-3.718593,0,0,0,no_crisis
3,1,DZA,Algeria,1873,0,0.05168,0,0,0.0,11.203897,0,0,0,no_crisis
4,1,DZA,Algeria,1874,0,0.051308,0,0,0.0,-3.848561,0,0,0,no_crisis


In [4]:
df.describe()

Unnamed: 0,case,year,systemic_crisis,exch_usd,domestic_debt_in_default,sovereign_external_debt_default,gdp_weighted_default,inflation_annual_cpi,independence,currency_crises,inflation_crises
count,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0
mean,35.613787,1967.767705,0.077432,43.140831,0.03966,0.152975,0.006402,20848.89,0.776204,0.1322,0.129367
std,23.692402,33.530632,0.267401,111.47538,0.195251,0.360133,0.043572,675727.4,0.416984,0.349847,0.335765
min,1.0,1860.0,0.0,0.0,0.0,0.0,0.0,-28.50214,0.0,0.0,0.0
25%,15.0,1951.0,0.0,0.19535,0.0,0.0,0.0,2.086162,1.0,0.0,0.0
50%,38.0,1973.0,0.0,0.8684,0.0,0.0,0.0,5.76233,1.0,0.0,0.0
75%,56.0,1994.0,0.0,8.46275,0.0,0.0,0.0,11.64405,1.0,0.0,0.0
max,70.0,2014.0,1.0,744.306139,1.0,1.0,0.4,21989700.0,1.0,2.0,1.0


In [5]:
def df_info(df):
    display(
        pd.concat(
            [
                df.dtypes.rename("dtypes"), 
                df.nunique().rename("n_unique"), 
                df.isna().sum().rename("nan_vals")
            ], 
            axis=1
        )
    )

In [6]:
df_info(df)

Unnamed: 0,dtypes,n_unique,nan_vals
case,int64,13,0
cc3,object,13,0
country,object,13,0
year,int64,155,0
systemic_crisis,int64,2,0
exch_usd,float64,772,0
domestic_debt_in_default,int64,2,0
sovereign_external_debt_default,int64,2,0
gdp_weighted_default,float64,6,0
inflation_annual_cpi,float64,1022,0


In [7]:
for column in df.select_dtypes(exclude=[int, float]):
    print(column)
    print(df[column].unique())
    print("-"*100)

cc3
['DZA' 'AGO' 'CAF' 'CIV' 'EGY' 'KEN' 'MUS' 'MAR' 'NGA' 'ZAF' 'TUN' 'ZMB'
 'ZWE']
----------------------------------------------------------------------------------------------------
country
['Algeria' 'Angola' 'Central African Republic' 'Ivory Coast' 'Egypt'
 'Kenya' 'Mauritius' 'Morocco' 'Nigeria' 'South Africa' 'Tunisia' 'Zambia'
 'Zimbabwe']
----------------------------------------------------------------------------------------------------
banking_crisis
['crisis' 'no_crisis']
----------------------------------------------------------------------------------------------------


### Transforming binary object value into boolean

In [8]:
df["banking_crisis"] = np.where(df["banking_crisis"] == "crisis", 1, 0)

## Converting countries to geo locations

In [9]:
url = "https://gist.githubusercontent.com/tadast/8827699/raw/f5cac3d42d16b78348610fc4ec301e9234f82821/countries_codes_and_coordinates.csv"

In [10]:
geo_df = pd.read_csv(url, skipinitialspace=True)

In [11]:
geo_df.head(5)

Unnamed: 0,Country,Alpha-2 code,Alpha-3 code,Numeric code,Latitude (average),Longitude (average)
0,Afghanistan,AF,AFG,4,33.0,65.0
1,Albania,AL,ALB,8,41.0,20.0
2,Algeria,DZ,DZA,12,28.0,3.0
3,American Samoa,AS,ASM,16,-14.3333,-170.0
4,Andorra,AD,AND,20,42.5,1.6


In [12]:
df = pd.merge(
    df, geo_df[["Latitude (average)", "Longitude (average)"]],
    left_on="country", right_on=geo_df["Country"], 
    how="left"
)

In [13]:
df = df.drop(columns=["cc3", "country"])

`case` serves as Unique ID, this is not required in our case and we won't use it either

In [14]:
df = df.drop(columns=["case"])

# Impute missing features

Since imputation, normalization and standardization procedures depend on all of the feature values we first need to split our dataset in train and test.
We will do this using `sklearn.model_selection.train_test_split` functionality:

In [23]:
from sklearn.model_selection import train_test_split

test_size = 0.1

df_train, df_test = train_test_split(df, test_size=test_size, random_state= 42)

In [24]:
df_info(df_train)

Unnamed: 0,dtypes,n_unique,nan_vals
year,int64,149,0
systemic_crisis,int64,2,0
exch_usd,float64,691,16
domestic_debt_in_default,int64,2,0
sovereign_external_debt_default,int64,2,0
gdp_weighted_default,float64,6,0
inflation_annual_cpi,float64,828,100
independence,int64,2,0
currency_crises,int64,3,0
inflation_crises,int64,2,0


As we can see, there are 116 missing values in our train set, 100 `annual_inflation_cpi` values and 16 `exch_usd` values. Let's focus on this features:

In [26]:
df_train[['inflation_annual_cpi','exch_usd']].describe()

Unnamed: 0,inflation_annual_cpi,exch_usd
count,853.0,937.0
mean,95.267719,42.30139
std,2271.712714,110.499645
min,-28.502137,0.0
25%,2.317243,0.1952
50%,5.805,0.8681
75%,11.584,8.4566
max,66279.89237,744.306139


In [29]:
df_train[['inflation_annual_cpi','exch_usd']].isna().sum()/df_train.shape[0]

inflation_annual_cpi    0.104932
exch_usd                0.016789
dtype: float64

About 10.5% of `inflation_annual_cpi` and 1.6% of `exch_usd` are missing. We can keep both columns by imputing missing with mean values:

In [31]:
ann_inf_cpi_mean = df_train['inflation_annual_cpi'].mean()
df_train['inflation_annual_cpi'] = df_train['inflation_annual_cpi'].fillna(ann_inf_cpi_mean)
df_test['inflation_annual_cpi'] = df_test['inflation_annual_cpi'].fillna(ann_inf_cpi_mean)