In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("data/african_crises.csv", sep=";")

### 1. Inventaire des variables

In [3]:
# Nombre de lignes et de columns
{"Dimensions du tableau":data.shape, "nombre pays distincts": len(data.country.unique())}

{'Dimensions du tableau': (1059, 12), 'nombre pays distincts': 13}

In [4]:
# Les types de variables
data.dtypes

country                             object
year                                 int64
systemic_crisis                      int64
exch_usd                           float64
domestic_debt_in_default             int64
sovereign_external_debt_default      int64
gdp_weighted_default               float64
inflation_annual_cpi               float64
independence                         int64
currency_crises                      int64
inflation_crises                     int64
banking_crisis                      object
dtype: object

Il y a 2 variables de type <span style="color:red">Object</span> qui sont en fait des variables <span style="color:red">qualitatives</span>; parmis ces variables figure la variable target <b>banking_crisis</b>. Les autres, 10, variables sont <span style="color:blue">Numeriques</span>.

In [5]:
# Aucune donnee manquante dans le jeu
data.columns[data.isna().any()]

Index([], dtype='object')

### 2. Essai de reponse

On peut y utiliser:
1. Une regression logistique
2. Une foret aleatoire, randomForest
3. Une classification a l'aide des surpport vectors machines

### 3. Codage du nom des pays

In [6]:
countries_ind = {x:x[:2] for x in data.country.unique()}
df = data.replace({"country":countries_ind})

In [7]:
df.head(3)

Unnamed: 0,country,year,systemic_crisis,exch_usd,domestic_debt_in_default,sovereign_external_debt_default,gdp_weighted_default,inflation_annual_cpi,independence,currency_crises,inflation_crises,banking_crisis
0,Al,1870,1,0.052264,0,0,0.0,3.441456,0,0,0,crisis
1,Al,1871,0,0.052798,0,0,0.0,14.14914,0,0,0,no_crisis
2,Al,1872,0,0.052274,0,0,0.0,-3.718593,0,0,0,no_crisis


### 4. Proportion de risques crises

### 4.1 Dumification

In [9]:
# Dummification des variables qualitatives
df_dummy = pd.get_dummies(df)

### 4.2 Proportion de crises dans les observations

a. <span style="color:blue">Proportions de crises par pays en %</span>

In [48]:
# Proportions de crises par pays %
tmp = data.drop("banking_crisis", axis=1)
tmp["crisis"] = pd.get_dummies(df.banking_crisis)["crisis"]
tmp.groupby(by=["country"])["crisis"].mean()*100

country
Algeria                      4.705882
Angola                       9.090909
Central African Republic    32.758621
Egypt                        7.096774
Ivory Coast                  6.349206
Kenya                       11.940299
Mauritius                    1.470588
Morocco                      2.666667
Nigeria                     18.333333
South Africa                 2.631579
Tunisia                      6.666667
Zambia                       5.555556
Zimbabwe                    16.666667
Name: crisis, dtype: float64

b. <span style="color:blue">Proportion de crise par annnees en %</span>

In [50]:
# Proportion de crise par annnees en %
tmp = data.drop("banking_crisis", axis=1)
tmp["crisis"] = pd.get_dummies(df.banking_crisis)["crisis"]
yearly_crisis = tmp.groupby(by=["year"])["crisis"].mean()*100

# Liste des taux de crise sans les proportion nulles
yearly_crisis.iloc[yearly_crisis.nonzero()]

year
1870    50.000000
1907    50.000000
1931    25.000000
1976     7.692308
1977    15.384615
1978    15.384615
1979     7.692308
1980     7.692308
1981    16.666667
1982    16.666667
1983    16.666667
1984     8.333333
1985     8.333333
1986     8.333333
1987     8.333333
1988    25.000000
1989    25.000000
1990    33.333333
1991    41.666667
1992    53.846154
1993    46.153846
1994    46.153846
1995    61.538462
1996    38.461538
1997    38.461538
1998    30.769231
1999    15.384615
2000     7.692308
2001     7.692308
2002     7.692308
2003     7.692308
2004     7.692308
2005     7.692308
2006     7.692308
2007     7.692308
2008     7.692308
2009    15.384615
2010     7.692308
2011     7.692308
2012     7.692308
2013     7.692308
2014     9.090909
Name: crisis, dtype: float64

### 4.3 Matrice des donnees

In [54]:
from sklearn.model_selection import train_test_split

# New data tp process
y = df_dummy.banking_crisis_crisis
X = df_dummy.drop(["banking_crisis_crisis","banking_crisis_no_crisis"], axis=1)

# Train test split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

x_train.shape, x_test.shape

((847, 23), (212, 23))