In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

In [3]:
data = pd.read_csv('../Cleaned_data/merged.csv')

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,country_code,country_name,year,gdp_growth_rate,government_type,gdp_per_capita
0,1,AFG,Afghanistan,1960,,Dictatorship,
1,3,ALB,Albania,1960,,Dictatorship,
2,7,ARG,Argentina,1960,,Democracy,5642.765138
3,11,AUS,Australia,1960,,Democracy,19378.370795
4,12,AUT,Austria,1960,,Democracy,13031.031769


In [5]:
data.drop('Unnamed: 0', axis=1, inplace=True)

In [5]:
data.shape

(9717, 6)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9717 entries, 0 to 9716
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   country_code     9717 non-null   object 
 1   country_name     9717 non-null   object 
 2   year             9717 non-null   int64  
 3   gdp_growth_rate  8281 non-null   float64
 4   government_type  9717 non-null   object 
 5   gdp_per_capita   8443 non-null   float64
dtypes: float64(2), int64(1), object(3)
memory usage: 455.6+ KB


In [12]:
def check_na_percentage(df, col_name):
    '''
    Prints the percentage of missing values in each year for the given column
    
    df : dataframe containing the column
    col_name : name of the column with missing values 
    
    returns : None
    
    '''
    grouped = df.groupby(by='year')
    for group, data in grouped:
        total = data[col_name].isna().sum()
        per = (total/len(data[col_name])) * 100
        print(group, ' : ', per)

In [13]:
check_na_percentage(data, 'gdp_growth_rate')

1960  :  100.0
1961  :  32.38095238095238
1962  :  32.142857142857146
1963  :  31.858407079646017
1964  :  31.896551724137932
1965  :  32.5
1966  :  30.64516129032258
1967  :  28.225806451612907
1968  :  29.6875
1969  :  28.125
1970  :  28.24427480916031
1971  :  23.703703703703706
1972  :  23.703703703703706
1973  :  23.52941176470588
1974  :  23.91304347826087
1975  :  25.517241379310345
1976  :  23.809523809523807
1977  :  23.64864864864865
1978  :  23.841059602649008
1979  :  23.376623376623375
1980  :  23.225806451612904
1981  :  17.197452229299362
1982  :  15.286624203821656
1983  :  13.924050632911392
1984  :  13.836477987421384
1985  :  12.578616352201259
1986  :  12.578616352201259
1987  :  11.949685534591195
1988  :  11.949685534591195
1989  :  10.69182389937107
1990  :  9.937888198757763
1991  :  10.555555555555555
1992  :  10.497237569060774
1993  :  9.782608695652174
1994  :  9.18918918918919
1995  :  8.108108108108109
1996  :  5.9459459459459465
1997  :  5.945945945945946

Removing 1960 and 2019.

In [14]:
indices_1960 = data[data.year == 1960].index.values
indices_2019 = data[data.year == 2019].index.values

data.drop(indices_2019, inplace=True)
data.drop(indices_1960, inplace=True)

In [15]:
data.year.min(), data.year.max()

(1961, 2018)

In [16]:
data.shape

(9428, 6)

# Using data 2000 onwards.

In [24]:
data_new = data[data.year >= 2000]

data_after_2000 = data_new.copy()


In [18]:
data_after_2000.shape

(3541, 6)

In [19]:
data_after_2000.head()

Unnamed: 0,country_code,country_name,year,gdp_growth_rate,government_type,gdp_per_capita
5989,AFG,Afghanistan,2000,,Dictatorship,
5990,AGO,Angola,2000,3.054624,Dictatorship,2195.631044
5991,ALB,Albania,2000,6.950036,Democracy,2244.624632
5992,ARE,United Arab Emirates,2000,10.852704,Dictatorship,63251.522987
5993,ARG,Argentina,2000,-0.788999,Democracy,8224.112827


In [17]:
data_after_2000.tail()

Unnamed: 0,country_code,country_name,year,gdp_growth_rate,government_type,gdp_per_capita
9525,WSM,Samoa,2018,-2.168099,Dictatorship,3748.844075
9526,YEM,"Yemen, Rep.",2018,-2.701475,Dictatorship,632.90943
9527,ZAF,South Africa,2018,0.787056,Dictatorship,7433.615398
9528,ZMB,Zambia,2018,3.794901,Dictatorship,1678.169124
9529,ZWE,Zimbabwe,2018,6.15919,Dictatorship,1305.783295


# Handling missing values.

## Handling missing values of gdp growth rate.

In [25]:
grouped_government_type = data_after_2000.groupby(by='government_type')

In [32]:
def country_count(grouped_data):
    '''
    Prints number of unique countries in each group
    
    grouped_data : DataFrameGroupBy object
    
    returns : None
    '''
    unique_countries = grouped_data.country_name.unique()
    groups = list(grouped_data.groups.keys())
    for i in range(2):
        print(groups[i], ' : ', unique_countries[i].size)

In [33]:
country_count(grouped_government_type)

Democracy  :  117
Dictatorship  :  86


About 58% of the countries after 2000 are democratic.

In [40]:
# dataframe only containing those observations with missing values of gdp_growth_rate 
data_after_2000_missing = data_after_2000[data_after_2000.gdp_growth_rate.isna()] 

# categorizing 'data_after_2000_missing' into democracies and dictatorships
grouped_government_type_missing = data_after_2000_missing.groupby('government_type')

# number of countries with missing values for gdp_growth_rate
country_count(grouped_government_type_missing)

Democracy  :  6
Dictatorship  :  9


In [42]:
# grouping the dataframe 'data_after_2000_missing' by country_name
grouped_country_missing = data_after_2000_missing.groupby('country_name')

# countries and respective number of years for which the values for gdp_growth_rate is missing.
grouped_country_missing.country_name.count()

country_name
Afghanistan                   3
Djibouti                     14
Eritrea                       7
Iran, Islamic Rep.            1
Korea, Dem. People’s Rep.    19
Liberia                       1
Liechtenstein                 9
Nauru                         8
Palau                         1
Qatar                         1
San Marino                    1
Sao Tome and Principe         2
Somalia                      19
Syrian Arab Republic         11
Venezuela, RB                 4
Name: country_name, dtype: int64

I have decided to remove the rows with missing gdp_growth_rate. Here is why :-
- Gdp growth rates can change quickly year to year. Filling the missing values with average or any other value does not feel right.
- Moreover, one possible reason for missing values is that the economic conditions might not be very good during those years. So imputation is not a good idea.
- Most countries with missing values have missing values for cinssecutive years. This makes it more difficult to impute the values.

In [43]:
data_after_2000.gdp_growth_rate.isna().sum()

101

In [48]:
data_after_2000.dropna(subset=['gdp_growth_rate'], inplace=True)

In [49]:
data_after_2000.gdp_growth_rate.isna().sum()

0

## Handling missing values of gdp per capita

In [62]:
data_after_2000_missing_included[data_after_2000_missing_included.country_name == 'Eritrea']

Unnamed: 0,country_code,country_name,year,gdp_growth_rate,government_type,gdp_per_capita
6039,ERI,Eritrea,2000,-3.141986,Dictatorship,635.17796
6224,ERI,Eritrea,2001,8.755435,Dictatorship,666.848297
6409,ERI,Eritrea,2002,3.005429,Dictatorship,657.450441
6594,ERI,Eritrea,2003,-2.655516,Dictatorship,610.485625
6779,ERI,Eritrea,2004,1.451736,Dictatorship,592.288525
6964,ERI,Eritrea,2005,2.57445,Dictatorship,584.570025
7149,ERI,Eritrea,2006,-0.969217,Dictatorship,560.743668
7336,ERI,Eritrea,2007,1.426822,Dictatorship,553.877207
7523,ERI,Eritrea,2008,-9.78303,Dictatorship,488.883713
7710,ERI,Eritrea,2009,3.876502,Dictatorship,498.534349


In [86]:
data_after_2000[data_after_2000.gdp_per_capita.isna()]

Unnamed: 0,country_code,country_name,year,gdp_growth_rate,government_type,gdp_per_capita
6087,LIE,Liechtenstein,2000,3.220191,Democracy,
6148,SYR,Syrian Arab Republic,2000,2.742857,Dictatorship,
6272,LIE,Liechtenstein,2001,-0.733091,Democracy,
6333,SYR,Syrian Arab Republic,2001,5.200943,Dictatorship,
6457,LIE,Liechtenstein,2002,-0.993919,Democracy,
6518,SYR,Syrian Arab Republic,2002,5.9,Dictatorship,
6642,LIE,Liechtenstein,2003,-1.937566,Democracy,
6703,SYR,Syrian Arab Republic,2003,0.6,Dictatorship,
6827,LIE,Liechtenstein,2004,3.039174,Democracy,
6888,SYR,Syrian Arab Republic,2004,6.9,Dictatorship,


I will replace the missing value for LIE with 141200.38, SYR with 0. DJI with 1300.

I do so for the following reasons:-
- I only need to classify the countries according to income levels. I do not need exact values.
- It is not very likely that a country's status changes from one income level to another, quickly.
- There are only three countries with missing values for gdp_per_capita.

In [31]:
def fill_missing(dataframe, column_name, codes, new_values):
    '''
    Fills missing values in a column with the given values
    
    dataframe : dataframe 
    column_name : name of the column for which values are to filled
    codes : codes of the countries for which the values are to replaced
    new_values : new values to replace old ones
    
    returns : None
    '''
    for i in range(len(codes)):
        dataframe.loc[(dataframe.country_code == codes[i]) &
                 (dataframe[column_name].isna()), column_name] = new_values[i]

In [32]:
values = [141200.38, 0, 1300]
countries = ['LIE', 'DJI', 'SYR']

In [33]:
fill_missing(data_after_2000, 'gdp_per_capita', countries, values)

In [34]:
data_after_2000.gdp_per_capita.isna().sum()

0

# Saving as csv

In [68]:
data_after_2000.to_csv('../Cleaned_data/data_after_2000.csv')

In [67]:
data_after_2000_missing_included.to_csv('../Cleaned_data/data_after_2000_including_missing.csv')