In [1]:
import pandas as pd
from pycountry import countries

In [20]:
data_dir = '.\\..\\data\\'
pickle_out_file = 'cleaned_data.pkl'
world_bank_file_input = "world_bank_data"
wb_data = pd.read_pickle(data_dir + world_bank_file_input)

In [21]:
#Create list of valid countries taken from the pycountry library
#This gives us a list of 249 countries that we will focus on
countries_list = []
for country in countries:
    countries_list.append(country.name)

*Let's have a look at regions that are used in the world bank data that are not in the pycountry list:*

In [22]:
exclude_list = [wb_country for wb_country in wb_data.index.levels[0].values if wb_country not in countries_list]
print(exclude_list)

['Arab World', 'Bahamas, The', 'Bolivia', 'British Virgin Islands', 'Caribbean small states', 'Central Europe and the Baltics', 'Channel Islands', 'Congo, Dem. Rep.', 'Congo, Rep.', "Cote d'Ivoire", 'Curacao', 'Czech Republic', 'Early-demographic dividend', 'East Asia & Pacific', 'East Asia & Pacific (IDA & IBRD countries)', 'East Asia & Pacific (excluding high income)', 'Egypt, Arab Rep.', 'Eswatini', 'Euro area', 'Europe & Central Asia', 'Europe & Central Asia (IDA & IBRD countries)', 'Europe & Central Asia (excluding high income)', 'European Union', 'Fragile and conflict affected situations', 'Gambia, The', 'Heavily indebted poor countries (HIPC)', 'High income', 'Hong Kong SAR, China', 'IBRD only', 'IDA & IBRD total', 'IDA blend', 'IDA only', 'IDA total', 'Iran, Islamic Rep.', 'Korea, Dem. People’s Rep.', 'Korea, Rep.', 'Kosovo', 'Kyrgyz Republic', 'Lao PDR', 'Late-demographic dividend', 'Latin America & Caribbean', 'Latin America & Caribbean (excluding high income)', 'Latin Americ

*Clearly, some of the regions in the list above seem to be countries of interest so we will investigate these further now to see why that are not in the pycountry list*

To investigate:  
Bahamas, The  
Bolivia  
British Virgin Islands  
Congo, Dem. Rep.  
Congo, Rep.  
Cote d'Ivoire  
Curacao  
Czech Republic  
Egypt, Arab Rep.  
Eswatini  
Gambia, The  
Hong Kong SAR, China  
Iran, Islamic Rep.  
Korea, Dem. People’s Rep.  
Korea, Rep.  
Kosovo  
Kyrgyz Republic  
Lao PDR  
Macao SAR, China  
Micronesia, Fed. Sts.  
Moldova  
North Macedonia  
Slovak Republic  
St. Kitts and Nevis  
St. Lucia  
St. Martin (French part)  
St. Vincent and the Grenadines  
Tanzania  
Venezuela, RB  
Vietnam  
Virgin Islands (U.S.)  
West Bank and Gaza  
Yemen, Rep.  

*After a bit of investigation all of the above seem to fall into 3 categories:*

1. Countries that have different names in both lists and the name in the World Bank dataset is clearer or simpler.
2. Countries that have different names in both lists and the name in the pycountry library is decided to be the standard for all further processing  (most of the above)
3. 'West Bank and Gaza' and 'Kosovo' are not listed in the pycountry list. I will add these to the countries_list seperately.

*1. Countries that have different names in both lists and the name in the World Bank dataset is clearer or simpler*

In [23]:
#Some of the names in the pycountry list could be simplified or made clearer.
pycountry_list_transform={
#Remove names                      #Replace with these
'Bolivia, Plurinational State of' :'Bolivia',
'Côte d\'Ivoire'                  :'Cote d\'Ivoire',
'Curaçao'                         :'Curacao',
'Czechia'                         :'Czech Republic',
'Viet Nam'                        :'Vietnam'
}

#remove the old names specifie above dictinary
countries_list = [c for c in countries_list if c not in list(pycountry_list_transform.keys()) ]

#Add the new names
countries_list.extend(list(pycountry_list_transform.values()))

*2. Countries that have different names in both lists and the name in the pycountry library is decided to be the standard for all further processing (most of the above)*

In [24]:
countries_transform = {
'Bahamas, The'                  :'Bahamas',
'British Virgin Islands'        :'Virgin Islands, British',
'Congo, Dem. Rep.'              :'Congo, The Democratic Republic of the',
'Congo, Rep.'                   :'Congo',
'Egypt, Arab Rep.'              :'Egypt',
'Eswatini'                      :'Swaziland',
'Gambia, The'                   :'Gambia',
'Hong Kong SAR, China'          :'Hong Kong',
'Iran, Islamic Rep.'            :'Iran, Islamic Republic of',
'Korea, Dem. People’s Rep.'     :'Korea, Democratic People\'s Republic of',
'Korea, Rep.'                   :'Korea, Republic of',
'Kyrgyz Republic'               :'Kyrgyzstan',
'Lao PDR'                       :'Lao People\'s Democratic Republic',
'Macao SAR, China'              :'Macao',
'Micronesia, Fed. Sts.'         :'Micronesia, Federated States of',
'Moldova'                       :'Moldova, Republic of',
'North Macedonia'               :'Macedonia, Republic of',
'Slovak Republic'               :'Slovakia',
'St. Kitts and Nevis'           :'Saint Kitts and Nevis',
'St. Lucia'                     :'Saint Lucia',
'St. Martin (French part)'      :'Saint Martin (French part)',
'St. Vincent and the Grenadines':'Saint Vincent and the Grenadines',
'Tanzania'                      :'Tanzania, United Republic of',
'Venezuela, RB'                 :'Venezuela, Bolivarian Republic of',
'Virgin Islands (U.S.)'         :'Virgin Islands, U.S.',
'Yemen, Rep.'                   :'Yemen'}
    

In [11]:
#Get index as a list, make changes and reapply the list to the World Bank dataframe
index_of_wb_countries = list(wb_data.index.levels[0])

for remove_region, replace_with_region in countries_transform.items():
    index_of_wb_countries[index_of_wb_countries.index(remove_region)] = replace_with_region

#reapply the index to the original dataframe
wb_data.index = wb_data.index.set_levels(index_of_wb_countries, level=0)

*3. 'West Bank and Gaza' and 'Kosovo' are not listed in the pycountry list. I will add these to the countries_list seperately.*

In [25]:
add_list = ['West Bank and Gaza', 'Kosovo']

countries_list.extend(add_list)

*Have another look at what regions are going to be excluded:*

In [26]:
new_exclude_list = [wb_country for wb_country in wb_data.index.levels[0].values if wb_country not in countries_list]
print(new_exclude_list)

['Arab World', 'Bahamas, The', 'British Virgin Islands', 'Caribbean small states', 'Central Europe and the Baltics', 'Channel Islands', 'Congo, Dem. Rep.', 'Congo, Rep.', 'Early-demographic dividend', 'East Asia & Pacific', 'East Asia & Pacific (IDA & IBRD countries)', 'East Asia & Pacific (excluding high income)', 'Egypt, Arab Rep.', 'Eswatini', 'Euro area', 'Europe & Central Asia', 'Europe & Central Asia (IDA & IBRD countries)', 'Europe & Central Asia (excluding high income)', 'European Union', 'Fragile and conflict affected situations', 'Gambia, The', 'Heavily indebted poor countries (HIPC)', 'High income', 'Hong Kong SAR, China', 'IBRD only', 'IDA & IBRD total', 'IDA blend', 'IDA only', 'IDA total', 'Iran, Islamic Rep.', 'Korea, Dem. People’s Rep.', 'Korea, Rep.', 'Kyrgyz Republic', 'Lao PDR', 'Late-demographic dividend', 'Latin America & Caribbean', 'Latin America & Caribbean (excluding high income)', 'Latin America & the Caribbean (IDA & IBRD countries)', 'Least developed countri

*Now that we have built up our countries_list and refactored the World Bank dataframes country index, it is time to apply the filter to the World bank dataframe.*

In [27]:
include_list = countries_list
wb_data_countries_only = wb_data.loc[include_list]

In [28]:
#A little bit of dataframe butchery here!!
#For some reason after the dataframe subset creation above, the index of the new..
#dataframe was the same as the original. Below is a way of getting around this..
#(i.e. by generating a new one using reset_index/set_index)
temp_data = wb_data_countries_only.reset_index()
filtered_data = temp_data.set_index(['country', 'year'])

*Summary of Missing Data:*

In [29]:
filtered_data.isna().sum()/filtered_data.shape[0]

SI.POV.DDAY          0.846585
SI.POV.GINI          0.847368
EN.POP.SLUM.UR.ZS    0.955207
SI.SPR.PC40          0.980515
SE.PRM.UNER          0.595185
SE.XPD.TOTL.GD.ZS    0.636170
SL.TLF.TOTL.IN       0.468533
NY.GDP.MKTP.KD.ZG    0.209295
SP.URB.TOTL.IN.ZS    0.031131
dtype: float64

In [30]:
#According to pandas docs on multiIndex usage: For objects to be indexed and sliced effectively, they need to be sorted.
filtered_data = filtered_data.sort_index()

### Write Files to disk

In [31]:
#Write the original dataset filtered to include only countries
filtered_data.to_pickle(data_dir + pickle_out_file)

### Globals

In [104]:
target = 'SI.POV.DDAY'
regressors = list(filtered_data.columns)
predict_year=2010

### Fill in Missing Values

Use straightforward forward fill method to fill in missing values to allow for Linear Regression

In [27]:
for country in countries_in_orig_data:
    filtered_data.loc[country, 'SI.POV.GINI':].fillna(method='ffill',inplace=True)
    
for country in countries_in_orig_data:
    filtered_data.loc[country, 'SI.POV.GINI':].fillna(method='bfill',inplace=True)


In [72]:
#Above we did not fill data for the target variable
#But it is probably worth doing so up but not including the year that we are predicting
for country in countries_in_orig_data:
    filtered_data.loc[(country):(country, str(predict_year-1)), target].fillna(method='ffill',inplace=True)

for country in countries_in_orig_data:
    filtered_data.loc[(country):(country, str(predict_year-1)), target].fillna(method='bfill',inplace=True)

### Reorganise the data into overlapping window frames

For consumption by the machine forecasting algo we reorganise the dataframe to be a 3 layer multilevel index dataframe: Country, Frame, Year.

In [74]:
countries_in_orig_data = list(filtered_data.index.levels[0]) 

In [75]:
lag = 5
diff = 1
number_of_windows=3

In [76]:
#Create an empty test and training dataframes
regressors_index = pd.MultiIndex(levels=[[],[],[]],
                            codes=[[],[],[]],
                            names=[u'country', u'window', u'lag'])
target_index = pd.MultiIndex(levels=[[],[]],
                           codes=[[],[]],
                           names=[u'country', u'window'])
test_regressors_index =  pd.MultiIndex(levels=[[],[]],
                           codes=[[],[]],
                           names=[u'country', u'lag'])
test_target_index = countries_in_orig_data


columns_spilt = regressors
training_data_regressors = pd.DataFrame(index=regressors_index, columns=columns_spilt)
training_data_targets = pd.DataFrame(index=target_index, columns=[target])
test_data_regressors = pd.DataFrame(index=test_regressors_index, columns=columns_spilt)
test_data_targets = pd.DataFrame(index=test_target_index, columns=[target])

In [77]:
#fill out the training dataframes : training_data_regressors and training_data_targets
#..and the testing dataframes     : test_data_regressors and test_data_targets
for country in countries_in_orig_data:
    test_data_targets.loc[country, target] =  filtered_data.loc[(country,str(predict_year)), target]
    
    for l in range(1,lag+1):
        year = predict_year - l
        test_data_regressors.loc[(country,l),:] = filtered_data.loc[(country,str(year)), regressors]  
    
    for window in range(1,number_of_windows+1):
        year= predict_year - window       
        #Add the target value for the spilt to the test_data 
        training_data_targets.loc[(country, window),:] = filtered_data.loc[(country,str(year)), target]
        for l in range(1,lag+1):
            year = year -1
            training_data_regressors.loc[(country,window,l),:] = filtered_data.loc[(country,str(year)), regressors]  
                        

In [78]:
#According to pandas docs on multiIndex usage: For objects to be indexed and sliced effectively, they need to be sorted.
training_data_regressors = training_data_regressors.sort_index()
training_data_targets = training_data_targets.sort_index()
test_data_targets = test_data_targets.sort_index()

*Note on efficiency: If needed when the datasets get bigger the above 3 loop copy can be made faster.  It is writing overlapping windows from one dataframe to another so unless there is a major overhaul we probably need to hold on to one of the for loops but the other 2 can go.*

*1. (Straightforward) The inner-most for loop can be replaced by copying a block of 'lag' rows in one go.*  
*2. (a bit trickier) The outerr-most loop (countries) could be replaced by placing it inside of the split loop and copying one split each for all the countries in one go.*

In [79]:
#unstacking the input features. Each row will now represent a set of features.
training_data_regressors  = training_data_regressors.unstack(level=2)
test_data_regressors  = test_data_regressors.unstack(level=1)

In [80]:
training_data_regressors.shape

(648, 45)

In [81]:
training_data_regressors.isna().sum()

                   lag
SI.POV.DDAY        1      177
                   2      177
                   3      177
                   4      177
                   5      177
SI.POV.GINI        1      156
                   2      156
                   3      156
                   4      156
                   5      156
EN.POP.SLUM.UR.ZS  1      330
                   2      330
                   3      330
                   4      330
                   5      330
SI.SPR.PC40        1      369
                   2      369
                   3      369
                   4      369
                   5      369
SE.PRM.UNER        1       72
                   2       72
                   3       72
                   4       72
                   5       72
SE.XPD.TOTL.GD.ZS  1       57
                   2       57
                   3       57
                   4       57
                   5       57
SL.TLF.TOTL.IN     1       90
                   2       90
                 

### Output data structures

Training data: The regressors dataframe and the targets dataframe have the same number of row as each row of the regressor represents a set of features with matching target variable in the equivalent row of the tatget dataframe.

Test data: There is one target for each country for the target year. This will be later compared to the output of the forecast algos to measure performance.

In [82]:
training_data_regressors.shape

(648, 45)

In [83]:
training_data_regressors.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,SI.POV.DDAY,SI.POV.DDAY,SI.POV.DDAY,SI.POV.DDAY,SI.POV.DDAY,SI.POV.GINI,SI.POV.GINI,SI.POV.GINI,SI.POV.GINI,SI.POV.GINI,...,NY.GDP.MKTP.KD.ZG,NY.GDP.MKTP.KD.ZG,NY.GDP.MKTP.KD.ZG,NY.GDP.MKTP.KD.ZG,NY.GDP.MKTP.KD.ZG,SP.URB.TOTL.IN.ZS,SP.URB.TOTL.IN.ZS,SP.URB.TOTL.IN.ZS,SP.URB.TOTL.IN.ZS,SP.URB.TOTL.IN.ZS
Unnamed: 0_level_1,lag,1,2,3,4,5,1,2,3,4,5,...,1,2,3,4,5,1,2,3,4,5
country,window,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
Afghanistan,1,,,,,,,,,,,...,3.92498,13.8263,5.3574,11.2297,1.41412,23.32,23.113,22.907,22.703,22.5
Afghanistan,2,,,,,,,,,,,...,13.8263,5.3574,11.2297,1.41412,8.83228,23.113,22.907,22.703,22.5,22.353
Afghanistan,3,,,,,,,,,,,...,5.3574,11.2297,1.41412,8.83228,8.83228,22.907,22.703,22.5,22.353,22.261
Albania,1,0.4,1.1,1.1,1.1,2.0,30.0,30.6,30.6,30.6,31.7,...,7.5,5.98,5.9,5.53,5.51,49.991,48.902,47.815,46.731,45.651
Albania,2,1.1,1.1,1.1,2.0,2.0,30.6,30.6,30.6,31.7,31.7,...,5.98,5.9,5.53,5.51,5.53,48.902,47.815,46.731,45.651,44.573
Albania,3,1.1,1.1,2.0,2.0,2.0,30.6,30.6,31.7,31.7,31.7,...,5.9,5.53,5.51,5.53,4.54,47.815,46.731,45.651,44.573,43.501
Algeria,1,5.8,5.8,5.8,5.8,5.8,35.3,35.3,35.3,35.3,35.3,...,2.36013,3.37288,1.68449,5.90779,4.30162,66.097,65.348,64.593,63.83,63.061
Algeria,2,5.8,5.8,5.8,5.8,5.8,35.3,35.3,35.3,35.3,35.3,...,3.37288,1.68449,5.90779,4.30162,7.20187,65.348,64.593,63.83,63.061,62.284
Algeria,3,5.8,5.8,5.8,5.8,5.8,35.3,35.3,35.3,35.3,35.3,...,1.68449,5.90779,4.30162,7.20187,5.60932,64.593,63.83,63.061,62.284,61.501
American Samoa,1,,,,,,,,,,,...,-2.61348,1.96353,-4.16667,-0.401606,0.538358,87.799,87.9,88.0,88.1,88.198


In [54]:
training_data_targets.shape

(648, 1)

In [55]:
training_data_targets.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,SI.POV.DDAY
country,window,Unnamed: 2_level_1
Afghanistan,1,
Afghanistan,2,
Afghanistan,3,
Albania,1,
Albania,2,0.4
Albania,3,
Algeria,1,
Algeria,2,
Algeria,3,
American Samoa,1,


In [85]:
#Write the time series window data (ready for the forecasting algos)
training_data_regressors.to_pickle(out_dir + train_regressors_out_file)
training_data_targets.to_pickle(out_dir + train_targets_out_file)
test_data_regressors.to_pickle(out_dir + test_regressors_out_file)
test_data_targets.to_pickle(out_dir + test_targets_out_file)