In [125]:
%load_ext jupyternotify
import numpy as np
import pandas as pd
import glob
import pickle

The jupyternotify extension is already loaded. To reload it, use:
  %reload_ext jupyternotify


In [126]:
# Pickling functions used to save and load the dictionary file

def save_obj(obj, name):
    with open('dataset/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open('dataset/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [447]:
# Importing UN data
data=[]
name=[]
for file_name in glob.glob('dataset/UNDP_HDI/*.csv'):
    data.append(pd.read_csv(file_name,
                             skiprows=0,
                             na_values="..",
                             na_filter=True,
                             header=1))
    name.append(file_name.split('/')[-1].split('.')[0])

# This snippet is for dealing with the format of the imported dataframes
for i in range(len(data)):
    for col in data[i].columns:
        # Drop columns containing 'Unnamed'
        if "Unnamed" in col: data[i].drop(labels=col, axis=1, inplace=True) 

In [448]:
data_train = pd.read_csv('dataset/train.csv')
columns_train = list(data_train.columns)

In [133]:
# WARNING!------------------------------------------------------------------------
# This code snippet is not to be run if one wants to get a map
# of columns. In that case refer to the *pickled dictionary file* in the dataset
# folder. This is due to the manual corrections present later in this block, which
# are dependant on a certain sequence of columns from UN data, which in turn
# depends on the glob order. This load order might change on your run!

# This code is kept only to record my steps. 

import re

# Function str_comp compares how similar two strings are
# based on matching words or numbers.
# It returns a value from 0 to 1 (if duplicate words present could be more than 1).
# 0 means strings lstr1 and lstr2 contain completely different words
# and the larger the metric — the better similarity. 
# It's very crude, but gets the job done relatively well for my purpose. 

def str_comp(lstr1, lstr2): # lstr1 and lstr2 are strings to be compared
    
    # Forming 2 lists of words from strings, 
    # ignoring all non alphabet/number symbols.
    # filter removes unnecessary blank strings in the list
    # left by re.split.
    l1 = list(filter(None, re.split(r'\W',lstr1)))
    l2 = list(filter(None, re.split(r'\W',lstr2)))
    
    # matchings DO account for duplicate words in one or both lists of strings,
    # in which case it inflates the final metric to more than 1.
    # Ideally it should be fixed, but it works fine for comparing strings. 
    matchings = 0
    for s1 in l1:
        for s2 in l2:
            if s1.lower() == s2.lower(): matchings+=1
    return matchings/max([len(l1), len(l2)])


# Making a list containing indices to match columns between datasets
# based on the str_comp function 'metric'
col_map=[]
for i, col1 in enumerate(name):
    maxim = 0
    j_m = 0
    for j, col2 in enumerate(columns_train):
        comp = str_comp(col1,col2)
        # This part checks whether this combination of list of strings col1 and col2
        # are more similar than the previous best in this loop. It also checks
        # that the second list of strings col2 has not appeared before in
        # col_map to avoid duplicates. 
        if (comp >= maxim) and (j not in [x[1] for x in col_map]): 
            maxim = comp
            j_m = j
    col_map.append((i, j_m))

# Manually fixing mistakes made by the mapping script
# NOTE! This manual corrections may change on different loads of the data
#
# Please, use the columns dict pickle file in the dataset folder 
# for final mapping. 

corrections = {
    (37, 72): (37, 8),
    (43, 78): (43, 72),
    (56, 41): (56, 42),
    (57, 8): (57, 78),
    (10, 42): (10,41),
}

# Final list with corrections:
col_map_corr = [corrections.get(x,x) for x in col_map]

# Now it's nice and dandy
for i, j in col_map_corr:
    print(i, name[i])
    print(j, columns_train[j], '\n')

0 Education Index
9 Education Index 

1 Employment to population ratio (% ages 15 and older)
34 Employment to population ratio (% ages 15 and older) 

2 Population, ages 65 and older (millions)
20 Population, ages 65 and older (millions) 

3 Mortality rate, infant (per 1,000 live births)
17 Mortality rate, infant (per 1,000 live births) 

4 Human Development Index (HDI), female
70 Intergalactic Development Index (IDI), female 

5 Estimated gross national income per capita, female (2011 PPP$)
56 Estimated gross galactic income per capita, female 

6 Share of seats in parliament (% held by women)
47 Share of seats in senate (% held by female) 

7 Labour force participation rate (% ages 15 and older), male
33 Labour force participation rate (% ages 15 and older), male 

8 Remittances, inflows (% of GDP)
62 Remittances, inflows (% of GGP) 

9 International inbound tourists (thousands)
64 Intergalactic inbound tourists (thousands) 

10 Infants lacking immunization, measles (% of one-year-ol


This code was used to save the pickled dict.  
```python
col_map_abs = {}
for i, j in col_map_corr:
    col_map_abs[name[i].strip()] = columns_train[j].strip()

save_obj(col_map_abs, 'column_dict_from_UN_to_Contest_format')
```

In [549]:
# Forming a complete list of all stripped entries in the Country column

full_list_dirty = set()
for i in range(len(data)):
    countries = [x for x in data[i].loc[:, 'Country'] if str(x) != 'nan']
    countries_stripped = set(map(lambda x: x.strip() , countries))
    full_list_dirty |= countries_stripped

In [569]:
entries_to_remove = set([
    'Arab States',
    'Developing Countries',
    'High human development',
    'Human Development',
    'Least Developed Countries',
    'Low human development',
    'Medium human development',
    'Organization for Economic Co-operation and Development',
    'Small Island Developing States',
    'South Asia',
    'Very high human development',
    'World',
    'East Asia and the Pacific',
    'Latin America and the Caribbean',
    'Regions',
    'Sub-Saharan Africa',
    
])
full_list = full_list_dirty - entries_to_remove
full_list = sorted(list(full_list))

In [323]:
# Index rows of each dataframe in data by Country column
for i in range(len(data)):
    data[i].loc(:, 'Country')
    data[i].set_index('Country', inplace=True)

In [325]:
# Select only rows with countries and columns with years and the country list
for i,df in enumerate(data):
    data[i] = df.iloc[0:189, 1:]

In [583]:
for i in range(len(data)):
    for j, country in enumerate(data[i].loc[:, 'Country']):
        if str(country) != 'nan':
            data[i].loc[j, 'Country'] = country.strip()

In [403]:
i=37
print(name[i])
d = data[i].iloc[0:189, 1:]
d

Human Development Index (HDI)


Unnamed: 0_level_0,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,0.304,0.312,0.308,0.303,0.327,0.331,0.335,0.339,0.343,0.345,...,0.447,0.464,0.465,0.479,0.485,0.488,0.490,0.491,0.493,0.496
Albania,0.625,0.608,0.611,0.617,0.629,0.639,0.639,0.649,0.660,0.667,...,0.729,0.740,0.759,0.771,0.781,0.787,0.788,0.788,0.789,0.791
Algeria,0.582,0.589,0.593,0.597,0.602,0.610,0.619,0.629,0.638,0.646,...,0.720,0.730,0.738,0.737,0.746,0.749,0.751,0.755,0.758,0.759
Andorra,,,,,,,,,,0.759,...,0.830,0.828,0.827,0.849,0.846,0.853,0.850,0.854,0.852,0.857
Angola,,,,,,,,,0.384,0.394,...,0.508,0.510,0.525,0.537,0.547,0.557,0.565,0.570,0.576,0.574
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Venezuela (Bolivarian Republic of),0.648,0.654,0.656,0.657,0.660,0.662,0.666,0.668,0.670,0.672,...,0.752,0.753,0.764,0.767,0.772,0.770,0.763,0.752,0.735,0.726
Viet Nam,0.484,0.496,0.506,0.517,0.529,0.540,0.539,0.559,0.566,0.578,...,0.650,0.653,0.663,0.668,0.673,0.675,0.680,0.685,0.690,0.693
Yemen,0.396,0.395,0.398,0.398,0.393,0.408,0.418,0.430,0.423,0.432,...,0.503,0.499,0.511,0.501,0.506,0.504,0.493,0.477,0.463,0.463
Zambia,0.421,0.420,0.422,0.418,0.419,0.419,0.420,0.419,0.424,0.428,...,0.521,0.531,0.541,0.552,0.559,0.565,0.570,0.580,0.589,0.591


In [246]:
cols = [int(x) for x in d.columns[1:]]

In [571]:
idx = pd.IndexSlice

undp_ind = pd.MultiIndex.from_product([cols, full_list], names=['year', 'country'])
undp_data = pd.DataFrame(None, index = undp_ind)

In [576]:
undp_data.loc[(slice(None), 'Yemen'), :]

year,country
1990,Yemen
1991,Yemen
1992,Yemen
1993,Yemen
1994,Yemen
1995,Yemen
1996,Yemen
1997,Yemen
1998,Yemen
1999,Yemen


In [591]:
data[0][data[0]['Country'] == 'Afghanistan'].loc[:, '1990']

0    0.122
Name: 1990, dtype: object

In [608]:
np.data[0][data[0]['Country'] == 'a'].loc[:, '1990'].values[0]

numpy.ndarray

In [597]:
data[0].columns[2:]

Index(['1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998',
       '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007',
       '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016',
       '2017', '2018'],
      dtype='object')

In [613]:
a = []
len(a) == 0

True

In [619]:
# To be commented 

for df, nam in zip(data, name):
    for country in full_list:
        for year in df.columns[2:]:
            val = df[df['Country'] == country].loc[:, year].values
            if len(val) == 0: 
                undp_data.loc[(int(year), country), nam] = np.nan
            else: undp_data.loc[(int(year), country), nam] = val[0]

In [None]:
%%notify

In [624]:
undp_data.loc[(slice(None),'Albania'),]

Unnamed: 0_level_0,Unnamed: 1_level_0,Education Index,Employment to population ratio (% ages 15 and older),"Population, ages 65 and older (millions)","Mortality rate, infant (per 1,000 live births)","Human Development Index (HDI), female","Estimated gross national income per capita, female (2011 PPP$)",Share of seats in parliament (% held by women),"Labour force participation rate (% ages 15 and older), male","Remittances, inflows (% of GDP)",International inbound tourists (thousands),...,Domestic credit provided by financial sector (% of GDP),Forest area (% of total land area),Life expectancy Index,Adjusted net savings (% of GNI),Population using at least basic drinking-water services (%),"Tuberculosis incidence (per 100,000 people)","Mortality rate, male adult (per 1,000 people)","Population, total (millions)",Young age (0-14) dependency ratio (per 100 people ages 15-64),"Population with at least some secondary education, female (% ages 25 and older)"
year,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1990,Albania,0.584,,0.2,34.8,,,,74.6,,,...,,28.8,0.797,2.3,,,144.0,3.3,53.1,59.5
1991,Albania,0.589,54.9,,,,,,,,,...,,,0.797,,,,,,,
1992,Albania,0.558,,,,,,,,,,...,,,0.797,,,,,,,
1993,Albania,0.543,,,,,,,,,,...,,,0.798,,,,,,,
1994,Albania,0.542,,,,,,,,,,...,,,0.8,,,,,,,
1995,Albania,0.551,52.3,0.2,28.5,0.603,3032.0,,73.9,17.86,304.0,...,40.5,28.4,0.803,9.9,,,144.0,3.1,53.8,60.6
1996,Albania,0.558,,,,,,,,,,...,,,0.808,,,,,,,
1997,Albania,0.571,,,,,,,,,,...,,,0.813,,,,,,,
1998,Albania,0.58,,,,,,,,,,...,,,0.819,,,,,,,
1999,Albania,0.586,,,,,,,,,,...,,,0.824,,,,,,,


In [508]:
undp_data.loc[(1990, slice(None)), :].index

MultiIndex([(1990,                         'Afghanistan'),
            (1990,                             'Albania'),
            (1990,                             'Algeria'),
            (1990,                             'Andorra'),
            (1990,                              'Angola'),
            (1990,                 'Antigua and Barbuda'),
            (1990,                           'Argentina'),
            (1990,                             'Armenia'),
            (1990,                           'Australia'),
            (1990,                             'Austria'),
            ...
            (1990,                            ' Vanuatu'),
            (1990, ' Venezuela (Bolivarian Republic of)'),
            (1990,                           ' Viet Nam'),
            (1990,                              ' Yemen'),
            (1990,                             ' Zambia'),
            (1990,                           ' Zimbabwe'),
            (1990,                      

In [512]:
undp_data.loc[(1990, slice(None)), :].index.levels[1]

Index(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria',
       ...
       ' Vanuatu', ' Venezuela (Bolivarian Republic of)', ' Viet Nam',
       ' Yemen', ' Zambia', ' Zimbabwe', ' Andorra', ' Dominica',
       ' Saint Kitts and Nevis', ' Liechtenstein'],
      dtype='object', name='country', length=384)

In [424]:
data[-29]

Unnamed: 0_level_0,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,0.466,0.440,0.431,0.376,0.328,0.381,0.366,0.354,0.343,0.331,...,0.409,0.426,0.421,0.435,0.438,0.436,0.435,0.434,0.434,0.432
Albania,0.572,0.521,0.506,0.526,0.541,0.563,0.578,0.561,0.576,0.595,...,0.686,0.693,0.699,0.700,0.705,0.707,0.711,0.717,0.721,0.727
Algeria,0.695,0.687,0.687,0.683,0.677,0.678,0.681,0.681,0.688,0.689,...,0.731,0.735,0.734,0.734,0.735,0.737,0.739,0.745,0.743,0.743
Andorra,0.936,0.934,0.929,0.923,0.922,0.923,0.929,0.942,0.947,0.953,...,0.924,0.916,0.910,0.910,0.914,0.920,0.924,0.928,0.931,0.935
Angola,0.562,0.576,0.431,0.428,0.388,0.506,0.505,0.530,0.527,0.511,...,0.615,0.614,0.613,0.622,0.626,0.630,0.629,0.620,0.617,0.607
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Uzbekistan,0.518,0.514,0.492,0.486,0.475,0.471,0.470,0.474,0.479,0.483,...,0.562,0.571,0.580,0.590,0.601,0.607,0.612,0.618,0.623,0.630
Vanuatu,0.493,0.476,0.472,0.480,0.474,0.481,0.477,0.480,0.496,0.496,...,0.508,0.506,0.504,0.498,0.505,0.505,0.497,0.501,0.503,0.504
Venezuela (Bolivarian Republic of),0.752,0.763,0.766,0.763,0.756,0.760,0.756,0.762,0.760,0.750,...,0.779,0.774,0.775,0.780,0.780,0.777,0.758,0.730,0.705,0.681
Viet Nam,0.395,0.402,0.415,0.423,0.435,0.448,0.459,0.468,0.475,0.481,...,0.546,0.567,0.574,0.580,0.586,0.593,0.600,0.609,0.616,0.624


In [443]:
for i,x in enumerate((undp_data.loc[(2000, ' Algeria'), :])):
    if not np.isnan(x): print(i)

44


In [628]:
undp_data.to_csv('dataset/undp_data.csv', na_rep='nan')

In [444]:
data[44]

Unnamed: 0_level_0,1990,1995,2000,2005,2010,2011,2012,2013,2014,2015,2016,2017,2018
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Afghanistan,6.1,9.0,10.1,12.8,14.4,15.0,15.7,16.5,17.3,18.1,18.8,19.5,20.2
Albania,2.0,1.9,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
Algeria,13.7,16.3,19.0,21.8,24.2,24.6,25.0,25.3,25.7,26.0,26.3,26.6,26.8
Angola,5.9,7.0,8.2,9.8,11.8,12.2,12.7,13.1,13.6,14.1,14.6,15.1,15.7
Antigua and Barbuda,0.0,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zimbabwe,5.4,6.1,6.5,6.7,7.0,7.1,7.2,7.3,7.4,7.5,7.6,7.8,7.9
Andorra,,,,,,,,,,,,,
Dominica,,,,,,,,,,,,,
Saint Kitts and Nevis,,,,,,,,,,,,,


In [413]:
pd.Series([x[1] for x in undp_data.index]).unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia',
       'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh',
       'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan',
       'Bolivia (Plurinational State of)', 'Bosnia and Herzegovina',
       'Botswana', 'Brazil', 'Brunei Darussalam', 'Bulgaria',
       'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon',
       'Canada', 'Central African Republic', 'Chad', 'Chile', 'China',
       'Colombia', 'Comoros', 'Congo',
       'Congo (Democratic Republic of the)', 'Costa Rica', 'Croatia',
       'Cuba', 'Cyprus', 'Czechia', "Côte d'Ivoire", 'Denmark',
       'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt',
       'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia',
       'Eswatini (Kingdom of)', 'Ethiopia', 'Fiji', 'Finland', 'France',
       'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece',
     