In [125]:
%load_ext jupyternotify
import numpy as np
import pandas as pd
import glob
import pickle

The jupyternotify extension is already loaded. To reload it, use:
  %reload_ext jupyternotify


In [126]:
# Pickling functions used to save and load the dictionary file

def save_obj(obj, name):
    with open('dataset/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open('dataset/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [447]:
# Importing UN data
data=[]
name=[]
for file_name in glob.glob('dataset/UNDP_HDI/*.csv'):
    data.append(pd.read_csv(file_name,
                             skiprows=0,
                             na_values="..",
                             na_filter=True,
                             header=1))
    name.append(file_name.split('/')[-1].split('.')[0])

# This snippet is for dealing with the format of the imported dataframes
for i in range(len(data)):
    for col in data[i].columns:
        # Drop columns containing 'Unnamed'
        if "Unnamed" in col: data[i].drop(labels=col, axis=1, inplace=True) 

In [645]:
data_train = pd.read_csv('dataset/train.csv')
columns_train = list(data_train.columns)

In [133]:
# WARNING!------------------------------------------------------------------------
# This code snippet is not to be run if one wants to get a map
# of columns. In that case refer to the *pickled dictionary file* in the dataset
# folder. This is due to the manual corrections present later in this block, which
# are dependant on a certain sequence of columns from UN data, which in turn
# depends on the glob order. This load order might change on your run!

# This code is kept only to record my steps. 

import re

# Function str_comp compares how similar two strings are
# based on matching words or numbers.
# It returns a value from 0 to 1 (if duplicate words present could be more than 1).
# 0 means strings lstr1 and lstr2 contain completely different words
# and the larger the metric — the better similarity. 
# It's very crude, but gets the job done relatively well for my purpose. 

def str_comp(lstr1, lstr2): # lstr1 and lstr2 are strings to be compared
    
    # Forming 2 lists of words from strings, 
    # ignoring all non alphabet/number symbols.
    # filter removes unnecessary blank strings in the list
    # left by re.split.
    l1 = list(filter(None, re.split(r'\W',lstr1)))
    l2 = list(filter(None, re.split(r'\W',lstr2)))
    
    # matchings DO account for duplicate words in one or both lists of strings,
    # in which case it inflates the final metric to more than 1.
    # Ideally it should be fixed, but it works fine for comparing strings. 
    matchings = 0
    for s1 in l1:
        for s2 in l2:
            if s1.lower() == s2.lower(): matchings+=1
    return matchings/max([len(l1), len(l2)])


# Making a list containing indices to match columns between datasets
# based on the str_comp function 'metric'
col_map=[]
for i, col1 in enumerate(name):
    maxim = 0
    j_m = 0
    for j, col2 in enumerate(columns_train):
        comp = str_comp(col1,col2)
        # This part checks whether this combination of list of strings col1 and col2
        # are more similar than the previous best in this loop. It also checks
        # that the second list of strings col2 has not appeared before in
        # col_map to avoid duplicates. 
        if (comp >= maxim) and (j not in [x[1] for x in col_map]): 
            maxim = comp
            j_m = j
    col_map.append((i, j_m))

# Manually fixing mistakes made by the mapping script
# NOTE! This manual corrections may change on different loads of the data
#
# Please, use the columns dict pickle file in the dataset folder 
# for final mapping. 

corrections = {
    (37, 72): (37, 8),
    (43, 78): (43, 72),
    (56, 41): (56, 42),
    (57, 8): (57, 78),
    (10, 42): (10,41),
}

# Final list with corrections:
col_map_corr = [corrections.get(x,x) for x in col_map]

# Now it's nice and dandy
for i, j in col_map_corr:
    print(i, name[i])
    print(j, columns_train[j], '\n')

0 Education Index
9 Education Index 

1 Employment to population ratio (% ages 15 and older)
34 Employment to population ratio (% ages 15 and older) 

2 Population, ages 65 and older (millions)
20 Population, ages 65 and older (millions) 

3 Mortality rate, infant (per 1,000 live births)
17 Mortality rate, infant (per 1,000 live births) 

4 Human Development Index (HDI), female
70 Intergalactic Development Index (IDI), female 

5 Estimated gross national income per capita, female (2011 PPP$)
56 Estimated gross galactic income per capita, female 

6 Share of seats in parliament (% held by women)
47 Share of seats in senate (% held by female) 

7 Labour force participation rate (% ages 15 and older), male
33 Labour force participation rate (% ages 15 and older), male 

8 Remittances, inflows (% of GDP)
62 Remittances, inflows (% of GGP) 

9 International inbound tourists (thousands)
64 Intergalactic inbound tourists (thousands) 

10 Infants lacking immunization, measles (% of one-year-ol


This code was used to save the pickled dict.  
```python
col_map_abs = {}
for i, j in col_map_corr:
    col_map_abs[name[i].strip()] = columns_train[j].strip()

save_obj(col_map_abs, 'column_dict_from_UN_to_Contest_format')
```

In [549]:
# Forming a complete list of all stripped entries in the Country column

full_list_dirty = set()
for i in range(len(data)):
    countries = [x for x in data[i].loc[:, 'Country'] if str(x) != 'nan']
    countries_stripped = set(map(lambda x: x.strip() , countries))
    full_list_dirty |= countries_stripped

In [569]:
entries_to_remove = set([
    'Arab States',
    'Developing Countries',
    'High human development',
    'Human Development',
    'Least Developed Countries',
    'Low human development',
    'Medium human development',
    'Organization for Economic Co-operation and Development',
    'Small Island Developing States',
    'South Asia',
    'Very high human development',
    'World',
    'East Asia and the Pacific',
    'Latin America and the Caribbean',
    'Regions',
    'Sub-Saharan Africa',
    
])
full_list = full_list_dirty - entries_to_remove
full_list = sorted(list(full_list))

In [323]:
# Index rows of each dataframe in data by Country column
for i in range(len(data)):
    data[i].loc(:, 'Country')
    data[i].set_index('Country', inplace=True)

In [325]:
# Select only rows with countries and columns with years and the country list
for i,df in enumerate(data):
    data[i] = df.iloc[0:189, 1:]

In [583]:
for i in range(len(data)):
    for j, country in enumerate(data[i].loc[:, 'Country']):
        if str(country) != 'nan':
            data[i].loc[j, 'Country'] = country.strip()

In [403]:
i=37
print(name[i])
d = data[i].iloc[0:189, 1:]
d

Human Development Index (HDI)


Unnamed: 0_level_0,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,0.304,0.312,0.308,0.303,0.327,0.331,0.335,0.339,0.343,0.345,...,0.447,0.464,0.465,0.479,0.485,0.488,0.490,0.491,0.493,0.496
Albania,0.625,0.608,0.611,0.617,0.629,0.639,0.639,0.649,0.660,0.667,...,0.729,0.740,0.759,0.771,0.781,0.787,0.788,0.788,0.789,0.791
Algeria,0.582,0.589,0.593,0.597,0.602,0.610,0.619,0.629,0.638,0.646,...,0.720,0.730,0.738,0.737,0.746,0.749,0.751,0.755,0.758,0.759
Andorra,,,,,,,,,,0.759,...,0.830,0.828,0.827,0.849,0.846,0.853,0.850,0.854,0.852,0.857
Angola,,,,,,,,,0.384,0.394,...,0.508,0.510,0.525,0.537,0.547,0.557,0.565,0.570,0.576,0.574
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Venezuela (Bolivarian Republic of),0.648,0.654,0.656,0.657,0.660,0.662,0.666,0.668,0.670,0.672,...,0.752,0.753,0.764,0.767,0.772,0.770,0.763,0.752,0.735,0.726
Viet Nam,0.484,0.496,0.506,0.517,0.529,0.540,0.539,0.559,0.566,0.578,...,0.650,0.653,0.663,0.668,0.673,0.675,0.680,0.685,0.690,0.693
Yemen,0.396,0.395,0.398,0.398,0.393,0.408,0.418,0.430,0.423,0.432,...,0.503,0.499,0.511,0.501,0.506,0.504,0.493,0.477,0.463,0.463
Zambia,0.421,0.420,0.422,0.418,0.419,0.419,0.420,0.419,0.424,0.428,...,0.521,0.531,0.541,0.552,0.559,0.565,0.570,0.580,0.589,0.591


In [246]:
cols = [int(x) for x in d.columns[1:]]

In [571]:
idx = pd.IndexSlice

undp_ind = pd.MultiIndex.from_product([cols, full_list], names=['year', 'country'])
undp_data = pd.DataFrame(None, index = undp_ind)

In [576]:
undp_data.loc[(slice(None), 'Yemen'), :]

year,country
1990,Yemen
1991,Yemen
1992,Yemen
1993,Yemen
1994,Yemen
1995,Yemen
1996,Yemen
1997,Yemen
1998,Yemen
1999,Yemen


In [591]:
data[0][data[0]['Country'] == 'Afghanistan'].loc[:, '1990']

0    0.122
Name: 1990, dtype: object

In [608]:
np.data[0][data[0]['Country'] == 'a'].loc[:, '1990'].values[0]

numpy.ndarray

In [597]:
data[0].columns[2:]

Index(['1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998',
       '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007',
       '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016',
       '2017', '2018'],
      dtype='object')

In [613]:
a = []
len(a) == 0

True

In [619]:
# To be commented 

for df, nam in zip(data, name):
    for country in full_list:
        for year in df.columns[2:]:
            val = df[df['Country'] == country].loc[:, year].values
            if len(val) == 0: 
                undp_data.loc[(int(year), country), nam] = np.nan
            else: undp_data.loc[(int(year), country), nam] = val[0]

In [624]:
undp_data.loc[(slice(None),'Albania'),]

Unnamed: 0_level_0,Unnamed: 1_level_0,Education Index,Employment to population ratio (% ages 15 and older),"Population, ages 65 and older (millions)","Mortality rate, infant (per 1,000 live births)","Human Development Index (HDI), female","Estimated gross national income per capita, female (2011 PPP$)",Share of seats in parliament (% held by women),"Labour force participation rate (% ages 15 and older), male","Remittances, inflows (% of GDP)",International inbound tourists (thousands),...,Domestic credit provided by financial sector (% of GDP),Forest area (% of total land area),Life expectancy Index,Adjusted net savings (% of GNI),Population using at least basic drinking-water services (%),"Tuberculosis incidence (per 100,000 people)","Mortality rate, male adult (per 1,000 people)","Population, total (millions)",Young age (0-14) dependency ratio (per 100 people ages 15-64),"Population with at least some secondary education, female (% ages 25 and older)"
year,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1990,Albania,0.584,,0.2,34.8,,,,74.6,,,...,,28.8,0.797,2.3,,,144.0,3.3,53.1,59.5
1991,Albania,0.589,54.9,,,,,,,,,...,,,0.797,,,,,,,
1992,Albania,0.558,,,,,,,,,,...,,,0.797,,,,,,,
1993,Albania,0.543,,,,,,,,,,...,,,0.798,,,,,,,
1994,Albania,0.542,,,,,,,,,,...,,,0.8,,,,,,,
1995,Albania,0.551,52.3,0.2,28.5,0.603,3032.0,,73.9,17.86,304.0,...,40.5,28.4,0.803,9.9,,,144.0,3.1,53.8,60.6
1996,Albania,0.558,,,,,,,,,,...,,,0.808,,,,,,,
1997,Albania,0.571,,,,,,,,,,...,,,0.813,,,,,,,
1998,Albania,0.58,,,,,,,,,,...,,,0.819,,,,,,,
1999,Albania,0.586,,,,,,,,,,...,,,0.824,,,,,,,


In [508]:
undp_data.loc[(1990, slice(None)), :].index

MultiIndex([(1990,                         'Afghanistan'),
            (1990,                             'Albania'),
            (1990,                             'Algeria'),
            (1990,                             'Andorra'),
            (1990,                              'Angola'),
            (1990,                 'Antigua and Barbuda'),
            (1990,                           'Argentina'),
            (1990,                             'Armenia'),
            (1990,                           'Australia'),
            (1990,                             'Austria'),
            ...
            (1990,                            ' Vanuatu'),
            (1990, ' Venezuela (Bolivarian Republic of)'),
            (1990,                           ' Viet Nam'),
            (1990,                              ' Yemen'),
            (1990,                             ' Zambia'),
            (1990,                           ' Zimbabwe'),
            (1990,                      

In [512]:
undp_data.loc[(1990, slice(None)), :].index.levels[1]

Index(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria',
       ...
       ' Vanuatu', ' Venezuela (Bolivarian Republic of)', ' Viet Nam',
       ' Yemen', ' Zambia', ' Zimbabwe', ' Andorra', ' Dominica',
       ' Saint Kitts and Nevis', ' Liechtenstein'],
      dtype='object', name='country', length=384)

In [443]:
for i,x in enumerate((undp_data.loc[(2000, ' Algeria'), :])):
    if not np.isnan(x): print(i)

44


In [628]:
undp_data.to_csv('dataset/undp_data.csv', na_rep='nan')

In [631]:
undp_data.loc[(2017, slice(None)),]

Unnamed: 0_level_0,Unnamed: 1_level_0,Education Index,Employment to population ratio (% ages 15 and older),"Population, ages 65 and older (millions)","Mortality rate, infant (per 1,000 live births)","Human Development Index (HDI), female","Estimated gross national income per capita, female (2011 PPP$)",Share of seats in parliament (% held by women),"Labour force participation rate (% ages 15 and older), male","Remittances, inflows (% of GDP)",International inbound tourists (thousands),...,Domestic credit provided by financial sector (% of GDP),Forest area (% of total land area),Life expectancy Index,Adjusted net savings (% of GNI),Population using at least basic drinking-water services (%),"Tuberculosis incidence (per 100,000 people)","Mortality rate, male adult (per 1,000 people)","Population, total (millions)",Young age (0-14) dependency ratio (per 100 people ages 15-64),"Population with at least some secondary education, female (% ages 25 and older)"
year,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2017,Afghanistan,0.408,64.9,0.9,51.5,0.408,1112,27.4,82.2,3.64,,...,-2.5,,0.679,2.7,67,189,240,36.3,81.3,11.4
2017,Albania,0.758,48.5,0.4,7.8,0.769,9423,27.9,65,10.06,4643,...,63.6,,0.897,8.2,91,20,77,2.9,26.3,93.1
2017,Algeria,0.674,36.4,2.6,20.6,0.684,4093,21.3,67.6,1.07,2451,...,67.9,,0.869,21.2,94,70,104,41.4,46.2,37.5
2017,Andorra,0.701,,,3.2,,,32.1,,,,...,,,0.949,,100,1.5,,0.1,,71.5
2017,Angola,0.498,72.2,0.7,53.8,0.547,5054,30.5,80.2,0,261,...,29.7,,0.621,-16.3,56,359,273,29.8,92.4,23.1
2017,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017,Venezuela (Bolivarian Republic of),0.7,57.7,2,25.7,0.738,7827,22.2,77.3,,427,...,,,0.804,,96,42,189,29.4,43,71.7
2017,Viet Nam,0.626,76,6.7,16.7,0.69,5455,26.7,82.6,6.7,12922,...,141.9,,0.85,13.4,95,129,178,94.6,33.1,66.2
2017,Yemen,0.341,33.5,0.8,43.2,0.255,182,0.5,70.7,14.06,,...,,,0.709,,63,48,240,27.8,70,18.7
2017,Zambia,0.572,69.8,0.4,41.5,0.562,2985,18,79.9,0.36,1083,...,21.8,,0.662,,60,361,342,16.9,86.5,39.2


In [None]:
data_tra

In [661]:
undp_data.columns

Index(['Education Index',
       'Employment to population ratio (% ages 15 and older)',
       'Population, ages 65 and older (millions)',
       'Mortality rate, infant (per 1,000 live births)',
       'Human Development Index (HDI), female',
       'Estimated gross national income per capita, female (2011 PPP$)',
       'Share of seats in parliament (% held by women)',
       'Labour force participation rate (% ages 15 and older), male',
       'Remittances, inflows (% of GDP)',
       'International inbound tourists (thousands)',
       'Infants lacking immunization, measles (% of one-year-olds)',
       'Foreign direct investment, net inflows (% of GDP)',
       'Employment in services (% of total employment)',
       'Labour force participation rate (% ages 15 and older)',
       'Mean years of schooling, male (years)',
       'Expected years of schooling, male (years)',
       'Mobile phone subscriptions (per 100 people)',
       'Renewable energy consumption (% of total final e

In [646]:
#Test data_trainset contains objects only of latest 10 years,
#including unmentioned in train data_train latest year 1016064
unique_years = data_train['galactic year'].unique()
print (unique_years)
print (unique_years.shape)

# print(np.sort(test['galactic year'].unique()))
# print(test['galactic year'].unique().shape)

unique_names = data_train['galaxy'].unique()
unique_names = np.sort(unique_names)
print (unique_names[:5])
print (unique_names.shape)

#Let's map all galactic years to years with increment of 1 year
# and map all galaxies to their unique integer key

#dictionary for replacement of galactic years with normal years
di = {val: ind+1990 for ind, val in enumerate(np.append(unique_years,
                                                     1016064))}
#dictionary for replacement of names with integer keys
di_names = {val: ind+1 for ind, val in enumerate(unique_names)}
data_train.replace({'galactic year':di}, inplace=True)
#test.replace({'galactic year':di}, inplace=True)

# data_train.replace({'galaxy':di_names}, inplace=True)
# test.replace({'galaxy':di_names}, inplace=True)

[ 990025  991020  992016  993012  994009  995006  996004  997002  998001
  999000 1000000 1001000 1002001 1003002 1004004 1005006 1006009 1007012
 1008016 1009020 1010025 1011030 1012036 1013042 1014049 1015056]
(26,)
['Andromeda Galaxy (M31)' 'Andromeda I' 'Andromeda II' 'Andromeda III'
 'Andromeda IX']
(181,)


In [653]:
unique_data_years = data_train['galactic year'].unique()
unique_galaxy_names = data_train['galaxy'].unique()

In [659]:
data_train_m = data_train.set_index(['galactic year', 'galaxy'])

In [668]:
# Preparing train set to compare with the UNDP dataset. I dropped some columns since I believe 
# the HDI Ranks for the year from which the original dataset was pulled might
# be different from the one in 2018
data_train_cl = data_train_m.drop(labels=['Intergalactic Development Index (IDI), Rank',
                          'Intergalactic Development Index (IDI), female, Rank',
                          'Intergalactic Development Index (IDI), male, Rank',
                          'y'
                         ],
                  axis = 1)

In [669]:
data_train_cl

Unnamed: 0_level_0,Unnamed: 1_level_0,existence expectancy index,existence expectancy at birth,Gross income per capita,Income Index,Expected years of education (galactic years),Mean years of education (galactic years),Intergalactic Development Index (IDI),Education Index,Population using at least basic drinking-water services (%),Population using at least basic sanitation services (%),...,Interstellar phone subscriptions (per 100 people),"Interstellar Data Net users, total (% of population)",Current health expenditure (% of GGP),"Intergalactic Development Index (IDI), female","Intergalactic Development Index (IDI), male",Gender Development Index (GDI),Adjusted net savings,"Creature Immunodeficiency Disease prevalence, adult (% ages 15-49), total",Private galaxy capital flows (% of GGP),Gender Inequality Index (GII)
galactic year,galaxy,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1990,Large Magellanic Cloud (LMC),0.628657,63.125200,27109.234310,0.646039,8.240543,,,,,,...,,,,,,,,,,
1990,Camelopardalis B,0.818082,81.004994,30166.793958,0.852246,10.671823,4.742470,0.833624,0.467873,,,...,,,,,,,19.177926,,22.785018,
1990,Virgo I,0.659443,59.570534,8441.707353,0.499762,8.840316,5.583973,0.469110,0.363837,,,...,,,,,,,21.151265,6.534020,,
1990,UGC 8651 (DDO 181),0.555862,52.333293,,,,,,,,,...,,,,,,,,5.912194,,
1990,Tucana Dwarf,0.991196,81.802464,81033.956906,1.131163,13.800672,13.188907,0.910341,0.918353,,,...,,,,,,,,5.611753,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015,Columba I,1.029704,82.832063,34310.471408,0.855094,18.578586,10.557143,0.906573,0.862826,116.585709,121.891775,...,121.836488,79.897857,10.392312,0.943410,0.902237,1.060532,26.438719,3.023709,29.294865,0.580785
2015,"Leo II Dwarf (Leo B, DDO 93)",0.937869,75.877098,36899.067719,0.929494,16.153857,9.151665,0.865822,0.747577,121.672753,115.422812,...,157.818080,53.061093,10.296360,0.915225,0.798083,1.055118,20.637654,4.470596,31.085400,0.517558
2015,Canes Venatici I Dwarf,1.036144,93.540275,37002.977875,1.085245,21.066473,16.661344,0.983835,1.100779,125.376956,114.907359,...,177.621424,116.206334,9.601421,1.097208,1.044890,1.114754,28.154859,5.193997,32.145570,0.363862
2015,KKs 3,0.939034,78.274427,28180.459770,0.687655,9.388911,8.908748,0.735694,0.602703,105.345928,88.416415,...,96.875054,41.923105,4.137744,0.596164,0.754729,0.825864,38.963157,2.854140,27.227179,0.711878


In [700]:
# Inverse dictionary of col_map_abs: Contest column names -> UNDP column names
col_map_abs_inv = {v: k for k, v in col_map_abs.items()}

In [704]:
undp_order = [col_map_abs_inv.get(x) for x in data_train_cl.columns]

In [705]:
undp_data[undp_order]

Unnamed: 0_level_0,Unnamed: 1_level_0,Life expectancy Index,Life expectancy at birth,Gross national income (GNI) per capita (2011 PPP$),Income Index,Expected years of schooling (years),Mean years of schooling (years),Human Development Index (HDI),Education Index,Population using at least basic drinking-water services (%),Population using at least basic sanitation services (%),...,Mobile phone subscriptions (per 100 people),"Internet users, total (% of population)",Current health expenditure (% of GDP),"Human Development Index (HDI), female","Human Development Index (HDI), male",Gender Development Index (GDI),Adjusted net savings (% of GNI),"HIV prevalence, adult (% ages 15-49), total",Private capital flows (% of GDP),Gender Inequality Index (GII)
year,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1990,Afghanistan,0.467,50.3,2193,0.466,2.6,1.5,0.298,0.122,,,...,,,,,,,,,,
1990,Albania,0.797,71.8,4415,0.572,11.6,7.8,0.644,0.584,,,...,,,,,,,2.3,0.1,,
1990,Algeria,0.722,66.9,9989,0.695,9.6,3.6,0.578,0.385,,,...,,,,,,,9.0,0.1,,
1990,Andorra,0.870,76.5,49062,0.936,10.8,,,,,,...,,,,,,,,,,
1990,Angola,0.389,45.3,4139,0.562,3.4,,,,,,...,,,,,,,,0.2,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,Venezuela (Bolivarian Republic of),0.802,72.1,9070,0.681,12.8,10.3,0.726,0.7,,,...,71.8,,,0.728,0.719,1.013,,,,0.458
2018,Viet Nam,0.851,75.3,6220,0.624,12.7,8.2,0.693,0.626,,,...,147.2,70.3,,0.693,0.692,1.003,,,,0.314
2018,Yemen,0.709,66.1,1433,0.402,8.7,3.2,0.463,0.347,,,...,,,,0.245,0.535,0.458,,,,0.834
2018,Zambia,0.669,63.5,3582,0.541,12.1,7.1,0.591,0.572,,,...,89.2,14.3,,0.565,0.596,0.948,,,0.2,0.54


In [None]:
from skleanr 