In [125]:
%load_ext jupyternotify
import numpy as np
import pandas as pd
import glob
import pickle

The jupyternotify extension is already loaded. To reload it, use:
  %reload_ext jupyternotify


In [46]:
data=[]
name=[]
for file_name in glob.glob('dataset/UNDP_HDI/*.csv'):
    data.append(pd.read_csv(file_name,
                             skiprows=0,
                             na_values="..",
                             na_filter=True,
                             header=1))
    name.append(file_name.split('/')[-1].split('.')[0])

In [6]:
(list(enumerate(name)))

[(0, 'Education Index'),
 (1, 'Employment to population ratio (% ages 15 and older)'),
 (2, 'Population, ages 65 and older (millions)'),
 (3, 'Mortality rate, infant (per 1,000 live births)'),
 (4, 'Human Development Index (HDI), female'),
 (5, 'Estimated gross national income per capita, female (2011 PPP$)'),
 (6, 'Share of seats in parliament (% held by women)'),
 (7, 'Labour force participation rate (% ages 15 and older), male'),
 (8, 'Remittances, inflows (% of GDP)'),
 (9, 'International inbound tourists (thousands)'),
 (10, 'Infants lacking immunization, measles (% of one-year-olds)'),
 (11, 'Foreign direct investment, net inflows (% of GDP)'),
 (12, 'Employment in services (% of total employment)'),
 (13, 'Labour force participation rate (% ages 15 and older)'),
 (14, 'Mean years of schooling, male (years)'),
 (15, 'Expected years of schooling, male (years)'),
 (16, 'Mobile phone subscriptions (per 100 people)'),
 (17, 'Renewable energy consumption (% of total final energy consu

In [47]:
# Function str_comp compares how similar two strings are
# based on matching words or numbers. It's very crude, but
# gets the job done relatively well for my purpose.

# Spits out a value from 0 to 1
import re
def str_comp(lstr1, lstr2):
    # Form 2 lists of words from strings, 
    # ignoring all non alphabet/number symbols.
    #
    # filter removes unnecessary blank strings in the list
    # left by re.split.
    l1 = list(filter(None, re.split(r'\W',lstr1)))
    l2 = list(filter(None, re.split(r'\W',lstr2)))

    matchings = 0
    for s1 in l1:
        for s2 in l2:
            if s1.lower() == s2.lower(): matchings+=1
    return matchings/max([len(l1), len(l2)])

In [48]:
data_train = pd.read_csv('dataset/train.csv')
columns_train = list(data_train.columns)

In [49]:
list(enumerate(columns_train))

[(0, 'galactic year'),
 (1, 'galaxy'),
 (2, 'existence expectancy index'),
 (3, 'existence expectancy at birth'),
 (4, 'Gross income per capita'),
 (5, 'Income Index'),
 (6, 'Expected years of education (galactic years)'),
 (7, 'Mean years of education (galactic years)'),
 (8, 'Intergalactic Development Index (IDI)'),
 (9, 'Education Index'),
 (10, 'Intergalactic Development Index (IDI), Rank'),
 (11, 'Population using at least basic drinking-water services (%)'),
 (12, 'Population using at least basic sanitation services (%)'),
 (13, 'Gross capital formation (% of GGP)'),
 (14, 'Population, total (millions)'),
 (15, 'Population, urban (%)'),
 (16, 'Mortality rate, under-five (per 1,000 live births)'),
 (17, 'Mortality rate, infant (per 1,000 live births)'),
 (18,
  'Old age dependency ratio (old age (65 and older) per 100 creatures (ages 15-64))'),
 (19, 'Population, ages 15–64 (millions)'),
 (20, 'Population, ages 65 and older (millions)'),
 (21, 'Life expectancy at birth, male (gala

In [53]:
# Making list containing indices to match columns between datasets
col_map=[]
for i, col1 in enumerate(name):
    maxim = 0
    j_m = 0
    for j, col2 in enumerate(columns_train):
        comp = str_comp(col1,col2)
        # This part checks whether this combination of list of strings col1 and col2
        # are more similar than the previous best in the loop. It also checks
        # that the second list of strings col2 has not appeared before in
        # col_map to avoid duplicates. 
        if (comp >= maxim) and (j not in [x[1] for x in col_map]): 
            maxim = comp
            j_m = j
    col_map.append((i, j_m))

In [54]:
for i, j in col_map:
    print(i, name[i])
    print(j, columns_train[j], '\n')

0 Education Index
9 Education Index 

1 Employment to population ratio (% ages 15 and older)
34 Employment to population ratio (% ages 15 and older) 

2 Population, ages 65 and older (millions)
20 Population, ages 65 and older (millions) 

3 Mortality rate, infant (per 1,000 live births)
17 Mortality rate, infant (per 1,000 live births) 

4 Human Development Index (HDI), female
70 Intergalactic Development Index (IDI), female 

5 Estimated gross national income per capita, female (2011 PPP$)
56 Estimated gross galactic income per capita, female 

6 Share of seats in parliament (% held by women)
47 Share of seats in senate (% held by female) 

7 Labour force participation rate (% ages 15 and older), male
33 Labour force participation rate (% ages 15 and older), male 

8 Remittances, inflows (% of GDP)
62 Remittances, inflows (% of GGP) 

9 International inbound tourists (thousands)
64 Intergalactic inbound tourists (thousands) 

10 Infants lacking immunization, measles (% of one-year-ol

In [92]:
# Manually fixing mistakes made by the mapping script
# corrections = {
#     (37, 72): (37, 8),
#     (43, 78): (43, 72),
#     (56, 41): (56, 42),
#     (57, 8): (57, 78),
#     (10, 42): (10,41),
# }
corrections = {
    'Human Development Index (HDI)': 'Intergalactic Development Index (IDI)',
    'Gender Inequality Index (GII)': 'Gender Inequality Index (GII)',
    'Infants lacking immunization, DTP (% of one-year-olds)': 'Infants lacking immunization, Combination Vaccine (% of one-galactic year-olds)',
    'Infants lacking immunization, measles (% of one-year-olds)': 'Infants lacking immunization, red hot disease (% of one-galactic year-olds)',
    'Gender Inequality Index (GII)': 'Gender Inequality Index (GII)'
}

corr_keys = [x.lower() for x in corrections.keys()]
for i, j in col_map:
    if name[i].strip().lower() in corr_keys:


        
#col_map_corr = [corrections.get(x,x) for x in col_map]

In [100]:
corrections.keys()

dict_keys(['Human Development Index (HDI)', 'Gender Inequality Index (GII)', 'Infants lacking immunization, DTP (% of one-year-olds)', 'Infants lacking immunization, measles (% of one-year-olds)'])

In [111]:
'Human Development Index (HDI)'.strip().lower() in [x.lower() for x in corrections.keys()]

True

In [113]:
# Now it's nice and dandy
for i, j in col_map_corr:
    print(i, name[i])
    print(j, columns_train[j], '\n')

0 Education Index
9 Education Index 

1 Employment to population ratio (% ages 15 and older)
34 Employment to population ratio (% ages 15 and older) 

2 Population, ages 65 and older (millions)
20 Population, ages 65 and older (millions) 

3 Mortality rate, infant (per 1,000 live births)
17 Mortality rate, infant (per 1,000 live births) 

4 Human Development Index (HDI), female
70 Intergalactic Development Index (IDI), female 

5 Estimated gross national income per capita, female (2011 PPP$)
56 Estimated gross galactic income per capita, female 

6 Share of seats in parliament (% held by women)
47 Share of seats in senate (% held by female) 

7 Labour force participation rate (% ages 15 and older), male
33 Labour force participation rate (% ages 15 and older), male 

8 Remittances, inflows (% of GDP)
62 Remittances, inflows (% of GGP) 

9 International inbound tourists (thousands)
64 Intergalactic inbound tourists (thousands) 

10 Infants lacking immunization, measles (% of one-year-ol

In [119]:
col_map_abs = {}
for i, j in col_map_corr:
    col_map_abs[name[i].strip()] = columns_train[j].strip()

In [120]:
col_map_abs

{'Education Index': 'Education Index',
 'Employment to population ratio (% ages 15 and older)': 'Employment to population ratio (% ages 15 and older)',
 'Population, ages 65 and older (millions)': 'Population, ages 65 and older (millions)',
 'Mortality rate, infant (per 1,000 live births)': 'Mortality rate, infant (per 1,000 live births)',
 'Human Development Index (HDI), female': 'Intergalactic Development Index (IDI), female',
 'Estimated gross national income per capita, female (2011 PPP$)': 'Estimated gross galactic income per capita, female',
 'Share of seats in parliament (% held by women)': 'Share of seats in senate (% held by female)',
 'Labour force participation rate (% ages 15 and older), male': 'Labour force participation rate (% ages 15 and older), male',
 'Remittances, inflows (% of GDP)': 'Remittances, inflows (% of GGP)',
 'International inbound tourists (thousands)': 'Intergalactic inbound tourists (thousands)',
 'Infants lacking immunization, measles (% of one-year-ol

In [126]:
def save_obj(obj, name):
    with open('dataset/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open('dataset/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [127]:
save_obj(col_map_abs, 'column_dict_from_UN_to_Contest_format')

In [None]:
load_obj()

In [167]:
# This snippet is for dealing with the format of the imported dataframes
for i in range(len(data)):
    for col in data[i].columns:
        # Drop columns containing 'Unnamed'
        if "Unnamed" in col: data[i].drop(labels=col, axis=1, inplace=True) 

In [169]:
i=37
print(name[i])
data[i][:189]

Human Development Index (HDI)


Unnamed: 0,HDI Rank (2018),Country,1990,1991,1992,1993,1994,1995,1996,1997,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
0,170,Afghanistan,0.298,0.304,0.312,0.308,0.303,0.327,0.331,0.335,...,0.447,0.464,0.465,0.479,0.485,0.488,0.490,0.491,0.493,0.496
1,69,Albania,0.644,0.625,0.608,0.611,0.617,0.629,0.639,0.639,...,0.729,0.740,0.759,0.771,0.781,0.787,0.788,0.788,0.789,0.791
2,82,Algeria,0.578,0.582,0.589,0.593,0.597,0.602,0.610,0.619,...,0.720,0.730,0.738,0.737,0.746,0.749,0.751,0.755,0.758,0.759
3,36,Andorra,,,,,,,,,...,0.830,0.828,0.827,0.849,0.846,0.853,0.850,0.854,0.852,0.857
4,149,Angola,,,,,,,,,...,0.508,0.510,0.525,0.537,0.547,0.557,0.565,0.570,0.576,0.574
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184,96,Venezuela (Bolivarian Republic of),0.638,0.648,0.654,0.656,0.657,0.660,0.662,0.666,...,0.752,0.753,0.764,0.767,0.772,0.770,0.763,0.752,0.735,0.726
185,118,Viet Nam,0.475,0.484,0.496,0.506,0.517,0.529,0.540,0.539,...,0.650,0.653,0.663,0.668,0.673,0.675,0.680,0.685,0.690,0.693
186,177,Yemen,0.392,0.396,0.395,0.398,0.398,0.393,0.408,0.418,...,0.503,0.499,0.511,0.501,0.506,0.504,0.493,0.477,0.463,0.463
187,143,Zambia,0.424,0.421,0.420,0.422,0.418,0.419,0.419,0.420,...,0.521,0.531,0.541,0.552,0.559,0.565,0.570,0.580,0.589,0.591


In [123]:
undp_data = pd.DataFrame()