# Grouping countries based on the Human Development Index (HDI)

In [None]:
import pandas as pd
import warnings
import functions

warnings.filterwarnings(action='ignore')

In [None]:
def get_hdi(path,filename):
    '''This function creates two dictionaries based on the Human Development Index and its levels.'''
    
    # Read in the xlsx-file with data
    file = "{}/{}".format(path,filename)
    hdi_data = pd.read_excel(file, header=None)
    
    # Replace the country names in the hdi_data with the corresponding country names as used in
    # the regions data frame.
    hdi_data[1] = hdi_data[1].replace(to_replace ="Viet Nam", value ="Vietnam") 
    hdi_data[1] = hdi_data[1].replace(to_replace ="Czechia", value ="Czech Republic") 
    hdi_data[1] = hdi_data[1].replace(to_replace ="Russian Federation", value ="Russia") 
    hdi_data[1] = hdi_data[1].replace(to_replace ="Venezuela (Bolivarian Republic of)", value ="Venezuela") 
    hdi_data[1] = hdi_data[1].replace(to_replace ="Korea (Republic of)", value ="South Korea") 
    hdi_data[1] = hdi_data[1].replace(to_replace ="Bolivia (Plurinational State of)", value ="Bolivia") 
    hdi_data[1] = hdi_data[1].replace(to_replace ="Hong Kong, China (SAR)", value ="Hong Kong") 
    hdi_data[1] = hdi_data[1].replace(to_replace ="Moldova (Republic of)", value ="Moldova") 
    hdi_data[1] = hdi_data[1].replace(to_replace ="Tanzania (United Republic of)", value ="Tanzania") 
    hdi_data[1] = hdi_data[1].replace(to_replace ="Lao People's Democratic Republic", value ="Laos") 
    hdi_data[1] = hdi_data[1].replace(to_replace ="Congo (Democratic Republic of the)", value ="Democratic Republic of the Congo") 
    hdi_data[1] = hdi_data[1].replace(to_replace ="American Samoa", value ="Samoa")
    
    ## Create hdi dictionary
    
    # Select the useful rows and columns from the hdi data file to make the hdi.
    df_hdi = hdi_data.iloc[7:200, 1:3]
    
    # Remove the title rows indicating the human development level.
    df_hdi = df_hdi[df_hdi[2].notna()]
    
    # Append Taiwan
    df_hdi_taiwan = pd.DataFrame([["Taiwan", 0.911]], columns = [1,2])
    df_hdi = pd.concat([df_hdi,df_hdi_taiwan])
    
    # Make a dictionary with countries as keys and the hdi as values.
    dict_hdi = dict(df_hdi.values.tolist())
    
    ## Create hdi-levels dictionary
    
    # Select the useful rows and column from the hdi data file to make the hdi-levels.
    df_levels = hdi_data.iloc[7:200, [1]]
    
    # Create an index based on the title rows indicating the hdi-level.
    idx = df_levels[(df_levels[1].str.contains("HUMAN DEVELOPMENT"))].index
    
    # Use the index to create new dataframes per hdi-level.
    df_very_high = df_levels.iloc[idx[0]-6:idx[1]-7, :]
    df_high = df_levels.iloc[idx[1]-6:idx[2]-7, :]
    df_medium = df_levels.iloc[idx[2]-6:idx[3]-7, :]
    df_low = df_levels.iloc[idx[3]-6:, :]
        
    # Add a column with the hdi-level per data frame.
    df_very_high[2] = "very high"
    df_high[2] = "high"
    df_medium[2] = "medium"
    df_low[2] = "low"
    
    # Append Taiwan
    df_levels_taiwan = pd.DataFrame([["Taiwan", "very high"]], columns = [1,2])
    df_very_high = pd.concat([df_very_high,df_levels_taiwan])
    
    # Concatenate dataframes.
    df_hdi_levels = pd.concat([df_very_high, df_high, df_medium, df_low])
    
    # Make a dictionary with countries as keys and hdi-levels as values.
    dict_hdi_levels = dict(df_hdi_levels.values.tolist())
    
    print('Creating dictionaries for hdi and hdi-levels completed.')
    return dict_hdi, dict_hdi_levels

In [None]:
def create_hdi_columns(regions, dict_hdi, dict_hdi_levels):
    '''This function uses the hdi dictionary and the hdi-levels dictionary to create a column
    in the regions table with the hdi and the hdi-levels.'''
    
    # Create empty lists for hdi and hdi levels
    index = []
    levels = []
    
    # Get the hdi values for the countries in the regions dataframe and append it to the list.
    for i in range(len(regions)):  
        hd_idx = dict_hdi.get(regions.at[regions.index[i],"country_agg"],"NaN")
        index.append(hd_idx)
    print('Creating hdi list completed.')
    
    # Get the hdi index for the countries in the regions dataframe and append it to the list.
    for i in range(len(regions)):  
        hdi_lvls = dict_hdi_levels.get(regions.at[regions.index[i],"country_agg"],"NaN")
        levels.append(hdi_lvls)
    print('Creating hdi-level list completed.')
    
    # Fill the appended lists into their corresponding columns.
    regions["hdi"] = index
    regions["hdi_level"] = levels
    
    return regions

In [None]:
# Reading in data files from csv into a dictionary of dataframes.
dfs_region = functions.get_data("data/CMU_Global_data/Full_Survey_Data/region/smooth/", "region")

# Concatenating individuals dataframes from the dictionary into one dataframe for regions.
regions = pd.concat(dfs_region)

In [None]:
# Getting the dictionaries for the hdi and the hdi-levels.
dict_hdi, dict_hdi_levels = get_hdi("data","hdro_statistical_data_tables_1_15_d1_d5.xlsx")

# Creating a new column with the hdi and the hdi-levels in the regions dataframe
regions = create_hdi_columns(regions, dict_hdi_index, dict_hdi_levels)

### On checking differences in names

There were many NaN's when looking up the country in the dictionary that could be explained by differences in country names. This was checked below. The solution is already included in the get hdi function.

### Differences in naming regions and hdi

regions|hdi
---|---
Vietnam|Viet Nam
Czech Republic|Czechia
Russia|Russian Federation
Venezuela|Venezuela (Bolivarian Republic of)
South Korea|Korea (Republic of)
Bolivia|Bolivia (Plurinational State of)
Hong Kong|Hong Kong, China (SAR)
Taiwan|*non-existent*
Moldova|Moldova (Republic of)
Tanzania|Tanzania (United Republic of)
Laos|Lao People's Democratic Republic
Democratic Republic of the Congo|Congo (Democratic Republic of the)
American Samoa|Samoa