In [3]:
import os

repo_dir = os.environ.get("REPO_DIR")
code_dir = os.path.join(repo_dir, "code/")
data_dir = os.path.join(repo_dir, "data/")

os.chdir(code_dir)

import geopandas as gpd
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

import os
import shutil


import sys


from mosaiks.label_utils.utils import geopandas_shape_grid, box_grid, assign_grid_points_to_gpdFile, get_dense_grid_for_gpdf_file
from mosaiks.label_utils.plotting_utils import plot_label_map_hist

# HDI data (Smits et al)

**Data Download:**

*Shapefiles and tabular data are separate downloads*

*Files downloaded on July 17, 2023:*

Tabular data:
https://globaldatalab.org/mygdl/downloads/


https://globaldatalab.org/asset/394/SHDI-SGDI-Total%207.0.csv

We are using the SHDI V7.0data in this analysis. The full database is downloaded from the link above. Version history is [here](https://globaldatalab.org/shdi/archive/).


*A previous version of this manuscript used the V4 version of these labels.*


Shapefiles:
https://globaldatalab.org/shdi/shapefiles/

https://globaldatalab.org/asset/403/GDL%20Shapefiles%20V6.1.zip

We use the neweset shapefile available on July 17, 2023. This is the `GDL Shapefiles V6.1`. This file is NOT included in the GitHub repository and must be downloaded to replicate our data cleaning.



**Data Citation**

Smits, J., Permanyer, I. The Subnational Human Development Database. Sci Data 6, 190038 (2019). https://doi.org/10.1038/sdata.2019.38

**Corresponding paper:**

https://www.nature.com/articles/sdata201938


**Abstract**

In this paper we describe the Subnational Human Development Database. This database contains for the period 1990–2017 for 1625 regions within 161 countries the national and subnational values of the Subnational Human Development Index (SHDI), for the three dimension indices on the basis of which the SHDI is constructed – education, health and standard of living --, and for the four indicators needed to create the dimension indices -- expected years of schooling, mean years of schooling, life expectancy and gross national income per capita. The subnational values of the four indicators were computed using data from statistical offices and from the Area Database of the Global Data Lab, which contains indicators aggregated from household surveys and census datasets. Values for missing years were estimated by interpolation and extrapolation from real data. By normalizing the population-weighted averages of the indicators to their national levels in the UNDP-HDI database, values of the SHDI and its dimension indices were obtained that at national level equal their official versions of the UNDP.


**Data sources**

Three major data sources were used to create our SHDI database. We approached statistical offices, including Eurostat, the statistical office of the European Union (https://ec.europa.eu/eurostat), by email communication or visiting their websites to obtain data. We downloaded data from the Area Database of the Global Data Lab (https://www.globaldatalab.org). And we downloaded data from the HDI website of the Human Development Report Office of the United Nations Development Program (http://hdr.undp.org). In the ‘SHDI Start’ data file (Data Citation 1), for each country information is provided on the data source(s) used for the subnational values of the indicators. In this file also for each country the years for which data is available, the number of subnational regions and the population size is presented. Below we discuss the three main data sources in more detail.



## Read in shape files

In [None]:
directory = data_dir + "raw/GDL_HDI/"
out_directory = data_dir + "int/GDL_HDI/"

## This file MUST be downloaded from the link above and placed in the correct subdirectory


shp_path = directory+"GDL_Shapefiles_V6.1/shdi2022_World_large.shp"

if os.path.exists(shp_path):
    print("reading shp")
    gpdf = gpd.read_file(shp_path)
    
elif os.path.exists(directory+"GDL Shapefiles V6.1.zip"):
    print("unzipping file")
    try:
        os.mkdir(directory+"/GDL_Shapefiles_V6.1/")
    except:
        shutil.unpack_archive(directory+"GDL Shapefiles V6.1.zip", directory+"/GDL_Shapefiles_V6.1")
else:
    print("Shapefile needs to be downnloaded and placed in the correct directory. See details above.")

In [None]:
gpdf.rename(columns = {"gdlcode":"GDLcode"}, inplace=True) #Revert to an older name convention

In [None]:
gpdf[gpdf["GDLcode"].isnull()]  # No null GDLcodes

In [None]:
gpdf.set_index("GDLcode", inplace=True)
gpdf.loc["BHRt","iso_code"] = "BHR" # Fix weird anomaly in shapefile
gpdf.loc[gpdf.index.str.startswith("CUB"),"iso_code"] = "CUB" # fix missing iso code or Cuba
gpdf["iso_code"] = gpdf["iso_code"].replace("XKO","KSV")

In [None]:
nulls = gpdf[gpdf["iso_code"].isnull()] # Make a df of remaining null values in the country code

gpdf.dropna(subset = ["iso_code"],inplace=True)

### Let's make and save a country aggregated version of this shapefiile -- it will be useful later

In [2]:
# gpdf_country = gpdf.dissolve("iso_code")
# gpdf_country.to_pickle(out_directory + "/HDI_ADM0_dissolved_shapefile.p")

In [None]:
gpdf_country = pd.read_pickle(out_directory + "/HDI_ADM0_dissolved_shapefile.p")

## Read and clean data files

See above for details on this tabular data download.

In [None]:
data = pd.read_csv(directory + "/SHDI-SGDI-Total 7.0.csv",low_memory = False)

#Subset to only 2019 observations. This is the year for which we have MOSAIKS features
data = data[data["year"] == 2019]

In [None]:
nulls

In [None]:
data[data["GDLCODE"].isin(nulls.index)] # None of the remaining null iso codes have matching HDI values

In [None]:
rename_dictionary = {"shdi" : "Sub-national HDI",
                    "msch": "Mean years schooling",
                    "esch":"Expected years schooling",
                    "lifexp":"Life expectancy",
                    "gnic": "GNI per capita in thousands of US$ (2011 PPP)",
                    "iso_code": "ISO_Code"}

tasks = list(rename_dictionary.values())[:-1]

data.rename(columns = rename_dictionary, inplace=True)

In [None]:
unneeded_cols = ['sgdi', 'shdif', 'shdim',
       'healthindex', 'healthindexf', 'healthindexm', 'incindex', 'incindexf',
       'incindexm', 'edindex', 'edindexf', 'edindexm', 'eschf',
       'eschm', 'mschf', 'mschm', 'gnicf',
       'gnicm', "lgnic", "lgnicf", "lgnicm", "lifexpf", "lifexpm"]

data.drop(columns = unneeded_cols, inplace=True)

data["ISO_Code"] = data["ISO_Code"].replace("XKO","KSV") # Set ISO code for Kosovo. For our use, first 3 of GDLcode

In [None]:
for task in tasks:
    data[task] = pd.to_numeric(data[task], errors="coerce")

In [None]:
national_data_only_indices = data.groupby("ISO_Code").size()

In [None]:
## Now we want to take the countries where we only have national data and merge those with the dataframe of subnational entities
national_data_only_indices = data.groupby("ISO_Code").size()==1
national_data_only = data.groupby("ISO_Code").first()[national_data_only_indices].reset_index()

subnational_data_only = data[data["level"] == "Subnat"]

df = pd.concat([national_data_only, subnational_data_only])



### Let's inspect the set of countries that do not have subnational province observations

In [None]:
#pd.set_option('display.max_rows', None)
print("Countries that do not have ADM1 child regions:")
national_data_only

These are all very small countries and this appears to be reasonable.

### The shapefile is not a perfect match the tabular data

Let's analyze what is missing


#### First, let's inspect the set of countries that cannot be linked to a shapefile primary key

In [None]:
nats_dropped = national_data_only[~national_data_only["GDLCODE"].isin(gpdf.index)]
nats_dropped

These is a vry small country. Excluding this from our analysis seems reasonable.

#### Second, let's inspect the set of ADM1 polygons that cannot be linked to a shapefile primary key

In [None]:
subnats_dropped = subnational_data_only[~subnational_data_only.GDLCODE.isin(gpdf.index)]
subnats_dropped

In [None]:
len(subnats_dropped)

Dropping these 47 subnational observations is the best we can do. Some appear quite reasonable (e.g., it probably doesn't make sense to consider Guadeloupe a part of France for the purpose of this analysis).

### Now let's see if there is any data in the shapefile that is missing from the tabular data

In [None]:
print("Shape file obs that don't match tabular data")

gpdf[~gpdf.index.isin(df.GDLCODE)] # Just a few

In [None]:
n_dropped = len(nats_dropped) + len(subnats_dropped)

## Let's go ahead and subset both of these files to the matching set of indices

In [None]:
df.set_index("GDLCODE", inplace=True)
#gpdf.set_index("GDLcode", inplace=True)

In [None]:
matching_locs = df.index[df.index.isin(gpdf.index)]

In [None]:
df = df.loc[matching_locs]
gpdf = gpdf.loc[matching_locs]

In [None]:
df

In [None]:
df.to_pickle(out_directory + "/HDI_indicators_and_indices_clean.p")
gpdf.to_pickle(out_directory + "/HDI_ADM1_shapefile_clean.p")

#### Also write the national level data

In [None]:
nat_data = data[data["level"] == "National"].set_index("ISO_Code")
nat_data.loc[gpdf_country.index] # Only include countries that also have a shapefile
nat_data.to_pickle(out_directory + "/HDI_indicators_and_indices_adm0_clean.p")

In [None]:
len(matching_locs)

In [None]:
print(round(n_dropped/(len(matching_locs ) + n_dropped),3) * 100, "% of HDI data dropped")

##  Transform shapefile to .01 x . 01 degree grid

This is the form needed for aggregating features in the existing pipeline

In [None]:
gpdf.head()

In [None]:
# dense_grid = get_dense_grid_for_gpdf_file(gpdf.reset_index(), columns=["GDLCODE", "iso_code"])

In [None]:
# dense_grid.head()

In [None]:
# outpath = data_dir + "/features/prepared_labels/GDL_HDI_polygon_coords_for_featurization.p"
# dense_grid["constant"] = 1


# dense_grid.to_pickle(outpath)
# dense_grid = pd.read_pickle(outpath)

Check to see if any polygon observations were dropped. This would occur if they are very small and don't overlay any grid centorids.

In [1]:
# len(dense_grid["GDLCODE"].unique()) == len(matching_locs)