In [1]:
import os
import pandas as pd
import geopandas as gpd

from typing import Tuple, List, Dict, Optional, Set

In [2]:
BASE_PATH = os.path.join("D:\\", "Workspaces", "vscode-workspace", "ai_x_medicine", "data")
RES_GDP_PATH = os.path.join(BASE_PATH, "out", "res-gdp.csv")
RES_GINI_PATH = os.path.join(BASE_PATH, "out", "res-gini.csv")
RES_PAIN_PATH = os.path.join(BASE_PATH, "out", "socioeconomic-pain.csv")
RES_GINI_SOV_PATH = os.path.join(BASE_PATH, "out", "res-gini-sov.csv")
RES_PAIN_SOV_PATH = os.path.join(BASE_PATH, "out", "socioeconomic-pain-sov2.csv")
# Projections map (easy names -> EPSG/PROJ strings)
PROJECTIONS = {
    "PlateCarree": "EPSG:4326",
    "Mercator": "EPSG:3395",
    "Robinson": "+proj=robin +lon_0=0 +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs",
    "Mollweide": "+proj=moll +lon_0=0 +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs",
    "EqualEarth": "+proj=eqearth +lon_0=0 +datum=WGS84 +units=m +no_defs",
    "WinkelTripel": "+proj=wintri +lon_0=0 +datum=WGS84 +units=m +no_defs",
}
CODE_COLUMN = "SOV_A3"

In [3]:
def load_world(verbose: bool = False):
    # Use Natural Earth low-res that ships with GeoPandas
    #world_path = gpd.datasets.get_path("naturalearth_lowres")
    world = gpd.read_file(os.path.join(BASE_PATH, "countries_map.zip"))#"https://naciscdn.org/naturalearth/110m/cultural/ne_110m_admin_0_countries.zip")
    #world = gpd.read_file("https://naciscdn.org/naturalearth/110m/physical/ne_110m_land.zip") #geodatasets.get_path("naturalearth_lowres"))
    if verbose: 
        print(world)
    # Fix some known ISO code quirks
    # Natural Earth uses "France" overseas territories as single polygon, ISO A3 is in 'iso_a3'
    world.loc[world["SOVEREIGNT"] == "France", "sov_a3"] = "FRA"
    world.loc[world["SOVEREIGNT"] == "Norway", "sov_a3"] = "NOR"
    world.loc[world["SOVEREIGNT"] == "Somaliland", "sov_a3"] = "SOL"  # non-ISO, avoid collision
    return world

In [4]:
def compute_sova3_subset(dataframe: pd.DataFrame, df_country_label: str = "Country", df_value_label: str = "value") -> Tuple[pd.DataFrame, Set]:
    # Find intersection of country/location names
    new_world = load_world()
    common_data = set(new_world["SOVEREIGNT"]).intersection(set(dataframe[df_country_label]))
    diff_data = set(new_world["SOVEREIGNT"]).difference(set(dataframe[df_country_label]))

    # Subset both DataFrames to only those in the intersection
    dataframe_sub = dataframe[dataframe["Country"].isin(common_data)].copy()

    result_data = []
    for _, row in dataframe_sub.iterrows():
        country = row[df_country_label]
        value = row[df_value_label]
        nw_row = new_world.loc[new_world["SOVEREIGNT"] == country]  # get the row corresponding to country
        sova3 = nw_row.iloc[0]["SOV_A3"]
        result_data.append({
            "sov_a3": sova3,
            "Country": country,
            "value": value
        })
    #dataframe_sub.merge(new_world[["SOVEREIGNT", "SOV_A3"]], how="left")

    return pd.DataFrame(result_data), diff_data

In [65]:
# Adapt socio-economic pain data
df_pain = pd.read_csv(RES_PAIN_PATH, index_col=False)
df_pain_sub, _ = compute_sova3_subset(df_pain, "Country", "pain")
df_pain_sub.to_csv(RES_PAIN_SOV_PATH, index=False)

In [5]:
df_gini = pd.read_csv(RES_GINI_PATH, index_col=False)
df_gini_sub, missing_countries = compute_sova3_subset(df_gini, "Country", "gini")
df_gini_sub.to_csv(RES_GINI_SOV_PATH, index=False)

print(f"The susbet includes {len(df_gini_sub['Country'])} out of {len(df_gini['Country'])}.")
print("The following countries are missing:")
for country in missing_countries:
    print(f"- {country}")

The susbet includes 142 out of 170.
The following countries are missing:
- Cambodia
- Eritrea
- Somaliland
- United Republic of Tanzania
- eSwatini
- Afghanistan
- Cuba
- Equatorial Guinea
- Saudi Arabia
- North Korea
- Oman
- Azerbaijan
- Northern Cyprus
- East Timor
- Democratic Republic of the Congo
- Czechia
- The Bahamas
- Antarctica
- United States of America
- Kosovo
- Republic of Serbia
- New Zealand
- Western Sahara
- Lebanon
- Libya
- Brunei
- Belize
- Kuwait
- Nepal


In [None]:
df_gdp = pd.read_csv(RES_GDP_PATH)
df_gdp_sub, missing_countries = compute_sova3_subset(df_gini, "Country")

print(f"The susbet includes {len(df_gdp_sub['Country'])} out of {len(df_gdp['Country'])}.")
print("The following countries are missing:")
for country in missing_countries:
    print(f"- {country}")

The susbet includes 142 out of 262.
The following countries are missing:
- Northern Cyprus
- Oman
- Cuba
- Nepal
- Kuwait
- North Korea
- Libya
- Equatorial Guinea
- The Bahamas
- Western Sahara
- Lebanon
- United Republic of Tanzania
- Cambodia
- Republic of Serbia
- United States of America
- Somaliland
- Kosovo
- Azerbaijan
- Belize
- Saudi Arabia
- Czechia
- Eritrea
- East Timor
- Antarctica
- eSwatini
- New Zealand
- Afghanistan
- Brunei
- Democratic Republic of the Congo
