In [47]:
import pandas as pd
import pycountry



In [48]:
# Load the uploaded WHO dataset
file_path = "../raw_data/Alternative_sources_country_level/WHO_health_data_2024.xlsx" 
excel_data = pd.ExcelFile(file_path)

# Display sheet names to identify where relevant data might reside
sheet_names = excel_data.sheet_names
sheet_names


['readme', 'data']

In [49]:
# Load the 'data' sheet which likely contains the country-level statistics
df = excel_data.parse('data')

# Display first few rows and column names to understand structure
df.head()


Unnamed: 0,IND_NAME,DIM_GEO_NAME,IND_CODE,DIM_GEO_CODE,DIM_TIME_YEAR,DIM_1_CODE,VALUE_NUMERIC,VALUE_STRING,VALUE_COMMENTS
0,Adolescent birth rate (per 1000 women),Afghanistan,MDG_0000000003,AFG,2021,AGEGROUP_YEARS15-19,62.0,62.0,Afghanistan 2022-2023 Multiple Indicator Clust...
1,Adolescent birth rate (per 1000 women),Afghanistan,MDG_0000000003,AFG,2021,AGEGROUP_YEARS10-14,18.0,18.0,Afghanistan 2022-2023 Multiple Indicator Clust...
2,Age-standardized mortality rate attributed to ...,Afghanistan,SDGAIRBODA,AFG,2019,SEX_BTSX,265.66452,265.7,
3,Age-standardized prevalence of hypertension am...,Afghanistan,NCD_HYP_PREVALENCE_A,AFG,2019,SEX_BTSX,40.200001,40.2,
4,Age-standardized prevalence of obesity among a...,Afghanistan,NCD_BMI_30A,AFG,2022,SEX_BTSX,19.222589,19.2,


In [50]:
# List unique indicator names to identify available health metrics
unique_indicators = df["IND_NAME"].unique()
unique_indicators[:30]  # Show the first 30 for review


array(['Adolescent birth rate (per 1000 women)',
       'Age-standardized mortality rate attributed to household and ambient air pollution  (per 100 000 population) ',
       'Age-standardized prevalence of hypertension among adults aged 30–79 years (%)',
       'Age-standardized prevalence of obesity among adults (18+ years) (%)',
       'Age-standardized prevalence of tobacco use among persons 15 years and older (%) ',
       'Amount of water- and sanitation-related official development assistance that is part of a government-coordinated spending plan (constant 2020 US$ millions)',
       'Annual mean concentrations of fine particulate matter (PM2.5) in urban areas (µg/m3)',
       'Average of 15 International Health Regulations core capacity scores',
       'Density of dentists (per 10 000 population) ',
       'Density of medical doctors (per 10 000 population) ',
       'Density of nursing and midwifery personnel (per 10 000 population) ',
       'Density of pharmacists (per 10 00

In [51]:
# Define the selected indicators and which need to be minimized (e.g., mortality rates)
selected_indicators = {
    'Life expectancy at birth (years)': {'weight': 0.15, 'inverse': False},
    'Neonatal mortality rate (per 1000 live births)': {'weight': 0.10, 'inverse': True},
    'Maternal mortality ratio (per 100 000 live births)': {'weight': 0.15, 'inverse': True},
    'Diphtheria-tetanus-pertussis (DTP3) immunization coverage among 1-year-olds (%)': {'weight': 0.10, 'inverse': False},
    'Density of medical doctors (per 10 000 population) ': {'weight': 0.075, 'inverse': False},
    'Density of nursing and midwifery personnel (per 10 000 population) ': {'weight': 0.075, 'inverse': False},
    'Healthy life expectancy at birth (years)': {'weight': 0.10, 'inverse': False},
    'Average of 15 International Health Regulations core capacity scores': {'weight': 0.10, 'inverse': False},
    'Population with household expenditures on health > 10% of total household expenditure or income (%)': {'weight': 0.075, 'inverse': True},
    'Domestic general government health expenditure (GGHE-D) as percentage of general government expenditure (GGE) (%)': {'weight': 0.075, 'inverse': False}
}





In [52]:
# Filter the dataset to include only selected indicators
filtered_df = df[df["IND_NAME"].isin(selected_indicators.keys())]

# Use latest year for each indicator
latest_df = filtered_df.sort_values('DIM_TIME_YEAR').drop_duplicates(subset=["IND_NAME", "DIM_GEO_NAME"], keep="last")

In [53]:
# Pivot the data: rows = countries, columns = indicators
pivot_df = latest_df.pivot(index="DIM_GEO_NAME", columns="IND_NAME", values="VALUE_NUMERIC")

# Normalize each indicator to 0–1 range
normalized_df = pd.DataFrame(index=pivot_df.index)




In [54]:
for indicator, meta in selected_indicators.items():
    if indicator in pivot_df.columns:
        col = pivot_df[indicator]
        # Normalize: (x - min) / (max - min) or inverse if lower = better
        if meta['inverse']:
            normalized_col = 1 - ((col - col.min()) / (col.max() - col.min()))
        else:
            normalized_col = (col - col.min()) / (col.max() - col.min())
        normalized_df[indicator] = normalized_col * meta['weight']



In [55]:
# Calculate final Health Index
normalized_df["Health Index"] = normalized_df.sum(axis=1)



In [56]:
# Combine with original pivot_df for output
result_df = pivot_df.copy()
result_df["Health Index"] = normalized_df["Health Index"]



In [57]:
# Sort by Health Index descending
result_df_sorted = result_df.sort_values(by="Health Index", ascending=False)



In [58]:
# Extract only the country and health index columns
health_index_df = result_df_sorted[["Health Index"]].reset_index()
health_index_df.columns = ["Country", "Health Index"]



In [59]:
# Function to match country names to pycountry's official names
def get_official_country_name(name):
    try:
        return pycountry.countries.lookup(name).name
    except LookupError:
        return name  # fallback to original if not found




In [60]:
# Apply the function to standardize country names
health_index_df["Country"] = health_index_df["Country"].apply(get_official_country_name)



In [61]:
# Sort alphabetically by country name
health_index_df_sorted = health_index_df.sort_values(by="Country").reset_index(drop=True)



In [62]:
# Save to CSV
csv_path = "../raw_data/Alternative_sources_country_level/WHO_health_index_2024.csv"
health_index_df_sorted.to_csv(csv_path, index=False)

csv_path

'../raw_data/Alternative_sources_country_level/WHO_health_index_2024.csv'

In [63]:
# Identify which country names didn't match pycountry and had to fall back
fallbacks = []

for name in health_index_df["Country"]:
    try:
        _ = pycountry.countries.lookup(name).name
    except LookupError:
        fallbacks.append(name)

# Count and list fallback names
fallback_count = len(fallbacks)
fallbacks[:10], fallback_count  # Show first 10 fallbacks and total count


(['Republic of Korea',
  'Netherlands (Kingdom of the)',
  'European Region',
  'Iran (Islamic Republic of)',
  'Region of the Americas',
  'Western Pacific Region',
  'occupied Palestinian territory, including east Jerusalem',
  'South-East Asia Region',
  'Bolivia (Plurinational State of)',
  'Eastern Mediterranean Region'],
 16)

In [64]:
# Define a custom mapping for known exceptions that are actual countries with alternate names
custom_country_mapping = {
    'Republic of Korea': 'South Korea',
    'Türkiye': 'Turkey',
    'Iran (Islamic Republic of)': 'Iran',
    'Bolivia (Plurinational State of)': 'Bolivia',
    'Netherlands (Kingdom of the)': 'Netherlands',
    'Venezuela (Bolivarian Republic of)': 'Venezuela',
    'Russian Federation': 'Russia',
    'Czechia': 'Czech Republic',
    'United Kingdom of Great Britain and Northern Ireland': 'United Kingdom',
    'United Republic of Tanzania': 'Tanzania',
    'Syrian Arab Republic': 'Syria',
    'Lao People\'s Democratic Republic': 'Laos',
    'Brunei Darussalam': 'Brunei',
    'Moldova (Republic of)': 'Moldova',
    'Viet Nam': 'Vietnam',
    'Democratic Republic of the Congo': 'Congo (Kinshasa)',
    'Congo': 'Congo (Brazzaville)'
}



In [65]:
# Function to clean and match only actual countries
def map_to_standard_country(name):
    if name in custom_country_mapping:
        name = custom_country_mapping[name]
    try:
        return pycountry.countries.lookup(name).name
    except LookupError:
        return None  # will be dropped later



In [66]:
# Apply the mapping and remove non-countries
health_index_df["Standard Country"] = health_index_df["Country"].apply(map_to_standard_country)
cleaned_df = health_index_df.dropna(subset=["Standard Country"])
cleaned_df = cleaned_df[["Standard Country", "Health Index"]].rename(columns={"Standard Country": "Country"})
cleaned_df = cleaned_df.sort_values(by="Country").reset_index(drop=True)



In [67]:
# Save to CSV
csv_path = "../raw_data/Alternative_sources_country_level/WHO_health_index_2024.csv"
cleaned_df.to_csv(csv_path, index=False)

csv_path

'../raw_data/Alternative_sources_country_level/WHO_health_index_2024.csv'