# Data Cleaning for ML (`health_panel_2000_2021.csv`)

This notebook cleans the panel dataset for machine learning with a balance between:
- keeping enough data,
- removing low-value noise,
- and ensuring a fully numeric, trainable output.

Cleaning strategy:
1. Keep all years (2000-2021).
2. Drop variables with very high missingness (threshold: >40%).
3. Drop countries with very low overall feature coverage (threshold: <60%).
4. Impute remaining missing values using country-wise time interpolation, then year median fallback.
5. Export cleaned datasets for ML.


In [9]:
from pathlib import Path
import numpy as np
import pandas as pd

DATA_DIR = Path('../Assets/cleaner_exports')
INPUT_PATH = DATA_DIR / 'health_panel_2000_2021.csv'

if not INPUT_PATH.exists():
    raise FileNotFoundError(f'Input file not found: {INPUT_PATH.resolve()}')

raw = pd.read_csv(INPUT_PATH)
print('Loaded:', INPUT_PATH.resolve())
print('Shape:', raw.shape)
raw.head(3)


Loaded: /Users/bikki/Documents/Projects_Spring_2026/STATS201/Assets/cleaner_exports/health_panel_2000_2021.csv
Shape: (4532, 17)


Unnamed: 0,Country,ISO3,Year,Life expectancy at birth (years) Both sexes,Adult mortality rate (probability of dying between 15 and 60 years per 1000 population) Both sexes,"Alcohol, total per capita (15+) consumption (in litres of pure alcohol) (SDG Indicator 3.5.2), three-year average Both sexes",Current health expenditure (CHE) as percentage of gross domestic product (GDP) (%),Hepatitis B (HepB3) immunization coverage among 1-year-olds (%),Number of infant deaths Both sexes,"Prevalence of overweight among adults, BMI ≥ 25 (age-standardized estimate) (%) Both sexes",Polio (Pol3) immunization coverage among 1-year-olds (%),Under-five mortality rate (probability of dying by age 5 per 1000 live births) Both sexes,"Prevalence of underweight among adults, BMI < 18 (age-standardized estimate) (%) Both sexes",Diphtheria tetanus toxoid and pertussis (DTP3) immunization coverage among 1-year-olds (%),Prevalence of HIV among adults aged 15 to 49 (%),"Prevalence of thinness among children and adolescents, BMI < -2 standard deviations below the median (crude estimate) (%) Both sexes 5-19 years",GDP per capita (constant 2015 US$)
0,Afghanistan,AFG,2000,53.8,378.0,0.0,,0.0,110740.0,17.9,24.0,131.7,20.4,24.0,0.1,15.9,308.31827
1,Afghanistan,AFG,2001,53.9,380.0,0.0,,0.0,107225.0,18.9,35.0,127.4,19.6,33.0,,15.5,277.118051
2,Afghanistan,AFG,2002,55.2,367.0,0.0,9.44,0.0,103124.0,19.9,36.0,123.1,18.9,36.0,,15.1,338.139974


In [10]:
# Rename columns to concise snake_case names for modeling convenience.
rename_map = {
    'Life expectancy at birth (years) Both sexes': 'life_expectancy',
    'Adult mortality rate (probability of dying between 15 and 60 years per 1000 population) Both sexes': 'adult_mortality_15_60',
    'Alcohol, total per capita (15+) consumption (in litres of pure alcohol) (SDG Indicator 3.5.2), three-year average Both sexes': 'alcohol_per_capita_15plus',
    'Current health expenditure (CHE) as percentage of gross domestic product (GDP) (%)': 'che_pct_gdp',
    'Hepatitis B (HepB3) immunization coverage among 1-year-olds (%)': 'hepb3_coverage_pct',
    'Number of infant deaths Both sexes': 'infant_deaths',
    'Prevalence of overweight among adults, BMI ≥ 25 (age-standardized estimate) (%) Both sexes': 'overweight_adults_pct',
    'Polio (Pol3) immunization coverage among 1-year-olds (%)': 'pol3_coverage_pct',
    'Under-five mortality rate (probability of dying by age 5 per 1000 live births) Both sexes': 'u5_mortality_rate',
    'Prevalence of underweight among adults, BMI < 18 (age-standardized estimate) (%) Both sexes': 'underweight_adults_pct',
    'Diphtheria tetanus toxoid and pertussis (DTP3) immunization coverage among 1-year-olds (%)': 'dtp3_coverage_pct',
    'Prevalence of HIV among adults aged 15 to 49 (%)': 'hiv_prev_15_49_pct',

    'Prevalence of thinness among children and adolescents, BMI < -2 standard deviations below the median (crude estimate) (%) Both sexes 5-19 years': 'thinness_children_adolescents_pct',
    'GDP per capita (constant 2015 US$)': 'gdp_per_capita_2015usd',
}

df = raw.rename(columns=rename_map).copy()

id_cols = ['Country', 'ISO3', 'Year']
feature_cols = [c for c in df.columns if c not in id_cols]

print('Years in data:', int(df['Year'].min()), '-', int(df['Year'].max()), '| unique years =', df['Year'].nunique())
print('Countries:', df['ISO3'].nunique())
print('Feature columns:', len(feature_cols))


Years in data: 2000 - 2021 | unique years = 22
Countries: 206
Feature columns: 14


In [11]:
# 1) Feature-level missingness filter
feature_missing_pct = (df[feature_cols].isna().mean() * 100).sort_values(ascending=False)
print('Missingness by feature (%):')
print(feature_missing_pct)

MAX_FEATURE_MISSING_PCT = 40.0
keep_features = feature_missing_pct[feature_missing_pct <= MAX_FEATURE_MISSING_PCT].index.tolist()
drop_features = feature_missing_pct[feature_missing_pct > MAX_FEATURE_MISSING_PCT].index.tolist()

print('\nDropped features (>40% missing):', drop_features)
print('Kept features:', keep_features)


Missingness by feature (%):
hiv_prev_15_49_pct                   80.406002
life_expectancy                      10.679612
adult_mortality_15_60                10.679612
alcohol_per_capita_15plus             9.223301
che_pct_gdp                           7.811121
hepb3_coverage_pct                    6.729921
pol3_coverage_pct                     6.729921
dtp3_coverage_pct                     6.729921
gdp_per_capita_2015usd                5.119153
overweight_adults_pct                 3.883495
underweight_adults_pct                3.883495
thinness_children_adolescents_pct     3.883495
infant_deaths                         3.398058
u5_mortality_rate                     3.398058
dtype: float64

Dropped features (>40% missing): ['hiv_prev_15_49_pct']
Kept features: ['life_expectancy', 'adult_mortality_15_60', 'alcohol_per_capita_15plus', 'che_pct_gdp', 'hepb3_coverage_pct', 'pol3_coverage_pct', 'dtp3_coverage_pct', 'gdp_per_capita_2015usd', 'overweight_adults_pct', 'underweight_adults_pct

In [12]:
# 2) Country-level completeness filter (across kept features)
work = df[id_cols + keep_features].copy()

country_completeness = (
    work.groupby('ISO3')[keep_features]
        .apply(lambda x: x.notna().mean().mean())
        .sort_values()
)

MIN_COUNTRY_COMPLETENESS = 0.60
keep_iso3 = country_completeness[country_completeness >= MIN_COUNTRY_COMPLETENESS].index
drop_iso3 = country_completeness[country_completeness < MIN_COUNTRY_COMPLETENESS].index

print('Countries kept:', len(keep_iso3))
print('Countries dropped:', len(drop_iso3))
print('\nLowest-coverage dropped countries:')
print(country_completeness.loc[drop_iso3].head(20))

work = work[work['ISO3'].isin(keep_iso3)].copy()
print('\nShape after country filter:', work.shape)
print('Years preserved:', int(work['Year'].min()), '-', int(work['Year'].max()), '| unique =', work['Year'].nunique())


Countries kept: 192
Countries dropped: 14

Lowest-coverage dropped countries:
ISO3
LIE    0.034965
AIA    0.153846
VGB    0.153846
MSR    0.153846
XKX    0.202797
TCA    0.206294
TKL    0.230769
ASM    0.300699
GRL    0.307692
BMU    0.307692
PYF    0.307692
PRI    0.461538
MCO    0.538462
SMR    0.538462
dtype: float64

Shape after country filter: (4224, 16)
Years preserved: 2000 - 2021 | unique = 22


In [13]:
# 3) Missing-value imputation
# Step A: within each country, interpolate over time (linear), forward/back fill edges.
work = work.sort_values(['ISO3', 'Year']).reset_index(drop=True)

for col in keep_features:
    work[col] = (
        work.groupby('ISO3', group_keys=False)[col]
            .apply(lambda s: s.interpolate(method='linear', limit_direction='both'))
    )

# Step B: fallback to same-year median across countries (if any values still missing).
year_medians = work.groupby('Year')[keep_features].transform('median')
for col in keep_features:
    work[col] = work[col].fillna(year_medians[col])

# Step C: final fallback to global median (rare edge cases).
for col in keep_features:
    work[col] = work[col].fillna(work[col].median())

print('Any missing left:', work[keep_features].isna().sum().sum())


Any missing left: 0


In [14]:
# 4) Mild outlier clipping to stabilize ML training (winsorize at 1st/99th percentiles)
# This is conservative and keeps the data scale interpretable.
for col in keep_features:
    lo, hi = work[col].quantile([0.01, 0.99])
    work[col] = work[col].clip(lower=lo, upper=hi)

clean = work.copy()
print('Clean shape:', clean.shape)
clean.head(3)


Clean shape: (4224, 16)


Unnamed: 0,Country,ISO3,Year,life_expectancy,adult_mortality_15_60,alcohol_per_capita_15plus,che_pct_gdp,hepb3_coverage_pct,pol3_coverage_pct,dtp3_coverage_pct,gdp_per_capita_2015usd,overweight_adults_pct,underweight_adults_pct,thinness_children_adolescents_pct,infant_deaths,u5_mortality_rate
0,Afghanistan,AFG,2000,53.8,378.0,0.0,9.44,0.0,38.0,36.0,317.072114,17.9,20.4,15.9,110740.0,131.7
1,Afghanistan,AFG,2001,53.9,380.0,0.0,9.44,0.0,38.0,36.0,317.072114,18.9,19.6,15.5,107225.0,127.4
2,Afghanistan,AFG,2002,55.2,367.0,0.0,9.44,0.0,38.0,36.0,338.139974,19.9,18.9,15.1,103124.0,123.1


In [15]:
# 5) Build ML-ready numeric table
ml_numeric = clean.copy()

# Country as numeric ID for models that need numeric inputs.
ml_numeric['country_id'] = pd.factorize(ml_numeric['ISO3'])[0].astype(int)

# Optional transformed features useful for many models.
ml_numeric['log_gdp_per_capita_2015usd'] = np.log1p(ml_numeric['gdp_per_capita_2015usd'])
ml_numeric['log_infant_deaths'] = np.log1p(ml_numeric['infant_deaths'])

# Keep all years by design.
assert ml_numeric['Year'].nunique() == 22, 'Year coverage changed unexpectedly.'
assert set(range(2000, 2022)) == set(ml_numeric['Year'].unique()), 'Some years are missing.'

print('Final years:', ml_numeric['Year'].min(), '-', ml_numeric['Year'].max())
print('Final countries:', ml_numeric['ISO3'].nunique())
print('Final features (numeric incl ids/transforms):', ml_numeric.select_dtypes(include='number').shape[1])


Final years: 2000 - 2021
Final countries: 192
Final features (numeric incl ids/transforms): 17


In [16]:
# 6) Export
OUT_CLEAN = DATA_DIR / 'health_panel_ml_clean.csv'          # with Country/ISO3 identifiers
OUT_NUMERIC = DATA_DIR / 'health_panel_ml_numeric.csv'      # numeric-friendly for direct ML use
OUT_DROPPED = DATA_DIR / 'health_panel_dropped_summary.csv' # audit trail

clean.to_csv(OUT_CLEAN, index=False)
ml_numeric.to_csv(OUT_NUMERIC, index=False)

# Save audit of dropped features/countries
pd.DataFrame({
    'dropped_feature': pd.Series(drop_features, dtype='object')
}).to_csv(DATA_DIR / 'dropped_features.csv', index=False)

pd.DataFrame({
    'ISO3': pd.Index(drop_iso3, dtype='object'),
    'country_completeness': country_completeness.loc[drop_iso3].values
}).to_csv(DATA_DIR / 'dropped_countries.csv', index=False)

summary = pd.DataFrame({
    'metric': [
        'raw_rows', 'raw_countries', 'raw_features',
        'clean_rows', 'clean_countries', 'kept_features', 'dropped_features'
    ],
    'value': [
        len(raw), raw['ISO3'].nunique(), len(feature_cols),
        len(clean), clean['ISO3'].nunique(), len(keep_features), len(drop_features)
    ]
})
summary.to_csv(OUT_DROPPED, index=False)

print('Saved:', OUT_CLEAN.resolve())
print('Saved:', OUT_NUMERIC.resolve())
print('Saved:', (DATA_DIR / 'dropped_features.csv').resolve())
print('Saved:', (DATA_DIR / 'dropped_countries.csv').resolve())
print('Saved:', OUT_DROPPED.resolve())


Saved: /Users/bikki/Documents/Projects_Spring_2026/STATS201/Assets/cleaner_exports/health_panel_ml_clean.csv
Saved: /Users/bikki/Documents/Projects_Spring_2026/STATS201/Assets/cleaner_exports/health_panel_ml_numeric.csv
Saved: /Users/bikki/Documents/Projects_Spring_2026/STATS201/Assets/cleaner_exports/dropped_features.csv
Saved: /Users/bikki/Documents/Projects_Spring_2026/STATS201/Assets/cleaner_exports/dropped_countries.csv
Saved: /Users/bikki/Documents/Projects_Spring_2026/STATS201/Assets/cleaner_exports/health_panel_dropped_summary.csv
