In [1]:
import os, glob, statistics
from functools import reduce

import pandas as pd
import pycountry

RAW_DIR   = "data/raw/worldbank"
CLEAN_DIR = "data/clean"
os.makedirs(CLEAN_DIR, exist_ok=True)

In [2]:
frames = {}      # key = indicator name, value = DataFrame

for path in glob.glob(f"{RAW_DIR}/*.csv"):
    ind = os.path.basename(path).replace(".csv", "")     # indicator name
    df_full = pd.read_csv(path, usecols=["iso3", "year", "value"])

    latest_year = df_full["year"].max()
    df = (df_full.loc[df_full["year"] == latest_year, ["iso3", "value"]]
                  .drop_duplicates("iso3")
                  .rename(columns={"value": ind}))

    frames[ind] = df
    print(f"{ind:<30} year:{latest_year}  rows:{len(df)}")

gdp_per_capita_ppp             year:2023  rows:262
gdp_total_usd                  year:2023  rows:262
gini_index                     year:2023  rows:262
inflation_cpi_pct              year:2024  rows:262
life_expectancy                year:2023  rows:262
literacy_rate                  year:2023  rows:262
military_expenditure_pct_gdp   year:2023  rows:262
population_density             year:2022  rows:262
population_growth_pct          year:2023  rows:262
real_gdp_growth_pct            year:2023  rows:262
total_population               year:2023  rows:262
unemployment_rate              year:2024  rows:262
urbanization_rate              year:2023  rows:262


In [3]:
master = reduce(
    lambda l, r: l.merge(r, on="iso3", how="outer"),
    frames.values()
)
print("Merged shape:", master.shape)
master.head()

Merged shape: (262, 14)


Unnamed: 0,iso3,gdp_per_capita_ppp,gdp_total_usd,gini_index,inflation_cpi_pct,life_expectancy,literacy_rate,military_expenditure_pct_gdp,population_density,population_growth_pct,real_gdp_growth_pct,total_population,unemployment_rate,urbanization_rate
0,ABW,44967.344513,3648573000.0,,,76.353,,,596.166667,0.045652,4.263719,107359.0,,44.254
1,AFE,4374.229532,1245472000000.0,,,65.146291,73.275108,0.999402,49.297201,2.552859,2.320138,750503764.0,7.772656,38.424898
2,AFG,2211.280635,17233050000.0,,-6.601186,66.035,,,62.215541,2.135594,2.710887,41454761.0,13.295,26.933
3,AFW,5343.468529,799106000000.0,,,58.855722,60.50555,1.177399,54.985592,2.414901,3.354733,509398589.0,3.218313,49.711184
4,AGO,8040.70245,84824650000.0,,28.240495,64.617,,1.332529,28.583484,3.080655,1.001289,36749906.0,14.464,68.688


In [None]:
nulls = (master.isna()
                 .mean()
                 .mul(100).round(1)
                 .astype(str) + "%")
print("Percent missing per column:")
print(nulls)

# Drop columns with >50 % missing (unlikely but safe)
master = master.loc[:, master.isna().mean() < 0.50]

# Median‑impute remaining NaNs
for col in master.columns[1:]:          # skip iso3
    master[col].fillna(master[col].median(), inplace=True)
# 
# NOT MEDIAN FILLED

Percent missing per column:
iso3                             0.4%
gdp_per_capita_ppp               9.5%
gdp_total_usd                    8.8%
gini_index                      98.5%
inflation_cpi_pct               52.3%
life_expectancy                  0.0%
literacy_rate                   84.4%
military_expenditure_pct_gdp    27.1%
population_density               2.7%
population_growth_pct            0.0%
real_gdp_growth_pct              8.4%
total_population                 0.0%
unemployment_rate               13.4%
urbanization_rate                0.8%
dtype: object


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  master[col].fillna(master[col].median(), inplace=True)


In [5]:
# out_path = f"{CLEAN_DIR}/country_snapshot_master.csv"
# master.to_csv(out_path, index=False)
# print("✅  Saved:", out_path, "| rows:", len(master))