In [1]:
import os, io, zipfile, requests, pandas as pd, pycountry

RAW_EXT = "data/raw/external"
os.makedirs(RAW_EXT, exist_ok=True)

def clean_iso(df, col="iso3"):
    df[col] = df[col].str.strip().str.upper()
    good    = {c.alpha_3 for c in pycountry.countries}
    return df[df[col].isin(good)].copy()

In [2]:
url = ("https://api.worldbank.org/v2/country/all/indicator/SE.ADT.LITR.ZS"
       "?format=json&per_page=20000&mrv=3")
_, data = requests.get(url, timeout=30).json()

lit = (pd.json_normalize(data)
         .loc[:, ["countryiso3code", "date", "value"]]
         .rename(columns={"countryiso3code": "iso3", "date": "year"}))

lit["year"]  = lit["year"].astype(int)
lit["value"] = pd.to_numeric(lit["value"], errors="coerce")

# keep latest non-null per country
lit = (lit.dropna(subset=["value"])
          .sort_values("year")
          .drop_duplicates("iso3", keep="last"))
lit = clean_iso(lit)

lit.to_csv(f"{RAW_EXT}/literacy_rate_unesco.csv", index=False)
print("✅ literacy rows:", len(lit))

✅ literacy rows: 95


In [6]:
# Inflation, consumer prices (annual %)  — indicator FP.CPI.TOTL.ZG
url = ("https://api.worldbank.org/v2/country/all/indicator/FP.CPI.TOTL.ZG"
       "?format=json&per_page=20000&mrv=3")          # last 3 years

_, data = requests.get(url, timeout=30).json()

infl = (pd.json_normalize(data)
          .loc[:, ["countryiso3code", "date", "value"]]
          .rename(columns={"countryiso3code": "iso3", "date": "year"}))

infl["year"]  = infl["year"].astype(int)
infl["value"] = pd.to_numeric(infl["value"], errors="coerce")

# keep the latest non‑null per country
infl = (infl.dropna(subset=["value"])
            .sort_values("year")
            .drop_duplicates("iso3", keep="last"))

infl = clean_iso(infl)
infl.to_csv(f"{RAW_EXT}/inflation_pct_wb.csv", index=False)
print("✅ inflation rows:", len(infl), "| snapshot years span:",
      infl["year"].min(), "–", infl["year"].max())


✅ inflation rows: 175 | snapshot years span: 2022 – 2024


In [7]:
wgi_url = ("https://api.worldbank.org/v2/country/all/indicator/GE.EST"
           "?format=json&per_page=20000&mrv=1")
_, data = requests.get(wgi_url, timeout=30).json()

wgi = (pd.json_normalize(data)
         .loc[:, ["countryiso3code", "date", "value"]]
         .rename(columns={"countryiso3code": "iso3", "date": "year"}))
wgi["year"]  = wgi["year"].astype(int)
wgi["value"] = pd.to_numeric(wgi["value"], errors="coerce")
wgi = clean_iso(wgi)

wgi.to_csv(f"{RAW_EXT}/wgi_gov_effectiveness.csv", index=False)
print("✅ WGI rows:", len(wgi))

✅ WGI rows: 215


In [14]:
# Military Expenditure, constant 2015 USD
wb_mil_url = ("https://api.worldbank.org/v2/country/all/indicator/MS.MIL.XPND.CN"
              "?format=json&per_page=20000&mrv=3")

_, data = requests.get(wb_mil_url, timeout=30).json()

mil = (pd.json_normalize(data)
         .loc[:, ["countryiso3code", "date", "value"]]
         .rename(columns={"countryiso3code": "iso3", "date": "year"}))

mil["year"]  = mil["year"].astype(int)
mil["value"] = pd.to_numeric(mil["value"], errors="coerce")

mil = (mil.dropna(subset=["value"])
          .sort_values("year")
          .drop_duplicates("iso3", keep="last"))

mil = clean_iso(mil)
mil.to_csv(f"{RAW_EXT}/military_expenditure_constusd.csv", index=False)
print("✅ WB military rows:", len(mil))


✅ WB military rows: 154


In [15]:
for f in sorted(os.listdir(RAW_EXT)):
    rows = pd.read_csv(f"{RAW_EXT}/{f}").shape[0]
    print(f"{f:<40} rows: {rows}")
print("🎉  Extra sources downloaded.")

inflation_pct_wb.csv                     rows: 175
literacy_rate_unesco.csv                 rows: 95
military_expenditure_constusd.csv        rows: 154
wgi_gov_effectiveness.csv                rows: 215
🎉  Extra sources downloaded.
