In [4]:
import sqlite3
import pandas as pd
from pathlib import Path
import re

In [9]:

DB_PATH = Path("/Users/qwuiris/Desktop/QTM 350 group project/data/wdi_population.db")

con = sqlite3.connect(DB_PATH)


# ---------------------------------------------------------------
# 1.  Helper to read‑>tidy‑>clean one indicator
# ---------------------------------------------------------------
ISO3_REGEX = re.compile(r"^[A-Z]{3}$")           # crude ISO‑3 check


def load_wdi_csv(path: Path, value_name: str, skiprows: int = 4) -> pd.DataFrame:
    """
    Read a World‑Bank “wide” CSV → tidy / cleaned long DataFrame with columns
    [country_code | country_name | year | <value_name>]
    """
    df_wide = pd.read_csv(
        path,
        skiprows=skiprows,
        na_values=[".."],
        encoding="utf-8-sig",
    )

    id_cols   = ["Country Name", "Country Code"]
    year_cols = [c for c in df_wide.columns if c.isdigit()]

    df_long = (
        df_wide[id_cols + year_cols]
        .melt(id_vars=id_cols,
              value_vars=year_cols,
              var_name="year",
              value_name=value_name)
        .rename(columns={"Country Name": "country_name",
                         "Country Code": "country_code"})
        .astype({"year": "int16"})
        # cleaning
        .loc[lambda d: d["year"].between(1960, 2023)]      # 1960‑2023 window
        .dropna(subset=[value_name])                       # toss NaNs
        .loc[lambda d: d["country_code"].str.match(ISO3_REGEX)]  # real ISO‑3
        .reset_index(drop=True)
    )
    return df_long


# ---------------------------------------------------------------
# 2.  Load & clean each indicator
# ---------------------------------------------------------------
root = Path("/Users/qwuiris/Desktop/QTM 350 group project/data")

lifeexp = load_wdi_csv(root / "life expectancy/life expectancy.csv", "lifeexp")
u5mort  = load_wdi_csv(root / "mortality rate/mortality rate.csv", "u5mort")
adofert = load_wdi_csv(root / "adolecent/adolecent.csv",            "adofert")

lifeexp.to_sql("lifeexp", con, if_exists="replace", index=False)
u5mort .to_sql("u5mort",  con, if_exists="replace", index=False)
adofert.to_sql("adofert", con, if_exists="replace", index=False)

# ---------------------------------------------------------------
# 3.  Country‑level meta  (keep only rows with a valid income group)
# ---------------------------------------------------------------
meta_path = root / "adolecent/Metadata_Country_API_SH.DYN.MORT_DS2_en_csv_v2_21068.csv"
meta_raw  = pd.read_csv(meta_path, encoding="utf-8-sig")

col_income = next(c for c in meta_raw.columns if "Income" in c)

meta_clean = (
    meta_raw[["Country Code", col_income]]
      .rename(columns={"Country Code": "country_code",
                       col_income:      "income_group"})
      .dropna(subset=["income_group"])                # <-- toss NULL groups
)

meta_clean.to_sql("meta", con, if_exists="replace", index=False)

# ---------------------------------------------------------------
# 4.  Build the full panel inside SQLite (inner‑join to META)
# ---------------------------------------------------------------
merge_sql = """
DROP TABLE IF EXISTS population_panel;
CREATE TABLE population_panel AS
SELECT  l.country_code,
        l.country_name,
        l.year,
        l.lifeexp,
        u.u5mort,
        a.adofert,
        m.income_group
FROM    lifeexp AS l
JOIN    u5mort  AS u USING (country_code, year)
JOIN    adofert AS a USING (country_code, year)
JOIN    meta    AS m USING (country_code);   -- inner join keeps only valid groups
"""
con.executescript(merge_sql)

# ---------------------------------------------------------------
# 5.  Export the clean panel to CSV
# ---------------------------------------------------------------
panel = pd.read_sql("SELECT * FROM population_panel;", con)
out_path = root / "population_panel_clean.csv"
panel.to_csv(out_path, index=False)
print(f"✔  Clean panel written to {out_path}  ({len(panel):,} rows)")

con.close()

✔  Clean panel written to /Users/qwuiris/Desktop/QTM 350 group project/data/population_panel_clean.csv  (11,536 rows)
