# Prelim exploration

In [None]:
from __future__ import annotations

from pathlib import Path
import re
from functools import lru_cache

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Folder is next to the notebook:
DATASET_DIR = Path("dataset")

# DDF datapoints filename convention you showed:
GLOB_PATTERN = "ddf--datapoints--*--by--geo--time.csv"
FILENAME_RX = re.compile(r"^ddf--datapoints--(.+)--by--geo--time\.csv$")


In [None]:
files = sorted(DATASET_DIR.glob(GLOB_PATTERN))

catalog_rows = []
for f in files:
    m = FILENAME_RX.match(f.name)
    if not m:
        continue
    indicator = m.group(1)
    catalog_rows.append({"indicator": indicator, "path": f, "filename": f.name})

catalog = pd.DataFrame(catalog_rows).sort_values("indicator").reset_index(drop=True)

print(f"Found {len(catalog)} datapoints files in {DATASET_DIR.resolve()}")
catalog.head(10)


In [None]:
def _read_csv(path: Path) -> pd.DataFrame:
    # Keep as strings first, then convert selectively
    return pd.read_csv(path)

@lru_cache(maxsize=512)
def load_indicator(indicator: str) -> pd.DataFrame:
    row = catalog.loc[catalog["indicator"].eq(indicator)]
    if row.empty:
        raise KeyError(f"Unknown indicator: {indicator}")

    path = Path(row.iloc[0]["path"])
    df = _read_csv(path)

    # Validate expected columns
    expected = {"geo", "time", indicator}
    missing = expected - set(df.columns)
    if missing:
        raise ValueError(f"{path.name} is missing columns: {missing}. Has: {list(df.columns)}")

    # Normalize schema
    out = df.rename(columns={indicator: "value"}).copy()
    out["indicator"] = indicator

    # Types
    out["geo"] = out["geo"].astype(str)
    out["time"] = pd.to_numeric(out["time"], errors="coerce").astype("Int64")

    # Value -> numeric when possible; keep non-numeric as object if needed
    # (Many Gapminder datapoints are numeric; this gives you numeric plots easily.)
    out["value"] = pd.to_numeric(out["value"], errors="coerce")

    return out[["indicator", "geo", "time", "value"]]


In [None]:
def describe_indicator(indicator: str) -> dict:
    df = load_indicator(indicator)
    return {
        "indicator": indicator,
        "rows": int(len(df)),
        "geos": int(df["geo"].nunique()),
        "time_min": int(df["time"].min()) if df["time"].notna().any() else None,
        "time_max": int(df["time"].max()) if df["time"].notna().any() else None,
        "missing_values": int(df["value"].isna().sum()),
    }

summary = pd.DataFrame([describe_indicator(ind) for ind in catalog["indicator"]])
summary.sort_values(["rows"], ascending=False).reset_index(drop=True).head(20)


In [None]:
def find_indicators(keyword: str, case: bool = False) -> pd.DataFrame:
    mask = catalog["indicator"].str.contains(keyword, case=case, regex=False)
    return catalog.loc[mask, ["indicator", "filename"]].reset_index(drop=True)

# examples:
find_indicators("suicide").head(30)


In [None]:
def coverage(indicator: str) -> dict:
    df = load_indicator(indicator)
    return {
        "indicator": indicator,
        "n_geos": int(df["geo"].nunique()),
        "geos_sample": sorted(df["geo"].unique())[:25],
        "time_min": int(df["time"].min()) if df["time"].notna().any() else None,
        "time_max": int(df["time"].max()) if df["time"].notna().any() else None,
    }

# pick one:
coverage(catalog["indicator"].iloc[0])


In [None]:
def timeseries(indicator: str, geo: str) -> pd.DataFrame:
    df = load_indicator(indicator)
    out = df[df["geo"].eq(geo)].dropna(subset=["time"]).sort_values("time").reset_index(drop=True)
    return out

def plot_timeseries(indicator: str, geo: str):
    ts = timeseries(indicator, geo)
    if ts.empty:
        print(f"No data for geo={geo!r} in indicator={indicator!r}")
        return
    plt.figure()
    plt.plot(ts["time"].astype(float), ts["value"])
    plt.title(f"{indicator} — {geo}")
    plt.xlabel("time")
    plt.ylabel("value")
    plt.show()

# Example (change geo to something you have, e.g. 'swe', 'usa', 'esp', etc.)
plot_timeseries(catalog["indicator"].iloc[0], "swe")


In [None]:
def plot_compare_geos(indicator: str, geos: list[str]):
    df = load_indicator(indicator).dropna(subset=["time"]).sort_values("time")
    plt.figure()
    for g in geos:
        ts = df[df["geo"].eq(g)]
        if ts.empty:
            continue
        plt.plot(ts["time"].astype(float), ts["value"], label=g)
    plt.title(f"{indicator} — compare geos")
    plt.xlabel("time")
    plt.ylabel("value")
    plt.legend()
    plt.show()

plot_compare_geos("suicide_per_100000_people", ["swe", "usa", "esp"])


In [None]:
def snapshot(geo: str, time: int) -> pd.DataFrame:
    rows = []
    for ind in catalog["indicator"]:
        df = load_indicator(ind)
        v = df.loc[(df["geo"].eq(geo)) & (df["time"].eq(time)), "value"]
        if not v.empty:
            rows.append({"indicator": ind, "value": float(v.iloc[0]) if pd.notna(v.iloc[0]) else np.nan})
    out = pd.DataFrame(rows).dropna(subset=["value"]).sort_values("indicator").reset_index(drop=True)
    return out

snapshot("swe", 2019).head(30)


In [None]:
all_long = pd.concat([load_indicator(ind) for ind in catalog["indicator"]], ignore_index=True)
all_long.head(), all_long.shape


In [None]:
OUT_PARQUET = Path("all_datapoints_long.parquet")
all_long.to_parquet(OUT_PARQUET, index=False)
print("Wrote:", OUT_PARQUET.resolve())


In [None]:
all_long = pd.read_parquet("all_datapoints_long.parquet")
all_long.shape


In [None]:
wide = all_long.pivot_table(
    index=["geo", "time"],
    columns="indicator",
    values="value",
    aggfunc="first"
).reset_index()

wide.head()


In [None]:
missing_by_indicator = (
    all_long.assign(is_missing=lambda d: d["value"].isna())
            .groupby("indicator", as_index=False)
            .agg(rows=("value", "size"), missing=("is_missing", "sum"))
            .assign(missing_pct=lambda d: d["missing"] / d["rows"])
            .sort_values("missing_pct", ascending=False)
            .reset_index(drop=True)
)

missing_by_indicator.head(20)


In [None]:
coverage_rank = summary.assign(
    time_span=lambda d: (d["time_max"] - d["time_min"])
).sort_values(["geos", "time_span", "rows"], ascending=False).reset_index(drop=True)

coverage_rank.head(20)


# Graphs

In [None]:
from pathlib import Path
import re
from functools import lru_cache

import pandas as pd
import matplotlib.pyplot as plt

DATASET_DIR = Path("dataset")
RX = re.compile(r"^ddf--datapoints--(.+)--by--geo--time\.csv$")


In [None]:
files = DATASET_DIR.glob("ddf--datapoints--*--by--geo--time.csv")

catalog = {}
for f in files:
    m = RX.match(f.name)
    if m:
        catalog[m.group(1)] = f

@lru_cache(maxsize=512)
def load(indicator: str) -> pd.DataFrame:
    df = pd.read_csv(catalog[indicator])
    df = df.rename(columns={indicator: "value"})
    df["time"] = pd.to_numeric(df["time"], errors="coerce")
    return df[["geo", "time", "value"]].dropna(subset=["time"])


In [None]:
def plot_timeseries(indicator, geos=("usa", "swe", "deu", "fra", "jpn")):
    df = load(indicator)
    plt.figure()
    for g in geos:
        d = df[df.geo == g]
        if not d.empty:
            plt.plot(d.time, d.value, label=g)
    plt.title(indicator.replace("_", " "))
    plt.xlabel("Year")
    plt.ylabel("Value")
    plt.legend()
    plt.show()


def plot_latest_bar(indicator, year=None, top_n=15):
    df = load(indicator)
    if year is None:
        year = int(df.time.max())
    d = df[df.time == year].dropna().sort_values("value", ascending=False).head(top_n)

    plt.figure()
    plt.barh(d.geo, d.value)
    plt.title(f"{indicator.replace('_', ' ')} ({year})")
    plt.xlabel("Value")
    plt.gca().invert_yaxis()
    plt.show()


In [None]:
plot_timeseries("alcohol_consumption_per_adult_15plus_litres")


In [None]:
plot_timeseries("body_mass_index_bmi_men_kgperm2")


In [None]:
plot_timeseries("body_mass_index_bmi_women_kgperm2")


In [None]:
plot_latest_bar("breast_cancer_deaths_per_100000_women")


In [None]:
plot_timeseries("cell_phones_per_100_people")


In [None]:
plot_latest_bar("cell_phones_total", top_n=20)


In [None]:
plot_timeseries("cholesterol_fat_in_blood_men_mmolperl")


In [None]:
plot_timeseries("cholesterol_fat_in_blood_women_mmolperl")


In [None]:
plot_latest_bar("data_quality_income_per_person")


In [None]:
plot_timeseries("economic_growth_over_the_past_10_years")


In [None]:
plot_timeseries("females_aged_15_24_unemployment_rate_percent")


In [None]:
plot_timeseries("females_aged_25_54_unemployment_rate_percent")


In [None]:
plot_timeseries("male_long_term_unemployment_rate_percent")


In [None]:
plot_timeseries("suicide_per_100000_people")


In [None]:
plot_timeseries("suicide_men_per_100000_people")

In [None]:
plot_timeseries("suicide_women_per_100000_people")

In [None]:
plot_timeseries("suicide_total_deaths", geos=("usa", "jpn", "rus", "ind", "chn"))


In [None]:
plot_latest_bar("total_number_of_dollar_billionaires")


In [None]:
plot_timeseries("working_hours_per_week")


# Heat Maps

In [None]:
!pip install plotly


In [None]:
import plotly.express as px
import pandas as pd


In [None]:
import plotly.io as pio

# If you're in VS Code notebooks, this is usually the right one:
pio.renderers.default = "vscode"

# If you're in classic Jupyter Notebook / JupyterLab and vscode doesn't work, try:
# pio.renderers.default = "notebook_connected"
# or:
# pio.renderers.default = "jupyterlab"


In [None]:
import pandas as pd
import plotly.express as px

test = pd.DataFrame({
    "iso3": ["AFG", "SWE", "USA", "ESP", "CHN"],
    "value": [2, 10, 8, 6, 4]
})

fig = px.choropleth(
    test,
    locations="iso3",
    locationmode="ISO-3",
    color="value",
    title="Renderer test map"
)
fig.show()


In [None]:
import plotly.express as px

def plot_world_map(indicator: str, year: int | None = None):
    df = load(indicator).copy()

    # IMPORTANT: Plotly expects ISO-3 uppercase
    df["iso3"] = df["geo"].astype(str).str.upper()

    if year is None:
        year = int(df["time"].max())

    d = df[df["time"] == year].dropna(subset=["value", "iso3"])

    print(f"{indicator} | year={year} | rows={len(d)} | unique ISO3={d['iso3'].nunique()}")

    fig = px.choropleth(
        d,
        locations="iso3",
        locationmode="ISO-3",
        color="value",
        hover_name="iso3",
        hover_data={"value": True},
        title=f"{indicator.replace('_', ' ')} ({year})",
        color_continuous_scale="Viridis",
    )

    fig.update_layout(margin=dict(l=0, r=0, t=40, b=0))
    fig.show()


In [None]:
plot_world_map("suicide_women_per_100000_people")


In [None]:
def plot_world_map_animated(indicator: str):
    df = load(indicator).copy()
    df["iso3"] = df["geo"].astype(str).str.upper()

    df = df.dropna(subset=["value", "iso3", "time"])

    fig = px.choropleth(
        df,
        locations="iso3",
        locationmode="ISO-3",
        color="value",
        animation_frame="time",
        color_continuous_scale="Viridis",
        title=indicator.replace("_", " "),
    )

    fig.update_layout(margin=dict(l=0, r=0, t=40, b=0))
    fig.show()


In [None]:
plot_world_map_animated("suicide_women_per_100000_people")


In [None]:
plot_world_map_animated("alcohol_consumption_per_adult_15plus_litres")


In [None]:
plot_world_map_animated("body_mass_index_bmi_men_kgperm2")


In [None]:
plot_world_map_animated("body_mass_index_bmi_women_kgperm2")


In [None]:
plot_world_map_animated("breast_cancer_deaths_per_100000_women")


In [None]:
plot_world_map_animated("cell_phones_per_100_people")


In [None]:
plot_world_map_animated("cell_phones_total")


In [None]:
plot_world_map_animated("cholesterol_fat_in_blood_men_mmolperl")


In [None]:
plot_world_map_animated("cholesterol_fat_in_blood_women_mmolperl")


In [None]:
plot_world_map_animated("data_quality_income_per_person")


In [None]:
plot_world_map_animated("economic_growth_over_the_past_10_years")


In [None]:
plot_world_map_animated("females_aged_15_24_unemployment_rate_percent")


In [None]:
plot_world_map_animated("females_aged_25_54_unemployment_rate_percent")


In [None]:
plot_world_map_animated("male_long_term_unemployment_rate_percent")


In [None]:
plot_world_map_animated("suicide_age_15_19_per_100000_people")


In [None]:
plot_world_map_animated("suicide_age_15_24_per_100000_people")


In [None]:
plot_world_map_animated("suicide_age_15_29_per_100000_people")


In [None]:
plot_world_map_animated("suicide_age_25_34_per_100000_people")


In [None]:
plot_world_map_animated("suicide_age_35_44_per_100000_people")


In [None]:
plot_world_map_animated("suicide_age_45_54_per_100000_people")


In [None]:
plot_world_map_animated("suicide_age_55_64_per_100000_people")


In [None]:
plot_world_map_animated("suicide_age_65_74_per_100000_people")


In [None]:
plot_world_map_animated("suicide_age_75_84_per_100000_people")


In [None]:
plot_world_map_animated("suicide_age_85plus_per_100000_people")


In [None]:
plot_world_map_animated("suicide_men_per_100000_people")


In [None]:
plot_world_map_animated("suicide_per_100000_people")


In [None]:
plot_world_map_animated("suicide_total_deaths")


In [None]:
plot_world_map_animated("suicide_women_per_100000_people")


In [None]:
plot_world_map_animated("total_number_of_dollar_billionaires")


In [None]:
plot_world_map_animated("working_hours_per_week")
