# 🌬️ Day 3 · Air Quality & Health
We connect economic development and particulate pollution. Follow the same micro-loop rhythm to keep students engaged.

## 🔄 How to Use This Solution
- Merge datasets step-by-step; each diagnostic cell confirms column names and units.
- Encourage students to hypothesise the relationship before plotting.
- Offer extensions (log scales, regional filters) once the baseline scatter is solid.

> ### 🗂️ Data Card — World Bank PM2.5 & GDP
> - **Sources:** World Bank World Development Indicators (`EN.ATM.PM25.MC.M3`) and GDP per capita (`NY.GDP.PCAP.CD`).
> - **Temporal coverage:** Annual, 1990–2021 (we focus on 2019 for alignment).
> - **Metrics:** Population-weighted PM2.5 exposure (µg/m³) and GDP per capita (current USD).
> - **Refresh cadence:** Annual updates; download timestamp January 2024.
> - **Caveats:** Missing data for small states; GDP in current USD does not adjust for purchasing power.
> - **Ethics & framing:** Highlight that exposure impacts health inequities and that high GDP does not guarantee clean air.

In [None]:

from __future__ import annotations
from pathlib import Path
from typing import Mapping, Sequence

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display

DATA_DIR = Path.cwd() / "data"
PLOTS_DIR = Path.cwd() / "plots"
PLOTS_DIR.mkdir(parents=True, exist_ok=True)

sns.set_theme(style="whitegrid", context="notebook", palette="colorblind")
plt.rcParams.update({
    "figure.dpi": 120,
    "axes.titlesize": 18,
    "axes.labelsize": 13,
    "axes.titleweight": "semibold",
    "axes.grid": True,
})


def baseline_style() -> None:
    """Reset plot style to the shared course defaults."""
    sns.set_theme(style="whitegrid", context="notebook", palette="colorblind")
    plt.rcParams.update({
        "axes.grid": True,
        "axes.spines.top": False,
        "axes.spines.right": False,
        "figure.dpi": 120,
        "font.size": 12,
    })


def load_data(path: Path, *, read_kwargs: Mapping[str, object] | None = None) -> pd.DataFrame:
    read_kwargs = dict(read_kwargs or {})
    df = pd.read_csv(path, **read_kwargs)
    print(f"✅ Loaded {path.name} with shape {df.shape}")
    return df


def validate_columns(df: pd.DataFrame, required: Sequence[str]) -> None:
    missing = [col for col in required if col not in df.columns]
    if missing:
        raise ValueError(f"Missing columns: {missing}")
    print("✅ Column check passed:", ", ".join(required))


def expect_rows_between(df: pd.DataFrame, lower: int, upper: int) -> None:
    rows = len(df)
    if not (lower <= rows <= upper):
        raise ValueError(f"Expected between {lower} and {upper} rows, got {rows}")
    print(f"✅ Row count within expected range ({rows} rows)")


def quick_diagnose(df: pd.DataFrame, *, label: str = "Data preview", n: int = 5) -> None:
    print(f"
🔍 {label}")
    display(df.head(n))
    numeric = df.select_dtypes(include="number")
    if not numeric.empty:
        display(numeric.describe().T)
    nulls = df.isna().sum()
    if nulls.any():
        print("⚠️ Null values detected:
", nulls[nulls > 0])
    else:
        print("✅ No null values detected in this slice.")


def accessibility_check(ax: plt.Axes) -> None:
    title_ok = bool(ax.get_title())
    label_ok = bool(ax.get_xlabel()) and bool(ax.get_ylabel())
    if not (title_ok and label_ok):
        raise ValueError("Add a descriptive title and axis labels before proceeding.")
    xlabels = [tick.get_text() for tick in ax.get_xticklabels()]
    if len(xlabels) > 12:
        ax.tick_params(axis='x', labelrotation=35)
    print("✅ Accessibility check: title, labels, and readable ticks confirmed.")


def annotate_latest_point(ax: plt.Axes, x: float, y: float, text: str) -> None:
    ax.scatter([x], [y], color=ax.lines[0].get_color(), s=60, zorder=5)
    ax.annotate(
        text,
        xy=(x, y),
        xytext=(0.96, 0.85),
        textcoords="axes fraction",
        ha="right",
        arrowprops={"arrowstyle": "->", "color": "#333"},
        fontsize=11,
    )


def require_story_elements(story: Mapping[str, str]) -> None:
    required = ["claim", "evidence", "visual", "takeaway", "source", "units"]
    missing = [key for key in required if not story.get(key, "").strip()]
    if missing:
        raise ValueError(f"Fill in the storytelling scaffold: missing {missing}")
    print("✅ Story scaffold complete.")


def save_last_fig(filename: str, fig: plt.Figure | None = None) -> None:
    fig = fig or plt.gcf()
    if not fig.axes:
        raise ValueError("No Matplotlib figure found to save.")
    output_path = PLOTS_DIR / filename
    fig.savefig(output_path, bbox_inches="tight")
    print(f"💾 Figure saved to {output_path.relative_to(Path.cwd())}")


## Loop 1 · Load & Align 2019 Snapshots
Goal: select 2019 columns, rename them clearly, and confirm we have matching country codes before merging.

✅ **You should see:** Two tidy dataframes with columns `Country Name`, `Country Code`, and the metric column.

In [None]:

baseline_style()

pm25 = load_data(DATA_DIR / "pm25_exposure.csv")
gdp = load_data(DATA_DIR / "gdp_per_country.csv")

pm25_2019 = (
    pm25[["Country Name", "Country Code", "2019"]]
    .rename(columns={"2019": "pm25_ug_m3"})
)

gdp_2019 = (
    gdp[["Country Name", "Country Code", "2019"]]
    .rename(columns={"2019": "gdp_per_capita_usd"})
)

validate_columns(pm25_2019, ["Country Name", "Country Code", "pm25_ug_m3"])
validate_columns(gdp_2019, ["Country Name", "Country Code", "gdp_per_capita_usd"])
quick_diagnose(pm25_2019.head(), label="PM2.5 snapshot")
quick_diagnose(gdp_2019.head(), label="GDP snapshot")


## Loop 2 · Merge & Diagnose
Goal: inner-join on country code, drop missing values, and flag the shape so teachers can spot issues early.

✅ **You should see:** ~180 countries with both metrics and no nulls.

In [None]:

merged = (
    pm25_2019.merge(gdp_2019, on=["Country Name", "Country Code"], how="inner")
    .dropna(subset=["pm25_ug_m3", "gdp_per_capita_usd"])
)

expect_rows_between(merged, 150, 220)
quick_diagnose(merged.sample(5, random_state=42), label="Merged sample")

print("PM2.5 range:", merged["pm25_ug_m3"].min(), "→", merged["pm25_ug_m3"].max())
print("GDP per capita range:", merged["gdp_per_capita_usd"].min(), "→", merged["gdp_per_capita_usd"].max())


## Loop 3 · Baseline Scatter Plot
Goal: visualise the relationship with thoughtful scaling, annotations, and story scaffold.

✅ **You should see:** Negative relationship trend (wealthier countries tend to have lower exposure) with notable outliers.

In [None]:

TITLE = "Wealthier Countries Tend to Breathe Cleaner Air"
SUBTITLE = "PM2.5 exposure versus GDP per capita, 2019"
SOURCE = "Source: World Bank World Development Indicators (downloaded Jan 2024)"
UNITS = "PM2.5 (µg/m³) vs GDP per capita (USD)"

story = {
    "claim": "Economic development often coincides with lower particulate exposure, but inequalities remain.",
    "evidence": f"Median PM2.5 exposure among the top income quartile is {merged.nlargest(len(merged)//4, 'gdp_per_capita_usd')['pm25_ug_m3'].median():.1f} µg/m³ versus {merged.nsmallest(len(merged)//4, 'gdp_per_capita_usd')['pm25_ug_m3'].median():.1f} µg/m³ for the lowest quartile.",
    "visual": "Log-scale scatter plot with region annotations.",
    "takeaway": "Policy and technology investments matter—income alone doesn't guarantee clean air.",
    "source": SOURCE,
    "units": UNITS,
}
require_story_elements(story)

fig, ax = plt.subplots(figsize=(11, 6))
scatter = ax.scatter(
    merged["gdp_per_capita_usd"],
    merged["pm25_ug_m3"],
    alpha=0.75,
    edgecolor="#222",
    linewidth=0.3,
)
ax.set_xscale("log")
ax.set_title(f"{TITLE}
{SUBTITLE}")
ax.set_xlabel("GDP per capita (log scale, USD)")
ax.set_ylabel("PM2.5 exposure (µg/m³)")
ax.text(0.01, 0.03, SOURCE, transform=ax.transAxes, fontsize=9, color="#555")
accessibility_check(ax)

highlight = merged.nsmallest(1, "pm25_ug_m3").iloc[0]
ax.annotate(
    f"{highlight['Country Name']}: {highlight['pm25_ug_m3']:.1f} µg/m³",
    xy=(highlight["gdp_per_capita_usd"], highlight["pm25_ug_m3"]),
    xytext=(highlight["gdp_per_capita_usd"]*1.5, highlight["pm25_ug_m3"] + 10),
    arrowprops={"arrowstyle": "->"},
    fontsize=10,
)

last_fig = fig
plt.show()

display(pd.DataFrame([story]).T.rename(columns={0: "Story Scaffold"}))


## Loop 4 · Interpret & Extend
Discussion starters:
- Which regions break the overall pattern and why?
- How would the picture change using population-weighted averages by income group?
- Extension: colour points by World Bank income classification and add a regression line.

In [None]:
save_last_fig("day03_solution_plot.png", fig=last_fig)