## 🔗 Open This Notebook in Google Colab

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DavidLangworthy/ds4s/blob/master/Day%203%20%E2%80%93%20Pollution%20and%20Public%20Health.ipynb)

# 🌫️ Day 3 · Linking Pollution and Prosperity

Today we practice pairing two datasets to explain a public-health risk. We'll move through focused rounds—load, join, inspect, and visualize—so you always know the next action.

## 🗂️ Data Card · PM₂.₅ Exposure + GDP per Capita
| Field | Details |
| --- | --- |
| Sources | [World Bank PM₂.₅ exposure](https://data.worldbank.org/indicator/EN.ATM.PM25.MC.M3) & [World Bank GDP per capita](https://data.worldbank.org/indicator/NY.GDP.PCAP.CD) |
| Temporal coverage | 1990–2021 (PM₂.₅), 1960–2022 (GDP) |
| Geographic scope | Countries & select regions |
| Units | PM₂.₅: µg/m³ annual mean; GDP: current US dollars |
| Update cadence | Annual |
| Caveats | Missing data for small states; GDP in current USD ignores purchasing power; PM₂.₅ focuses on outdoor exposure. |
| What this chart cannot show | Within-country inequality, other pollutants (ozone, NO₂), or causal mechanisms. Pair with policy case studies for nuance. |

In [None]:
# 🔁 Shared scaffolds used across DS4S notebooks
from __future__ import annotations

import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

plt.rcParams.update({
    "figure.dpi": 120,
    "axes.facecolor": "#f8f9fb",
    "axes.grid": True,
    "grid.alpha": 0.25,
    "grid.linestyle": "--",
    "axes.titlesize": 18,
    "axes.labelsize": 12,
    "axes.titleweight": "bold",
    "legend.frameon": False,
    "legend.fontsize": 11,
    "font.family": "DejaVu Sans",
})

def quick_diagnostics(df: pd.DataFrame, dataset_name: str, *, expected_columns: list[str] | None = None, expected_rows: tuple[int, int] | None = None) -> None:
    """Print lightweight diagnostics without stopping execution."""
    print(f"\n🔍 {dataset_name}")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    if expected_columns is not None:
        missing = [col for col in expected_columns if col not in df.columns]
        if missing:
            warnings.warn(f"Missing expected columns: {missing}")
    if expected_rows is not None:
        low, high = expected_rows
        if not (low <= len(df) <= high):
            warnings.warn(f"Row count {len(df)} outside expected range {expected_rows}")
        else:
            print(f"Row count within expected range {expected_rows}.")
    print("Null counts:")
    print(df.isna().sum())
    print("Preview:")
    print(df.head())
    print("-" * 60)

def expect_value_range(series: pd.Series, *, lower: float | None = None, upper: float | None = None, context: str = "") -> None:
    """Warn when values fall outside an expected numeric window."""
    label = context or series.name or "series"
    if lower is not None and float(series.min()) < lower:
        warnings.warn(f"{label}: minimum {series.min():.3f} is below expected {lower}")
    if upper is not None and float(series.max()) > upper:
        warnings.warn(f"{label}: maximum {series.max():.3f} is above expected {upper}")
    print(f"{label}: {series.min():.3f} → {series.max():.3f}")

def validate_story_elements(*, title: str, subtitle: str, annotation: str, source: str, units: str) -> None:
    """Confirm the storytelling scaffold is filled before plotting."""
    elements = {
        "TITLE": title,
        "SUBTITLE": subtitle,
        "ANNOTATION": annotation,
        "SOURCE": source,
        "UNITS": units,
    }
    missing = [key for key, value in elements.items() if not str(value).strip()]
    if missing:
        warnings.warn(f"Please fill these storytelling fields: {', '.join(missing)}")
    else:
        print("👍 Story scaffold complete.")

def baseline_style(ax: plt.Axes | None = None) -> plt.Axes:
    """Standardise axes styling for consistency across notebooks."""
    ax = ax or plt.gca()
    for spine in ["top", "right"]:
        ax.spines[spine].set_visible(False)
    ax.set_facecolor("#ffffff")
    return ax

def save_last_visual(fig, filename: str, *, subfolder: str = "plots") -> None:
    """Persist the most recent Matplotlib or Plotly figure without failing the run."""
    plots_dir = Path.cwd() / subfolder
    plots_dir.mkdir(parents=True, exist_ok=True)
    output_path = plots_dir / filename
    try:
        if hasattr(fig, "write_image"):
            fig.write_image(str(output_path))
        elif hasattr(fig, "savefig"):
            fig.savefig(output_path, dpi=300, bbox_inches="tight")
        else:
            warnings.warn("Figure type not supported for export; skipping save.")
            return
        print(f"Saved visual to {output_path}")
    except Exception as exc:
        warnings.warn(f"Plot export skipped: {exc}")


## Step 1 · Load both World Bank tables
Read the wide-format CSVs and verify they contain the year columns we expect.

In [None]:
data_dir = Path.cwd() / "data"
df_pm = pd.read_csv(data_dir / "pm25_exposure.csv")
df_gdp = pd.read_csv(data_dir / "gdp_per_country.csv")

quick_diagnostics(
    df_pm,
    "PM2.5 exposure (wide)",
    expected_columns=["Country Name", "Country Code"],
    expected_rows=(200, 250),
)
quick_diagnostics(
    df_gdp,
    "GDP per capita (wide)",
    expected_columns=["Country Name", "Country Code"],
    expected_rows=(250, 300),
)


## Step 2 · Slice a single year and join
Select the latest shared year, rename columns, and perform an inner join to keep countries with both metrics.

In [None]:
TARGET_YEAR = 2019

pm_year = df_pm[["Country Name", "Country Code", str(TARGET_YEAR)]].rename(columns={str(TARGET_YEAR): "PM25"})
gdp_year = df_gdp[["Country Name", "Country Code", str(TARGET_YEAR)]].rename(columns={str(TARGET_YEAR): "GDP_per_capita"})

quick_diagnostics(
    pm_year,
    f"PM2.5 in {TARGET_YEAR}",
    expected_columns=["Country Name", "Country Code", "PM25"],
    expected_rows=(200, 250),
)
quick_diagnostics(
    gdp_year,
    f"GDP per capita in {TARGET_YEAR}",
    expected_columns=["Country Name", "Country Code", "GDP_per_capita"],
    expected_rows=(200, 250),
)

df_merge = pm_year.merge(gdp_year, on=["Country Name", "Country Code"], how="inner").dropna()
quick_diagnostics(
    df_merge,
    f"Merged PM2.5 + GDP in {TARGET_YEAR}",
    expected_columns=["Country Name", "Country Code", "PM25", "GDP_per_capita"],
    expected_rows=(180, 230),
)


## Step 3 · Run quick health checks
Before plotting, inspect ranges and highlight a few outliers so you know what story points to amplify.

In [None]:
expect_value_range(df_merge["PM25"], lower=0, upper=120, context="PM2.5 (µg/m³)")
expect_value_range(df_merge["GDP_per_capita"], lower=100, upper=150000, context="GDP per capita (USD)")

print(df_merge.sort_values("PM25", ascending=False).head())
print(df_merge.sort_values("PM25").head())


### Expected trend preview
Higher-income countries typically meet WHO guidelines, while many lower-income nations face hazardous air. Compare your scatter to this preview.

![Preview of the finished chart](../../../plots/day03_solution_plot.png)

## Step 4 · Build the interactive story
Use Plotly Express to create a log-scale scatter. Add the WHO guideline line and a narrative annotation so the takeaway is explicit.

In [None]:
import plotly.express as px

TITLE = "Economic gains correlate with cleaner air, but not for everyone"
SUBTITLE = "PM₂.₅ exposure vs. GDP per capita (log scale), {TARGET_YEAR}".format(TARGET_YEAR=TARGET_YEAR)
ANNOTATION = "Wealthier countries cluster below the WHO guideline; many low-income nations breathe 6× that level."
SOURCE = "Source: World Bank PM₂.₅ exposure & GDP per capita (downloaded 2025-01-05)"
UNITS = "PM₂.₅ annual mean exposure (µg/m³)"

validate_story_elements(
    title=TITLE,
    subtitle=SUBTITLE,
    annotation=ANNOTATION,
    source=SOURCE,
    units=UNITS,
)

who_guideline = 5

fig_pollution = px.scatter(
    df_merge,
    x="GDP_per_capita",
    y="PM25",
    hover_name="Country Name",
    color="PM25",
    size="PM25",
    size_max=40,
    color_continuous_scale="Viridis",
    template="plotly_white",
)

fig_pollution.update_traces(
    marker=dict(opacity=0.75, line=dict(color="#333333", width=0.5))
)

fig_pollution.update_layout(
    title=dict(text=f"{TITLE}<br><sup>{SUBTITLE}</sup>", x=0, font=dict(size=22)),
    xaxis=dict(
        title="GDP per capita (current US$)",
        type="log",
        tickformat="$~s",
    ),
    yaxis=dict(title=UNITS),
    coloraxis_colorbar=dict(title=UNITS),
    margin=dict(l=40, r=40, t=100, b=60),
)

fig_pollution.add_hline(
    y=who_guideline,
    line_dash="dash",
    line_color="#ef553b",
    annotation_text="WHO guideline (5 µg/m³)",
    annotation_position="top left",
    annotation_font=dict(size=12, color="#ef553b"),
)

fig_pollution.add_annotation(
    xref="paper",
    yref="paper",
    x=0.02,
    y=-0.23,
    text=f"{ANNOTATION}<br>{SOURCE} · Units: {UNITS}",
    showarrow=False,
    font=dict(size=12, color="#555555"),
    align="left",
)

fig_pollution.show()


In [None]:
save_last_visual(fig_pollution, "day03_solution_plot.png")

## 🔍 Reflection & limitations
- This scatter does not prove causation—economic development can both reduce and increase pollution depending on policy.
- Encourage discussion about indoor air quality, wildfire smoke, and monitoring coverage, none of which appear here.
- Ask students to draft a short caption using the Claim → Evidence → Visual → Takeaway scaffold.