## 🔗 Open This Notebook in Google Colab

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DavidLangworthy/ds4s/blob/master/Day%205_%20Capstone%20%E2%80%93%20CO%E2%82%82%20Emissions%20%26%20Global%20Temperature.ipynb)

# 🔥 Day 5 – Capstone Project: CO₂ and Climate
### Telling the Big Story with Data

The capstone weaves together data cleaning, diagnostics, multi-source merging, and a polished narrative. You’ll build a two-panel story that shows how human-caused CO₂ emissions track with global temperature change.

### 🗂️ Data Card
| Field | Details |
| --- | --- |
| **Dataset** | Global Fossil CO₂ Emissions & NASA GISTEMP |
| **Source & link** | Our World in Data (Global Carbon Project) & NASA GISTEMP v4 |
| **Temporal / spatial coverage** | Global totals, annual 1880–2023 |
| **Key units** | CO₂: gigatonnes per year • Temperature: anomaly in °C (1951–1980 baseline) |
| **Method & caveats** | CO₂ data aggregates fossil fuels and cement. Temperature anomalies follow NASA’s 1951–1980 baseline; latest year provisional. |

### ⏱️ Learning Path for Today

            Each loop takes about 10–15 minutes:
            - [ ] Load and validate the CO₂ and temperature datasets.
- [ ] Align years and run diagnostics on ranges and nulls.
- [ ] Engineer summary metrics for evidence statements.
- [ ] Compose a two-panel narrative figure with accessibility checks.

            > 👩‍🏫 **Teacher tip:** Use these checkpoints for quick formative assessment. Have students raise a colored card after each check cell to signal confidence or questions.

> ### 👩‍🏫 Teacher Sidebar
> **Suggested timing:** ~60 minutes including peer feedback.
>
> **Likely misconceptions:** Assuming correlation equals causation; misreading anomalies as absolute temperatures.
>
> **Fast finisher extension:** Invite students to add emissions per capita or mitigation milestones as context.

In [None]:
from __future__ import annotations

from pathlib import Path
from typing import Mapping, Sequence

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

try:
    import plotly.express as px  # noqa: F401 - imported for student use
except ModuleNotFoundError:  # pragma: no cover - Plotly installed in Colab
    px = None

pd.options.display.float_format = "{:.2f}".format
sns.set_theme(style="whitegrid", context="talk")
plt.rcParams.update(
    {
        "axes.titlesize": 18,
        "axes.titleweight": "bold",
        "axes.labelsize": 13,
        "axes.grid": True,
        "grid.alpha": 0.25,
        "figure.dpi": 120,
        "axes.spines.top": False,
        "axes.spines.right": False,
    }
)

STORY_KEYS = (
    "title",
    "subtitle",
    "claim",
    "evidence",
    "visual",
    "takeaway",
    "source",
    "units",
    "annotation",
    "alt_text",
)


def load_csv(path: Path, *, description: str = "", **read_kwargs) -> pd.DataFrame:
    df = pd.read_csv(path, **read_kwargs)
    label = description or path.name
    print(
        f"✅ Loaded {label} with shape {df.shape[0]} rows × {df.shape[1]} columns."
    )
    return df


def validate_columns(
    df: pd.DataFrame, required: Sequence[str], *, df_name: str = "DataFrame"
) -> None:
    missing = [col for col in required if col not in df.columns]
    if missing:
        raise ValueError(f"{df_name} is missing columns: {missing}")
    print(f"✅ {df_name} includes required columns: {', '.join(required)}")


def expect_rows_between(
    df: pd.DataFrame,
    lower: int,
    upper: int,
    *,
    df_name: str = "DataFrame",
) -> None:
    rows = len(df)
    if not (lower <= rows <= upper):
        raise ValueError(
            f"{df_name} has {rows} rows; expected between {lower} and {upper}."
        )
    print(f"✅ {df_name} row count {rows} within [{lower}, {upper}].")


def quick_null_check(df: pd.DataFrame, *, df_name: str = "DataFrame") -> pd.Series:
    nulls = df.isna().sum()
    print(f"{df_name} missing values per column:\n{nulls}")
    return nulls


def quick_preview(
    df: pd.DataFrame, *, n: int = 5, df_name: str = "DataFrame"
) -> pd.DataFrame:
    print(f"🔍 Previewing {df_name} (first {n} rows):")
    return df.head(n)


def numeric_sanity_check(
    series: pd.Series,
    *,
    minimum: float | None = None,
    maximum: float | None = None,
    name: str = "Series",
) -> None:
    if minimum is not None and series.min() < minimum:
        raise ValueError(
            f"{name} has values below the expected minimum of {minimum}."
        )
    if maximum is not None and series.max() > maximum:
        raise ValueError(
            f"{name} has values above the expected maximum of {maximum}."
        )
    print(
        f"✅ {name} within expected range"
        f"{f' ≥ {minimum}' if minimum is not None else ''}"
        f"{f' and ≤ {maximum}' if maximum is not None else ''}."
    )


def story_fields_are_complete(story: Mapping[str, str]) -> None:
    missing = [key for key in STORY_KEYS if not str(story.get(key, "")).strip()]
    if missing:
        raise ValueError(
            "Please complete the storytelling scaffold before plotting: "
            + ", ".join(missing)
        )
    print(
        "✅ Story scaffold complete (title, subtitle, claim, evidence, visual,"
        " takeaway, source, units, annotation, alt text)."
    )


def print_story_scaffold(story: Mapping[str, str]) -> None:
    story_fields_are_complete(story)
    print("\n📖 Story Scaffold")
    print(f"Claim: {story['claim']}")
    print(f"Evidence: {story['evidence']}")
    print(f"Visual focus: {story['visual']}")
    print(f"Takeaway: {story['takeaway']}")
    print(f"Source: {story['source']} ({story['units']})")


def apply_matplotlib_story(ax: plt.Axes, story: Mapping[str, str]) -> None:
    story_fields_are_complete(story)
    ax.set_title(f"{story['title']}\n{story['subtitle']}", loc="left", pad=18)
    ax.figure.text(
        0.01,
        -0.08,
        (
            f"Claim: {story['claim']} | Evidence: {story['evidence']}"
            f" | Takeaway: {story['takeaway']}"
            f"\nSource: {story['source']} • Units: {story['units']}"
        ),
        ha="left",
        fontsize=10,
    )


def annotate_callout(
    ax: plt.Axes,
    *,
    xy: tuple[float, float],
    xytext: tuple[float, float],
    text: str,
) -> None:
    ax.annotate(
        text,
        xy=xy,
        xytext=xytext,
        arrowprops=dict(arrowstyle="->", color="black", lw=1),
        bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="black", alpha=0.8),
    )


def record_alt_text(text: str) -> None:
    print(f"📝 Alt text ready: {text}")


def accessibility_checklist(
    *, palette: str, has_alt_text: bool, contrast_passed: bool = True
) -> None:
    print("♿ Accessibility checklist:")
    print(f" • Palette: {palette}")
    print(
        f" • Alt text provided: {'yes' if has_alt_text else 'add alt text before sharing'}"
    )
    print(f" • Contrast OK: {'yes' if contrast_passed else 'adjust colors'}")


def save_figure(fig: plt.Figure, filename: str) -> Path:
    plots_dir = Path.cwd() / "plots"
    plots_dir.mkdir(parents=True, exist_ok=True)
    output_path = plots_dir / filename
    fig.savefig(output_path, dpi=300, bbox_inches="tight")
    print(f"💾 Saved figure to {output_path}")
    return output_path


def save_plotly_figure(fig, filename: str) -> Path:
    plots_dir = Path.cwd() / "plots"
    plots_dir.mkdir(parents=True, exist_ok=True)
    html_path = plots_dir / filename.replace(".png", ".html")
    fig.write_html(html_path)
    print(f"💾 Saved interactive figure to {html_path}")
    try:
        static_path = plots_dir / filename
        fig.write_image(str(static_path))
        print(f"💾 Saved static image to {static_path}")
    except Exception as exc:  # pragma: no cover - depends on kaleido
        print(f"⚠️ Static export skipped: {exc}")
    return html_path

In [None]:
from pathlib import Path

DATA_DIR = Path.cwd() / "data"
PLOTS_DIR = Path.cwd() / "plots"
PLOTS_DIR.mkdir(parents=True, exist_ok=True)

print(f"Data directory: {DATA_DIR}")
print(f"Plots directory: {PLOTS_DIR}")

## Loop 1 · Load & Inspect Both Datasets
Keep schemas transparent before merging anything.

In [None]:
co2_path = DATA_DIR / "global_co2.csv"
temp_path = DATA_DIR / "GLB.Ts+dSST.csv"

df_co2 = load_csv(co2_path, description="Global fossil CO₂ emissions")
df_temp_raw = load_csv(
    temp_path,
    description="NASA GISTEMP anomalies",
    skiprows=1,
    usecols=["Year", "J-D"],
)

validate_columns(df_co2, ["Year", "CO2"], df_name="CO₂ data")
validate_columns(df_temp_raw, ["Year", "J-D"], df_name="temperature data")

In [None]:
quick_preview(df_co2, n=5, df_name="CO₂ data")
quick_preview(df_temp_raw, n=5, df_name="temperature data")

## Loop 2 · Clean, Align, and Diagnose
Convert to numeric, align year index, and make sure both series overlap.

In [None]:
df_co2["Year"] = pd.to_numeric(df_co2["Year"], errors="coerce")
df_co2["CO2"] = pd.to_numeric(df_co2["CO2"], errors="coerce")

df_temp = df_temp_raw.rename(columns={"J-D": "TempAnomaly"})
df_temp["TempAnomaly"] = pd.to_numeric(df_temp["TempAnomaly"], errors="coerce")

df_co2 = df_co2.dropna().set_index("Year")
df_temp = df_temp.dropna().set_index("Year")

df_merged = df_co2.join(df_temp, how="inner")
df_merged = df_merged[df_merged.index >= 1880]

expect_rows_between(df_merged, 120, 200, df_name="merged climate data")
numeric_sanity_check(df_merged["CO2"], minimum=0, maximum=40, name="CO₂ (Gt)")
numeric_sanity_check(df_merged["TempAnomaly"], minimum=-1.0, maximum=1.5, name="Temperature anomaly (°C)")
quick_null_check(df_merged, df_name="merged climate data")

In [None]:
quick_preview(df_merged.head(), n=5, df_name="merged climate data")
quick_preview(df_merged.tail(), n=5, df_name="recent climate data")

## Loop 3 · Engineer Evidence Metrics
Quantify change for your claim-evidence statement.

In [None]:
baseline_year = 1960
start_year = df_merged.index.min()
latest_year = int(df_merged.index.max())

def safe_value(year: int, series: pd.Series) -> float:
    if year in series.index:
        return float(series.loc[year])
    return float(series.loc[series.index[series.index.searchsorted(year)]])

co2_baseline = safe_value(baseline_year, df_merged["CO2"])
co2_latest = float(df_merged["CO2"].iloc[-1])
temp_baseline = safe_value(baseline_year, df_merged["TempAnomaly"])
temp_latest = float(df_merged["TempAnomaly"].iloc[-1])
corr = df_merged["CO2"].corr(df_merged["TempAnomaly"])

print(
    f"Since {baseline_year}, CO₂ rose from {co2_baseline:.1f} Gt to {co2_latest:.1f} Gt."                     f" Temperature anomaly increased from {temp_baseline:.2f}°C to {temp_latest:.2f}°C."
)
print(f"Pearson correlation between series: {corr:.2f}")

story = {
    "title": "CO₂ Emissions and Global Temperatures Rise in Lockstep",
    "subtitle": f"Global totals, {start_year}–{latest_year}",
    "claim": "Burning fossil fuels drives a steep climb in atmospheric CO₂ and global temperature anomalies.",
    "evidence": (
        f"CO₂ emissions quadrupled after {baseline_year}, and temperature anomalies climbed ~{temp_latest - temp_baseline:.1f}°C."                         f" Correlation = {corr:.2f}."
    ),
    "visual": "Two-panel line chart sharing a timeline (top: CO₂, bottom: temperature anomaly).",
    "takeaway": "Cutting emissions is essential to stabilise temperatures within agreed climate targets.",
    "source": "Global Carbon Project & NASA GISTEMP (2024 release)",
    "units": "CO₂ in gigatonnes; temperature anomaly in °C relative to 1951–1980",
    "annotation": f"{latest_year}: {co2_latest:.1f} Gt CO₂ and {temp_latest:.2f}°C anomaly",
    "alt_text": (
        "Two aligned line charts from the late 1800s to present showing CO₂ emissions climbing from under 5 Gt"
        f" to over {co2_latest:.0f} Gt while temperature anomalies rise from near 0°C to about {temp_latest:.1f}°C."
    ),
}

print_story_scaffold(story)

## Loop 4 · Compose the Capstone Figure
Use a shared timeline, consistent storytelling scaffold, and explicit accessibility log.

In [None]:
fig, axes = plt.subplots(
    2,
    1,
    figsize=(12, 9),
    sharex=True,
    gridspec_kw={"height_ratios": [2, 1.2], "hspace": 0.05},
)

ax_co2, ax_temp = axes
ax_co2.plot(df_merged.index, df_merged["CO2"], color="#6a4c93", linewidth=3)
ax_co2.set_ylabel("CO₂ emissions (Gt)")
ax_co2.axvspan(1950, latest_year, color="#f6bd60", alpha=0.15, label="Great Acceleration")
ax_co2.legend(loc="upper left")

ax_temp.plot(df_merged.index, df_merged["TempAnomaly"], color="#ef476f", linewidth=2.5)
ax_temp.axhline(0, color="black", linestyle="--", linewidth=1)
ax_temp.set_ylabel("Temp anomaly (°C)")
ax_temp.set_xlabel("Year")

apply_matplotlib_story(ax_co2, story)
annotate_callout(
    ax_co2,
    xy=(latest_year, co2_latest),
    xytext=(1985, co2_latest - 10),
    text=story["annotation"],
)
annotate_callout(
    ax_temp,
    xy=(latest_year, temp_latest),
    xytext=(1940, temp_latest + 0.2),
    text=f"{latest_year}: {temp_latest:.2f}°C anomaly",
)

record_alt_text(story["alt_text"])
accessibility_checklist(
    palette="Purple/rose contrast with shared timeline",
    has_alt_text=True,
)

fig.align_ylabels(axes)

### 🧾 Reflection Prompt
- Where might the causal chain break? What other evidence would you gather?
- How could you adapt this chart for a policymaker vs. a public audience?

In [None]:
save_figure(fig, "day05_solution_plot.png")