## 🔗 Open This Notebook in Google Colab

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DavidLangworthy/ds4s/blob/master/days/day02/notebook/day02_starter.ipynb)

# ⚡ Day 2 – Fossil Fuels vs. Renewables
### Comparing the changing global energy mix

We'll track how the world's energy sources have shifted over the past two decades and build a grouped bar chart that shows fossil fuels giving way to renewables.

#### Data card: Our World in Data – Energy mix
* **Sources:** [Our World in Data](https://ourworldindata.org/energy) (BP Statistical Review, Ember, IEA).
* **Temporal coverage:** 1965–2023 for most fuels.
* **Units:** Share of global primary energy (%).
* **Refresh cadence:** Updated annually; downloaded September 2024.
* **Caveats:** Percentages may not sum to 100 due to rounding; renewables include hydro, wind, solar, and other sources.

In [None]:
# Core imports and shared helpers
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display

import utils

utils.baseline_style()


## Step 1: Load the four energy share datasets
Each file contains a `Year` column and a percentage column. We keep the helper workflow consistent so every file goes through the same checks.

In [None]:
# Example: diagnostics summarises a dataframe before we touch it
sample = pd.DataFrame({'Year': [2000, 2001, 2002], 'Share': [10, 11, 12]})
utils.diagnostics(sample, 'Sample share table', expected_columns=['Year', 'Share'], expected_row_range=(3, 3))


In [None]:
renewables = utils.load_data('01 renewable-share-energy.csv')
hydro = utils.load_data('06 hydro-share-energy.csv')
wind = utils.load_data('10 wind-share-energy.csv')
solar = utils.load_data('14 solar-share-energy.csv')

for name, df in [('Renewables', renewables), ('Hydro', hydro), ('Wind', wind), ('Solar', solar)]:
    utils.diagnostics(
        df,
        f"{name} share (raw)",
        expected_columns=['Entity', 'Code', 'Year'],
        expected_row_range=(2000, 3000),
    )


## Step 2: Isolate the global totals and clean the shares
Filter to the `World` totals, convert the share columns to numeric values, and keep only the years we need.

In [None]:
def tidy_energy_share(df: pd.DataFrame, column_name: str, label: str) -> pd.DataFrame:
    cleaned = (
        df[df['Entity'] == 'World'][['Year', column_name]]
        .rename(columns={column_name: 'share'})
        .assign(
            Year=lambda frame: pd.to_numeric(frame['Year'], errors='coerce').astype('Int64'),
            share=lambda frame: pd.to_numeric(frame['share'], errors='coerce'),
            category=label,
        )
        .dropna(subset=['Year', 'share'])
    )
    utils.expect_rows_between(cleaned, 40, 70)
    return cleaned

world_renewables = tidy_energy_share(renewables, 'Renewables (% equivalent primary energy)', 'Renewables')
world_hydro = tidy_energy_share(hydro, 'Hydro (% equivalent primary energy)', 'Hydro')
world_wind = tidy_energy_share(wind, 'Wind (% electricity)', 'Wind electricity')
world_solar = tidy_energy_share(solar, 'Solar (% electricity)', 'Solar electricity')


## Step 3: Combine the streams into one tidy table
Merge the categories, calculate the implied fossil-fuel share, and derive an `Other renewables` bucket so the parts add up.

In [None]:
energy_mix = (
    world_renewables.rename(columns={'share': 'renewable_share'})[['Year', 'renewable_share']]
    .merge(world_hydro.rename(columns={'share': 'hydro_share'})[['Year', 'hydro_share']], on='Year', how='left')
    .merge(world_wind.rename(columns={'share': 'wind_share'})[['Year', 'wind_share']], on='Year', how='left')
    .merge(world_solar.rename(columns={'share': 'solar_share'})[['Year', 'solar_share']], on='Year', how='left')
    .assign(
        fossil_share=lambda df: 100 - df['renewable_share'],
        other_renewables=lambda df: df['renewable_share'] - (
            df['hydro_share'].fillna(0)
            + df['wind_share'].fillna(0)
            + df['solar_share'].fillna(0)
        ),
    )
)
energy_mix['other_renewables'] = energy_mix['other_renewables'].clip(lower=0)
utils.diagnostics(
    energy_mix,
    'Energy mix (combined)',
    expected_columns=['Year', 'renewable_share', 'fossil_share'],
    expected_row_range=(40, 70),
)


## Step 4: Focus on milestone years and reshape for plotting
Three checkpoints (2000, 2010, 2022) provide a clean narrative arc and make the grouped bars easy to compare.

In [None]:
focus_years = [2000, 2010, 2022]
focus_mix = (
    energy_mix[energy_mix['Year'].isin(focus_years)]
    .assign(Year=lambda df: df['Year'].astype(int))
    .melt(
        id_vars='Year',
        value_vars=['fossil_share', 'hydro_share', 'wind_share', 'solar_share', 'other_renewables'],
        var_name='category',
        value_name='share',
    )
    .replace(
        {
            'fossil_share': 'Fossil fuels',
            'hydro_share': 'Hydro',
            'wind_share': 'Wind',
            'solar_share': 'Solar',
            'other_renewables': 'Other renewables',
        }
    )
)
utils.diagnostics(
    focus_mix,
    'Energy mix (plot ready)',
    expected_columns=['Year', 'category', 'share'],
    expected_row_range=(15, 15),
)


## Step 5: Build the grouped bar chart with story metadata
Label every chart element (title, subtitle, annotation, source, units) before rendering and finish with a clean legend.

In [None]:
TITLE = 'Renewables have doubled their share of global energy since 2000'
SUBTITLE = 'Global primary energy mix, selected years'
ANNOTATION = 'Wind and solar remain small but have grown more than fivefold since 2000.'
SOURCE = 'Our World in Data, BP Statistical Review, Ember, IEA'
UNITS = 'Share of global primary energy (%)'

metadata = {
    'title': TITLE,
    'subtitle': SUBTITLE,
    'annotation': ANNOTATION,
    'source': SOURCE,
    'units': UNITS,
}
utils.validate_story_elements(metadata)

order = ['Fossil fuels', 'Hydro', 'Wind', 'Solar', 'Other renewables']
palette = {
    'Fossil fuels': '#8c564b',
    'Hydro': '#1f77b4',
    'Wind': '#2ca02c',
    'Solar': '#ff7f0e',
    'Other renewables': '#9467bd',
}

fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(
    data=focus_mix,
    x='Year',
    y='share',
    hue='category',
    order=sorted(focus_years),
    hue_order=order,
    palette=palette,
    ax=ax,
)
utils.apply_story_template(ax, title=TITLE, subtitle=SUBTITLE, source=SOURCE, units=UNITS)
ax.set_xlabel('Year')
ax.legend(title='', loc='upper right', frameon=False)

highlight_year = 2022
highlight_share = focus_mix[
    (focus_mix['Year'] == highlight_year) & (focus_mix['category'] == 'Solar')
]['share'].iloc[0]

ax.annotate(
    ANNOTATION,
    xy=(2.6, highlight_share + 1.5),
    xycoords='data',
    xytext=(15, 35),
    textcoords='offset points',
    arrowprops=dict(arrowstyle='->', color='#333333'),
    fontsize=11,
    ha='left',
    va='bottom',
    bbox=dict(boxstyle='round,pad=0.3', fc='white', ec='#555555', alpha=0.85),
)

plt.tight_layout()
utils.save_last_fig('day02_solution_plot.png')


In [None]:
display(
    Markdown(
        utils.summarize_claim(
            claim='Fossil fuels still dominate but their share is shrinking.',
            evidence='Global renewables grew from about 7% in 2000 to nearly 15% in 2022, led by wind and solar.',
            takeaway='Scaling renewables requires accelerating wind and solar without neglecting other clean sources.',
        )
    )
)
