# Exploratory Data Analysis
Analysis of UK National Grid demand data from NESO (National Energy System Operator).

In [20]:
# Imports
import requests
from bs4 import BeautifulSoup
from pathlib import Path
import pandas as pd
import hashlib
from datetime import datetime, timedelta
from supabase import create_client, Client
import os
from dotenv import load_dotenv
import numpy as np
import time
from tqdm import tqdm

## Demand Data

| Column name                 | Description                                                                                              |
| --------------------------- | -------------------------------------------------------------------------------------------------------- |
| `SETTLEMENT_DATE`           | Calendar date of the electricity settlement period (UTC).                                                |
| `SETTLEMENT_PERIOD`         | Half-hour settlement period within the day (1–48).                                                       |
| `FORECAST_ACTUAL_INDICATOR` | Indicates whether the data point is **Actual (A)** or **Forecast (F)**.                                  |
| `ND`                        | **National Demand** – estimated total GB electricity demand including embedded generation.               |
| `TSD`                       | **Transmission System Demand** – demand seen by the transmission network (excludes embedded generation). |
| `ENGLAND_WALES_DEMAND`      | Electricity demand specific to England and Wales.                                                        |
| `EMBEDDED_WIND_GENERATION`  | Electricity generated by wind connected to distribution networks (not transmission).                     |
| `EMBEDDED_WIND_CAPACITY`    | Installed capacity of embedded wind generation.                                                          |
| `EMBEDDED_SOLAR_GENERATION` | Electricity generated by embedded solar PV.                                                              |
| `EMBEDDED_SOLAR_CAPACITY`   | Installed capacity of embedded solar PV.                                                                 |
| `NON_BM_STOR`               | Non-Balancing Mechanism storage output (e.g. small-scale batteries).                                     |
| `PUMP_STORAGE_PUMPING`      | Electricity demand used to pump water into pumped-storage hydro (negative net generation).               |
| `SCOTTISH_TRANSFER`         | Net electricity transfer between Scotland and England/Wales.                                             |
| `IFA_FLOW`                  | Power flow on the **IFA** interconnector (GB–France).                                                    |
| `IFA2_FLOW`                 | Power flow on the **IFA2** interconnector (GB–France).                                                   |
| `BRITNED_FLOW`              | Power flow on the **BritNed** interconnector (GB–Netherlands).                                           |
| `MOYLE_FLOW`                | Power flow on the **Moyle** interconnector (GB–Northern Ireland).                                        |
| `EAST_WEST_FLOW`            | Power flow on the **East-West** interconnector (GB–Ireland).                                             |
| `NEMO_FLOW`                 | Power flow on the **NEMO** interconnector (GB–Belgium).                                                  |
| `NSL_FLOW`                  | Power flow on the **North Sea Link** interconnector (GB–Norway).                                         |
| `ELECLINK_FLOW`             | Power flow on the **ElecLink** interconnector (GB–France).                                               |
| `VIKING_FLOW`               | Power flow on the **Viking Link** interconnector (GB–Denmark).                                           |
| `GREENLINK_FLOW`            | Power flow on the **Greenlink** interconnector (GB–Ireland).                                             |       


### Download Historic Data
Download CSV files for each year (2020-2025) from NESO data portal.

In [None]:
load_dotenv()
supabase_url = os.getenv("SUPABASE_URL")
supabase_key = os.getenv("SUPABASE_KEY")
supabase: Client = create_client(supabase_url, supabase_key)

Connected to Supabase.


In [53]:
# Configuration
BASE_PAGE = "https://www.neso.energy/data-portal/historic-demand-data/historic_demand_data_{}"
YEARS = range(2020, 2026)
OUT_DIR = Path("neso_historic_demand")
OUT_DIR.mkdir(exist_ok=True)

In [54]:
# HTTP session with custom User-Agent
session = requests.Session()
session.headers.update({"User-Agent": "academic-dashboard-project"})

In [None]:
def sha256(path: Path) -> str:
    h = hashlib.sha256()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(8192), b""):
            h.update(chunk)
    return h.hexdigest()

In [None]:
date_formats = [
    '%Y-%m-%d',
    '%d-%b-%y',
    '%d-%b-%Y',
    '%d-%B-%y',
    '%d-%B-%Y',
    '%Y-%m-%dT%H:%M:%S.%fZ',
    '%Y-%m-%dT%H:%M:%S',
]

In [None]:
def standardize_date(date_val):
    if pd.isna(date_val):
        return pd.NaT
    date_str = str(date_val).strip().upper()
    for fmt in date_formats:
        try:
            return pd.to_datetime(date_str, format=fmt.upper() if '%b' in fmt or '%B' in fmt else fmt)
        except (ValueError, TypeError):
            continue
    try:
        return pd.to_datetime(date_str, dayfirst=True)
    except:
        return pd.NaT

In [None]:
all_dfs = []
seen_hashes = set()

for year in YEARS:
    page_url = BASE_PAGE.format(year)
    r = session.get(page_url, timeout=30)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")

    csv_url = None
    for a in soup.find_all("a", href=True):
        href = a["href"].lower()
        if "download" in href and href.endswith(".csv"):
            csv_url = a["href"]
            break

    if not csv_url:
        continue

    if csv_url.startswith("/"):
        csv_url = "https://www.neso.energy" + csv_url

    csv_path = OUT_DIR / f"historic_demand_{year}.csv"

    with session.get(csv_url, stream=True, timeout=60) as resp:
        resp.raise_for_status()
        with open(csv_path, "wb") as f:
            for chunk in resp.iter_content(8192):
                f.write(chunk)

    file_hash = sha256(csv_path)
    if file_hash in seen_hashes:
        continue
    seen_hashes.add(file_hash)

    df_year = pd.read_csv(csv_path)
    df_year['SETTLEMENT_DATE'] = df_year['SETTLEMENT_DATE'].apply(standardize_date)
    df_year["SOURCE_YEAR"] = year
    all_dfs.append(df_year)
    print(f"{year}: {len(df_year):,} records")

df = pd.concat(all_dfs, ignore_index=True)
print(f"\nTotal: {len(df):,}")

Processing 2020...
  Loaded 17568 records (2020-01-01 to 2020-12-31)
Processing 2021...
  Loaded 17520 records (2021-01-01 to 2021-12-31)
Processing 2022...
  Loaded 17520 records (2022-01-01 to 2022-12-31)
Processing 2023...
  Loaded 17520 records (2023-01-01 to 2023-12-31)
Processing 2024...
  Loaded 17568 records (2024-01-01 to 2024-12-31)
Processing 2025...
  Loaded 16800 records (2025-01-01 to 2025-12-16)

Total records: 104496


In [None]:
df.sample(5)

Columns: ['SETTLEMENT_DATE', 'SETTLEMENT_PERIOD', 'ND', 'TSD', 'ENGLAND_WALES_DEMAND', 'EMBEDDED_WIND_GENERATION', 'EMBEDDED_WIND_CAPACITY', 'EMBEDDED_SOLAR_GENERATION', 'EMBEDDED_SOLAR_CAPACITY', 'NON_BM_STOR', 'PUMP_STORAGE_PUMPING', 'IFA_FLOW', 'IFA2_FLOW', 'BRITNED_FLOW', 'MOYLE_FLOW', 'EAST_WEST_FLOW', 'NEMO_FLOW', 'NSL_FLOW', 'ELECLINK_FLOW', 'VIKING_FLOW', 'GREENLINK_FLOW', 'SOURCE_YEAR', 'SCOTTISH_TRANSFER']
Date range: 2020-01-01 00:00:00 to 2025-12-16 00:00:00


Unnamed: 0,SETTLEMENT_DATE,SETTLEMENT_PERIOD,ND,TSD,ENGLAND_WALES_DEMAND,EMBEDDED_WIND_GENERATION,EMBEDDED_WIND_CAPACITY,EMBEDDED_SOLAR_GENERATION,EMBEDDED_SOLAR_CAPACITY,NON_BM_STOR,...,BRITNED_FLOW,MOYLE_FLOW,EAST_WEST_FLOW,NEMO_FLOW,NSL_FLOW,ELECLINK_FLOW,VIKING_FLOW,GREENLINK_FLOW,SOURCE_YEAR,SCOTTISH_TRANSFER
34286,2021-12-15,15,32405,35654,30075,2568,6527,0,14001,0,...,108,375,369,-587,693,0,0,0,2021,
80737,2024-08-09,4,17125,21825,16386,3448,6563,0,18053,0,...,0,-301,-526,-958,-429,159,0,0,2024,3247.0
90348,2025-02-25,13,26637,30627,25087,2087,6606,0,19726,0,...,-1071,-131,0,-632,1397,850,-1094,-439,2025,4377.0
32542,2021-11-08,47,24344,25771,23057,2649,6527,0,13915,0,...,924,122,-65,584,693,0,0,0,2021,
22657,2021-04-17,4,24955,26196,22744,382,6527,0,13653,0,...,0,-19,0,975,0,0,0,0,2021,


### Data Cleaning
Remove unnecessary columns and create a unified datetime column.

In [None]:
cols_to_drop = ['_id', '_full_text', '_count', 'rank', 'FORECAST_ACTUAL_INDICATOR', 'SOURCE_YEAR']
df_cleaned = df.drop(columns=[c for c in cols_to_drop if c in df.columns], errors='ignore')

Columns after cleaning: ['SETTLEMENT_DATE', 'SETTLEMENT_PERIOD', 'ND', 'TSD', 'ENGLAND_WALES_DEMAND', 'EMBEDDED_WIND_GENERATION', 'EMBEDDED_WIND_CAPACITY', 'EMBEDDED_SOLAR_GENERATION', 'EMBEDDED_SOLAR_CAPACITY', 'NON_BM_STOR', 'PUMP_STORAGE_PUMPING', 'IFA_FLOW', 'IFA2_FLOW', 'BRITNED_FLOW', 'MOYLE_FLOW', 'EAST_WEST_FLOW', 'NEMO_FLOW', 'NSL_FLOW', 'ELECLINK_FLOW', 'VIKING_FLOW', 'GREENLINK_FLOW', 'SCOTTISH_TRANSFER']


In [None]:
df_cleaned['DATETIME'] = df_cleaned.apply(
    lambda row: row['SETTLEMENT_DATE'] + timedelta(minutes=(int(row['SETTLEMENT_PERIOD']) - 1) * 30),
    axis=1
)

In [70]:
# Reorder columns and remove original date columns
cols = ['DATETIME'] + [col for col in df_cleaned.columns if col not in ['DATETIME', 'SETTLEMENT_DATE', 'SETTLEMENT_PERIOD']]
df_cleaned = df_cleaned[cols]

In [None]:
df_cleaned.head()

Final shape: (104496, 21)
Date range: 2020-01-01 00:00:00 to 2025-12-16 23:30:00


Unnamed: 0,DATETIME,ND,TSD,ENGLAND_WALES_DEMAND,EMBEDDED_WIND_GENERATION,EMBEDDED_WIND_CAPACITY,EMBEDDED_SOLAR_GENERATION,EMBEDDED_SOLAR_CAPACITY,NON_BM_STOR,PUMP_STORAGE_PUMPING,...,IFA2_FLOW,BRITNED_FLOW,MOYLE_FLOW,EAST_WEST_FLOW,NEMO_FLOW,NSL_FLOW,ELECLINK_FLOW,VIKING_FLOW,GREENLINK_FLOW,SCOTTISH_TRANSFER
0,2020-01-01 00:00:00,26340,27153,23821,1073,6465,0,13040,0,15,...,0,852,-151,-47,854,0,0,0,0,
1,2020-01-01 00:30:00,26921,27684,24393,1020,6465,0,13040,0,17,...,0,853,-146,0,854,0,0,0,0,
2,2020-01-01 01:00:00,26569,27240,24085,1010,6465,0,13040,0,18,...,0,852,-53,0,854,0,0,0,0,
3,2020-01-01 01:30:00,25754,26435,23350,1043,6465,0,13040,0,15,...,0,852,-66,0,854,0,0,0,0,
4,2020-01-01 02:00:00,25075,25824,22788,1001,6465,0,13040,0,15,...,0,853,-74,-60,854,0,0,0,0,


### Data Quality

Analyse the data quality

In [None]:
print(f"{len(df_cleaned):,} records | {df_cleaned['DATETIME'].min().date()} → {df_cleaned['DATETIME'].max().date()}")

104,496 records
From 2020-01-01 00:00:00 to 2025-12-16 23:30:00

21 columns: ['DATETIME', 'ND', 'TSD', 'ENGLAND_WALES_DEMAND', 'EMBEDDED_WIND_GENERATION', 'EMBEDDED_WIND_CAPACITY', 'EMBEDDED_SOLAR_GENERATION', 'EMBEDDED_SOLAR_CAPACITY', 'NON_BM_STOR', 'PUMP_STORAGE_PUMPING', 'IFA_FLOW', 'IFA2_FLOW', 'BRITNED_FLOW', 'MOYLE_FLOW', 'EAST_WEST_FLOW', 'NEMO_FLOW', 'NSL_FLOW', 'ELECLINK_FLOW', 'VIKING_FLOW', 'GREENLINK_FLOW', 'SCOTTISH_TRANSFER']


In [None]:
missing = df_cleaned.isnull().sum()
missing[missing > 0]

Missing values:
SCOTTISH_TRANSFER    52608
dtype: int64


In [None]:
dupes = df_cleaned[df_cleaned.duplicated(subset=['DATETIME'], keep=False)]
print(f"{len(dupes)} duplicates")

24 duplicate timestamps


In [86]:
# Remove duplicates 
df_cleaned = df_cleaned.drop_duplicates(subset=['DATETIME'], keep='first').reset_index(drop=True)

In [None]:
df_sorted = df_cleaned.sort_values('DATETIME')
time_diffs = df_sorted['DATETIME'].diff()
gaps = time_diffs[time_diffs > timedelta(minutes=30)]
print(f"{len(gaps)} gaps in time series")

Found 6 gaps in the time series


In [91]:
# Stats
df_cleaned.describe()

Unnamed: 0,DATETIME,ND,TSD,ENGLAND_WALES_DEMAND,EMBEDDED_WIND_GENERATION,EMBEDDED_WIND_CAPACITY,EMBEDDED_SOLAR_GENERATION,EMBEDDED_SOLAR_CAPACITY,NON_BM_STOR,PUMP_STORAGE_PUMPING,...,IFA2_FLOW,BRITNED_FLOW,MOYLE_FLOW,EAST_WEST_FLOW,NEMO_FLOW,NSL_FLOW,ELECLINK_FLOW,VIKING_FLOW,GREENLINK_FLOW,SCOTTISH_TRANSFER
count,104484,104484.0,104484.0,104484.0,104484.0,104484.0,104484.0,104484.0,104484.0,104484.0,...,104484.0,104484.0,104484.0,104484.0,104484.0,104484.0,104484.0,104484.0,104484.0,51882.0
mean,2022-12-24 11:59:25.165958400,26752.066833,28717.435081,24515.319637,1884.021745,6549.374163,1583.656388,15890.782924,0.421529,192.440192,...,251.905842,276.472618,-144.414886,-94.521946,409.267141,590.25603,220.793031,115.522893,-51.194355,1810.682877
min,2020-01-01 00:00:00,12803.0,15297.0,12040.0,125.0,6465.0,0.0,13040.0,0.0,0.0,...,-1030.0,-1093.0,-505.0,-585.0,-1024.0,-1455.0,-1028.0,-1465.0,-539.0,-2851.0
25%,2021-06-28 06:22:30,21879.0,24206.0,20047.0,910.0,6527.0,0.0,13721.0,0.0,7.0,...,-2.0,-104.0,-444.0,-374.0,0.0,0.0,0.0,0.0,0.0,281.25
50%,2022-12-24 11:45:00,25866.0,27788.0,23665.0,1552.0,6545.0,6.0,15029.0,0.0,11.0,...,0.0,346.0,-207.0,0.0,667.0,693.0,0.0,0.0,0.0,1664.0
75%,2024-06-20 18:07:30,30673.0,32349.25,28132.0,2622.0,6562.0,2540.25,17714.0,0.0,157.0,...,941.0,917.0,81.0,0.0,963.0,1395.0,871.0,0.0,0.0,3223.0
max,2025-12-16 23:30:00,46433.0,47760.0,42458.0,5962.0,6622.0,14035.0,20993.0,481.0,1869.0,...,1016.0,1080.0,499.0,504.0,1020.0,1419.0,1002.0,1436.0,506.0,6704.0
std,,6219.736766,5932.001775,5693.937458,1213.376734,38.49212,2484.195058,2550.415513,10.005025,362.019887,...,624.826686,656.898443,281.493949,302.518871,625.839242,702.943795,553.844066,493.30737,162.808656,1854.728008


In [None]:
for col in ['ND', 'TSD', 'ENGLAND_WALES_DEMAND']:
    if col not in df_cleaned.columns:
        continue
    q1, q3 = df_cleaned[col].quantile([0.25, 0.75])
    iqr = q3 - q1
    outliers = df_cleaned[(df_cleaned[col] < q1 - 1.5*iqr) | (df_cleaned[col] > q3 + 1.5*iqr)]
    print(f"{col}: {len(outliers)} outliers ({len(outliers)/len(df_cleaned)*100:.1f}%)")

ND: 237 outliers (0.2%)
TSD: 509 outliers (0.5%)
ENGLAND_WALES_DEMAND: 207 outliers (0.2%)


In [None]:
expected = 48 * 365
by_year = df_cleaned.groupby(df_cleaned['DATETIME'].dt.year).size()
pd.DataFrame({'records': by_year, 'expected': expected, 'pct': (by_year / expected * 100).round(1)})

Unnamed: 0_level_0,records,expected,pct
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020,17566,17520,100.3
2021,17518,17520,100.0
2022,17518,17520,100.0
2023,17518,17520,100.0
2024,17566,17520,100.3
2025,16798,17520,95.9


In [None]:
print(f"Records: {len(df_cleaned):,} | Nulls: {df_cleaned.isnull().sum().sum()} | Dupes: {df_cleaned.duplicated(subset=['DATETIME']).sum()}")

Records: 104,484
Nulls: 52602
Dupelicates: 0
Range: 2020-01-01 → 2025-12-16


### Upload to Supabase

In [None]:
df_upload = df_cleaned.copy()
df_upload['DATETIME'] = df_upload['DATETIME'].dt.strftime('%Y-%m-%dT%H:%M:%S')
df_upload.columns = df_upload.columns.str.lower()
df_upload = df_upload.replace([np.nan, np.inf, -np.inf], None)

batch_size = 1000
for start in range(0, len(df_upload), batch_size):
    end = min(start + batch_size, len(df_upload))
    batch = df_upload.iloc[start:end].to_dict(orient='records')
    for record in batch:
        for key, val in record.items():
            if isinstance(val, float) and (np.isnan(val) or np.isinf(val)):
                record[key] = None
    supabase.table('historic_demand').insert(batch).execute()

print(f"Uploaded {len(df_upload):,} records")

Uploaded 0 to 1000
Uploaded 1000 to 2000
Uploaded 2000 to 3000
Uploaded 3000 to 4000
Uploaded 4000 to 5000
Uploaded 5000 to 6000
Uploaded 6000 to 7000
Uploaded 7000 to 8000
Uploaded 8000 to 9000
Uploaded 9000 to 10000
Uploaded 10000 to 11000
Uploaded 11000 to 12000
Uploaded 12000 to 13000
Uploaded 13000 to 14000
Uploaded 14000 to 15000
Uploaded 15000 to 16000
Uploaded 16000 to 17000
Uploaded 17000 to 18000
Uploaded 18000 to 19000
Uploaded 19000 to 20000
Uploaded 20000 to 21000
Uploaded 21000 to 22000
Uploaded 22000 to 23000
Uploaded 23000 to 24000
Uploaded 24000 to 25000
Uploaded 25000 to 26000
Uploaded 26000 to 27000
Uploaded 27000 to 28000
Uploaded 28000 to 29000
Uploaded 29000 to 30000
Uploaded 30000 to 31000
Uploaded 31000 to 32000
Uploaded 32000 to 33000
Uploaded 33000 to 34000
Uploaded 34000 to 35000
Uploaded 35000 to 36000
Uploaded 36000 to 37000
Uploaded 37000 to 38000
Uploaded 38000 to 39000
Uploaded 39000 to 40000
Uploaded 40000 to 41000
Uploaded 41000 to 42000
Uploaded 42000

# Carbon Intensity Data & Generation Mix Data

### Carbon Historic Data

Data from the National Grid Carbon Intensity API - Regional endpoint. Provides 30-minute resolution data for 17 GB regions with generation mix breakdown.

| Region ID | Region Name |
|-----------|-------------|
| 1 | North Scotland |
| 2 | South Scotland |
| 3 | North West England |
| 4 | North East England |
| 5 | South Yorkshire |
| 6 | North Wales & Merseyside |
| 7 | South Wales |
| 8 | West Midlands |
| 9 | East Midlands |
| 10 | East England |
| 11 | South West England |
| 12 | South England |
| 13 | London |
| 14 | South East England |
| 15 | England |
| 16 | Scotland |
| 17 | Wales |

In [None]:
REGIONAL_API_BASE = "https://api.carbonintensity.org.uk/regional/intensity"
START_DATE = datetime(2020, 1, 1)
END_DATE = datetime.now()

REGIONS = {
    1: "North Scotland", 2: "South Scotland", 3: "North West England",
    4: "North East England", 5: "South Yorkshire", 6: "North Wales & Merseyside",
    7: "South Wales", 8: "West Midlands", 9: "East Midlands", 10: "East England",
    11: "South West England", 12: "South England", 13: "London", 14: "South East England"
}

In [None]:
def fetch_chunk(from_dt, to_dt):
    from_str = from_dt.strftime('%Y-%m-%dT%H:%M')
    to_str = to_dt.strftime('%Y-%m-%dT%H:%M')
    url = f"{REGIONAL_API_BASE}/{from_str}/{to_str}"
    for attempt in range(3):
        try:
            r = requests.get(url, timeout=120)
            r.raise_for_status()
            return r.json(), None
        except Exception as e:
            if attempt < 2:
                time.sleep(1)
                continue
            return None, str(e)
    return None, "Failed"

chunk_days = 10
chunks = []
current = START_DATE
while current < END_DATE:
    chunk_end = min(current + timedelta(days=chunk_days), END_DATE)
    chunks.append((current, chunk_end))
    current = chunk_end

all_regional_records = []
errors = []

for start_dt, end_dt in tqdm(chunks):
    result, error = fetch_chunk(start_dt, end_dt)
    if error:
        errors.append(f"{start_dt.date()}: {error}")
    elif result:
        for entry in result.get('data', []):
            for region in entry.get('regions', []):
                if region.get('regionid') in REGIONS:
                    record = {
                        'datetime_from': entry.get('from'),
                        'datetime_to': entry.get('to'),
                        'region_id': region.get('regionid'),
                        'region_name': region.get('shortname'),
                        'forecast': region.get('intensity', {}).get('forecast'),
                        'index': region.get('intensity', {}).get('index'),
                    }
                    for gen in region.get('generationmix', []):
                        fuel = gen.get('fuel', '').lower().replace(' ', '_')
                        record[f'gen_{fuel}'] = gen.get('perc')
                    all_regional_records.append(record)

print(f"{len(all_regional_records):,} records | {len(errors)} errors")

Fetching 220 chunks...


Downloading: 100%|██████████| 220/220 [11:07<00:00,  3.04s/it]


Total: 1,454,082 records





In [None]:
df_carbon = pd.DataFrame(all_regional_records)
df_carbon.head()

Shape: (1454082, 16)
Columns: ['datetime_from', 'datetime_to', 'region_id', 'region_name', 'dno_region', 'forecast', 'index', 'gen_biomass', 'gen_coal', 'gen_imports', 'gen_gas', 'gen_nuclear', 'gen_other', 'gen_hydro', 'gen_solar', 'gen_wind']


Unnamed: 0,datetime_from,datetime_to,region_id,region_name,dno_region,forecast,index,gen_biomass,gen_coal,gen_imports,gen_gas,gen_nuclear,gen_other,gen_hydro,gen_solar,gen_wind
0,2019-12-31T23:30Z,2020-01-01T00:00Z,1,North Scotland,Scottish Hydro Electric Power Distribution,48,very low,0.0,0.0,0.0,12.2,0.0,0,27.1,0.0,60.7
1,2019-12-31T23:30Z,2020-01-01T00:00Z,2,South Scotland,SP Distribution,15,very low,2.0,0.0,0.0,3.4,50.6,0,0.6,0.0,43.4
2,2019-12-31T23:30Z,2020-01-01T00:00Z,3,North West England,Electricity North West,20,very low,0.0,0.0,0.0,5.3,77.4,0,0.0,0.0,17.3
3,2019-12-31T23:30Z,2020-01-01T00:00Z,4,North East England,NPG North East,21,very low,18.0,0.0,0.0,0.0,78.9,0,0.0,0.0,3.1
4,2019-12-31T23:30Z,2020-01-01T00:00Z,5,Yorkshire,NPG Yorkshire,254,high,41.6,1.0,0.0,50.0,0.0,0,0.0,0.0,7.4


In [None]:
df_carbon['datetime'] = pd.to_datetime(df_carbon['datetime_from'])
key_cols = ['datetime', 'region_id', 'region_name', 'forecast', 'index']
gen_cols = [c for c in df_carbon.columns if c.startswith('gen_')]
df_carbon = df_carbon[key_cols + gen_cols]
df_carbon.head()

Shape: (1454082, 14)
Columns: ['datetime', 'region_id', 'region_name', 'forecast', 'index', 'gen_biomass', 'gen_coal', 'gen_imports', 'gen_gas', 'gen_nuclear', 'gen_other', 'gen_hydro', 'gen_solar', 'gen_wind']


Unnamed: 0,datetime,region_id,region_name,forecast,index,gen_biomass,gen_coal,gen_imports,gen_gas,gen_nuclear,gen_other,gen_hydro,gen_solar,gen_wind
0,2019-12-31 23:30:00+00:00,1,North Scotland,48,very low,0.0,0.0,0.0,12.2,0.0,0,27.1,0.0,60.7
1,2019-12-31 23:30:00+00:00,2,South Scotland,15,very low,2.0,0.0,0.0,3.4,50.6,0,0.6,0.0,43.4
2,2019-12-31 23:30:00+00:00,3,North West England,20,very low,0.0,0.0,0.0,5.3,77.4,0,0.0,0.0,17.3
3,2019-12-31 23:30:00+00:00,4,North East England,21,very low,18.0,0.0,0.0,0.0,78.9,0,0.0,0.0,3.1
4,2019-12-31 23:30:00+00:00,5,Yorkshire,254,high,41.6,1.0,0.0,50.0,0.0,0,0.0,0.0,7.4


### Carbon Data Quality

In [None]:
print(f"{len(df_carbon):,} records | {df_carbon['datetime'].min().date()} → {df_carbon['datetime'].max().date()}")
print(f"{df_carbon['region_id'].nunique()} regions")
df_carbon.groupby('region_name').size().sort_values(ascending=False)

1,454,082 records
Date range: 2019-12-31 23:30:00+00:00 to 2025-12-31 23:00:00+00:00
Regions: 14

Records per region:
region_name
East England                  103863
East Midlands                 103863
London                        103863
North East England            103863
North Scotland                103863
South East England            103863
North West England            103863
South Wales                   103863
South West England            103863
South England                 103863
South Scotland                103863
West Midlands                 103863
Yorkshire                     103863
North Wales & Merseyside       98491
North Wales and Merseyside      5372
dtype: int64

No nulls


In [None]:
df_carbon['region_name'] = df_carbon['region_name'].replace('North Wales and Merseyside', 'North Wales & Merseyside')

Merged 'North Wales and Merseyside' → 'North Wales & Merseyside'
region_name
East England                103863
East Midlands               103863
London                      103863
North East England          103863
North Scotland              103863
North Wales & Merseyside    103863
North West England          103863
South East England          103863
South England               103863
South Scotland              103863
South Wales                 103863
South West England          103863
West Midlands               103863
Yorkshire                   103863
dtype: int64


In [None]:
before = len(df_carbon)
df_carbon = df_carbon.drop_duplicates(subset=['datetime', 'region_id'], keep='first').reset_index(drop=True)
print(f"Removed {before - len(df_carbon):,} duplicates → {len(df_carbon):,} records")

Removed 2,982 duplicates
Final: 1,451,100 records


In [14]:
# Stats
df_carbon.describe()

Unnamed: 0,region_id,forecast,gen_biomass,gen_coal,gen_imports,gen_gas,gen_nuclear,gen_other,gen_hydro,gen_solar,gen_wind
count,1451100.0,1451100.0,1451100.0,1451100.0,1451100.0,1451100.0,1451100.0,1451100.0,1451100.0,1451100.0,1451100.0
mean,7.5,152.2703,5.87556,0.7396009,10.29967,33.04662,15.45022,0.0,2.036391,4.394725,28.15464
std,4.03113,114.6302,10.28243,2.989674,17.08217,27.13649,17.77542,0.0,5.826089,9.823473,24.54186
min,1.0,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,50.0,0.0,0.0,0.0,9.1,0.3,0.0,0.0,0.0,7.3
50%,7.5,135.0,1.6,0.0,2.2,27.6,9.0,0.0,0.2,0.1,21.0
75%,11.0,238.0,6.0,0.0,11.3,53.0,24.9,0.0,1.2,3.9,44.2
max,14.0,717.0,100.0,71.1,100.0,100.0,100.0,0.0,100.0,100.0,100.0


### Upload Carbon Data to Supabase

In [16]:
# Prep for upload
df_carbon_upload = df_carbon.copy()
df_carbon_upload['datetime'] = df_carbon_upload['datetime'].dt.strftime('%Y-%m-%dT%H:%M:%S')
df_carbon_upload = df_carbon_upload.replace([np.nan, np.inf, -np.inf], None)

In [19]:
# Batch upload
batch_size = 1000
total = len(df_carbon_upload)

for start in range(0, total, batch_size):
    end = min(start + batch_size, total)
    batch = df_carbon_upload.iloc[start:end].to_dict(orient='records')
    
    # Clean any remaining nan
    for record in batch:
        for key, val in record.items():
            if isinstance(val, float) and (np.isnan(val) or np.isinf(val)):
                record[key] = None
    
    response = supabase.table('carbon_intensity').insert(batch).execute()
    
    if (start // batch_size + 1) % 100 == 0:
        print(f"Uploaded {end:,} / {total:,} ({end/total*100:.1f}%)")

print(f"Done! Uploaded {total:,} records")

Uploaded 100,000 / 1,451,100 (6.9%)
Uploaded 200,000 / 1,451,100 (13.8%)
Uploaded 300,000 / 1,451,100 (20.7%)
Uploaded 400,000 / 1,451,100 (27.6%)
Uploaded 500,000 / 1,451,100 (34.5%)
Uploaded 600,000 / 1,451,100 (41.3%)
Uploaded 700,000 / 1,451,100 (48.2%)
Uploaded 800,000 / 1,451,100 (55.1%)
Uploaded 900,000 / 1,451,100 (62.0%)
Uploaded 1,000,000 / 1,451,100 (68.9%)
Uploaded 1,100,000 / 1,451,100 (75.8%)
Uploaded 1,200,000 / 1,451,100 (82.7%)
Uploaded 1,300,000 / 1,451,100 (89.6%)
Uploaded 1,400,000 / 1,451,100 (96.5%)
Done! Uploaded 1,451,100 records


# Weather Data

### Download Historic Weather Data

Historic hourly weather data from Open-Meteo API

| Column | Description | Unit |
|--------|-------------|------|
| `datetime` | Timestamp of the observation | UTC |
| `region_id` | Grid region identifier (1-14) | - |
| `region_name` | Name of the grid region | - |
| `temperature` | Air temperature at 2m height | °C |
| `humidity` | Relative humidity at 2m height | % |
| `wind_speed` | Wind speed at 10m height | km/h |
| `cloud_cover` | Total cloud cover | % |
| `precipitation` | Total precipitation (rain, showers, snow) | mm |

In [21]:
# Region coordinates (approximate centroids)
REGION_COORDS = {
    1: ("North Scotland", 57.5, -4.5),
    2: ("South Scotland", 55.9, -3.2),
    3: ("North West England", 53.8, -2.6),
    4: ("North East England", 54.9, -1.6),
    5: ("South Yorkshire", 53.5, -1.5),
    6: ("North Wales & Merseyside", 53.2, -3.0),
    7: ("South Wales", 51.6, -3.4),
    8: ("West Midlands", 52.5, -2.0),
    9: ("East Midlands", 52.8, -1.0),
    10: ("East England", 52.2, 0.9),
    11: ("South West England", 50.7, -3.5),
    12: ("South England", 51.0, -1.3),
    13: ("London", 51.5, -0.1),
    14: ("South East England", 51.3, 0.5),
}

WEATHER_START = "2020-01-01"
WEATHER_END = "2025-12-31"

In [23]:
def fetch_weather(region_id, name, lat, lon):
    """Fetch hourly weather from Open-Meteo"""
    url = "https://archive-api.open-meteo.com/v1/archive"
    params = {
        "latitude": lat,
        "longitude": lon,
        "start_date": WEATHER_START,
        "end_date": WEATHER_END,
        "hourly": "temperature_2m,relative_humidity_2m,wind_speed_10m,cloud_cover,precipitation",
        "timezone": "Europe/London"
    }
    
    for attempt in range(3):
        try:
            r = requests.get(url, params=params, timeout=120)
            r.raise_for_status()
            data = r.json()
            
            hourly = data.get('hourly', {})
            records = []
            times = hourly.get('time', [])
            
            for i, t in enumerate(times):
                records.append({
                    'datetime': t,
                    'region_id': region_id,
                    'region_name': name,
                    'temperature': hourly.get('temperature_2m', [None]*len(times))[i],
                    'humidity': hourly.get('relative_humidity_2m', [None]*len(times))[i],
                    'wind_speed': hourly.get('wind_speed_10m', [None]*len(times))[i],
                    'cloud_cover': hourly.get('cloud_cover', [None]*len(times))[i],
                    'precipitation': hourly.get('precipitation', [None]*len(times))[i],
                })
            return records, None
        except Exception as e:
            if attempt < 2:
                time.sleep(2)
                continue
            return None, str(e)
    return None, "Failed"

In [24]:
all_weather = []
errors = []

for region_id, (name, lat, lon) in tqdm(REGION_COORDS.items()):
    records, error = fetch_weather(region_id, name, lat, lon)
    if error:
        errors.append(f"{name}: {error}")
        print(f"Error: {name}")
    else:
        all_weather.extend(records)
        print(f"{name}: {len(records):,} records")
    time.sleep(0.5) 

print(f"\nTotal: {len(all_weather):,} records | {len(errors)} errors")

  0%|          | 0/14 [00:00<?, ?it/s]

North Scotland: 52,608 records


  7%|▋         | 1/14 [00:19<04:10, 19.30s/it]

South Scotland: 52,608 records


 14%|█▍        | 2/14 [00:37<03:45, 18.82s/it]

North West England: 52,608 records


 21%|██▏       | 3/14 [00:56<03:24, 18.62s/it]

North East England: 52,608 records


 29%|██▊       | 4/14 [01:14<03:06, 18.64s/it]

South Yorkshire: 52,608 records


 36%|███▌      | 5/14 [01:33<02:46, 18.53s/it]

North Wales & Merseyside: 52,608 records


 43%|████▎     | 6/14 [01:50<02:26, 18.29s/it]

South Wales: 52,608 records


 50%|█████     | 7/14 [02:09<02:07, 18.25s/it]

West Midlands: 52,608 records


 57%|█████▋    | 8/14 [02:27<01:49, 18.17s/it]

East Midlands: 52,608 records


 64%|██████▍   | 9/14 [02:45<01:31, 18.26s/it]

East England: 52,608 records


 71%|███████▏  | 10/14 [03:03<01:13, 18.27s/it]

South West England: 52,608 records


 79%|███████▊  | 11/14 [03:23<00:56, 18.80s/it]

South England: 52,608 records


 86%|████████▌ | 12/14 [03:41<00:37, 18.58s/it]

London: 52,608 records


 93%|█████████▎| 13/14 [03:59<00:18, 18.26s/it]

South East England: 52,608 records


100%|██████████| 14/14 [04:17<00:00, 18.37s/it]


Total: 736,512 records | 0 errors





In [27]:
df_weather = pd.DataFrame(all_weather)
df_weather['datetime'] = pd.to_datetime(df_weather['datetime'])
df_weather.head()

Unnamed: 0,datetime,region_id,region_name,temperature,humidity,wind_speed,cloud_cover,precipitation
0,2020-01-01 00:00:00,1,North Scotland,3.0,90,10.7,87,0.0
1,2020-01-01 01:00:00,1,North Scotland,3.7,91,12.0,83,0.0
2,2020-01-01 02:00:00,1,North Scotland,4.1,91,13.3,97,0.0
3,2020-01-01 03:00:00,1,North Scotland,4.3,92,15.0,100,0.0
4,2020-01-01 04:00:00,1,North Scotland,4.8,92,16.1,99,0.0


### Weather Data Quality

In [None]:
print(f"{len(df_weather):,} records | {df_weather['datetime'].min().date()} → {df_weather['datetime'].max().date()}")
print(f"{df_weather['region_id'].nunique()} regions")

In [None]:
missing = df_weather.isnull().sum()
missing[missing > 0]

In [None]:
dupes = df_weather[df_weather.duplicated(subset=['datetime', 'region_id'], keep=False)]
print(f"{len(dupes)} duplicates")

In [None]:
df_weather.groupby('region_name').size()

In [None]:
df_weather.describe()

In [None]:
# Check for gaps in hourly data per region
for region_id in df_weather['region_id'].unique():
    region_data = df_weather[df_weather['region_id'] == region_id].sort_values('datetime')
    time_diffs = region_data['datetime'].diff()
    gaps = time_diffs[time_diffs > timedelta(hours=1)]
    if len(gaps) > 0:
        print(f"Region {region_id}: {len(gaps)} gaps")

In [None]:
# Sanity check ranges
print("Temperature range:", df_weather['temperature'].min(), "→", df_weather['temperature'].max(), "°C")
print("Humidity range:", df_weather['humidity'].min(), "→", df_weather['humidity'].max(), "%")
print("Wind speed range:", df_weather['wind_speed'].min(), "→", df_weather['wind_speed'].max(), "km/h")
print("Precipitation range:", df_weather['precipitation'].min(), "→", df_weather['precipitation'].max(), "mm")

In [None]:
expected_hourly = 24 * 365 * 6  # ~6 years of hourly data
by_region = df_weather.groupby('region_id').size()
pd.DataFrame({'records': by_region, 'expected': expected_hourly, 'pct': (by_region / expected_hourly * 100).round(1)})

In [None]:
print(f"Records: {len(df_weather):,} | Nulls: {df_weather.isnull().sum().sum()} | Dupes: {df_weather.duplicated(subset=['datetime', 'region_id']).sum()}")