# Exploratory Data Analysis
Analysis of UK National Grid demand data from NESO (National Energy System Operator).

In [None]:
# Imports
import requests
from bs4 import BeautifulSoup
from pathlib import Path
import pandas as pd
import hashlib
from datetime import datetime, timedelta
from supabase import create_client, Client
import os
from dotenv import load_dotenv

load_dotenv()

## Demand Data

| Column name                 | Description                                                                                              |
| --------------------------- | -------------------------------------------------------------------------------------------------------- |
| `SETTLEMENT_DATE`           | Calendar date of the electricity settlement period (UTC).                                                |
| `SETTLEMENT_PERIOD`         | Half-hour settlement period within the day (1–48).                                                       |
| `FORECAST_ACTUAL_INDICATOR` | Indicates whether the data point is **Actual (A)** or **Forecast (F)**.                                  |
| `ND`                        | **National Demand** – estimated total GB electricity demand including embedded generation.               |
| `TSD`                       | **Transmission System Demand** – demand seen by the transmission network (excludes embedded generation). |
| `ENGLAND_WALES_DEMAND`      | Electricity demand specific to England and Wales.                                                        |
| `EMBEDDED_WIND_GENERATION`  | Electricity generated by wind connected to distribution networks (not transmission).                     |
| `EMBEDDED_WIND_CAPACITY`    | Installed capacity of embedded wind generation.                                                          |
| `EMBEDDED_SOLAR_GENERATION` | Electricity generated by embedded solar PV.                                                              |
| `EMBEDDED_SOLAR_CAPACITY`   | Installed capacity of embedded solar PV.                                                                 |
| `NON_BM_STOR`               | Non-Balancing Mechanism storage output (e.g. small-scale batteries).                                     |
| `PUMP_STORAGE_PUMPING`      | Electricity demand used to pump water into pumped-storage hydro (negative net generation).               |
| `SCOTTISH_TRANSFER`         | Net electricity transfer between Scotland and England/Wales.                                             |
| `IFA_FLOW`                  | Power flow on the **IFA** interconnector (GB–France).                                                    |
| `IFA2_FLOW`                 | Power flow on the **IFA2** interconnector (GB–France).                                                   |
| `BRITNED_FLOW`              | Power flow on the **BritNed** interconnector (GB–Netherlands).                                           |
| `MOYLE_FLOW`                | Power flow on the **Moyle** interconnector (GB–Northern Ireland).                                        |
| `EAST_WEST_FLOW`            | Power flow on the **East-West** interconnector (GB–Ireland).                                             |
| `NEMO_FLOW`                 | Power flow on the **NEMO** interconnector (GB–Belgium).                                                  |
| `NSL_FLOW`                  | Power flow on the **North Sea Link** interconnector (GB–Norway).                                         |
| `ELECLINK_FLOW`             | Power flow on the **ElecLink** interconnector (GB–France).                                               |
| `VIKING_FLOW`               | Power flow on the **Viking Link** interconnector (GB–Denmark).                                           |
| `GREENLINK_FLOW`            | Power flow on the **Greenlink** interconnector (GB–Ireland).                                             |       


### Download Historic Data
Download CSV files for each year (2020-2025) from NESO data portal.

In [None]:
# Supabase connection
supabase_url: str = os.getenv("SUPABASE_URL")
supabase_key: str = os.getenv("SUPABASE_KEY")
supabase: Client = create_client(supabase_url, supabase_key)
print("Connected to Supabase.")

In [None]:
# Configuration
BASE_PAGE = "https://www.neso.energy/data-portal/historic-demand-data/historic_demand_data_{}"
YEARS = range(2020, 2026)
OUT_DIR = Path("neso_historic_demand")
OUT_DIR.mkdir(exist_ok=True)

session = requests.Session()
session.headers.update({"User-Agent": "academic-dashboard-project"})


Processing 2020
Loaded 17568 records from 2020
  Date range: 2020-01-01 00:00:00 to 2020-12-31 00:00:00

Processing 2021
Loaded 17520 records from 2021
  Date range: 2021-01-01 00:00:00 to 2021-12-31 00:00:00

Processing 2022
Loaded 17520 records from 2022
  Date range: 2022-01-01 00:00:00 to 2022-12-31 00:00:00

Processing 2023
Loaded 17520 records from 2023
  Date range: 2023-01-01 00:00:00 to 2023-12-31 00:00:00

Processing 2024
Loaded 17568 records from 2024
  Date range: 2024-01-01 00:00:00 to 2024-12-31 00:00:00

Processing 2025
Loaded 16800 records from 2025
  Date range: 2025-01-01 00:00:00 to 2025-12-16 00:00:00

Done.
Total unified records: 104496
Final date range: 2020-01-01 00:00:00 to 2025-12-16 00:00:00


In [None]:
# Helper functions
def sha256(path: Path) -> str:
    """Calculate SHA256 hash of a file for deduplication."""
    h = hashlib.sha256()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(8192), b""):
            h.update(chunk)
    return h.hexdigest()

def standardize_date(date_val):
    """Convert various date formats to a standard datetime object."""
    if pd.isna(date_val):
        return pd.NaT
    
    date_formats = [
        '%Y-%m-%d',        # 2025-06-27
        '%d-%b-%y',        # 26-Oct-23
        '%d-%b-%Y',        # 26-Oct-2023
        '%d-%B-%y',        # 26-October-23
        '%d-%B-%Y',        # 26-October-2023
        '%Y-%m-%dT%H:%M:%S.%fZ',
        '%Y-%m-%dT%H:%M:%S',
    ]
    
    date_str = str(date_val).strip().upper()
    
    for fmt in date_formats:
        try:
            return pd.to_datetime(date_str, format=fmt.upper() if '%b' in fmt or '%B' in fmt else fmt)
        except (ValueError, TypeError):
            continue
    
    try:
        return pd.to_datetime(date_str, dayfirst=True)
    except:
        return pd.NaT

In [None]:
# Download and process data for each year
all_dfs = []
seen_hashes = set()

for year in YEARS:
    print(f"Processing {year}...")
    page_url = BASE_PAGE.format(year)

    # Fetch dataset page
    r = session.get(page_url, timeout=30)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")

    # Find CSV download link
    csv_url = None
    for a in soup.find_all("a", href=True):
        href = a["href"].lower()
        if "download" in href and href.endswith(".csv"):
            csv_url = a["href"]
            break

    if not csv_url:
        print(f"  No CSV found for {year}, skipping...")
        continue

    if csv_url.startswith("/"):
        csv_url = "https://www.neso.energy" + csv_url

    csv_path = OUT_DIR / f"historic_demand_{year}.csv"

    # Download CSV
    with session.get(csv_url, stream=True, timeout=60) as resp:
        resp.raise_for_status()
        with open(csv_path, "wb") as f:
            for chunk in resp.iter_content(8192):
                f.write(chunk)

    # Verify uniqueness
    file_hash = sha256(csv_path)
    if file_hash in seen_hashes:
        print(f"  Duplicate detected for {year}, skipping...")
        continue
    seen_hashes.add(file_hash)

    # Load and standardize date format
    df_year = pd.read_csv(csv_path)
    df_year['SETTLEMENT_DATE'] = df_year['SETTLEMENT_DATE'].apply(standardize_date)
    df_year["SOURCE_YEAR"] = year

    all_dfs.append(df_year)
    print(f"  Loaded {len(df_year)} records ({df_year['SETTLEMENT_DATE'].min().date()} to {df_year['SETTLEMENT_DATE'].max().date()})")

# Combine all years
df = pd.concat(all_dfs, ignore_index=True)
print(f"\nTotal records: {len(df)}")

In [None]:
# Preview the data
print(f"Columns: {df.columns.tolist()}")
print(f"Date range: {df['SETTLEMENT_DATE'].min()} to {df['SETTLEMENT_DATE'].max()}")
df.sample(5)

Columns: ['SETTLEMENT_DATE', 'SETTLEMENT_PERIOD', 'ND', 'TSD', 'ENGLAND_WALES_DEMAND', 'EMBEDDED_WIND_GENERATION', 'EMBEDDED_WIND_CAPACITY', 'EMBEDDED_SOLAR_GENERATION', 'EMBEDDED_SOLAR_CAPACITY', 'NON_BM_STOR', 'PUMP_STORAGE_PUMPING', 'IFA_FLOW', 'IFA2_FLOW', 'BRITNED_FLOW', 'MOYLE_FLOW', 'EAST_WEST_FLOW', 'NEMO_FLOW', 'NSL_FLOW', 'ELECLINK_FLOW', 'VIKING_FLOW', 'GREENLINK_FLOW', 'SOURCE_YEAR', 'SCOTTISH_TRANSFER']
Total rows: 104496
Date range: 01-APR-2020 to 31-Oct-23


Unnamed: 0,SETTLEMENT_DATE,SETTLEMENT_PERIOD,ND,TSD,ENGLAND_WALES_DEMAND,EMBEDDED_WIND_GENERATION,EMBEDDED_WIND_CAPACITY,EMBEDDED_SOLAR_GENERATION,EMBEDDED_SOLAR_CAPACITY,NON_BM_STOR,...,BRITNED_FLOW,MOYLE_FLOW,EAST_WEST_FLOW,NEMO_FLOW,NSL_FLOW,ELECLINK_FLOW,VIKING_FLOW,GREENLINK_FLOW,SOURCE_YEAR,SCOTTISH_TRANSFER
0,01-JAN-2020,1,26340,27153,23821,1073,6465,0,13040,0,...,852,-151,-47,854,0,0,0,0,2020,
1,01-JAN-2020,2,26921,27684,24393,1020,6465,0,13040,0,...,853,-146,0,854,0,0,0,0,2020,
2,01-JAN-2020,3,26569,27240,24085,1010,6465,0,13040,0,...,852,-53,0,854,0,0,0,0,2020,
3,01-JAN-2020,4,25754,26435,23350,1043,6465,0,13040,0,...,852,-66,0,854,0,0,0,0,2020,
4,01-JAN-2020,5,25075,25824,22788,1001,6465,0,13040,0,...,853,-74,-60,854,0,0,0,0,2020,


### Data Cleaning
Remove unnecessary columns and create a unified datetime column.

In [None]:
# Remove unnecessary columns
columns_to_remove = ['_id', '_full_text', '_count', 'rank', 'FORECAST_ACTUAL_INDICATOR', 'SOURCE_YEAR']
df_cleaned = df.drop(columns=[col for col in columns_to_remove if col in df.columns], errors='ignore')

# Filter for actual data only (not forecasts)
if 'FORECAST_ACTUAL_INDICATOR' in df.columns:
    df_cleaned = df[df['FORECAST_ACTUAL_INDICATOR'] == 'A'].copy()
    df_cleaned = df_cleaned.drop(columns=['FORECAST_ACTUAL_INDICATOR'], errors='ignore')

print(f"Columns after cleaning: {df_cleaned.columns.tolist()}")

Cleaned columns: ['SETTLEMENT_DATE', 'SETTLEMENT_PERIOD', 'ND', 'TSD', 'ENGLAND_WALES_DEMAND', 'EMBEDDED_WIND_GENERATION', 'EMBEDDED_WIND_CAPACITY', 'EMBEDDED_SOLAR_GENERATION', 'EMBEDDED_SOLAR_CAPACITY', 'NON_BM_STOR', 'PUMP_STORAGE_PUMPING', 'SCOTTISH_TRANSFER', 'IFA_FLOW', 'IFA2_FLOW', 'BRITNED_FLOW', 'MOYLE_FLOW', 'EAST_WEST_FLOW', 'NEMO_FLOW', 'NSL_FLOW', 'ELECLINK_FLOW', 'VIKING_FLOW', 'GREENLINK_FLOW']


In [None]:
# Create unified DATETIME column from SETTLEMENT_DATE and SETTLEMENT_PERIOD
# Each settlement period is 30 minutes (period 1 = 00:00-00:30)
df_cleaned['DATETIME'] = df_cleaned.apply(
    lambda row: row['SETTLEMENT_DATE'] + timedelta(minutes=(int(row['SETTLEMENT_PERIOD']) - 1) * 30),
    axis=1
)

# Reorder columns and remove original date columns
cols = ['DATETIME'] + [col for col in df_cleaned.columns if col not in ['DATETIME', 'SETTLEMENT_DATE', 'SETTLEMENT_PERIOD']]
df_cleaned = df_cleaned[cols]

print(f"Final shape: {df_cleaned.shape}")
print(f"Date range: {df_cleaned['DATETIME'].min()} to {df_cleaned['DATETIME'].max()}")
df_cleaned.head()

Datetime range: 2025-12-01 00:00:00 to 2026-01-13 23:30:00


### Upload to Supabase
Insert the cleaned data into the Supabase database.

In [None]:
# Prepare data for Supabase
df_to_insert = df_cleaned.copy()
df_to_insert['DATETIME'] = df_to_insert['DATETIME'].dt.strftime('%Y-%m-%dT%H:%M:%S')
df_to_insert.columns = df_to_insert.columns.str.lower()
data_to_insert = df_to_insert.to_dict(orient='records')

# Insert in batches
batch_size = 500
total_inserted = 0

print(f"Inserting {len(data_to_insert)} records...")

for i in range(0, len(data_to_insert), batch_size):
    batch = data_to_insert[i:i + batch_size]
    try:
        response = supabase.table("demand_data").upsert(batch).execute()
        total_inserted += len(batch)
        if (i // batch_size + 1) % 10 == 0:
            print(f"Progress: {total_inserted}/{len(data_to_insert)} records")
    except Exception as e:
        print(f"Error at batch {i // batch_size + 1}: {e}")
        break

print(f"\nCompleted! Total records inserted: {total_inserted}")