In [None]:
# Load library imports
import os
import sys
import glob
import torch
import random
import logging
import datetime
import numpy as np
import pandas as pd
from torch_geometric.data import Data
from sklearn.neighbors import kneighbors_graph

# Load project Imports
from src.utils.config_loader import load_project_config, deep_format, expanduser_tree

In [None]:
# Set up logger config
logging.basicConfig(
    level=logging.INFO,
   format='%(levelname)s - %(message)s',
#    format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
    handlers=[logging.StreamHandler(sys.stdout)]
)

# Set up logger for file and load config file for paths and params
logger = logging.getLogger(__name__)
config = load_project_config(config_path="config/project_config.yaml")
notebook = True

# Set up root directory paths in config
raw_data_root = config["global"]["paths"]["raw_data_root"]
results_root = config["global"]["paths"]["results_root"]
 
# Reformat config roots
config = deep_format(
    config,
    raw_data_root=raw_data_root,
    results_root=results_root
)
config = expanduser_tree(config)

In [None]:
# Set up seeding to define global states
random_seed = config["global"]["pipeline_settings"]["random_seed"]
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed_all(random_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Define notebook demo catchment
catchments_to_process = config["global"]["pipeline_settings"]["catchments_to_process"]
catchment = catchments_to_process[0]
run_defra_API_calls = config["global"]["pipeline_settings"]["run_defra_api"]

logger.info(f"Show Notebook Outputs: {notebook}")
logger.info(f"Notebook Demo Catchment: {catchment.capitalize()}")

### Climatology (catchment seasonal mean) ###

In [None]:
# Load in geographic reference data
geo_path = config[catchment]["paths"]["gwl_station_list_with_coords"]
geo_cols = ["station_name", "easting", "northing"]
stations_geo = pd.read_csv(geo_path, usecols=geo_cols)
stations_geo

In [None]:
test_station = "longtown"
omitted = "cliburn_town_bridge_1"
station_df_dir = config[catchment]["paths"]["trimmed_output_dir"]
columns_to_load = ["Unnamed: 0", "value"]

# Get list of files in the dir
all_files = glob.glob(os.path.join(station_df_dir, "*.csv"))

# Filter out test file
files_to_load = [file for file in all_files if os.path.basename(file) not in 
                 [f"{test_station}_trimmed.csv", f"{omitted}_trimmed.csv"]]

# loop through training station list and load each file
loaded_dataframes = {}
for file_path in files_to_load:
    filename = os.path.basename(file_path).replace("_trimmed.csv", "")
    logger.info(f"Loading {filename}...")
    
    df = pd.read_csv(file_path, usecols=columns_to_load)
    
    # Clean dfs and build day of year col
    df.rename(columns={"Unnamed: 0": "date"}, inplace=True)
    df['date'] = pd.to_datetime(df['date'])
    df['day_of_year'] = df['date'].dt.dayofyear
    
    # Handle leap years to keep all years same length
    leap_day_mask = (df['date'].dt.month == 2) & (df['date'].dt.day == 29)
    df.loc[leap_day_mask, 'day_of_year'] = 59  # Map Feb 29 to Feb 28 
    
    # Shift all after adjusted leap year day back by one
    after_leap_day_mask = (df['date'].dt.is_leap_year) & (df['day_of_year'] > 59)
    df.loc[after_leap_day_mask, 'day_of_year'] -= 1
    
    # Get easting and northing from reference loaded in above
    coords = stations_geo[stations_geo['station_name'] == filename]
    
    if not coords.empty:
        df['easting'] = coords['easting'].iloc[0]
        df['northing'] = coords['northing'].iloc[0]
    else:
        logger.warning(f"Could not find coordinates for station: {filename}. Skipping coordinate assignment.")
    
    # Add to dict of dfs
    loaded_dataframes[filename] = df
    
# Confirm load is as expected
logger.info(f"Files loaded: {len(list(loaded_dataframes.keys()))}\n")
# print(list(loaded_dataframes.keys()))

loaded_dataframes

In [None]:
# Init dict to store baselines
station_baselines = {}

# Loop through each DataFrame in the dictionary
for station_name, df in loaded_dataframes.items():
    
    # Get baselines
    mean_level = df['value'].mean()  # Mean level (offset): mu_j = mean(y_j,t)
    doy_mean_curve = df.groupby('day_of_year')['value'].mean()  # DOY mean curve: m_j,k = mean{yj,t:DOY(t)=k}
    shape_curve = doy_mean_curve - mean_level  # Shape: sj,k = mj,k - μj (DOY mean curve with the overall mean subtracted)
    
    # Store the results in the baselines dictionary
    station_baselines[station_name] = {
        'mean_level': mean_level,
        'doy_mean_curve': doy_mean_curve,
        'shape_curve': shape_curve
    }

# log success to confirm
logger.info("Baseline calculations complete for all training stations.")
logger.info(f"Number of stations with calculated baselines: {len(station_baselines)}")

In [None]:
# Generate predictions
for station_name, df in loaded_dataframes.items():
    
    # Get  day-of-year mean curve for the current station
    doy_mean_curve = station_baselines[station_name]['doy_mean_curve']
    
    # Generate the baseline prediction by mapping 'day_of_year' to the mean curve
    df['baseline_pred'] = df['day_of_year'].map(doy_mean_curve)

loaded_dataframes

Build PyG Object

In [None]:
input_features = ['value', 'baseline_pred'] 
n_lags = 7  # previous 7 days as features
k_neighbors = 3 # Number of spatial neighbors for graph edges

def create_lagged_features(df, features, n_lags):
    """
    Creates lagged features for a given DataFrame.
    """
    df_with_lags = df.copy()
    for feature in features:
        for lag in range(1, n_lags + 1):
            df_with_lags[f'{feature}_lag{lag}'] = df_with_lags[feature].shift(lag)
    
    # Drop the first 'n_lags' rows with NaN values
    df_with_lags.dropna(inplace=True)
    return df_with_lags

def build_pyg_data_object(loaded_dataframes, input_features, n_lags, k_neighbors):
    """
    Prepares the full dataset and builds a PyG Data object.
    """
    # Create lagged features for each station
    lagged_dfs = {
        name: create_lagged_features(df, input_features, n_lags)
        for name, df in loaded_dataframes.items()
    }
    
    # Ensure all DataFrames have the same time index for concatenation
    first_df = next(iter(lagged_dfs.values()))
    start_date = first_df['date'].iloc[0]
    end_date = first_df['date'].iloc[-1]
    common_dates = pd.date_range(start=start_date, end=end_date)
    
    # Extract and align features across all stations
    station_names = list(lagged_dfs.keys())
    
    # Features will have shape (num_timesteps, num_stations, num_features)
    num_timesteps = len(common_dates)
    num_stations = len(station_names)
    # num_features = n_lags * len(input_features) + len(input_features) + 2  # Lags + Current values + DOY + Coordinates
    num_features = n_lags * len(input_features) + len(input_features) + 1 + 2
    
    all_features = np.zeros((num_timesteps, num_stations, num_features))
    
    # Static features: easting and northing are the same for all timesteps
    static_features = np.zeros((num_stations, 2))
    
    for i, name in enumerate(station_names):
        df = lagged_dfs[name]
        df = df[df['date'].isin(common_dates)] # Align timesteps
        
        # Prepare dynamic features (time-series)
        dynamic_feature_cols = [f for f in df.columns if 'lag' in f or f in input_features or f == 'day_of_year']
        all_features[:, i, :-2] = df[dynamic_feature_cols].values # All but last 2 columns
        
        # Prepare static features (easting, northing)
        static_features[i, :] = df[['easting', 'northing']].iloc[0].values
    
    # 3. Create the graph's edge index (adjacency matrix in PyG)
    # Use k-nearest neighbors on the static coordinates
    adj_matrix = kneighbors_graph(static_features, k_neighbors, mode='connectivity', include_self=True)
    adj_coo = adj_matrix.tocoo()
    edge_index = torch.tensor(np.vstack((adj_coo.row, adj_coo.col)), dtype=torch.long)
    
    # 4. Create the final PyG Data object
    x = torch.tensor(all_features, dtype=torch.float)
    
    # Build PyG Data object
    pyg_data = Data(x=x, edge_index=edge_index, pos=torch.tensor(static_features, dtype=torch.float))
    
    return pyg_data

# Build PyG dataset
pyg_dataset = build_pyg_data_object(loaded_dataframes, input_features, n_lags, k_neighbors)

print(f"Successfully created a PyG Data object with {pyg_dataset.num_nodes} nodes and {pyg_dataset.num_edges} edges.")
print(f"Shape of node features (x): {pyg_dataset.x.shape}")

Process Test Station

In [None]:
# build shapes/offset for given test station (no leakage)
def build_baseline_artifacts_for_test_station(test_station, stations_geo, station_baselines, k=5, p=2):
    """
    Returns:
      bar_s   : catchment shape (pd.Series, index=1..365)
      mu_hat  : neighbour-weighted offset for the test station (float)
      s_reg   : regional (node-wise) shape (pd.Series, index=1..365)
    """
    # 1) Catchment shape (exclude test station)
    shapes = [v['shape_curve'] for s, v in station_baselines.items() if s != test_station]
    bar_s = pd.concat(shapes, axis=1).mean(axis=1)  # length 365, index = DOY (1..365)

    # 2) Neighbour weights for offset/shape
    coords = stations_geo.set_index('station_name')
    xi, yi = coords.loc[test_station, ['easting', 'northing']]
    nbr_names = [s for s in station_baselines.keys() if s != test_station]
    nbr_xy = coords.loc[nbr_names, ['easting', 'northing']].values
    d = np.linalg.norm(nbr_xy - np.array([xi, yi]), axis=1)

    k = min(k, len(nbr_names))
    idx = np.argsort(d)[:k]
    nbr_names = np.array(nbr_names)[idx]
    w = (d[idx]**-p); w = w / w.sum()  # IDW weights

    # 3) Offset and regional shape
    mu_hat = float(np.sum([w[m] * station_baselines[nbr_names[m]]['mean_level'] for m in range(k)]))
    s_reg = sum(w[m] * station_baselines[nbr_names[m]]['shape_curve'] for m in range(k))
    return bar_s, mu_hat, s_reg

def create_test_station_pyg_obj(station_name, df_dir, geo_df):
    """
    Loads, processes, and converts a single test station to a PyG Data object.
    """
    file_path = os.path.join(df_dir, f"{station_name}_trimmed.csv")
    
    # Load and process the df
    df = pd.read_csv(file_path, usecols=['Unnamed: 0', 'value'])
    df.rename(columns={"Unnamed: 0": "date"}, inplace=True)
    df['date'] = pd.to_datetime(df['date'])
    df['day_of_year'] = df['date'].dt.dayofyear
    
    # Handle leap years and create lagged features
    df['day_of_year'] = df['date'].dt.dayofyear
    leap_day_mask = (df['date'].dt.month == 2) & (df['date'].dt.day == 29)
    df.loc[leap_day_mask, 'day_of_year'] = 59
    after_leap_day_mask = (df['date'].dt.is_leap_year) & (df['day_of_year'] > 59)
    df.loc[after_leap_day_mask, 'day_of_year'] -= 1
    
    # # Add baseline predictions col
    # doy_mean_curve = df.groupby('day_of_year')['value'].mean()
    # df['baseline_pred'] = df['day_of_year'].map(doy_mean_curve)
    # df['baseline_pred'].fillna(method='ffill', inplace=True)
    # df['baseline_pred'].fillna(method='bfill', inplace=True)
    
    # Build baseline from training stations (exclude the test station) → NO LEAKAGE
    bar_s, mu_hat, s_reg = build_baseline_artifacts_for_test_station(
        station_name, geo_df, station_baselines, k=5, p=2
    )

    # Choose ONE of the two baselines:
    # (i) Climatology: catchment-wide shape + site offset
    df['baseline_pred'] = mu_hat + df['day_of_year'].map(bar_s).astype(float)
    
    bar_s.plot(); print(bar_s.min(), bar_s.max(), bar_s.std())


    # (ii) Regional seasonal: node-wise shape + site offset
    # df['baseline_pred'] = mu_hat + df['day_of_year'].map(s_reg).astype(float)

    # Build lags to replicate training station pipeline for consistnecy
    df_with_lags = create_lagged_features(df, input_features, n_lags)
    
    # Get static features
    coords = geo_df[geo_df['station_name'] == station_name]
    static_features = coords[['easting', 'northing']].values
    
    # Prepare dynamic features (no edges needed for a single node)
    dynamic_feature_cols = [f for f in df_with_lags.columns if 'lag' in f or f in input_features or f == 'day_of_year']
    
    # enforce deterministic col order
    lag_cols_val  = [f'value_lag{l}' for l in range(1, n_lags+1)]
    lag_cols_base = [f'baseline_pred_lag{l}' for l in range(1, n_lags+1)]
    dynamic_feature_cols = ['value', 'baseline_pred', 'day_of_year'] + lag_cols_val + lag_cols_base
    
    # Create tensors for PyG Data object
    x = torch.tensor(df_with_lags[dynamic_feature_cols].values, dtype=torch.float)
    y = torch.tensor(df_with_lags['value'].values, dtype=torch.float)
    pos = torch.tensor(static_features, dtype=torch.float)
    
    # Create the PyG Data object with a single node and no edges
    pyg_data = Data(x=x.unsqueeze(1), y=y.unsqueeze(1), pos=pos)
    
    return pyg_data

# Execute the function to create the test station data object
test_station_data = create_test_station_pyg_obj(test_station, station_df_dir, stations_geo)
test_station_data

In [None]:
num_features = pyg_dataset.x.shape[2] 
# target_feature_idx = input_features.index('value') + (n_lags * len(input_features))
target_feature_idx = 0  # Using deterministically set indices now

def create_time_series_batches(data, window_size, horizon):
    """
    Splits the PyG Data object into time series batches for training.
    """
    num_timesteps, num_nodes, num_features = data.x.shape
    features = data.x
    x_batches, y_batches = [], []
    
    # Iterate through the time steps to create input-output pairs
    for i in range(num_timesteps - window_size - horizon + 1):
       
        x_batch = features[i : i + window_size, :, :]   # Input: window of window_size timesteps of all features
        y_batch = features[i + window_size + horizon - 1, :, target_feature_idx]  # Target: the value feature at the horizon-th step after the window
        
        x_batches.append(x_batch)
        y_batches.append(y_batch)
        
    return torch.stack(x_batches), torch.stack(y_batches)

# Define your training window and prediction horizon
window_size = 7
horizon = 1

# Create the batches
X, Y = create_time_series_batches(pyg_dataset, window_size, horizon)

In [None]:
# # Define split (spatially held out)
# X_train = pyg_dataset.x
# Y_train = pyg_dataset.y

# Randomly select some validation stations
validation_stations = np.random.choice(
    X.shape[2], 3, replace=False)

# Get indices of training stations
training_stations = np.array([i for i in range(X.shape[2]) if i not in validation_stations])

X_train = X[:, :, training_stations, :]  # X_train is all timesteps and features for the training stations
Y_train = Y[:, training_stations]  # Y_train is all targets for the training stations
X_val = X[:, :, validation_stations, :]  # X_val is all timesteps and features for the validation stations
Y_val = Y[:, validation_stations]  # Y_val is all targets for the validation stations

# # use full X and Y for training
# X, Y = create_time_series_batches(pyg_dataset, window_size, horizon)

# Print the shapes to confirm
logger.info(f"Training shape (time, window, nodes, features): {X_train.shape}")
logger.info(f"Validation shape (time, window, nodes, features): {X_val.shape}")
logger.info(f"Training targets shape (time, nodes): {Y_train.shape}")
logger.info(f"Validation targets shape (time, nodes): {Y_val.shape}")

#### GET TEST STATION METRICS ####

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# --- Extract observed and baseline series (unaligned) ---
y_true = test_station_data.y[:, 0].numpy()

# Prefer indexing by name if you added feature_names; otherwise fall back to index 1
try:
    BASE_IDX = test_station_data.feature_names.index('baseline_pred')
except Exception:
    BASE_IDX = 1  # assumes you enforced ['value','baseline_pred','day_of_year', ...] earlier

y_pred_clim = test_station_data.x[:, 0, BASE_IDX].numpy()   # climatology baseline

# --- Metrics on the deployable (unaligned) baseline ---
mae  = mean_absolute_error(y_true, y_pred_clim)
rmse = np.sqrt(mean_squared_error(y_true, y_pred_clim))
nse  = 1 - np.sum((y_true - y_pred_clim)**2) / np.sum((y_true - y_true.mean())**2)

# KGE (2009 variant): alpha = std ratio
r     = np.corrcoef(y_true, y_pred_clim)[0, 1]
beta  = y_pred_clim.mean() / y_true.mean()
alpha = y_pred_clim.std(ddof=0) / y_true.std(ddof=0)
kge   = 1 - np.sqrt((r - 1)**2 + (alpha - 1)**2 + (beta - 1)**2)

logger.info("\nClimatology baseline (UNALIGNED) on held-out test station:")
logger.info(f"MAE={mae:.4f} m | RMSE={rmse:.4f} m | NSE={nse:.4f} | KGE={kge:.4f}")
logger.info(f"KGE components: r={r:.4f}, alpha={alpha:.4f}, beta={beta:.4f}")


#### PLOT RESULTS ####

In [None]:
import matplotlib.pyplot as plt

# Load the test station dates
file_path = os.path.join(station_df_dir, f"{test_station}_trimmed.csv")
test_df = pd.read_csv(file_path, usecols=['Unnamed: 0', 'value'])
test_df.rename(columns={"Unnamed: 0": "date"}, inplace=True)
test_df['date'] = pd.to_datetime(test_df['date'])

# Align plot dates with the PyG test series (you trimmed by n_lags when building the object)
n_lags = 7
dates_to_plot = test_df['date'].values[n_lags:]

# First-point alignment helper (for plotting only)
def first_point_align_for_plot(y_pred, y_obs):
    mask = np.isfinite(y_pred) & np.isfinite(y_obs)
    if not np.any(mask):
        raise ValueError("No overlapping finite values to align on.")
    i0 = np.flatnonzero(mask)[0]
    diff = float(y_pred[i0] - y_obs[i0])   # baseline - observed at first valid point
    return y_pred - diff, diff, i0

# Align the climatology baseline to the first observation (for visual comparison)
y_pred_fp, diff, i0 = first_point_align_for_plot(y_pred_clim, y_true)
print(f"First-point offset applied to climatology: {diff:.3f} m at index {i0}")

# Plot
plt.figure(figsize=(15,7))
plt.plot(dates_to_plot, y_true,    label='Actual', color='blue', alpha=0.7)
plt.plot(dates_to_plot, y_pred_fp, label='Climatology (first-point aligned)', 
         color='red', linestyle='--')
plt.xlabel('Date'); plt.ylabel('Groundwater level (mAOD)')
plt.title(f'Climatology Baseline vs Actual for {test_station}')
plt.legend(); plt.grid(True); plt.tight_layout()
plt.show()
# (Keep saving if you want)  # plt.savefig('.../climatology_baseline_plot.png')

# plt.savefig(f'results/trained_models/eden/ablations/climatology_baseline/{test_station}_baseline_plot.png')
# plt.show()

In [None]:
# Metrics on the FIRST-POINT ALIGNED baseline (diagnostic only)
mae_fp  = mean_absolute_error(y_true, y_pred_fp)
rmse_fp = np.sqrt(mean_squared_error(y_true, y_pred_fp))
nse_fp  = 1 - np.sum((y_true - y_pred_fp)**2) / np.sum((y_true - y_true.mean())**2)

r_fp = np.corrcoef(y_true, y_pred_fp)[0, 1]
beta_fp  = y_pred_fp.mean() / y_true.mean()
alpha_fp = y_pred_fp.std(ddof=0) / y_true.std(ddof=0)
kge_fp = 1 - np.sqrt((r_fp - 1)**2 + (alpha_fp - 1)**2 + (beta_fp - 1)**2)

logger.info("\nClimatology baseline (FIRST-POINT ALIGNED) — diagnostic only:")
logger.info(f"MAE={mae_fp:.4f} m | RMSE={rmse_fp:.4f} m | NSE={nse_fp:.4f} | KGE={kge_fp:.4f}")
logger.info(f"KGE components: r={r_fp:.4f}, alpha={alpha_fp:.4f}, beta={beta_fp:.4f}")


In [None]:
# make preds df
df_predictions = pd.DataFrame({
    'date': dates_to_plot,
    'actuals': y_true,
    'predictions_first_point_aligned': y_pred_fp
})

# save to csv
output_dir = 'data/04_model/eden/model/test_results/baselines'
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, f'{test_station}_climatology_baseline.csv')
df_predictions.to_csv(output_file, index=False)

print(f"Saved climatology predictions to {output_file}")

### Regional seasonal baseline ###

In [None]:
# choose k neighbours and IDW exponent
k, p = 5, 2

coords = stations_geo.set_index('station_name')
train_names = [s for s in station_baselines.keys() if s != test_station]

# coords of test site and all training sites
xi, yi = coords.loc[test_station, ['easting','northing']]
nbr_xy  = coords.loc[train_names, ['easting','northing']].values
d = np.linalg.norm(nbr_xy - np.array([xi, yi]), axis=1)

# k nearest neighbours and normalised IDW weights
k = min(k, len(train_names))
idx = np.argsort(d)[:k]
nbr_names = np.array(train_names)[idx]
w = (d[idx] ** -p)
w = w / w.sum()  # sum to 1

# neighbour-weighted offset (no target history)
mu_hat = float(sum(w[m] * station_baselines[nbr_names[m]]['mean_level']
                   for m in range(k)))

# neighbour-weighted seasonal shape (length 365 Series indexed by DOY)
shapes_df = pd.concat([station_baselines[n]['shape_curve'] for n in nbr_names], axis=1)
shapes_df.columns = nbr_names
s_reg = shapes_df.dot(pd.Series(w, index=nbr_names))      # 365 x 1 Series

# DOY for the test dates
test_df['doy'] = test_df['date'].dt.dayofyear.clip(upper=365)

# regional seasonal baseline in mAOD
y_pred_reg_full = mu_hat + s_reg.reindex(test_df['doy']).to_numpy()

# align to your y_true length (you trimmed by n_lags earlier)
y_pred_reg = y_pred_reg_full[n_lags:]

In [None]:
def first_point_align_for_plot(y_pred, y_obs):
    """
    Align baseline to observations at the first index where both are finite.
    Returns (aligned_baseline, offset_used, first_index_used).
    """
    mask = np.isfinite(y_pred) & np.isfinite(y_obs)
    if not np.any(mask):
        raise ValueError("No overlapping finite values to align on.")
    i0 = np.flatnonzero(mask)[0]                 # first valid index
    diff = float(y_pred[i0] - y_obs[i0])         # baseline - observed at first valid point
    return y_pred - diff - 0.5, diff, i0

# choose the baseline to visualise
# y_pred = y_pred_clim      # climatology baseline (aligned to dates_to_plot)
# y_pred = y_pred_reg       # regional seasonal baseline
y_pred = y_pred_reg
y_obs = y_true

# align at the first point
y_pred_fp, diff, i0 = first_point_align_for_plot(y_pred, y_obs)
print(f"First-point offset applied (baseline - observed at {i0}): {diff:.3f} m")

mae = mean_absolute_error(y_true, y_pred_fp)
rmse = np.sqrt(mean_squared_error(y_true, y_pred_fp))
nse = 1 - np.sum((y_true - y_pred_fp)**2) / np.sum((y_true - y_true.mean())**2)
r = np.corrcoef(y_true, y_pred_fp)[0,1]
beta = y_pred_fp.mean() / y_true.mean()
alpha = y_pred_fp.std(ddof=0) / y_true.std(ddof=0)
kge = 1 - np.sqrt((r-1)**2 + (alpha-1)**2 + (beta-1)**2)

print(f"MAE={mae:.3f} m, RMSE={rmse:.3f} m, NSE={nse:.3f}, KGE={kge:.3f}")
logger.info(f"KGE components: r={r:.4f}, alpha={alpha:.4f}, beta={beta:.4f}")

# plot
plt.figure(figsize=(15,7))
plt.plot(dates_to_plot, y_obs, label='Actual', color='blue', alpha=0.7)
plt.plot(dates_to_plot, y_pred_fp, label='Baseline (first-point aligned)', color='red', linestyle='--')
plt.xlabel('Date'); plt.ylabel('Groundwater level (mAOD)')
plt.title(f'Baseline vs Actual for {test_station}')
plt.legend(); plt.grid(True); plt.tight_layout()
plt.show()

In [None]:
# make preds df
df_predictions = pd.DataFrame({
    'date': dates_to_plot,
    'actuals': y_true,
    'predictions_first_point_aligned': y_pred_fp
})

# save to csv
output_dir = 'data/04_model/eden/model/test_results/baselines'
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, f'{test_station}_regional_seasonal_baseline.csv')
df_predictions.to_csv(output_file, index=False)

print(f"Saved regional seasonal predictions to {output_file}")