In [None]:
# Load library imports
import os
import sys
import torch
import joblib
import random
import logging
import datetime
import subprocess
import numpy as np
import pandas as pd
import geopandas as gpd

# Load project Imports
from src.utils.config_loader import load_project_config, deep_format, expanduser_tree
from src.graph_building.graph_construction import build_mesh, \
    define_catchment_polygon, define_graph_adjacency
from src.preprocessing.data_partitioning import define_station_id_splits, \
    load_graph_tensors, build_pyg_object
from src.preprocessing.model_feature_engineering import preprocess_gwl_features, \
    preprocess_shared_features
from src.utils.run_manifest import save_run_manifest 

In [None]:
# Set up logger config
logging.basicConfig(
    level=logging.INFO,
   format='%(levelname)s - %(message)s',
#    format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
    handlers=[logging.StreamHandler(sys.stdout)]
)

# Set up logger for file and load config file for paths and params
logger = logging.getLogger(__name__)
config = load_project_config(config_path="config/project_config.yaml")
notebook = True

# Set up root directory paths in config
raw_data_root = config["global"]["paths"]["raw_data_root"]
results_root = config["global"]["paths"]["results_root"]

# Reformat config roots
config = deep_format(
    config,
    raw_data_root=raw_data_root,
    results_root=results_root
)
config = expanduser_tree(config)

In [None]:
# Set up seeding to define global states
random_seed = config["global"]["pipeline_settings"]["random_seed"]
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed_all(random_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Define notebook demo catchment
catchments_to_process = config["global"]["pipeline_settings"]["catchments_to_process"]
catchment = catchments_to_process[0]
run_defra_API_calls = config["global"]["pipeline_settings"]["run_defra_api"]

logger.info(f"Show Notebook Outputs: {notebook}")
logger.info(f"Notebook Demo Catchment: {catchment.capitalize()}")

In [None]:
# Select Catchment area from country wide gdf
define_catchment_polygon(
    england_catchment_gdf_path=config[catchment]['paths']['gis_catchment_boundary'],
    target_mncat=config[catchment]['target_mncat'],
    catchment=catchment,
    polygon_output_path=config[catchment]['paths']['gis_catchment_dir']
)

# Build catchment mesh
mesh_nodes_table, mesh_nodes_gdf, mesh_cells_gdf_polygons, catchment_polygon = build_mesh(
    shape_filepath=config[catchment]['paths']['gis_catchment_dir'],
    output_path=config[catchment]['paths']['mesh_nodes_output'],
    catchment=catchment,
    grid_resolution=config[catchment]['preprocessing']['graph_construction']['grid_resolution']
)

logger.info(f"Pipeline step 'Build Mesh' complete for {catchment} catchment.")

In [None]:
directional_edge_path = config[catchment]["paths"]["direction_edge_weights_path"]
directional_edge_weights = pd.read_csv(directional_edge_path)

# Create specific node_id column to merge
directional_edge_weights["node_id"] = range(0, len(directional_edge_weights))
directional_edge_weights

In [None]:
mesh_cells_gdf_polygons

### BUILD EDGE WEIGHTS ###

In [None]:
# Load in directional edge weights and mean elevation (not req. in main pipeline)
directional_edge_path=config[catchment]["paths"]["direction_edge_weights_path"]
directional_edge_weights = pd.read_csv(directional_edge_path)

edge_attr_tensor, edge_index_tensor = define_graph_adjacency(
    directional_edge_weights=directional_edge_weights,
    elevation_geojson_path=config[catchment]['paths']['elevation_geojson_path'],
    graph_output_dir=config[catchment]["paths"]["graph_data_output_dir"],
    mesh_cells_gdf_polygons=mesh_cells_gdf_polygons,
    epsilon_path=config["global"]["graph"]["epsilon"],
    catchment=catchment
)

logger.info(f"Pipeline step 'Define Graph Adjacency' complete for {catchment} catchment.\n")

In [None]:
edge_attr_tensor

In [None]:
edge_index_tensor

### Split into Train/Val/Test Split ###

In [None]:
# # Load tensors from file if needed
# edge_index_tensor, edge_attr_tensor = load_graph_tensors(
#     graph_output_dir=config[catchment]["paths"]["graph_data_output_dir"],
#     catchment=catchment
# )

# Load main_df_full from file if needed
load_path = config[catchment]["paths"]["final_df_path"] + 'final_df.csv'
main_df_full = pd.read_csv(load_path)
main_df_full

In [None]:
# --- 6a. Define Spatial Split for Observed Stations ---

train_station_ids, val_station_ids, test_station_ids = define_station_id_splits(
    main_df_full=main_df_full,
    catchment=catchment,
    test_station_shortlist=config[catchment]["model"]["data_partioning"]["test_station_shortlist"],
    val_station_shortlist=config[catchment]["model"]["data_partioning"]["val_station_shortlist"],
    random_seed=config["global"]["pipeline_settings"]["random_seed"],
    output_dir=config[catchment]["paths"]["aux_dir"],
    perc_train=config[catchment]["model"]["data_partioning"]["percentage_train"],
    perc_val=config[catchment]["model"]["data_partioning"]["percentage_val"],
    perc_test=config[catchment]["model"]["data_partioning"]["percentage_test"]
)

logger.info(f"Pipeline Step 'define station splits' complete for {catchment} catchment.")

### Preprocess Shared Features Prior to Split

In [None]:
# --- 6b. Preprocess (Standardise, one hot encode, round to 4dp) all shared features (not GWL) ---

processed_df, shared_scaler, shared_encoder, gwl_feats = preprocess_shared_features(
    main_df_full=main_df_full,
    catchment=catchment,
    random_seed=config["global"]["pipeline_settings"]["random_seed"],
    violin_plt_path=config[catchment]["visualisations"]["violin_plt_path"],
    scaler_dir = config[catchment]["paths"]["scalers_dir"],
    aux_dir=config[catchment]["paths"]["aux_dir"]
)

logger.info(f"Pipeline Step 'Preprocess Final Shared Features' complete for {catchment} catchment.")

### Split processed_df into train/val/test subsets

In [None]:
# --- 6c. Preprocess all GWL features using training data only ---

processed_df, gwl_scaler, gwl_encoder = preprocess_gwl_features(
    processed_df=processed_df,
    catchment=catchment,
    train_station_ids=train_station_ids,
    val_station_ids=val_station_ids,
    test_station_ids=test_station_ids,
    sentinel_value=config["global"]["graph"]["sentinel_value"],
    scaler_dir=config[catchment]["paths"]["scalers_dir"],
    parquet_path=os.path.join(config[catchment]["paths"]["final_df_path"], 'processed_df.parquet')
)

logger.info(f"Pipeline Step 'Preprocess Final GWL Features' complete for {catchment} catchment.")

In [None]:
print("Columns in processed_df after preprocessing:", processed_df.columns)

In [None]:
# # Shorten df to test range to reduce computation requirements

# test_start_date_str = config["global"]["data_ingestion"]["test_start_date"]
# test_end_date_str = config["global"]["data_ingestion"]["test_end_date"]

# # Convert 'timestep' column to datetime objects
# processed_df['timestep'] = pd.to_datetime(processed_df['timestep'])

# processed_df_test = processed_df.loc[
#     (processed_df['timestep'] >= test_start_date_str) &
#     (processed_df['timestep'] <= test_end_date_str)
# ].copy()

# print(f"Original processed_df shape (full data): {processed_df.shape}")
# print(f"Test processed_df_test shape (sliced from {test_start_date_str} to {test_end_date_str}): {processed_df_test.shape}")
# processed_df_test = processed_df.drop(columns=['streamflow_total_m3', 'HOST_soil_class_freely_draining_soils', 'HOST_soil_class_high_runoff_(impermeable)', 
#                                                'HOST_soil_class_impeded_saturated_subsurface_flow', 'HOST_soil_class_peat_soils', 'aquifer_productivity_High',
#                                                'aquifer_productivity_Low', 'aquifer_productivity_Mixed', 'aquifer_productivity_Moderate',
#                                                'aquifer_productivity_nan']).copy()
processed_df_test = processed_df.copy()

# TESTING WITH GWL LAGS (AR Inputs) TO UNDERSTAND PERFORMANCE
# processed_df_test = processed_df.drop(columns=['gwl_lag1', 'gwl_lag2', 'gwl_lag3', 'gwl_lag4', 'gwl_lag5', 'gwl_lag6', 'gwl_lag7'])

In [None]:
# # Display full (test) processed df
processed_df_test

In [None]:
print(processed_df_test.columns[54:61])

In [None]:
print(processed_df_test.columns[62:63])

In [None]:
processed_df_test.columns

In [None]:
# processed_df_test.describe()

Assign validation stations using geographic proximity with buffering

In [None]:
# Split to final preprocessed static columns
column_indices = list(range(0, 9)) + list(range(25, 54)) + list(range(61, 63))
split_df = processed_df_test.iloc[:, column_indices]

# Aggregate to node_id
aggregated_df = split_df.groupby('node_id').first().reset_index()

# Get station data
station_nodes = [430, 902, 1254, 1326, 1335, 1420, 1556, 1648, 1772, 1858, 1983, 2388, 2487, 2594]
station_dfs = aggregated_df[aggregated_df['node_id'].isin(station_nodes)]
station_dfs = station_dfs.drop(columns='timestep')

# Load reference df
station_metadata = pd.read_csv("data/02_processed/eden/gwl_station_data/snapped_station_node_mapping.csv")

# Merge in required data
merged_station_df = station_dfs.merge(
    station_metadata[['node_id', 'station_name', 'easting', 'northing', 'geometry']],
    on='node_id',
    how='left'
)

# Clean station_name column and drop unneeded stations
merged_station_df['station_name'] = merged_station_df['station_name'].astype(str).str.lower().str.replace(" ", "_")
rows_to_drop = merged_station_df[merged_station_df['station_name'] == 'cliburn_town_bridge_1'].index
merged_station_df.drop(rows_to_drop, inplace=True)

merged_station_df

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import spearmanr

DF = merged_station_df.copy() 
BUFFER_KM = 5.0  # km
USE_BUFFER = True
RIDGE = 1e-3

# Columns not used as environmental features
EXCLUDE = {'node_id','station_name','easting','northing','geometry'}

# Get pairwise geographic distances
def pairwise_geo_km(xy_m):
    d = xy_m[:, None, :] - xy_m[None, :, :]
    return np.hypot(d[...,0], d[...,1]) / 1000.0

# Calc per-fold whitening (Mahalanobis)
def whiten_per_fold(X, train_idx, ridge=1e-3):
    """
    Returns X_whitened for all rows, using mean/cov estimated on train_idx only.
    """
    mu = X[train_idx].mean(axis=0)
    Xc = X - mu
    Xt = X[train_idx] - mu
    # covariance on training stations
    C  = np.cov(Xt, rowvar=False)
    # ridge on diagonal
    C.flat[::C.shape[0]+1] += ridge
    # Σ^{-1/2}
    U,S,_ = np.linalg.svd(C, full_matrices=False)
    W = U @ np.diag(1.0/np.sqrt(S)) @ U.T
    return Xc @ W

# prepare merged_station_df matrices
ids = DF['node_id'].to_numpy()
XY  = DF[['easting','northing']].to_numpy(float)  # metres (BNG)
feat_cols = [c for c in DF.columns if c not in EXCLUDE]
X = DF[feat_cols].to_numpy(float)

D_geo = pairwise_geo_km(XY)

# proximity vs environmental similarity diagnostic
def proximity_env_report(X, D_geo, use_whitening=True, ridge=1e-3):
    n = X.shape[0]
    
    # global whitening for diagnostic only (not selection)
    if use_whitening:
        Xc = X - X.mean(0)
        C  = np.cov(Xc, rowvar=False)
        C.flat[::C.shape[0]+1] += ridge
        U,S,_ = np.linalg.svd(C, full_matrices=False)
        W = U @ np.diag(1.0/np.sqrt(S)) @ U.T
        Xw = Xc @ W
    else:
        Xw = X

    diff = Xw[:,None,:] - Xw[None,:,:]
    D_env = np.sqrt((diff**2).sum(-1))

    # upper triangle
    mask = np.triu(np.ones_like(D_env, dtype=bool), 1)
    rho, p_spear = spearmanr(D_env[mask], D_geo[mask])

    # quick Mantel permutation test
    def mantel(A, B, perms=20000, seed=42):
        rng = np.random.default_rng(seed)
        a = A[mask]; b = B[mask]
        obs = np.corrcoef(a, b)[0,1]
        cnt = 0
        for _ in range(perms):
            p = rng.permutation(n)
            bb = B[np.ix_(p, p)][mask]
            if np.corrcoef(a, bb)[0,1] >= obs:
                cnt += 1
        p_val = (cnt + 1) / (perms + 1)
        return float(obs), float(p_val)

    r_mantel, p_mantel = mantel(D_env, D_geo, perms=20000, seed=42)

    print(f"Spearman ρ(env, geo) = {rho:.3f}  (p={p_spear:.3f})")
    print(f"Mantel r = {r_mantel:.3f}  (p={p_mantel:.3f})")
    return D_env

# Run diagnostic report
D_env_diag = proximity_env_report(X, D_geo, use_whitening=True)

# Get assignments: two val stations by environmental similarity
def assign_two_validations_env_only(ids, X, XY, buffer_km=5.0, use_buffer=True, ridge=1e-3):
    """
    For each test station i, choose two validation stations j by:
      - computing Mahalanobis distances in static space with params
      - ranking by environmental dist only
      - discarding candidates within buffer dist of test station.
    """
    n = len(ids)
    D_geo = pairwise_geo_km(XY)
    rows = []

    for i in range(n):
        train_idx = np.arange(n) != i
        Xw = whiten_per_fold(X, train_idx, ridge=ridge)

        # environmental distances from test i to everyone
        d_env = np.linalg.norm(Xw - Xw[i], axis=1)
        d_env[i] = np.inf

        # candidate mask
        if use_buffer and buffer_km > 0:
            cand = (D_geo[i] >= buffer_km) & (np.arange(n) != i)
            # relax if too few
            if cand.sum() < 2:
                cand = (D_geo[i] >= max(3.0, 0.6*buffer_km)) & (np.arange(n) != i)
        else:
            cand = (np.arange(n) != i)

        idxs = np.where(cand)[0]
        order = idxs[np.argsort(d_env[idxs])]

        v1 = order[0]
        v2 = order[1] if len(order) > 1 else None

        rows.append({
            "test_node": ids[i],
            "val1_node": ids[v1],
            "val2_node": (ids[v2] if v2 is not None else np.nan),
            "env_d_val1": float(d_env[v1]),
            "env_d_val2": (float(d_env[v2]) if v2 is not None else np.nan),
            "geo_km_val1": float(D_geo[i, v1]),
            "geo_km_val2": (float(D_geo[i, v2]) if v2 is not None else np.nan)
        })
    return pd.DataFrame(rows)

assignments = assign_two_validations_env_only(
    ids=ids, X=X, XY=XY,
    buffer_km=BUFFER_KM, use_buffer=USE_BUFFER, ridge=RIDGE
)

assignments.sort_values("test_node").reset_index(drop=True)

### Create Train/Val/Test PyG Objects for Model Input

In [None]:
# --- 6d. Create PyG data objects using partioned station IDs (from 6a) ---

# Run time approx. 13.5 mins to build 4018 timesteps of objects (0.201s per timestep)
gwl_ohe_cols = joblib.load(os.path.join(config[catchment]["paths"]["scalers_dir"], "gwl_ohe_cols.pkl"))
all_timesteps_list = build_pyg_object(
    processed_df=processed_df_test,
    sentinel_value=config["global"]["graph"]["sentinel_value"],
    train_station_ids=train_station_ids,
    val_station_ids=val_station_ids,
    test_station_ids=test_station_ids,
    gwl_feats=gwl_feats,
    gwl_ohe_cols=gwl_ohe_cols,
    edge_index_tensor=edge_index_tensor,
    edge_attr_tensor=edge_attr_tensor,
    scalers_dir=config[catchment]["paths"]["scalers_dir"],
    catchment=catchment
)

# Save all_timesteps_list to file
torch.save(all_timesteps_list, config[catchment]["paths"]["pyg_object_path"])
logger.info(f"Pipeline Step 'Build PyG Data Objects' complete for {catchment} catchment.")

In [None]:
# Get current git commit hash to help with reproducibility if run performance is lost
git_commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode().strip()

run_dir = os.path.join("runs", datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))
save_run_manifest(
    run_dir=run_dir,
    config=config,
    git_commit=git_commit,
    all_timesteps_list=all_timesteps_list,
    temporal_features=config[catchment]["model"]["architecture"]["temporal_features"],
    scalers_dir=config[catchment]["paths"]["scalers_dir"],
    train_station_ids=train_station_ids,
    val_station_ids=val_station_ids,
    test_station_ids=test_station_ids,
    edge_index_path=os.path.join(config[catchment]["paths"]["graph_data_output_dir"], "edge_index_tensor.pt"),
    edge_attr_path=os.path.join(config[catchment]["paths"]["graph_data_output_dir"], "edge_attr_tensor.pt"),
    sentinel_value=config["global"]["graph"]["sentinel_value"],
    epsilon=config["global"]["graph"]["epsilon"]
)