In [None]:
# Load library imports
import os
import sys
import torch
import random
import logging
import numpy as np
import pandas as pd
import geopandas as gpd

# Load project Imports
from src.utils.config_loader import load_project_config, deep_format, expanduser_tree
from src.utils.config_loader import load_project_config
from src.graph_building.graph_construction import build_mesh, \
    define_catchment_polygon, build_main_df
from src.graph_building.data_merging import snap_stations_to_mesh, \
    merge_timeseries_data_to_df, load_gwl_data_for_merge, reorder_static_columns
    # aggregate_resolution
from src.visualisation.mapped_visualisations import plot_interactive_mesh_with_stations
from src.preprocessing.hydroclimatic_feature_engineering import transform_aet_data

In [None]:
# Set up logger config
logging.basicConfig(
    level=logging.INFO,
   format='%(levelname)s - %(message)s',
#    format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
    handlers=[logging.StreamHandler(sys.stdout)]
)

# Set up logger for file and load config file for paths and params
logger = logging.getLogger(__name__)
config = load_project_config(config_path="config/project_config.yaml")
notebook = True

# Set up root directory paths in config
raw_data_root = config["global"]["paths"]["raw_data_root"]
results_root = config["global"]["paths"]["results_root"]

# Reformat config roots
config = deep_format(
    config,
    raw_data_root=raw_data_root,
    results_root=results_root
)
config = expanduser_tree(config)

In [None]:
# Set up seeding to define global states
random_seed = config["global"]["pipeline_settings"]["random_seed"]
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed_all(random_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Define notebook demo catchment
catchments_to_process = config["global"]["pipeline_settings"]["catchments_to_process"]
catchment = catchments_to_process[0]
run_defra_API_calls = config["global"]["pipeline_settings"]["run_defra_api"]
pred_frequency = config["global"]["pipeline_settings"]["prediction_resolution"]

logger.info(f"Show Notebook Outputs: {notebook}")
logger.info(f"Notebook Demo Catchment: {catchment.capitalize()}")

In [None]:
# Select Catchment area from country wide gdf
define_catchment_polygon(
    england_catchment_gdf_path=config[catchment]['paths']['gis_catchment_boundary'],
    target_mncat=config[catchment]['target_mncat'],
    catchment=catchment,
    polygon_output_path=config[catchment]['paths']['gis_catchment_dir']
)

# Build catchment mesh
mesh_nodes_table, mesh_nodes_gdf, mesh_cells_gdf_polygons, catchment_polygon = build_mesh(
    shape_filepath=config[catchment]['paths']['gis_catchment_dir'],
    output_path=config[catchment]['paths']['mesh_nodes_output'],
    catchment=catchment,
    grid_resolution=config[catchment]['preprocessing']['graph_construction']['grid_resolution']
)

logger.info(f"Pipeline step 'Build Mesh' complete for {catchment} catchment.")

Load in centroid node csv's

In [None]:
# input_path=config[catchment]['paths']['mesh_nodes_output']
# grid_resolution=config[catchment]['preprocessing']['graph_construction']['grid_resolution']

# mesh_input_path = input_path + '_' + str(grid_resolution) + '.csv'
# mesh_nodes = pd.read_csv(mesh_input_path)

Merge station list with polygon geometry using spatial join to snap stations to mesh

In [None]:
station_node_mapping = snap_stations_to_mesh(
    station_list_path=config[catchment]["paths"]["gwl_station_list_output"],
    polygon_geometry_path=config[catchment]['paths']['output_polygon_dir'],
    output_path=config[catchment]["paths"]["snapped_station_node_mapping"],
    mesh_nodes_gdf=mesh_nodes_gdf,
    catchment=catchment
)

In [None]:
station_node_mapping

In [None]:
mesh_map = plot_interactive_mesh_with_stations(
    mesh_nodes_gdf=mesh_nodes_gdf,
    catchment_polygon=catchment_polygon,
    map_blue=config['global']['visualisations']['maps']['map_blue'],
    esri=config['global']['visualisations']['maps']['esri'],
    esri_attr=config['global']['visualisations']['maps']['esri_attr'],
    static_output_path=config[catchment]['visualisations']['maps']['static_mesh_map_output'],
    interactive_output_path=config[catchment]['visualisations']['maps']['interactive_station_map_output'],
    catchment=catchment,
    grid_resolution=config[catchment]['preprocessing']['graph_construction']['grid_resolution'],
    interactive=config['global']['visualisations']['maps']['display_interactive_map'],
    stations_gdf=station_node_mapping
)

logger.info(f"Pipeline step 'Interactive Mesh Mapping' complete for {catchment} catchment.")

# Display map in notebook
mesh_map

Create a main df for merging all features in to model input

In [None]:
main_df = build_main_df(
    start_date=config["global"]["data_ingestion"]["model_start_date"],
    end_date=config["global"]["data_ingestion"]["model_end_date"],
    mesh_nodes_gdf=mesh_nodes_gdf,
    catchment=catchment,
    pred_frequency=config["global"]["pipeline_settings"]["prediction_resolution"]
)

Merge all timeseries data into main df by station (from {station}_trimmed.csv)

### Merge in Timeseries Features ###

In [None]:
# Snap Precipitation, Lags and Averages to timestep

merged_ts_precipitation = merge_timeseries_data_to_df(
    model_start_date=config["global"]["data_ingestion"]["model_start_date"],
    model_end_date=config["global"]["data_ingestion"]["model_end_date"],
    feature_csv=config[catchment]["paths"]["rainfall_csv_path"],
    csv_name=f'rainfall_{pred_frequency}_catchment_sum_log_transform.csv',
    feature='all_precipitation',
    pred_frequency=config["global"]["pipeline_settings"]["prediction_resolution"]
)

logger.info(f"Precipitation and derived data snapped to graph timesteps ({pred_frequency} aggregates).\n")

# Snap 2m Temperature to timestep

merged_ts_tsm = merge_timeseries_data_to_df(
    model_start_date=config["global"]["data_ingestion"]["model_start_date"],
    model_end_date=config["global"]["data_ingestion"]["model_end_date"],
    feature_csv=config[catchment]["paths"]["2t_csv_path"],
    csv_name=f'2m_temp_{pred_frequency}_catchment_mean.csv',
    feature='2m_temperature',
    pred_frequency=config["global"]["pipeline_settings"]["prediction_resolution"],
    timeseries_df=merged_ts_precipitation
)

logger.info(f"2m Temperature Data snapped to graph timesteps ({pred_frequency} aggregate).\n")

# Snap AET to timestep

merged_ts_aet = merge_timeseries_data_to_df(
    model_start_date=config["global"]["data_ingestion"]["model_start_date"],
    model_end_date=config["global"]["data_ingestion"]["model_end_date"],
    feature_csv=config[catchment]["paths"]["aet_csv_path"],
    csv_name=f'aet_{pred_frequency}_catchment_sum.csv',
    feature='aet',
    pred_frequency=config["global"]["pipeline_settings"]["prediction_resolution"],
    timeseries_df=merged_ts_tsm
)
        
merged_ts_aet = transform_aet_data(merged_ts_aet, catchment)

logger.info(f"Actual evapotranspiration data snapped to graph timesteps ({pred_frequency} aggregate).\n")

        # Snap Surface Pressure to timestep

merged_ts_sp = merge_timeseries_data_to_df(
    model_start_date=config["global"]["data_ingestion"]["model_start_date"],
    model_end_date=config["global"]["data_ingestion"]["model_end_date"],
    feature_csv=config[catchment]["paths"]["sp_csv_path"],
    csv_name=f'surface_pressure_{pred_frequency}_catchment_mean.csv',
    feature='surface_pressure',
    pred_frequency=config["global"]["pipeline_settings"]["prediction_resolution"],
    timeseries_df=merged_ts_aet
)

logger.info(f"Surface pressure data snapped to graph timesteps ({pred_frequency} aggregate).\n")

# Snap Streamflow to timestep

final_merged_ts_df = merge_timeseries_data_to_df(
    model_start_date=config["global"]["data_ingestion"]["model_start_date"],
    model_end_date=config["global"]["data_ingestion"]["model_end_date"],
    feature_csv=config[catchment]["paths"]["stream_flow_csv"],
    csv_name=f'{pred_frequency}_streamflow.csv',
    feature='streamflow',
    pred_frequency=config["global"]["pipeline_settings"]["prediction_resolution"],
    timeseries_df=merged_ts_sp
)

logger.info(f"Streamflow data snapped to graph timesteps ({pred_frequency} aggregate).\n")

save_path = config[catchment]["paths"]["final_df_path"] + 'final_timeseries_df.csv'
final_merged_ts_df.to_csv(save_path)

logger.info(f"Final merged time series dataframe saved to {save_path}\n")

In [None]:
final_merged_ts_df

### Merge in Static Features ###

In [None]:
# Snap Land Cover to Mesh

agg_land_cover_df = pd.read_csv(config[catchment]["paths"]["land_cover_csv_path"])
merged_gdf_nodes_landuse = mesh_nodes_gdf.merge(
    agg_land_cover_df[['easting', 'northing', 'land_cover_code']],
    on=['easting', 'northing'],
    how='left'  # left join to keep all centroids, even NaN
)

logger.info(f"Land cover data snapped to mesh nodes (centroids).\n")

# Snap Elevation to Mesh

elevation_gdf_polygon = gpd.read_file(config[catchment]["paths"]["elevation_geojson_path"])
merged_gdf_nodes_elevation = merged_gdf_nodes_landuse.merge(
    elevation_gdf_polygon[['node_id', 'mean_elevation', 'geometry']],
    on='node_id',
    how='left'  # left join to keep all centroids, even NaN
)

logger.info(f"Elevation data snapped to mesh nodes (centroids).\n")

# Snap Geology Maps to Mesh

geology_path = os.path.join(config[catchment]["paths"]["geology_df"], 'geology_df.csv')
mesh_geology_df = pd.read_csv(geology_path)
merged_gdf_nodes_geology = merged_gdf_nodes_elevation.merge(
    mesh_geology_df[['geo_bedrock_type', 'geo_superficial_type', 'bedrock_flow_type',
                        'bedrock_perm_avg', 'superficial_flow_type', 'superficial_perm_avg',
                        'node_id']],
    on='node_id',
    how='left'  # left join to keep all centroids, even if NaN
)

logger.info(f"Geology data snapped to mesh nodes (centroids).\n")

# Snap Slope to Mesh

slope_gdf = pd.read_csv(config[catchment]["paths"]["slope_path"])
merged_gdf_nodes_slope = merged_gdf_nodes_geology.merge(
    slope_gdf[['node_id', 'mean_slope_degrees', 'mean_aspect_sin', 'mean_aspect_cos']],
    on='node_id',
    how='left'  # left join to keep all centroids, even if NaN
)

logger.info(f"Slope degrees and sinusoidal aspect data snapped to mesh nodes (centroids).\n")

# Snap Soil Hydrology to Mesh

soil_hydrology_path = os.path.join(config[catchment]["paths"]["soil_csv_path"], 'soil_hydrology.csv')
soil_hydrology_df = pd.read_csv(soil_hydrology_path)
merged_gdf_nodes_soil = merged_gdf_nodes_slope.merge(
    soil_hydrology_df[['node_id', 'HOST_soil_class']],
    on='node_id',
    how='left'  # left join to keep all centroids, even if NaN
)

logger.info(f"Soil Hydrology data snapped to mesh nodes (centroids).\n")

# Snap Aquifer Productivity to Mesh

productivity_path = os.path.join(config[catchment]["paths"]["productivity_csv_path"], 'productivity_data.csv')
productivity_gdf = pd.read_csv(productivity_path)
merged_gdf_nodes_productivity = merged_gdf_nodes_soil.merge(
    productivity_gdf[['node_id', 'aquifer_productivity']],
    on='node_id',
    how='left'  # left join to keep all centroids, even if NaN
)

logger.info(f"Aquifer Productivity data snapped to mesh nodes (centroids).\n")

# Snap Distance from River to Mesh

dist_to_river_path = os.path.join(config[catchment]["paths"]["rivers_csv_path"], 'distance_to_river.csv')
dist_to_river_gdf = pd.read_csv(dist_to_river_path)
static_features = merged_gdf_nodes_productivity.merge(
    dist_to_river_gdf[['node_id', 'distance_to_river']],
    on='node_id',
    how='left'  # left join to keep all centroids, even if NaN
)

logger.info(f"Distance from river data snapped to mesh nodes (centroids).\n")

# [FUTURE] Snap Depth to Groundwater to Mesh - Awaiting Licensing
# [FUTURE] Snap Infiltration Rate to Mesh
# [FUTURE] Snap Soil Type Maps to Mesh

# Finalise final_static_df for merge

final_static_df = reorder_static_columns(static_features)  # TODO: Update as more features added
static_data_ingestion.save_final_static_data(
    static_features=final_static_df,
    dir_path=config[catchment]["paths"]["final_df_path"]
)

logger.info(f"Full static feature dataframe finalised and ready to merge into main model dataframe.\n")

static_features

### Final Merge into main_df ###

In [None]:
# Merge static data into main_df
static_df = pd.read_csv(os.path.join(config[catchment]["paths"]["final_df_path"], 'final_static_df.csv'))
main_df_static = main_df.merge(
    static_df,
    left_on='node_id',
    right_on='node_id',
    how='left'
)

logger.info(f"Static data successfully merged into main_df for {catchment} catchment.\n")

# Merge timeseries data into main_df
timeseries_df = pd.read_csv(os.path.join(config[catchment]["paths"]["final_df_path"], 'final_timeseries_df.csv'))
timeseries_df['time'] = pd.to_datetime(timeseries_df['time'])
main_df_timeseries = main_df_static.merge(
    timeseries_df,
    left_on='timestep',
    right_on='time',
    how='left'
).drop(columns='time')

logger.info(f"Timeseries data successfully merged into main_df for {catchment} catchment.\n")
    
# Load GWL station data and merge data into main_df
station_dir = config[catchment]["paths"]["trimmed_output_dir"]
node_mapping_dir = config[catchment]["paths"]["snapped_station_node_mapping"]
gwl_data = load_gwl_data_for_merge(station_dir, node_mapping_dir)

seasonal_df = gwl_data[['timestep', 'season_sin', 'season_cos']].drop_duplicates('timestep')
main_df_full = (
    main_df_timeseries
    .merge(seasonal_df, on='timestep', how='left')
    .merge(gwl_data.drop(columns=['season_sin', 'season_cos']), on=['node_id', 'timestep'], how='left')
)

logger.info(f"Groundwater Level data successfully merged into main_df for {catchment} catchment.\n")

# Save final dataframe to file - NB: TIME TO SAVE APPROX. 3.5 MINS (total block 4 mins)
final_save_path = config[catchment]["paths"]["final_df_path"] + 'final_df.csv'
main_df_full.to_csv(final_save_path)
logger.info(f"Final merged dataframe saved to {final_save_path}")

# Display in notebook
main_df_full