In [None]:
# Load library imports
import os
import sys
import torch
import random
import logging
import numpy as np
from ruamel.yaml import YAML

# Load project Imports
from src.utils.config_loader import load_project_config
from src.graph_building.graph_construction import build_mesh, \
    define_catchment_polygon
from src.data_ingestion.timeseries_data_ingestion import find_haduk_file_names, \
    load_era5_land_data, load_rainfall_data, download_and_save_flow_data
from src.preprocessing.hydroclimatic_feature_engineering import derive_rainfall_features

In [None]:
# Set up logger config
logging.basicConfig(
    level=logging.INFO,
   format='%(levelname)s - %(message)s',
#    format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
    handlers=[logging.StreamHandler(sys.stdout)]
)

# Set up logger for file and load config file for paths and params
logger = logging.getLogger(__name__)
config = load_project_config(config_path="config/project_config.yaml")
notebook = True

#Set up root directory paths in config
raw_data_root = config["global"]["paths"]["raw_data_root"]
# Update all values in global paths
for key, val in config["global"]["paths"].items():
    if isinstance(val, str):
        config["global"]["paths"][key] = val.format(raw_data_root=raw_data_root)
# Update all catchment paths
catchments_to_process = config["global"]["pipeline_settings"]["catchments_to_process"]
for catchment in catchments_to_process:
    for key, val in config[catchment]["paths"].items():
        if isinstance(val, str):
            config[catchment]["paths"][key] = val.format(raw_data_root=raw_data_root)
 
# Set up seeding to define global states
random_seed = config["global"]["pipeline_settings"]["random_seed"]
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed_all(random_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Define notebook demo catchment
catchments_to_process = config["global"]["pipeline_settings"]["catchments_to_process"]
catchment = catchments_to_process[0]
run_defra_API_calls = config["global"]["pipeline_settings"]["run_defra_api"]  # True to run API calls

logger.info(f"Show Notebook Outputs: {notebook}")
logger.info(f"Notebook Demo Catchment: {catchment.capitalize()}")

### Build mesh ###

To align ingested data with

In [None]:
# Select Catchment area from country wide gdf
define_catchment_polygon(
    england_catchment_gdf_path=config[catchment]['paths']['gis_catchment_boundary'],
    target_mncat=config[catchment]['target_mncat'],
    catchment=catchment,
    polygon_output_path=config[catchment]['paths']['gis_catchment_dir']
)

In [None]:
# NB: mesh_nodes_gdf are the centroid coords, mesh_cells_gdf_polygons are polygons for e.g. averaging area
mesh_nodes_table, mesh_nodes_gdf, mesh_cells_gdf_polygons, catchment_polygon = build_mesh(
    shape_filepath=config[catchment]['paths']['gis_catchment_dir'],
    output_path=config[catchment]['paths']['mesh_nodes_output'],
    catchment=catchment,
    grid_resolution=config[catchment]['preprocessing']['graph_construction']['grid_resolution']
)

### Rainfall Ingestion and Preprocessing ###

Continue with the following once https://www.ceda.ac.uk/status/ allows.

In [None]:
# haduk_urls = find_haduk_file_names(
#     start_date=config["global"]["data_ingestion"]["model_start_date"],
#     end_date=config["global"]["data_ingestion"]["model_end_date"],
#     base_url = config["global"]["paths"]["HAD_UK_rainfall_url"]
# )

# os.environ["NETRC"] = os.path.abspath("ceda_credentials.netrc")
# url = haduk_urls[0]

# try:
#     ds = xr.open_dataset(url, engine="netcdf4")
#     print(ds)
# except OSError as e:
#     print("Failed to open dataset. This may be due to CEDA login issues. Check https://www.ceda.ac.uk/status/")
#     print(e)

### Local Rainfall Data Ingestion and Processing (using downloads not API) ###

In [None]:
# No explicit return, csv saved to csv_path for future access
load_rainfall_data(
    rainfall_dir=config[catchment]["paths"]["rainfall_filename_dir"],
    shape_filepath=config[catchment]["paths"]["gis_catchment_dir"],
    processed_output_dir=config[catchment]["paths"]["rainfall_processed_output_dir"],
    fig_path=config[catchment]["paths"]["rainfall_fig_path"],
    required_crs=27700,
    catchment=catchment
)

In [None]:
rainfall_df = derive_rainfall_features(
    csv_dir=config[catchment]["paths"]["rainfall_processed_output_dir"],
    processed_output_dir=config[catchment]["paths"]["rainfall_processed_output_dir"],
    start_date=config["global"]["data_ingestion"]["model_start_date"],
    end_date=config["global"]["data_ingestion"]["model_end_date"],
    config_path="config/project_config.yaml",
    catchment=catchment
)

In [None]:
rainfall_df

In [None]:
# No explicit return, csv saved to csv_path for future access
load_era5_land_data(
    catchment=catchment,
    shape_filepath=config[catchment]['paths']['gis_catchment_dir'],
    required_crs=27700,
    cdsapi_path=config["global"]["paths"]["CDSAPI_path"],
    start_date=config["global"]["data_ingestion"]["model_start_date"],
    end_date=config["global"]["data_ingestion"]["model_end_date"],
    run_era5_land_api=config["global"]["pipeline_settings"]["run_era5_land_api"],
    raw_output_dir=config[catchment]["paths"]["aet_raw_output_dir"],
    processed_output_dir=config[catchment]["paths"]["aet_processed_output_dir"],
    csv_path=config[catchment]["paths"]["aet_csv_path"],
    fig_path=config[catchment]["paths"]["aet_fig_path"],
    era5_feat='e',
    era5_long='total_evaporation',
    feat_name='aet',
    aggregation_type='sum'
)

### Temperature Ingestion and Preprocessing ###

Continue with the following once https://www.ceda.ac.uk/status/ allows.

### Surface Pressure Ingestion and Preprocessing ###

In [None]:
# No explicit return, csv saved to csv_path for future access
load_era5_land_data(
    catchment=catchment,
    shape_filepath=config[catchment]['paths']['gis_catchment_dir'],
    required_crs=27700,
    cdsapi_path=config["global"]["paths"]["CDSAPI_path"],
    start_date=config["global"]["data_ingestion"]["model_start_date"],
    end_date=config["global"]["data_ingestion"]["model_end_date"],
    run_era5_land_api=config["global"]["pipeline_settings"]["run_era5_land_api"],
    raw_output_dir=config[catchment]["paths"]["sp_raw_output_dir"],
    processed_output_dir=config[catchment]["paths"]["sp_processed_output_dir"],
    csv_path=config[catchment]["paths"]["sp_csv_path"],
    fig_path=config[catchment]["paths"]["sp_fig_path"],
    era5_feat='sp',
    era5_long='surface_pressure',
    feat_name='surface_pressure',
    aggregation_type='mean'
)

### AET Ingestion and Preprocessing ###

In [None]:
# No explicit return, csv saved to csv_path for future access
load_era5_land_data(
    catchment=catchment,
    shape_filepath=config[catchment]['paths']['gis_catchment_dir'],
    required_crs=27700,
    cdsapi_path=config["global"]["paths"]["CDSAPI_path"],
    start_date=config["global"]["data_ingestion"]["model_start_date"],
    end_date=config["global"]["data_ingestion"]["model_end_date"],
    run_era5_land_api=config["global"]["pipeline_settings"]["run_era5_land_api"],
    raw_output_dir=config[catchment]["paths"]["aet_raw_output_dir"],
    processed_output_dir=config[catchment]["paths"]["aet_processed_output_dir"],
    csv_path=config[catchment]["paths"]["aet_csv_path"],
    fig_path=config[catchment]["paths"]["aet_fig_path"],
    era5_feat='e',
    era5_long='total_evaporation',
    feat_name='aet',
    aggregation_type='sum'
)

### 2m Surface Temperature Ingestion and Preprocessing ###

In [None]:
# No explicit return, csv saved to csv_path for future access
load_era5_land_data(
    catchment=catchment,
    shape_filepath=config[catchment]['paths']['gis_catchment_dir'],
    required_crs=27700,
    cdsapi_path=config["global"]["paths"]["CDSAPI_path"],
    start_date=config["global"]["data_ingestion"]["model_start_date"],
    end_date=config["global"]["data_ingestion"]["model_end_date"],
    run_era5_land_api=config["global"]["pipeline_settings"]["run_era5_land_api"],
    raw_output_dir=config[catchment]["paths"]["2t_raw_output_dir"],
    processed_output_dir=config[catchment]["paths"]["2t_processed_output_dir"],
    csv_path=config[catchment]["paths"]["2t_csv_path"],
    fig_path=config[catchment]["paths"]["2t_fig_path"],
    era5_feat='2t',
    era5_long='2m_temperature',
    feat_name='2m_temp',
    aggregation_type='mean'
)

### Streamflow Data ###

In [None]:
# DEFRA Hydrology API daily total streamflow at station closest to catchment ouflow (lumped hydrological modelling, m^3/s)

stream_flow_df = download_and_save_flow_data(
    station_csv=config[catchment]["paths"]["stream_flow_station"],
    start_date=config["global"]["data_ingestion"]["model_start_date"],
    end_date=config["global"]["data_ingestion"]["model_end_date"],
    output_dir=config[catchment]["paths"]["stream_flow_csv"],
    catchment=catchment
)

stream_flow_df