### Evaluation 

This notebook is meant to serve as an example for how to evaluate the performance of a trained model, and/or compare to the summation of Q_prime for your inputs. Summed Q` represents using a summation rather than routing and is a good indicator of the baseline performance of your model

In [None]:
# Run imports
import logging
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xarray as xr
import yaml

from ddr._version import __version__
from ddr.validation import (
    Config,
    Metrics,
    plot_box_fig,
    plot_cdf,
    plot_drainage_area_boxplots,
    plot_gauge_map,
)

log = logging.getLogger(__name__)

In [None]:
# Load a config
config_path = "../"
with open("./example_config.yaml") as f:
    config = Config(**yaml.safe_load(f))

In [None]:
# Loading paths to results. We're comparing to summed q_prime as it's a good indicator of if routing is working
summed_q_prime_path = Path("./summed_q_prime.zarr")  # To obtain this, please run scripts/summed_q_prime.py
predictions_path = Path(
    "./model_test.zarr"
)  # To obtain this, please run scripts/test.py to evaluate a trained model

ds_qp = xr.open_zarr(summed_q_prime_path)
ds_pred = xr.open_zarr(predictions_path)
ds_qp

In [None]:
# Get Metrics
results = []

ds_qp_ordered = ds_qp.sel(gage_ids=ds_pred.gage_ids.values, time=ds_pred.time.values)
results.append(Metrics(pred=ds_qp_ordered.predictions.values, target=ds_qp_ordered.observations.values))
results.append(Metrics(pred=ds_pred.predictions.values, target=ds_pred.observations.values))

In [None]:
# Define metrics to include in the boxplot
key_list = ["bias", "rmse", "fhv", "flv", "nse", "kge"]
xlabel = [
    r"Bias ($m^3/s$)",
    "RMSE",
    "FHV",
    "FLV",
    "NSE",
    "KGE",
]
dataset_labels = [
    "$\\sum$ Q` $\\delta$HBV2.0UH",
    f"ddrv{__version__}",
]

# Create Box Plots
data_box = []
for statStr in key_list:
    temp = []
    for result in results:
        data = dict(result)[statStr]
        if data.size > 0:  # Check if data is not empty
            if statStr == "nse" or statStr == "kge":
                data = np.clip(data, -1, 1)  # Clip NSE and KGE values to [-1, 1]
            data = data[~np.isnan(data)]  # Remove NaNs
            temp.append(data)
    data_box.append(temp)

fig = plot_box_fig(
    data=data_box,
    xlabel_list=xlabel,
    legend_labels=dataset_labels,
    sharey=False,
    figsize=(20, 8),
    legend_font_size=18,
    xlabel_font_size=14,
    tick_font_size=26,
)
fig.patch.set_facecolor("white")
boxPlotName = "Model Comparison (1995/10/01 - 2010/09/30)"
fig.suptitle(boxPlotName, fontsize=30)
plt.rcParams["font.size"] = 22

In [None]:
# Plot a CDF
fig, ax = plot_cdf(
    data_list=[np.clip(dict(result)["nse"], 0, None) for result in results],
    title="Model Test Performance (1995/10/01 - 2010/09/30)",
    legend_labels=dataset_labels,
    figsize=(16, 8),
    xlabel="NSE",
    ylabel="Cumulative Frequency",
    reference_line=None,
)
plt.show()

In [None]:
# If you have the gauges.csv file, you can run the following function to break up your code's performance by gauge DA
gages_df = pd.read_csv("training_gauges.csv")
gages_df["STAID"] = gages_df["STAID"].astype(str).str.zfill(8)
gages_df = gages_df.set_index("STAID")
selected_gages = gages_df.loc[ds_pred.gage_ids.values].reset_index()

selected_gages["q_prime_NSE"] = np.clip(results[0].nse, a_min=0.0, a_max=1.0)
selected_gages["ddr_NSE"] = np.clip(results[1].nse, a_min=0.0, a_max=1.0)

fig = plot_drainage_area_boxplots(
    gages=selected_gages, metrics=["q_prime_NSE", "ddr_NSE"], model_names=dataset_labels, show_plot=True
)

In [None]:
fig = plot_gauge_map(
    gages=selected_gages,
    metric_column="ddr_NSE",
    title=f"ddrv{__version__}",
    show_plot=True,
    colormap="plasma",
    figsize=(16, 8),
    point_size=30,
)