# Running HMETS on the 5797 basins of the extended CANOPEX dataset

Here we use birdy's WPS client to launch the HMETS hydrological model on the server and analyze the output. We also prepare and gather data directly from the CANOPEX dataset made available freely for all users.

In [None]:
# Cookie-cutter template necessary to provide the tools, packages and paths for the project. All notebooks
# need this template (or a slightly adjusted one depending on the required packages)
import datetime as dt
import glob
import json
import os
import tempfile
from pathlib import Path
from urllib.request import urlretrieve
from zipfile import ZipFile

import netCDF4 as nc
import numpy as np
import pandas as pd
import spotpy
import xarray as xr
from matplotlib import pyplot as plt

from ravenpy.new_config import commands as rc
from ravenpy.new_config.emulators import HMETS
from ravenpy.utilities.new_config.calibration import SpotSetup
from ravenpy.utilities.testdata import get_file

# DATA MAIN SOURCE - DAP link to CANOPEX dataset. Can be DAP or direct URL:
CANOPEX_DAP = "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/dodsC/birdhouse/ets/Watersheds_5797_cfcompliant.nc"
CANOPEX_URL = "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/fileServer/birdhouse/ets/Watersheds_5797_cfcompliant.nc"

# Prefer the DAP link
ds = xr.open_dataset(CANOPEX_DAP)

In [None]:
# Explore the dataset:
display(ds)

In [None]:
# We could explore the dataset and find a watershed of interest, but for now, let's pick one at random
# from the dataset:
watershedID = 5600

# And show what it includes:
ds = ds.isel({"watershed": watershedID})

In [None]:
ds

In [None]:
# Let's write the file to disk to make it more efficient to retrieve:
fname = "/tmp/CANOPEX_extracted.nc"
ds.to_netcdf(fname)

In [None]:
# With this info, we can gather some properties from the CANOPEX database. This same database is used for
# regionalization, so let's query it there where more information is available:
tmp = pd.read_csv(get_file("regionalisation_data/gauged_catchment_properties.csv"))

basin_area = tmp["area"][watershedID]
basin_latitude = tmp["latitude"][watershedID]
basin_longitude = tmp["longitude"][watershedID]
basin_elevation = tmp["elevation"][watershedID]
basin_name = ds.watershed[watershedID].data

print("Basin name: ", basin_name)
print("Latitude: ", basin_latitude, " °N")
print("Area: ", basin_area, " km^2")

Now, we might have the model and data, but we don't have model parameters! We need to calibrate. This next snippet shows how to do so.

In [None]:
# We will also calibrate on only a subset of the years for now to keep the computations faster in this notebook.
start_calib = dt.datetime(1998, 1, 1)
end_calib = dt.datetime(1999, 12, 31)

# General parameters depending on the data source. We can find them by exploring the CANOPEX dataset in the
# cells above.
data_type = ["TEMP_MAX", "TEMP_MIN", "PRECIP"]

alt_names = {
    "TEMP_MIN": "tasmin",
    "TEMP_MAX": "tasmax",
    "PRECIP": "pr",
}

hru = {}
hru = dict(
    area=basin_area,
    elevation=basin_elevation,
    latitude=basin_latitude,
    longitude=basin_longitude,
    hru_type="land",
)

# Set the evaluation metrics to be calculated by Raven
eval_metrics = ("NASH_SUTCLIFFE",)

model_config = HMETS(
    ObservationData=rc.ObservationData.from_nc(
        CANOPEX_DAP, alt_names="discharge", station_idx=(watershedID,)
    ),
    # Setup the gauge using the second method, i.e., using a single file that contains all meteorological inputs. As
    # you can see, a single gauge is added, but it contains all the information we need.
    Gauge=[
        rc.Gauge.from_nc(
            fname,
            data_type=data_type,  # Note that this is the list of all the variables
            alt_names=alt_names,  # Note that all variables here are mapped to their names in the netcdf file.
            extra={
                "ALL": {
                    "elevation": hru["elevation"],
                    "Latitude": hru["latitude"],
                    "Longitude": hru["longitude"],
                }
            },
            station_idx=watershedID,
        )
    ],
    HRUs=[hru],
    StartDate=start_date,
    EndDate=end_date,
    RunName="CANOPEX_test",
    EvaluationMetrics=eval_metrics,
)

Now that the model is setup, we can focus on calibrating the parameters using SpotPy:

In [None]:
%%capture --no-display

# The model parameters bounds can either be set independently or we can use the defaults.
low_params = (
    0.3,
    0.01,
    0.5,
    0.15,
    0.0,
    0.0,
    -2.0,
    0.01,
    0.0,
    0.01,
    0.005,
    -5.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.00001,
    0.0,
    0.00001,
    0.0,
    0.0,
)
high_params = (
    20.0,
    5.0,
    13.0,
    1.5,
    20.0,
    20.0,
    3.0,
    0.2,
    0.1,
    0.3,
    0.1,
    2.0,
    5.0,
    1.0,
    3.0,
    1.0,
    0.02,
    0.1,
    0.01,
    0.5,
    2.0,
)

# Setup the spotpy optimizer
spot_setup = SpotSetup(
    config=model_config,
    low=low_params,
    high=high_params,
    path="/tmp/CANOPEX_NB_test3/",
)
# TODO: Allow overwrite!

Finally, we can run the optimizer:

In [None]:
# We'll definitely want to adjust the random seed and number of model evaluations:
model_evaluations = (
    50  # This is to keep computing time fast for the demo, increase as necessary
)

# Setup the spotpy sampler with the method, the setup configuration, a run name and other options. Please refer to
# the spotpy documentation for more options. We recommend sticking to this format for efficiency of most applications.
sampler = spotpy.algorithms.dds(
    spot_setup,
    dbname="CANOPEX_test",
    dbformat="ram",
    save_sim=False,
)

# Launch the actual optimization. Multiple trials can be launched, where the entire process is repeated and
# the best overall value from all trials is returned.
sampler.sample(model_evaluations, trials=1)

In [None]:
# Get the model diagnostics
diag = spot_setup.diagnostics

# Print the NSE and the parameter set in 2 different ways:
print("Nash-Sutcliffe value is: " + str(diag["DIAG_NASH_SUTCLIFFE"]))

# Get all the values of each iteration
results = sampler.getdata()

# Get the raw resutlts directly in an array
spotpy.analyser.get_best_parameterset(results)

In [None]:
# And also the NSE value:
print(diagnostics)

At this stage, we have calibrated the model on the observations for the desired dates. Now, let's run the model on a longer time period and look at the hydrograph

Since we requested output objects, we can simply access the output objects. The diagnostics is just a CSV file:

In [None]:
# We can analyze and plot the data directly here to see what it looks like, or we could download the data directly by
# changing the asobj=True to asobj=False in the cell above this one.
print(diagnostics)

In [None]:
print(diagnostics)

The `hydrograph` and `storage` outputs are netCDF files storing the time series. These files are opened by default using `xarray`, which provides convenient and powerful time series analysis and plotting tools.

In [None]:
hydrograph.q_sim

In [None]:
# Plot the simulated hydrograph
from pandas.plotting import register_matplotlib_converters

register_matplotlib_converters()
hydrograph.q_sim.plot()

In [None]:
# You can also get statistics from the data directly here.
print("Max: ", hydrograph.q_sim.max())
print("Mean: ", hydrograph.q_sim.mean())
print(
    "Monthly means: ",
    hydrograph.q_sim.groupby(hydrograph.time.dt.month).mean(dim="time"),
)

For an example of how to download the data directly to analyze locally on your own computer/server, see here:

In [None]:
# Rerun the analysis of the WPS response, this type by using asobj=False.
[hydrograph, storage, solution, diagnostics, rv] = resp.get(asobj=False)
print(hydrograph)
print(storage)
print(solution)
print(diagnostics)
print(rv)