The purpose of this notebook is to gather historical streamflow data for the reaches defined by our GIS layers.

In [1]:

#from os import error
#import sys
#import io
#import json
#import pandas
import urllib3
import requests
import xarray
from dask.distributed import Client
#from datetime import datetime
#import numpy as np

#from pathlib import Path


#import compute_rating_increments as cr

#from tqdm import tqdm
#import concurrent.futures
#from concurrent.futures import ThreadPoolExecutor

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [2]:
# use a try accept loop so we only instantiate the client
# if it doesn't already exist.
try:
    print(client.dashboard_link)
except:    
    # The client should be customized to your workstation resources.
    # This is configured for a "Large" instance on ciroh.awi.2i2c.cloud
    # client = Client()
    client = Client(n_workers=3, memory_limit='2GB')
    print(client.dashboard_link)

http://127.0.0.1:8787/status


2025-10-16 20:57:53,993 - tornado.application - ERROR - Exception in callback <bound method SystemMonitor.update of <SystemMonitor: cpu: 16 memory: 236 MB fds: 158>>
Traceback (most recent call last):
  File "/Users/castro/Documents/work/com_res/fim/.venv/lib/python3.10/site-packages/tornado/ioloop.py", line 937, in _run
    val = self.callback()
  File "/Users/castro/Documents/work/com_res/fim/.venv/lib/python3.10/site-packages/distributed/system_monitor.py", line 168, in update
    net_ioc = psutil.net_io_counters()
  File "/Users/castro/Documents/work/com_res/fim/.venv/lib/python3.10/site-packages/psutil/__init__.py", line 2148, in net_io_counters
    rawdict = _psplatform.net_io_counters()
OSError: [Errno 12] Cannot allocate memory


In [3]:
base_url = "https://arcgis.cuahsi.org/arcgis/rest/services/CIROH-ComRes"
def collect_reaches_from_gis_layers():
    # the following indices correspond to the Flowlines layer
    locations = {
        "DeSoto": 0,
        "MountAscutney": 0,
        "RoaringRiver": 13,
        "SpringfieldGreeneCounty": 1,
        "TwoRiversOttauquechee": 0,
        "Windham": 0,
    }
    params = {
        "where": "1=1",
        "geometryType": "esriGeometryEnvelope",
        "spatialRel": "esriSpatialRelIntersects",
        "units": "esriSRUnit_Foot",
        "outFields": "COMID,REACHCODE",
        "returnGeometry": "false",
        "returnDistinctValues": "true",
        "returnIdsOnly": "false",
        "returnCountOnly": "false",
        "returnExtentOnly": "false",
        "returnZ": "false",
        "returnM": "false",
        "multipatchOption": "xyFootprint",
        "returnTrueCurves": "false",
        "returnExceededLimitFeatures": "false",
        "returnCentroid": "false",
        "timeReferenceUnknownClient": "false",
        "sqlFormat": "none",
        "featureEncoding": "esriDefault",
        "f": "json",
    }

    riverids = {}
    for locname, layerid in locations.items():
        response = requests.get(
            f"{base_url}/{locname}/FeatureServer/{layerid}/query",
            params=params,
            verify=False,
        )

        data = response.json()
        if "error" in data.keys():
            print(
                f"\n * Error: {locname} - {data['error']['code']} - {data['error']['message']}\n"
            )
            continue

        comids = []
        if "features" in data:
            for feature in data["features"]:
                if "attributes" in feature and "COMID" in feature["attributes"]:
                    comids.append(
                        {
                            "reachid": feature["attributes"]["COMID"],
                            "huc8": feature["attributes"]["REACHCODE"][0:8],
                        }
                    )

        riverids[locname] = comids
        print(f"Found {len(comids)} rivers in {locname}")

    return riverids


In [4]:
reaches = collect_reaches_from_gis_layers()

Found 551 rivers in DeSoto
Found 1163 rivers in MountAscutney
Found 323 rivers in RoaringRiver
Found 1704 rivers in SpringfieldGreeneCounty
Found 1945 rivers in TwoRiversOttauquechee
Found 1352 rivers in Windham


In [5]:
region_name = 'MountAscutney'

In [6]:
reach_ids = [reach["reachid"] for reach in reaches[region_name]]
print(f'Found {len(reach_ids)} reach ids for {region_name}')

Found 1163 reach ids for MountAscutney


Select data corresponding to these reach identifiers.

In [7]:
# load historical streamflow data via retrospective
zarr_url = 's3://noaa-nwm-retrospective-3-0-pds/CONUS/zarr/chrtout.zarr'
ds = xarray.open_zarr(zarr_url, consolidated=True)

In [8]:
%%time

# select data that matches our reach_ids, silently drop any that are not found.
dat = ds.sel(feature_id=reach_ids[0:1], drop=True).streamflow 

CPU times: user 900 ms, sys: 275 ms, total: 1.17 s
Wall time: 1.16 s


In [11]:
target_feature_chunk = 1000    # roughly matches the number of feature IDs
target_time_chunk = -1         # one big chunk over time
dat = dat.chunk({'time': -1, 'feature_id': 1000})

In [13]:
%%time

percentiles = [0.05, 0.10, 0.25, 0.75, 0.95]
q = dat.quantile(percentiles, dim="time")
mx = dat.max(dim="time")      

# combine before computing
final = xarray.concat([q, mx.expand_dims({'quantile': ['max']})], dim='quantile')

CPU times: user 18.4 ms, sys: 3.41 ms, total: 21.8 ms
Wall time: 22.4 ms


In [14]:
%%time
result = q.compute()



CPU times: user 57.1 s, sys: 14.9 s, total: 1min 12s
Wall time: 7min 14s


In [18]:
result.sel(feature_id = 6090109, quantile=0.75).values

array(14.07999969)

In [15]:
# save to disk
df = result.to_dataframe().reset_index()
df.to_parquet(f'{region_name}.parquet')