# Generate FIM Inputs

The purpose of this notebook is to generate input arguments that can be used to calculate HAND-based flood maps. The ranges and intervals used to determine the input arguments are computed from historically modeled streamflow.

In [1]:
# import io
# import json
# import pandas
# import urllib3
# import requests
# from datetime import datetime
# import numpy as np
# import matplotlib.pyplot as plt

# import creds
# import compute_rating_increments as cr

#urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

import s3fs
import xarray
import gis_features as gf
from dask.distributed import Client

In [2]:
# use a try accept loop so we only instantiate the client
# if it doesn't already exist.
try:
    print(client.dashboard_link)
except:    
    # The client should be customized to your workstation resources.
    # This is configured for a "Large" instance on ciroh.awi.2i2c.cloud
    # client = Client()
    client = Client(n_workers=2, memory_limit='8GB')
    print(client.dashboard_link)

http://127.0.0.1:8787/status


2025-10-30 10:09:49,235 - distributed.dashboard.components.scheduler - ERROR - 'open_dataset-streamflow-d41c4f8d85fac77c8d1c84af2cb5ea1a'
Traceback (most recent call last):
  File "/Users/castro/Documents/work/com_res/fim/.venv/lib/python3.10/site-packages/distributed/utils.py", line 811, in wrapper
    return func(*args, **kwargs)
  File "/Users/castro/Documents/work/com_res/fim/.venv/lib/python3.10/site-packages/distributed/dashboard/components/scheduler.py", line 2630, in update_layout
    x = max(xs[dep] for dep in dependencies[tg]) + 1
  File "/Users/castro/Documents/work/com_res/fim/.venv/lib/python3.10/site-packages/distributed/dashboard/components/scheduler.py", line 2630, in <genexpr>
    x = max(xs[dep] for dep in dependencies[tg]) + 1
KeyError: 'open_dataset-streamflow-d41c4f8d85fac77c8d1c84af2cb5ea1a'
2025-10-30 10:09:49,275 - tornado.application - ERROR - Uncaught exception GET /individual-groups (127.0.0.1)
HTTPServerRequest(protocol='http', host='127.0.0.1:8787', method=

Load retrospective modeled data.

In [3]:
%%time

# load historical streamflow data via retrospective
zarr_url = 's3://noaa-nwm-retrospective-3-0-pds/CONUS/zarr/chrtout.zarr'

# Create an anonymous S3 filesystem
fs = s3fs.S3FileSystem(anon=True)

# Pass the filesystem to xarray
ds = xarray.open_zarr(fs.get_mapper(zarr_url), consolidated=True)


CPU times: user 2.45 s, sys: 398 ms, total: 2.85 s
Wall time: 5.78 s


Load river identifiers from GIS layers. These COMIDs will be used to generate inputs for the FIM analysis.

In [4]:
reaches = gf.collect_reaches_from_gis_layers()

Found 551 rivers in DeSoto
Found 1163 rivers in MountAscutney
Found 323 rivers in RoaringRiver
Found 1704 rivers in SpringfieldGreeneCounty
Found 1945 rivers in TwoRiversOttauquechee
Found 1352 rivers in Windham


In [5]:
reaches

Unnamed: 0,reachid,huc8,region
0,3629135,07140101,DeSoto
1,3629127,07140101,DeSoto
2,3629119,07140101,DeSoto
3,3629097,07140101,DeSoto
4,3629089,07140101,DeSoto
...,...,...,...
7033,10295176,01080203,Windham
7034,10295174,01080203,Windham
7035,10295172,01080203,Windham
7036,10295170,01080203,Windham


Get the maximum modeled flow from the retrospective data and use this to determine an upper bound for FIM generation.

In [6]:
# only search data for ids that exist in the dataset
existing_ids = set(ds.feature_id.values)
valid_ids = [rid for rid in reaches.reachid if rid in existing_ids]
missing_ids = list(set(reaches.reachid) - set(valid_ids))

reaches.loc[reaches.reachid.isin(missing_ids)].to_csv('GIS_reaches_missing_BigQuery_data.csv',
                                                      index=False) 

valid_reaches = reaches.loc[reaches.reachid.isin(valid_ids)]

In [7]:
# select data that matches our reach_ids, silently drop any that are not found.
dat = ds.sel(feature_id=valid_ids, drop=True)

In [44]:
# compute the maximum streamflow for all reaches.
maxes = dat['streamflow'].max(dim='time').compute()

Join these maximum values to the reaches dataset and convert them from CMS to CFS.

In [13]:
# debugging - remove later
sub = dat['streamflow'].isel(feature_id=range(0,10))

In [14]:
sub

Unnamed: 0,Array,Chunk
Bytes,29.43 MiB,52.50 kiB
Shape,"(385704, 10)","(672, 10)"
Dask graph,574 chunks in 4 graph layers,574 chunks in 4 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 29.43 MiB 52.50 kiB Shape (385704, 10) (672, 10) Dask graph 574 chunks in 4 graph layers Data type float64 numpy.ndarray",10  385704,

Unnamed: 0,Array,Chunk
Bytes,29.43 MiB,52.50 kiB
Shape,"(385704, 10)","(672, 10)"
Dask graph,574 chunks in 4 graph layers,574 chunks in 4 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,40 B,40 B
Shape,"(10,)","(10,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 40 B 40 B Shape (10,) (10,) Dask graph 1 chunks in 4 graph layers Data type float32 numpy.ndarray",10  1,

Unnamed: 0,Array,Chunk
Bytes,40 B,40 B
Shape,"(10,)","(10,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,150 B,150 B
Shape,"(10,)","(10,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,|S15 numpy.ndarray,|S15 numpy.ndarray
"Array Chunk Bytes 150 B 150 B Shape (10,) (10,) Dask graph 1 chunks in 4 graph layers Data type |S15 numpy.ndarray",10  1,

Unnamed: 0,Array,Chunk
Bytes,150 B,150 B
Shape,"(10,)","(10,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,|S15 numpy.ndarray,|S15 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,40 B,40 B
Shape,"(10,)","(10,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 40 B 40 B Shape (10,) (10,) Dask graph 1 chunks in 4 graph layers Data type float32 numpy.ndarray",10  1,

Unnamed: 0,Array,Chunk
Bytes,40 B,40 B
Shape,"(10,)","(10,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,40 B,40 B
Shape,"(10,)","(10,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 40 B 40 B Shape (10,) (10,) Dask graph 1 chunks in 4 graph layers Data type float32 numpy.ndarray",10  1,

Unnamed: 0,Array,Chunk
Bytes,40 B,40 B
Shape,"(10,)","(10,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,40 B,40 B
Shape,"(10,)","(10,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,int32 numpy.ndarray,int32 numpy.ndarray
"Array Chunk Bytes 40 B 40 B Shape (10,) (10,) Dask graph 1 chunks in 4 graph layers Data type int32 numpy.ndarray",10  1,

Unnamed: 0,Array,Chunk
Bytes,40 B,40 B
Shape,"(10,)","(10,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,int32 numpy.ndarray,int32 numpy.ndarray


In [15]:
# debugging - remove later
ds_rechunked = sub.chunk({"time": 50000, "feature_id": -1})
# ds_rechunked = ds.chunk({"time": 50000, "feature_id": 1000})
ds_rechunked

Unnamed: 0,Array,Chunk
Bytes,29.43 MiB,3.81 MiB
Shape,"(385704, 10)","(50000, 10)"
Dask graph,8 chunks in 5 graph layers,8 chunks in 5 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 29.43 MiB 3.81 MiB Shape (385704, 10) (50000, 10) Dask graph 8 chunks in 5 graph layers Data type float64 numpy.ndarray",10  385704,

Unnamed: 0,Array,Chunk
Bytes,29.43 MiB,3.81 MiB
Shape,"(385704, 10)","(50000, 10)"
Dask graph,8 chunks in 5 graph layers,8 chunks in 5 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,40 B,40 B
Shape,"(10,)","(10,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 40 B 40 B Shape (10,) (10,) Dask graph 1 chunks in 4 graph layers Data type float32 numpy.ndarray",10  1,

Unnamed: 0,Array,Chunk
Bytes,40 B,40 B
Shape,"(10,)","(10,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,150 B,150 B
Shape,"(10,)","(10,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,|S15 numpy.ndarray,|S15 numpy.ndarray
"Array Chunk Bytes 150 B 150 B Shape (10,) (10,) Dask graph 1 chunks in 4 graph layers Data type |S15 numpy.ndarray",10  1,

Unnamed: 0,Array,Chunk
Bytes,150 B,150 B
Shape,"(10,)","(10,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,|S15 numpy.ndarray,|S15 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,40 B,40 B
Shape,"(10,)","(10,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 40 B 40 B Shape (10,) (10,) Dask graph 1 chunks in 4 graph layers Data type float32 numpy.ndarray",10  1,

Unnamed: 0,Array,Chunk
Bytes,40 B,40 B
Shape,"(10,)","(10,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,40 B,40 B
Shape,"(10,)","(10,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 40 B 40 B Shape (10,) (10,) Dask graph 1 chunks in 4 graph layers Data type float32 numpy.ndarray",10  1,

Unnamed: 0,Array,Chunk
Bytes,40 B,40 B
Shape,"(10,)","(10,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,40 B,40 B
Shape,"(10,)","(10,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,int32 numpy.ndarray,int32 numpy.ndarray
"Array Chunk Bytes 40 B 40 B Shape (10,) (10,) Dask graph 1 chunks in 4 graph layers Data type int32 numpy.ndarray",10  1,

Unnamed: 0,Array,Chunk
Bytes,40 B,40 B
Shape,"(10,)","(10,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,int32 numpy.ndarray,int32 numpy.ndarray


In [11]:
# debugging - remove later

maxes = sub.max(dim='time').compute()

2025-10-30 10:19:59,226 - distributed.worker - ERROR - Failed to communicate with scheduler during heartbeat.
Traceback (most recent call last):
  File "/Users/castro/Documents/work/com_res/fim/.venv/lib/python3.10/site-packages/distributed/comm/tcp.py", line 225, in read
    frames_nosplit_nbytes_bin = await stream.read_bytes(fmt_size)
tornado.iostream.StreamClosedError: Stream is closed

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/castro/Documents/work/com_res/fim/.venv/lib/python3.10/site-packages/distributed/worker.py", line 1269, in heartbeat
    response = await retry_operation(
  File "/Users/castro/Documents/work/com_res/fim/.venv/lib/python3.10/site-packages/distributed/utils_comm.py", line 441, in retry_operation
    return await retry(
  File "/Users/castro/Documents/work/com_res/fim/.venv/lib/python3.10/site-packages/distributed/utils_comm.py", line 420, in retry
    return await coro()
  File "/Use

KeyboardInterrupt: 

2025-10-30 10:20:03,503 - distributed.nanny - ERROR - Worker process died unexpectedly
Process Dask Worker process (from Nanny):
Traceback (most recent call last):
  File "/Users/castro/miniconda3/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/castro/miniconda3/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/castro/Documents/work/com_res/fim/.venv/lib/python3.10/site-packages/distributed/process.py", line 202, in _run
    target(*args, **kwargs)
  File "/Users/castro/Documents/work/com_res/fim/.venv/lib/python3.10/site-packages/distributed/nanny.py", line 1023, in _run
    asyncio_run(run(), loop_factory=get_loop_factory())
  File "/Users/castro/Documents/work/com_res/fim/.venv/lib/python3.10/site-packages/distributed/compatibility.py", line 236, in asyncio_run
    return loop.run_until_complete(main)
  File "/Users/castro/miniconda3/lib/python3.10/asyncio/base_e

In [None]:
maxes

In [20]:
maxes_df = maxes.to_dataframe()[["streamflow"]]

In [21]:
maxes_df

Unnamed: 0_level_0,streamflow
feature_id,Unnamed: 1_level_1
101,158.009996
179,4.61
181,8.14
183,8.26
185,8.34
843,3.49
845,13.21
847,12.02
849,38.929999
851,14.75


In [23]:
valid_reaches.loc[valid_reaches.reachid==851]

Unnamed: 0,reachid,huc8,region


In [3]:
base_url = "https://arcgis.cuahsi.org/arcgis/rest/services/CIROH-ComRes"

# the following indices correspond to the Flowlines layer
locations = {'DeSoto': 0,
             'MountAscutney': 0,
             'RoaringRiver':13,
             'SpringfieldGreeneCounty':1,
             'TwoRiversOttauquechee': 0,
             'Windham': 0}
params = {
    'where': '1=1',
    'geometryType': 'esriGeometryEnvelope',
    'spatialRel': 'esriSpatialRelIntersects',
    'units': 'esriSRUnit_Foot',
    'outFields': 'COMID,REACHCODE',
    'returnGeometry': 'false',
    'returnDistinctValues': 'true', 
    'returnIdsOnly': 'false',
    'returnCountOnly': 'false',
    'returnExtentOnly': 'false',
    'returnZ': 'false',
    'returnM': 'false',
    'multipatchOption': 'xyFootprint',
    'returnTrueCurves': 'false',
    'returnExceededLimitFeatures': 'false',
    'returnCentroid': 'false',
    'timeReferenceUnknownClient': 'false',
    'sqlFormat': 'none',
    'featureEncoding': 'esriDefault', 
    'f': 'json' 
}

riverids = {}
for locname, layerid in locations.items():
    response = requests.get(f'{base_url}/{locname}/FeatureServer/{layerid}/query', params=params, verify=False)
    
    
    data = response.json()
    if 'error' in data.keys():
        print(f"\n * Error: {locname} - {data['error']['code']} - {data['error']['message']}\n")
        continue
    
    comids = []
    if 'features' in data:
        for feature in data['features']:
            if 'attributes' in feature and 'COMID' in feature['attributes']:
                comids.append({'reachid': feature['attributes']['COMID'],
                               'huc8': feature['attributes']['REACHCODE'][0:8]})

    riverids[locname] = comids
    print(f'Found {len(comids)} rivers in {locname}')

Found 551 rivers in DeSoto
Found 1163 rivers in MountAscutney
Found 323 rivers in RoaringRiver
Found 1704 rivers in SpringfieldGreeneCounty
Found 1945 rivers in TwoRiversOttauquechee
Found 1352 rivers in Windham


Now that we have all of the river identifiers, we can collect streamflow data and compute statistics.

In [4]:
API_KEY = creds.key
API_URL = creds.url

header = {
    'x-api-key': API_KEY
}

ENDPOINT = f'{API_URL}/analysis-assim'

## Collect Data from BigQuery


In [5]:
from tqdm import tqdm
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor

In [225]:
                
class GatherData():
    def __init__(self, api_key, api_url='https://nwm-api.ciroh.org/',
                 api_endpoint='analysis-assim', workers=5):
        self.url = f'{api_url}{api_endpoint}'
        self.header =  {
            'x-api-key': api_key
        }
        self.df = None
        self.workers = workers 
        
    def fetch_url(self, params):
        try:
            response = requests.get(self.url,
                                    params=params,
                                    headers=self.header)
            
            # Raise an exception for HTTP errors
            response.raise_for_status()  
            return response
            
        except requests.exceptions.RequestException as e:
            return f"Error fetching {self.url}: {e} -> {params}"

    def fetch_async(self, params_list):

        results = []
        errors = []
        
        # Use ThreadPoolExecutor to make concurrent GET requests
        # TQDM is used to provide a nice looking progress bar
        with ThreadPoolExecutor(max_workers=self.workers) as executor:
            
            # Submit all URLs to the executor
            future_to_url = {executor.submit(self.fetch_url, param): param for param in params_list}
            
            # Process the results as they complete
            for future in tqdm(concurrent.futures.as_completed(future_to_url),
                               total=len(future_to_url),
                               desc="Fetching Data",
                               unit="url",
                               colour="green",  
                               dynamic_ncols=True):
                url = future_to_url[future]
                try:
                    res = future.result()

                    # attempt to get the status code.
                    # if one is not returned, we should log 
                    # it as an error.
                    status_code = res.status_code

                    # otherwise, the 
                    results.append(res)
                    
                except Exception as e:
                    errors.append(f"Exception for {url}: {e}")
                    print(f"Exception for {url}: {e}")
            
            return results, errors
            
    def collect_analysis_data(self, comids, start_time,
                              end_time, parallel=True):
        
        # build a parameters to query
        params = [
            {'comids': comid,
             'start_time': start_time,
             'end_time': end_time,
             'output_format': 'csv'}
            for comid in comids
        ]
        
        # query the api asynchronously with the parameters defined above 
        if parallel:
            responses, errors = self.fetch_async(params)

            with open('errors.txt', 'a') as f:
                for err in errors:
                    f.write(err)
        else:
            responses = []
            for param in params:
                responses.append(self.fetch_url(params))
        
                
        # filter out only the successful responses and 
        # convert them into a single pandas dataframe
        successful_responses = [resp for resp in responses if resp.status_code == 200]
        # return responses, errors
        dfs = [pandas.read_csv(io.StringIO(res.text), sep=',') for res in successful_responses]
        df = pandas.concat(dfs, ignore_index=True)  

        # clean datetime columns and return
        df.time = pandas.to_datetime(df.time)

        self.df = df

In [217]:
riverids.keys()

dict_keys(['DeSoto', 'MountAscutney', 'RoaringRiver', 'SpringfieldGreeneCounty', 'TwoRiversOttauquechee', 'Windham'])

In [218]:
for region_name in ['RoaringRiver']:

    print(f'Collecting Data for {region_name}')
    
    reach_ids = [reach['reachid'] for reach in riverids[region_name]]
    
    gather_data = GatherData(creds.key, creds.url, workers = 2)
    
    start_time = '2016-01-01'
    end_time =  datetime.today().strftime('%Y-%m-%d')
    
    gather_data.collect_analysis_data(reach_ids, start_time, end_time)
    df = gather_data.df
    
    df.to_parquet(f'{region_name.lower()}.parquet')

Collecting Data for RoaringRiver


Fetching Data:   0%|[32m▎                                                                                                                                       [0m| 1/401 [00:15<1:41:37, 15.24s/url][0m

Exception for {'comids': 8585734, 'start_time': '2016-01-01', 'end_time': '2025-09-30', 'output_format': 'csv'}: 'str' object has no attribute 'status_code'
Exception for {'comids': 8585000, 'start_time': '2016-01-01', 'end_time': '2025-09-30', 'output_format': 'csv'}: 'str' object has no attribute 'status_code'


Fetching Data:   0%|[32m▋                                                                                                                                       [0m| 2/401 [00:28<1:35:40, 14.39s/url][0m

KeyboardInterrupt



# Load Parquet and Compute Statistics

In [52]:
# def cms_to_cfs(flow_cms):
#     return flow_cms * (3.28084**3)

# def m_to_ft(meters):
#     return meters * 3.28084


# def ft_to_m(feet):
#     return feet / 3.28084
    
# def water_year_season(month):
#         if month in [10, 11, 12]:
#             return "Fall"
#         elif month in [1, 2, 3]:
#             return "Winter" 
#         elif month in [4, 5, 6]:
#             return "Spring"
#         elif month in [7, 8, 9]:
#             return "Summer"  
#         else:
#             return None
            
# def get_record_flow(df):
#     """
#     Computes the maximum streamflow for the given series in cubic feet per second.
#     """
#     max_cms = df.streamflow.max()
#     max_cfs = cms_to_cfs(max_cms)

#     return dict(maximum_cms = max_cms,
#                 maximum_cfs = max_cfs)


# def get_seasonal_mean_flows(df):
#     """
#     Computes seasonal average streamflow in cubic feet per second.
#     """

#     # aggregate hourly data to mean daily
#     df_daily = df.resample("D").mean()

#     df_daily["season"] = df_daily.index.month.map(water_year_season)
#     df_daily["water_year"] = df_daily.index.year + (df_daily.index.month >= 10)

#     seasonal_means = (
#         df_daily.groupby(["water_year", "season"])["streamflow"]
#         .mean()
#         .unstack("season") 
#     ).mean()

#     return dict(fall_mean_cfs = cms_to_cfs(seasonal_means['Fall']),
#                 spring_mean_cfs = cms_to_cfs(seasonal_means['Spring']),
#                 summer_mean_cfs = cms_to_cfs(seasonal_means['Summer']),
#                 winter_mean_cfs = cms_to_cfs(seasonal_means['Winter']))

# def get_seasonal_median_flows(df):
#     """
#     Computes median streamflow for each season and averages them over year.
#     """
   
#     # aggregate hourly data to mean daily
#     df_daily = df.resample("D").mean()

#     df_daily["season"] = df_daily.index.month.map(water_year_season)
#     df_daily["water_year"] = df_daily.index.year + (df_daily.index.month >= 10)

#     seasonal_means = (
#         df_daily.groupby(["water_year", "season"])["streamflow"]
#         .median()
#         .unstack("season") 
#     ).mean()

#     return dict(fall_median_cfs = cms_to_cfs(seasonal_means['Fall']),
#                 spring_median_cfs = cms_to_cfs(seasonal_means['Spring']),
#                 summer_median_cfs = cms_to_cfs(seasonal_means['Summer']),
#                 winter_median_cfs = cms_to_cfs(seasonal_means['Winter']))

# def get_stats(df):
#     res = {}
#     res.update(get_record_flow(df))
#     res.update(get_seasonal_median_flows(df))
#     res.update(get_seasonal_mean_flows(df))
#     return res
    

# Analyze Roaring River data

In [236]:
# load all of the known roaring river ids into a dataframe
df_reaches = pandas.DataFrame(riverids['DeSoto'])
df_reaches

Unnamed: 0,reachid,huc8
0,3629135,07140101
1,3629127,07140101
2,3629119,07140101
3,3629097,07140101
4,3629089,07140101
...,...,...
546,3629145,07140101
547,3629143,07140101
548,3630443,07140101
549,3630493,07140101


In [237]:
# load the data collected and saved into the parquet file
df = pandas.read_parquet('desoto.parquet')
df.set_index('time', inplace=True)
df

Unnamed: 0_level_0,feature_id,streamflow,velocity
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-09-16 22:00:00,3629119,11061.540039,0.50
2018-09-16 23:00:00,3629119,11059.120117,0.50
2018-09-17 00:00:00,3629119,11056.700195,0.50
2018-09-17 01:00:00,3629119,11052.919922,0.50
2018-09-17 02:00:00,3629119,11047.740234,0.50
...,...,...,...
2025-09-10 17:00:00,3629143,0.000000,0.01
2025-09-10 18:00:00,3629143,0.000000,0.01
2025-09-10 19:00:00,3629143,0.000000,0.01
2025-09-10 20:00:00,3629143,0.000000,0.01


In [238]:
# make sure that we have data for all reach ids.
all_ids = len(df_reaches.reachid.unique())
collected_ids = len(df.feature_id.unique())
print(f'All reaches have data -> {all_ids == collected_ids}')
print(f'Total Number of Reaches = {all_ids}')
print(f'Number of Reaches that have Data = {collected_ids}')

All reaches have data -> False
Total Number of Reaches = 551
Number of Reaches that have Data = 542


In [247]:
# collect data for all missing reaches
missing_ids = df_reaches[~df_reaches.reachid.isin(df.feature_id.unique())]
if len(missing_ids) > 0:
    # TODO: Collect data here.
    for index,  row in missing_ids.iterrows():
        print(row['reachid'])
        
    ids = list(missing_ids.values[:,0])
    gather_data = GatherData(creds.key, creds.url, workers = 1)
    start_time = '2016-01-01'
    end_time =  datetime.today().strftime('%Y-%m-%d')
    gather_data.collect_analysis_data(ids, start_time, end_time)
    dat = gather_data.df
else: 
    print('No missing data to collect')

3627365
3627613
3629123
3627439
3627445
3627073
3630443
3630493
3627243


Fetching Data:  78%|[32m████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                               [0m| 7/9 [00:47<00:10,  5.31s/url][0m

Exception for {'comids': 3630443, 'start_time': '2016-01-01', 'end_time': '2025-10-01', 'output_format': 'csv'}: 'str' object has no attribute 'status_code'


Fetching Data:  89%|[32m████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍               [0m| 8/9 [00:48<00:03,  3.95s/url][0m

Exception for {'comids': 3630493, 'start_time': '2016-01-01', 'end_time': '2025-10-01', 'output_format': 'csv'}: 'str' object has no attribute 'status_code'


Fetching Data: 100%|[32m████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████[0m| 9/9 [00:49<00:00,  5.54s/url][0m

Exception for {'comids': 3627243, 'start_time': '2016-01-01', 'end_time': '2025-10-01', 'output_format': 'csv'}: 'str' object has no attribute 'status_code'





In [249]:
gather_data.collect_analysis_data([3630493], start_time, end_time)

Fetching Data: 100%|[32m████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████[0m| 1/1 [00:01<00:00,  1.13s/url][0m

Exception for {'comids': 3630493, 'start_time': '2016-01-01', 'end_time': '2025-10-01', 'output_format': 'csv'}: 'str' object has no attribute 'status_code'





ValueError: No objects to concatenate

In [None]:
df

In [56]:
# apply the statistics function to every feature_id in our dataset
stats = df.groupby('feature_id').apply(get_stats, include_groups=False)

# converte the series above into a dataframe in which each statistic is represented as a column
stats = pandas.DataFrame(stats.tolist(), index=stats.index).reset_index().rename(columns={"index": "feature_id"})

stats

Unnamed: 0,feature_id,maximum_cms,maximum_cfs,fall_median_cfs,spring_median_cfs,summer_median_cfs,winter_median_cfs,fall_mean_cfs,spring_mean_cfs,summer_mean_cfs,winter_mean_cfs
0,8584816,9.420000,332.664195,0.151349,0.365759,0.044143,0.359453,0.404612,1.301210,0.160039,0.825325
1,8584818,17.789999,628.247946,0.201798,0.405698,0.044143,0.422515,0.412215,1.178485,0.190158,0.707380
2,8584830,6.110000,215.772622,0.302697,0.554945,0.088287,0.706293,0.322093,0.632222,0.118801,0.717859
3,8584832,6.060000,214.006899,0.302697,0.578068,0.088287,0.706293,0.324518,0.655108,0.132825,0.719808
4,8584836,29.099998,1027.656846,0.353147,0.838723,0.088287,0.850285,0.955086,2.943300,0.410236,1.784595
...,...,...,...,...,...,...,...,...,...,...,...
396,8589596,623.440002,22016.578021,1014.643003,2233.165155,1169.377569,1449.512673,1029.966426,2402.815532,1198.452754,1612.928333
397,8589598,624.640015,22058.956056,1016.231112,2236.408639,1170.370704,1451.360381,1032.130827,2407.585439,1199.533158,1616.157269
398,8589600,28.420000,1003.642927,0.816652,1.486159,0.264860,1.893959,1.899854,4.568732,1.062457,3.232303
399,8589606,628.429993,22192.797880,1014.831135,2238.006207,1163.096912,1456.216151,1029.866086,2402.846867,1198.503790,1613.801285


In [53]:
# compute rating curve increments based on min and max flow values.



In [144]:
from utils import ratingcurve

import importlib
importlib.reload(ratingcurve)

<module 'utils.ratingcurve' from '/Users/castro/Documents/work/com_res/fim/utils/ratingcurve.py'>

In [145]:
rc = ratingcurve.RatingCurve(11010001)

In [146]:
rc.get_flow(8589606, 1.2)

287.1842952435199

In [147]:
cms_to_cfs(287.1842952435199)

10141.818647785014

In [114]:
import time
import math

import numpy
import compute_rating_increments as rc

def round_up_to_step(x, step=0.5):
    return math.ceil(x / step) * step
    
def get_fim_increments(reachid, huc_id, max_cms, buffer=.25, step_ft=0.5):
    
    
    max_stage_ft = m_to_ft(rc.get_stage(huc_id, reachid, max_cms * (1+buffer)))

    max_stage_ft = max(max_stage_ft, 10)
    max_stage_ft = round_up_to_step(max_stage_ft, step_ft)

    print(f'Computing flow rates for {reachid}, [max={max_stage_ft}]...', end='', flush=True)
    st = time.time()

    #print('Loading rating curve...', end='', flush=True)
    dat = rc.__load_rating_curve(huc_id, reachid)
    # since each feature_id is associated with multiple HydroID's,
    # we'll just consider the first one to determine the bounds of
    # our calculation
    group = dat.groupby("HydroID").get_group(
        list(dat.groupby("HydroID").groups.keys())[0]
    )
    #print('done')
    
    fim_args = []
    
    for i in numpy.arange(step_ft, max_stage_ft + 0.1, step_ft):
        stage_in_m = ft_to_m(i)
        flow_in_m = rc.interpolate_y(group, "stage", "discharge_cms", stage_in_m)

        fim_args.append({
                            'ft': i,
                            'm': stage_in_m,
                            'cms': flow_in_m,
                            'cfs': cms_to_cfs(flow_in_m)
                        })
   
    print(f'done [elapsed {round(time.time() - st, 1)} sec]')                      
    return fim_args
    


In [200]:
#reach_id = 8584816
from utils import ratingcurve

importlib.reload(ratingcurve)

buffer = 0.1
step_ft=0.5


fim_increments = {}
# loop through huc8s
for huc_id, data in df_reaches.groupby(df_reaches.huc8):
    rc = ratingcurve.RatingCurve(huc8)

    # loop through reaches in the huc8
    for rid in data.reachid.values:
        fim_args = []
        
        # load the stats
        reach_stats = stats.loc[stats.feature_id == rid]
    
        # compute fim increments
        #fim_inc = get_fim_increments(rid, huc_id, reach_stats.maximum_cms.item())
        max_cms = reach_stats.maximum_cms.item()
        if numpy.isnan(max_cms):
            print(f'Huc8 = {huc_id}, reachid = {rid} contains Nan data. Skipping.')
            continue

        
        try:
            max_stage_ft = m_to_ft(rc.get_stage(rid, max_cms * (1+buffer)))
        except Exception as e:
            print(e)
            continue

        
        max_stage_ft = max(max_stage_ft, 10)
        max_stage_ft = round_up_to_step(max_stage_ft, step_ft)
    
        print(f'Computing flow rates for {rid}, [max={max_stage_ft}]...', end='', flush=True)
        st = time.time()
        
        
        for i in numpy.arange(step_ft, max_stage_ft + 0.1, step_ft):
            stage_in_m = ft_to_m(i)
            flow_in_cms = rc.get_flow(rid, stage_in_m)
    
            fim_args.append({
                                'ft': i,
                                'm': stage_in_m,
                                'cms': flow_in_cms,
                                'cfs': cms_to_cfs(flow_in_cms),
                                'huc8': huc_id,
                            })
       
        print(f'done [elapsed {round(time.time() - st, 1)} sec]')                      
    
        fim_increments[rid] = fim_args

Computing flow rates for 8585734, [max=13.0]...done [elapsed 0.1 sec]
Computing flow rates for 8585000, [max=13.5]...done [elapsed 0.1 sec]
Computing flow rates for 8584992, [max=14.0]...done [elapsed 0.1 sec]
Computing flow rates for 8584984, [max=10.0]...done [elapsed 0.1 sec]
Computing flow rates for 8584978, [max=10.0]...done [elapsed 0.1 sec]
Computing flow rates for 8584962, [max=10.0]...done [elapsed 0.1 sec]
Computing flow rates for 8584940, [max=10.0]...done [elapsed 0.1 sec]
Computing flow rates for 8584928, [max=10.0]...done [elapsed 0.1 sec]
Computing flow rates for 8584920, [max=10.0]...done [elapsed 0.1 sec]
Computing flow rates for 8585720, [max=10.0]...done [elapsed 0.1 sec]
Computing flow rates for 8585718, [max=10.0]...done [elapsed 0.1 sec]
Computing flow rates for 8584904, [max=10.0]...done [elapsed 0.1 sec]
Computing flow rates for 8584900, [max=10.0]...done [elapsed 0.0 sec]
Computing flow rates for 8584886, [max=10.0]...done [elapsed 0.0 sec]
Computing flow rates

In [203]:
fim_increments

{8585734: [{'ft': 0.5,
   'm': 0.15239999512320015,
   'cms': 0.8138755446260789,
   'cfs': 28.74174637044747,
   'huc8': '11010001'},
  {'ft': 1.0,
   'm': 0.3047999902464003,
   'cms': 1.6277510892521578,
   'cfs': 57.48349274089494,
   'huc8': '11010001'},
  {'ft': 1.5,
   'm': 0.4571999853696005,
   'cms': 3.847128460898506,
   'cfs': 135.86007247395023,
   'huc8': '11010001'},
  {'ft': 2.0,
   'm': 0.6095999804928006,
   'cms': 6.066505922496977,
   'cfs': 214.23665538363505,
   'huc8': '11010001'},
  {'ft': 2.5,
   'm': 0.7619999756160007,
   'cms': 9.81853271616244,
   'cfs': 346.738243852178,
   'huc8': '11010001'},
  {'ft': 3.0,
   'm': 0.914399970739201,
   'cms': 13.570559706007046,
   'cfs': 479.23983924872266,
   'huc8': '11010001'},
  {'ft': 3.5,
   'm': 1.066799965862401,
   'cms': 18.824405339823308,
   'cfs': 664.7776646247221,
   'huc8': '11010001'},
  {'ft': 4.0,
   'm': 1.2191999609856012,
   'cms': 24.07825126198881,
   'cfs': 850.3155001836798,
   'huc8': '1101000

In [206]:
# Flatten into a DataFrame
fim_inputs = pandas.concat(
    [pandas.DataFrame(v).assign(reach_id=k) for k, v in fim_increments.items()],
    ignore_index=True
)
fim_inputs

Unnamed: 0,ft,m,cms,cfs,huc8,reach_id
0,0.5,0.1524,0.813876,28.741746,11010001,8585734
1,1.0,0.3048,1.627751,57.483493,11010001,8585734
2,1.5,0.4572,3.847128,135.860072,11010001,8585734
3,2.0,0.6096,6.066506,214.236655,11010001,8585734
4,2.5,0.7620,9.818533,346.738244,11010001,8585734
...,...,...,...,...,...,...
6600,8.0,2.4384,346.418500,12233.655034,11010001,8589582
6601,8.5,2.5908,384.467744,13577.351563,11010001,8589582
6602,9.0,2.7432,422.516991,14921.048147,11010001,8589582
6603,9.5,2.8956,463.562295,16370.549535,11010001,8589582


In [211]:
# write this to an input file
with open('fim_inputs.txt', 'w') as f:
    for idx, row in fim_inputs.iterrows():
        str_ft = str(round(row.ft, 1)).replace('.','_')
        str_cfs = str(round(row.cfs, 1)).replace('.','_')
        label = f'{row.reach_id}__{str_ft}_ft__{str_cfs}_cfs'
        f.write(f'reachfim,{row.huc8},{row.reach_id},{round(row.cms,5)},{label}\n')