From c6e7e2d64ac9ac39d2bed1118af2d2ba0ff29d4a Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sat, 30 May 2026 10:15:30 -0400 Subject: [PATCH] refactor(waterdata): coherent error handling + safe cleanups for non-OGC getters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Non-OGC getter error coherence + safe cleanups. All public signatures, kwargs (incl. the upstream-driven Samples camelCase), and return shapes are unchanged — the only behavior change is the exception TYPE on HTTP errors. Error coherence: - Route the direct-httpx getters (get_samples, get_samples_summary, get_codes, ratings._search/_download_and_parse) through the module's typed _raise_for_non_200 instead of a bare response.raise_for_status(), so a 429 raises RateLimited and a 5xx raises ServiceUnavailable (with Retry-After and USGS-aware messages) — the same typed errors the OGC/stats path already raises. NOTE: this changes the exception type from httpx.HTTPStatusError; flagged for review. Safe cleanups: - Extract `_get_samples_csv` and call it from get_samples/get_samples_summary (collapses the duplicated GET + read_csv tail). - Fix get_channel docstring drift: channel_flow was labeled as units (it's the discharge value), channel_flow_unit was undocumented, and measurement_type carried last_modified's pasted RFC-3339 block. - _format_api_dates: resolve the local timezone only after the all-NA / duration / interval guards (skip the lookup when it's discarded). Tests: add typed-error regression tests (429 -> RateLimited, 5xx -> ServiceUnavailable). 118 samples/ratings/utils tests pass; ruff clean. Deliberately NOT included (kept backward-compatible / out of scope): no kwarg renames (Samples camelCase mirrors the upstream API), no get_codes return-shape change, no ssl_check additions. The _aggregate_paginated_response / _combine_chunk_responses response-merge dedup is deferred to avoid colliding with the in-flight chunking PR (#295). Co-Authored-By: Claude Opus 4.8 (1M context) --- dataretrieval/waterdata/api.py | 76 ++++++++++++------------------ dataretrieval/waterdata/ratings.py | 5 +- dataretrieval/waterdata/utils.py | 6 +-- tests/waterdata_test.py | 24 ++++++++++ 4 files changed, 61 insertions(+), 50 deletions(-) diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index 1ec9ed42..61cd88c3 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -34,6 +34,7 @@ _check_profiles, _default_headers, _get_args, + _raise_for_non_200, get_ogc_data, get_stats_data, ) @@ -2116,7 +2117,7 @@ def get_codes(code_service: CODE_SERVICES) -> pd.DataFrame: response = httpx.get(url, headers=_default_headers(), **HTTPX_DEFAULTS) - response.raise_for_status() + _raise_for_non_200(response) data_dict = json.loads(response.text) data_list = data_dict["data"] @@ -2126,6 +2127,30 @@ def get_codes(code_service: CODE_SERVICES) -> pd.DataFrame: return df +def _get_samples_csv( + url: str, params: dict, ssl_check: bool +) -> tuple[pd.DataFrame, httpx.Response]: + """Issue a Samples CSV request and parse the body into a DataFrame. + + Shared tail for the Samples getters: sends the GET with the standard + headers (including ``X-Api-Key``), raises a typed error on a non-200 + (consistent with the OGC/stats path) instead of a bare + ``HTTPStatusError``, and reads the CSV. The caller wraps the response + as metadata and applies any per-getter post-step. + """ + logger.debug("Request: %s", httpx.URL(url).copy_merge_params(params)) + response = httpx.get( + url, + params=params, + verify=ssl_check, + headers=_default_headers(), + **HTTPX_DEFAULTS, + ) + _raise_for_non_200(response) + df = pd.read_csv(StringIO(response.text), delimiter=",") + return df, response + + def get_samples( ssl_check: bool = True, service: SERVICES = "results", @@ -2340,19 +2365,7 @@ def get_samples( url = f"{SAMPLES_URL}/{service}/{profile}" - logger.debug("Request: %s", httpx.URL(url).copy_merge_params(params)) - - response = httpx.get( - url, - params=params, - verify=ssl_check, - headers=_default_headers(), - **HTTPX_DEFAULTS, - ) - - response.raise_for_status() - - df = pd.read_csv(StringIO(response.text), delimiter=",") + df, response = _get_samples_csv(url, params, ssl_check) df = _attach_datetime_columns(df) return df, BaseMetadata(response) @@ -2414,19 +2427,7 @@ def get_samples_summary( url = f"{SAMPLES_URL}/summary/{quote(monitoringLocationIdentifier, safe='')}" params = {"mimeType": "text/csv"} - logger.debug("Request: %s", httpx.URL(url).copy_merge_params(params)) - - response = httpx.get( - url, - params=params, - verify=ssl_check, - headers=_default_headers(), - **HTTPX_DEFAULTS, - ) - - response.raise_for_status() - - df = pd.read_csv(StringIO(response.text), delimiter=",") + df, response = _get_samples_csv(url, params, ssl_check) return df, BaseMetadata(response) @@ -2758,6 +2759,8 @@ def get_channel( channel_name : string or iterable of strings, optional The channel name. channel_flow : string or iterable of strings, optional + The channel discharge (flow). + channel_flow_unit : string or iterable of strings, optional The units for channel discharge. channel_width : string or iterable of strings, optional The channel width. @@ -2788,24 +2791,7 @@ def get_channel( longitudinal_velocity_description : string or iterable of strings, optional The longitudinal velocity description. measurement_type : string or iterable of strings, optional - The measurement type. - The last time a record was refreshed in our database. This may happen - due to regular operational processes and does not necessarily indicate - anything about the measurement has changed. You can query this field - using date-times or intervals, adhering to RFC 3339, or using ISO 8601 - duration objects. Intervals may be bounded or half-bounded (double-dots - at start or end). - Examples: - - * A date-time: "2018-02-12T23:20:50Z" - * A bounded interval: "2018-02-12T00:00:00Z/2018-03-18T12:31:12Z" - * Half-bounded intervals: "2018-02-12T00:00:00Z/.." or - "../2018-03-18T12:31:12Z" - * Duration objects: "P1M" for data from the past month or "PT36H" for the - last 36 hours - - Only features that have a last_modified that intersects the value of - datetime are selected. + The type of channel measurement. skip_geometry : boolean, optional This option can be used to skip response geometries for each feature. The returning object will be a data frame with no spatial information. diff --git a/dataretrieval/waterdata/ratings.py b/dataretrieval/waterdata/ratings.py index 0e1b503d..8bdd99b8 100644 --- a/dataretrieval/waterdata/ratings.py +++ b/dataretrieval/waterdata/ratings.py @@ -29,6 +29,7 @@ _check_monitoring_location_id, _default_headers, _format_api_dates, + _raise_for_non_200, ) logger = logging.getLogger(__name__) @@ -248,7 +249,7 @@ def _search( verify=ssl_check, **HTTPX_DEFAULTS, ) - response.raise_for_status() + _raise_for_non_200(response) return response.json().get("features", []) @@ -262,7 +263,7 @@ def _download_and_parse( response = httpx.get( url, headers=_default_headers(), verify=ssl_check, **HTTPX_DEFAULTS ) - response.raise_for_status() + _raise_for_non_200(response) if file_path is not None: with open(os.path.join(file_path, feature["id"]), "w") as f: diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 1f52b3e2..459d464a 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -263,8 +263,6 @@ def _format_api_dates( """ if datetime_input is None: return None - # Get timezone - local_timezone = datetime.now().astimezone().tzinfo # Convert single string to list for uniform processing if isinstance(datetime_input, str): @@ -295,7 +293,9 @@ def _format_api_dates( return single # Half-bounded ranges: NA endpoints render as ".."; any unparseable non-NA - # element invalidates the range. + # element invalidates the range. Resolve the local tz only now — after the + # all-NA / duration / interval guards above have had their chance to return. + local_timezone = datetime.now().astimezone().tzinfo formatted = [ _format_one(dt, date=date, local_tz=local_timezone) for dt in datetime_input ] diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 70b6e229..d8704cf0 100644 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -128,6 +128,30 @@ def test_get_samples_summary_rejects_list(): get_samples_summary(monitoringLocationIdentifier=["USGS-04183500"]) +def test_get_samples_raises_typed_error_on_429(httpx_mock): + """Non-200 from the Samples endpoint now raises the module's typed error + (RateLimited on 429) — consistent with the OGC/stats path — instead of a + bare httpx.HTTPStatusError.""" + from dataretrieval.waterdata.chunking import RateLimited + + httpx_mock.add_response(status_code=429, headers={"Retry-After": "30"}) + with pytest.raises(RateLimited): + get_samples( + service="results", + profile="fullphyschem", + monitoringLocationIdentifier="USGS-05406500", + ) + + +def test_get_samples_summary_raises_typed_error_on_5xx(httpx_mock): + """A 5xx from the Samples summary endpoint raises ServiceUnavailable.""" + from dataretrieval.waterdata.chunking import ServiceUnavailable + + httpx_mock.add_response(status_code=503) + with pytest.raises(ServiceUnavailable): + get_samples_summary(monitoringLocationIdentifier="USGS-04183500") + + def test_check_profiles(): """Tests that correct errors are raised for invalid profiles.""" with pytest.raises(ValueError):