From 9801177a088c7d1c39120611244ebca908ea6eaa Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sun, 31 May 2026 20:50:38 -0400 Subject: [PATCH] refactor(errors): unify request errors under a DataRetrievalError taxonomy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An HTTP failure used to surface as a different exception type depending on which module made the request: - legacy query() (wqp, nwis, ngwmn, nldi): ValueError, or NoSitesError(Exception) - waterdata: RateLimited / ServiceUnavailable (RuntimeError), RequestTooLarge (ValueError), ChunkInterrupted (RuntimeError) - nadp / streamstats: bare httpx.HTTPStatusError so no single `except` clause could catch "any dataretrieval request failure." Add a `DataRetrievalError` base and root the existing exceptions on it, so a caller can `except dataretrieval.DataRetrievalError` for any request failure. Each subclass also keeps the built-in it has historically raised (BadRequestError is a ValueError; RateLimited is a RuntimeError), so existing `except ValueError` / `except RuntimeError` handlers keep working unchanged. The taxonomy lives in a new, dependency-free `dataretrieval/exceptions.py` — the single home for it (cf. requests.exceptions, botocore.exceptions) — rather than buried in the pandas/httpx-heavy utils.py: - exceptions.py: DataRetrievalError + the legacy query() types (BadRequestError, NotFoundError, RequestTooLargeError, ServiceUnavailableError; all ValueError) + NoSitesError + the Water Data transport types (RateLimited, ServiceUnavailable, RequestTooLarge). Explicit __all__, re-exported from dataretrieval/__init__.py so the top-level export is intentional, not an accident of `import *`. - utils.py keeps the behavior: query() and a consolidated _raise_for_status() status->exception mapper; it imports the types it raises. - chunking.py keeps the chunker-specific resumable types (ChunkInterrupted and its subclasses, which carry a ChunkedCall resume handle) and imports the transport types from exceptions. The old import paths still resolve (utils.NoSitesError, waterdata.chunking.RateLimited, ...) via re-import, so nothing downstream breaks. Out of scope (follow-ups): nadp/streamstats still raise httpx.HTTPStatusError; nldi's manual non-200 ValueError isn't rooted; waterdata.utils._raise_for_non_200's catch-all for non-retryable 4xx stays a bare RuntimeError (a deliberate fatal/non-resumable signal the chunker relies on). Co-Authored-By: Claude Opus 4.8 (1M context) --- dataretrieval/__init__.py | 1 + dataretrieval/exceptions.py | 142 ++++++++++++++++++++++++++++ dataretrieval/utils.py | 69 ++++++++------ dataretrieval/waterdata/chunking.py | 71 ++------------ dataretrieval/waterdata/utils.py | 3 +- tests/utils_test.py | 70 +++++++++++++- 6 files changed, 258 insertions(+), 98 deletions(-) create mode 100644 dataretrieval/exceptions.py diff --git a/dataretrieval/__init__.py b/dataretrieval/__init__.py index ca302ce0..960de546 100644 --- a/dataretrieval/__init__.py +++ b/dataretrieval/__init__.py @@ -5,6 +5,7 @@ except PackageNotFoundError: __version__ = "version-unknown" +from dataretrieval.exceptions import * from dataretrieval.nadp import * from dataretrieval.nwis import * from dataretrieval.samples import * diff --git a/dataretrieval/exceptions.py b/dataretrieval/exceptions.py new file mode 100644 index 00000000..de9d3db6 --- /dev/null +++ b/dataretrieval/exceptions.py @@ -0,0 +1,142 @@ +"""Exception taxonomy for ``dataretrieval``. + +A failed request from any service module (``nwis``, ``wqp``, ``waterdata``, +``nldi``, ...) raises a subclass of :class:`DataRetrievalError`, so a caller can +handle any request failure with a single ``except dataretrieval.DataRetrievalError``. + +This module deliberately has no third-party dependencies, so any module can +import it without pulling in pandas/httpx. +""" + +from __future__ import annotations + +__all__ = [ + "DataRetrievalError", + "BadRequestError", + "NotFoundError", + "RequestTooLargeError", + "ServiceUnavailableError", + "NoSitesError", + "RateLimited", + "ServiceUnavailable", + "RequestTooLarge", +] + + +class DataRetrievalError(Exception): + """Base class for errors raised when a request to a USGS or EPA web + service fails. + + Every service module (``nwis``, ``wqp``, ``waterdata``, ``nldi``, ...) + raises a subclass of this when a request fails, so a caller can handle any + request failure uniformly:: + + try: + df, md = dataretrieval.wqp.get_results(...) + except dataretrieval.DataRetrievalError: + ... + + Subclasses also inherit from the built-in exception this package has + historically raised for the same condition (e.g. :class:`BadRequestError` + is also a :class:`ValueError`, :class:`RateLimited` is also a + :class:`RuntimeError`), so existing ``except ValueError`` / ``except + RuntimeError`` handlers keep working unchanged. + """ + + +# Legacy ``query()`` path: HTTP status families mapped to ValueError-compatible +# types (the type that path has always raised). +class BadRequestError(DataRetrievalError, ValueError): + """The service rejected the request parameters (HTTP 400).""" + + +class NotFoundError(DataRetrievalError, ValueError): + """The requested resource was not found; often an empty query (HTTP 404).""" + + +class RequestTooLargeError(DataRetrievalError, ValueError): + """The request URL was too long for the service (HTTP 414, or rejected + client-side before it was sent).""" + + +class ServiceUnavailableError(DataRetrievalError, ValueError): + """The service is down or returned a server error (HTTP 5xx).""" + + +class NoSitesError(DataRetrievalError): + """The selection criteria matched no sites/data.""" + + def __init__(self, url): + self.url = url + + def __str__(self): + return ( + "No sites/data found using the selection criteria specified in " + f"url: {self.url}" + ) + + +# Water Data API transport errors: retryable HTTP status families, surfaced as +# RuntimeError-compatible types the chunker detects via ``isinstance`` and wraps +# as resumable interruptions. +class _RetryableTransportError(DataRetrievalError, RuntimeError): + """ + Base for typed HTTP transport failures the chunker recognizes as + transient. + + Raised by :func:`dataretrieval.waterdata.utils._raise_for_non_200` + and walked by :func:`dataretrieval.waterdata.chunking._classify_chunk_error`. + One subclass per recoverable HTTP status family (429 → :class:`RateLimited`, + 5xx → :class:`ServiceUnavailable`); ``ChunkedCall`` wraps them as resumable + :class:`~dataretrieval.waterdata.chunking.ChunkInterrupted` subclasses. + + Parameters + ---------- + message : str + Human-readable error message. + retry_after : float, optional + Seconds to wait before retrying, parsed from the + ``Retry-After`` response header. + + Attributes + ---------- + retry_after : float or None + Seconds to wait before retrying, parsed from the + ``Retry-After`` response header. ``None`` when the header was + absent or unparseable. + """ + + def __init__(self, message: str, *, retry_after: float | None = None) -> None: + super().__init__(message) + self.retry_after = retry_after + + +class RateLimited(_RetryableTransportError): + """ + A USGS Water Data API request was rejected with HTTP 429. + + Exposed as a typed exception so callers (notably the multi-value + chunker) can detect rate-limit failures via ``isinstance`` instead + of string-matching error messages. + """ + + +class ServiceUnavailable(_RetryableTransportError): + """ + A USGS Water Data API request was rejected with HTTP 5xx. + + Surfaced as a typed exception (parallel to :class:`RateLimited`) + so ``ChunkedCall`` can treat transient server failures as + resumable interruptions rather than fatal programmer errors. + """ + + +class RequestTooLarge(DataRetrievalError, ValueError): + """ + No chunking plan fits the URL byte limit. + + Raised when even the smallest reducible plan (every list axis at + singleton chunks and the filter at one clause per sub-request) + still exceeds the server's byte limit. Shrink the input lists, + simplify the filter, or split the call manually. + """ diff --git a/dataretrieval/utils.py b/dataretrieval/utils.py index 7bb03a69..532742f2 100644 --- a/dataretrieval/utils.py +++ b/dataretrieval/utils.py @@ -10,6 +10,13 @@ import dataretrieval from dataretrieval.codes import tz +from dataretrieval.exceptions import ( + BadRequestError, + NoSitesError, + NotFoundError, + RequestTooLargeError, + ServiceUnavailableError, +) HTTPX_DEFAULTS = { "follow_redirects": True, @@ -270,14 +277,42 @@ def __repr__(self) -> str: data_list.append(data) # append results to list""" -def _url_too_long_error(detail: str) -> ValueError: - return ValueError( +def _url_too_long_error(detail: str) -> RequestTooLargeError: + return RequestTooLargeError( "Request URL too long. Modify your query to use fewer sites. " f"{detail}. Pseudo-code example of how to split your query: " f"\n {_URL_TOO_LONG_EXAMPLE}" ) +def _raise_for_status(response: httpx.Response) -> None: + """Raise a typed :class:`DataRetrievalError` for an unsuccessful response. + + Centralizes the HTTP-status-to-exception mapping for the shared + :func:`query` path so every legacy service module (``wqp``, ``nwis``, + ``ngwmn``, ``nldi``) surfaces request failures the same way. A successful + response returns ``None``. The raised types are also :class:`ValueError` + subclasses, preserving this module's historical contract. + """ + status = response.status_code + if status == 400: + raise BadRequestError( + f"Bad Request, check that your parameters are correct. URL: {response.url}" + ) + elif status == 404: + raise NotFoundError( + "Page Not Found Error. May be the result of an empty query. " + f"URL: {response.url}" + ) + elif status == 414: + raise _url_too_long_error(f"API response reason: {response.reason_phrase}") + elif 500 <= status < 600: + raise ServiceUnavailableError( + f"Service Unavailable: {status} {response.reason_phrase}. " + f"The service at {response.url} may be down or experiencing issues." + ) + + def query(url, payload, delimiter=",", ssl_check=True): """Send a query. @@ -321,37 +356,9 @@ def query(url, payload, delimiter=",", ssl_check=True): except httpx.InvalidURL as exc: raise _url_too_long_error(f"httpx rejected the URL client-side: {exc}") from exc - if response.status_code == 400: - raise ValueError( - f"Bad Request, check that your parameters are correct. URL: {response.url}" - ) - elif response.status_code == 404: - raise ValueError( - "Page Not Found Error. May be the result of an empty query. " - + f"URL: {response.url}" - ) - elif response.status_code == 414: - raise _url_too_long_error(f"API response reason: {response.reason_phrase}") - elif 500 <= response.status_code < 600: - raise ValueError( - f"Service Unavailable: {response.status_code} {response.reason_phrase}. " - + f"The service at {response.url} may be down or experiencing issues." - ) + _raise_for_status(response) if response.text.startswith("No sites/data"): raise NoSitesError(response.url) return response - - -class NoSitesError(Exception): - """Custom error class used when selection criteria returns no sites/data.""" - - def __init__(self, url): - self.url = url - - def __str__(self): - return ( - "No sites/data found using the selection criteria specified in " - f"url: {self.url}" - ) diff --git a/dataretrieval/waterdata/chunking.py b/dataretrieval/waterdata/chunking.py index ab079070..ffb4fad9 100644 --- a/dataretrieval/waterdata/chunking.py +++ b/dataretrieval/waterdata/chunking.py @@ -66,6 +66,12 @@ import pandas as pd from anyio.from_thread import start_blocking_portal +from dataretrieval.exceptions import ( + DataRetrievalError, + RateLimited, + RequestTooLarge, + ServiceUnavailable, +) from dataretrieval.utils import HTTPX_DEFAULTS from . import _progress @@ -383,70 +389,7 @@ def _passthrough_result( return frame, response -class _RetryableTransportError(RuntimeError): - """ - Base for typed HTTP transport failures the chunker recognizes as - transient. - - Raised by :func:`dataretrieval.waterdata.utils._raise_for_non_200` - and walked by :func:`_classify_chunk_error`. One subclass per - recoverable HTTP status family (429 → :class:`RateLimited`, - 5xx → :class:`ServiceUnavailable`); ``ChunkedCall`` wraps them as - resumable :class:`ChunkInterrupted` subclasses. - - Parameters - ---------- - message : str - Human-readable error message. - retry_after : float, optional - Seconds to wait before retrying, parsed from the - ``Retry-After`` response header. - - Attributes - ---------- - retry_after : float or None - Seconds to wait before retrying, parsed from the - ``Retry-After`` response header. ``None`` when the header was - absent or unparseable. - """ - - def __init__(self, message: str, *, retry_after: float | None = None) -> None: - super().__init__(message) - self.retry_after = retry_after - - -class RateLimited(_RetryableTransportError): - """ - A USGS Water Data API request was rejected with HTTP 429. - - Exposed as a typed exception so callers (notably the multi-value - chunker) can detect rate-limit failures via ``isinstance`` instead - of string-matching error messages. - """ - - -class ServiceUnavailable(_RetryableTransportError): - """ - A USGS Water Data API request was rejected with HTTP 5xx. - - Surfaced as a typed exception (parallel to :class:`RateLimited`) - so ``ChunkedCall`` can treat transient server failures as - resumable interruptions rather than fatal programmer errors. - """ - - -class RequestTooLarge(ValueError): - """ - No chunking plan fits the URL byte limit. - - Raised when even the smallest reducible plan (every list axis at - singleton chunks and the filter at one clause per sub-request) - still exceeds the server's byte limit. Shrink the input lists, - simplify the filter, or split the call manually. - """ - - -class ChunkInterrupted(RuntimeError): +class ChunkInterrupted(DataRetrievalError, RuntimeError): """ Base class for mid-stream chunk failures whose completed work is preserved and resumable. diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index dd622efd..35491c88 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -26,12 +26,11 @@ from anyio.from_thread import start_blocking_portal from dataretrieval import __version__ +from dataretrieval.exceptions import RateLimited, ServiceUnavailable from dataretrieval.utils import HTTPX_DEFAULTS, BaseMetadata from dataretrieval.waterdata import _progress, chunking from dataretrieval.waterdata.chunking import ( _QUOTA_HEADER, - RateLimited, - ServiceUnavailable, _safe_elapsed, get_active_client, ) diff --git a/tests/utils_test.py b/tests/utils_test.py index c25e1084..167acc35 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -5,7 +5,7 @@ import pandas as pd import pytest -from dataretrieval import nwis, utils +from dataretrieval import exceptions, nwis, utils class Test_query: @@ -42,6 +42,74 @@ def test_header(self): assert "user-agent" in response.request.headers +class Test_error_taxonomy: + """The unified request-error hierarchy. + + Every module's request failures are catchable as ``DataRetrievalError``, + while remaining backward-compatible with the built-in type each path + historically raised (``ValueError`` for the legacy ``query`` path, + ``RuntimeError`` for the waterdata retryable types). + """ + + @pytest.mark.parametrize( + "status, exc_name, match", + [ + (400, "BadRequestError", "Bad Request"), + (404, "NotFoundError", "Page Not Found"), + (414, "RequestTooLargeError", "Request URL too long"), + (503, "ServiceUnavailableError", "Service Unavailable: 503"), + ], + ) + def test_query_maps_status_to_typed_error( + self, httpx_mock, status, exc_name, match + ): + """``query`` maps each HTTP status family to a typed error that is both + a ``DataRetrievalError`` (new, unified) and a ``ValueError`` (the type + this path has always raised), with the message preserved.""" + exc_cls = getattr(exceptions, exc_name) + url = "https://example.invalid/x" + httpx_mock.add_response(method="GET", url=f"{url}?a=1", status_code=status) + with pytest.raises(exc_cls, match=match) as excinfo: + utils.query(url, {"a": "1"}) + assert isinstance(excinfo.value, exceptions.DataRetrievalError) + assert isinstance(excinfo.value, ValueError) # backward compatibility + + def test_query_failure_catchable_as_base(self, httpx_mock): + """A bare ``except DataRetrievalError`` catches a legacy query failure.""" + url = "https://example.invalid/y" + httpx_mock.add_response(method="GET", url=f"{url}?a=1", status_code=400) + with pytest.raises(exceptions.DataRetrievalError): + utils.query(url, {"a": "1"}) + + def test_no_sites_error_is_data_retrieval_error(self): + """``NoSitesError`` joins the root (was a bare ``Exception``).""" + assert issubclass(exceptions.NoSitesError, exceptions.DataRetrievalError) + assert not issubclass(exceptions.NoSitesError, ValueError) # unchanged + + def test_waterdata_exceptions_share_the_root(self): + """waterdata's typed exceptions are ``DataRetrievalError`` too, so one + ``except`` clause spans the legacy and waterdata subsystems — while + keeping their historical ``RuntimeError``/``ValueError`` bases.""" + from dataretrieval.waterdata.chunking import ( + ChunkInterrupted, + RateLimited, + RequestTooLarge, + ServiceUnavailable, + ) + + for cls in (RateLimited, ServiceUnavailable, RequestTooLarge, ChunkInterrupted): + assert issubclass(cls, exceptions.DataRetrievalError) + assert issubclass(RateLimited, RuntimeError) + assert issubclass(ServiceUnavailable, RuntimeError) + assert issubclass(RequestTooLarge, ValueError) + + def test_base_exported_at_top_level(self): + """Users can write ``except dataretrieval.DataRetrievalError``.""" + import dataretrieval + + assert dataretrieval.DataRetrievalError is exceptions.DataRetrievalError + + class Test_BaseMetadata: """Tests of BaseMetadata"""