From 94ac1dca04c6c5e7dff1d34c781844ad90521edd Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Mon, 6 Apr 2026 07:20:00 +0900 Subject: [PATCH 1/2] refactor: unify facet filter parsing across CLI commands Consolidate three inconsistent filter parsing implementations into a single `parse_facet_filters` returning `dict[str, list[str]]` with multi-value OR semantics. Previously, `--filter key=val key=val2` in executions silently overwrote the first value; it now correctly returns results matching either value, consistent with `--dataset-filter` in `datasets list` and `solve`. - `cli/_utils.py`: `parse_facet_filters` returns multi-value dict - `cli/datasets.py`, `cli/solve.py`: replace inline parsing with shared function - `cli/executions.py`: update `ListGroupsFilterOptions.facets` type - `models/execution.py`: update `_filter_executions_by_facets` signature, extract `_selectors_match_facet` helper - Fix wrong docstring on `datasets list-columns` --- .../climate-ref/src/climate_ref/cli/_utils.py | 25 +++--- .../src/climate_ref/cli/datasets.py | 20 ++--- .../src/climate_ref/cli/executions.py | 9 ++- .../climate-ref/src/climate_ref/cli/solve.py | 17 ++-- .../src/climate_ref/models/execution.py | 80 +++++++++++-------- .../tests/unit/cli/test_executions.py | 10 +-- .../climate-ref/tests/unit/cli/test_solve.py | 4 +- .../climate-ref/tests/unit/cli/test_utils.py | 33 +++++--- 8 files changed, 107 insertions(+), 91 deletions(-) diff --git a/packages/climate-ref/src/climate_ref/cli/_utils.py b/packages/climate-ref/src/climate_ref/cli/_utils.py index efc9fad67..cb0496a7d 100644 --- a/packages/climate-ref/src/climate_ref/cli/_utils.py +++ b/packages/climate-ref/src/climate_ref/cli/_utils.py @@ -35,19 +35,24 @@ def format_size(size_bytes: int | float) -> str: return f"{size:.1f} TB" -def parse_facet_filters(filters: list[str] | None) -> dict[str, str]: +def parse_facet_filters(filters: list[str] | None) -> dict[str, list[str]]: """ Parse facet filters from key=value format into a dictionary. + Multiple values for the same key are collected into a list (OR logic). + Different keys are ANDed together when used for filtering. + Parameters ---------- filters - List of filter strings in 'key=value' format + List of filter strings in 'key=value' format. + Multiple entries with the same key are ORed + (e.g., ``["source_id=A", "source_id=B"]`` matches A or B). Returns ------- - dict[str, str] - Dictionary mapping facet keys to values + dict[str, list[str]] + Dictionary mapping facet keys to lists of allowed values Raises ------ @@ -57,12 +62,15 @@ def parse_facet_filters(filters: list[str] | None) -> dict[str, str]: Examples -------- >>> parse_facet_filters(["source_id=GFDL-ESM4", "variable_id=tas"]) - {'source_id': 'GFDL-ESM4', 'variable_id': 'tas'} + {'source_id': ['GFDL-ESM4'], 'variable_id': ['tas']} + + >>> parse_facet_filters(["source_id=A", "source_id=B"]) + {'source_id': ['A', 'B']} """ if not filters: return {} - parsed: dict[str, str] = {} + parsed: dict[str, list[str]] = {} for filter_str in filters: if "=" not in filter_str: raise ValueError( @@ -80,10 +88,7 @@ def parse_facet_filters(filters: list[str] | None) -> dict[str, str]: if not value: raise ValueError(f"Empty value in filter: '{filter_str}'") - if key in parsed: - logger.warning(f"Filter key '{key}' specified multiple times. Using last value: '{value}'") - - parsed[key] = value + parsed.setdefault(key, []).append(value) return parsed diff --git a/packages/climate-ref/src/climate_ref/cli/datasets.py b/packages/climate-ref/src/climate_ref/cli/datasets.py index 4a2de21ca..febc80b3b 100644 --- a/packages/climate-ref/src/climate_ref/cli/datasets.py +++ b/packages/climate-ref/src/climate_ref/cli/datasets.py @@ -14,7 +14,7 @@ import typer from loguru import logger -from climate_ref.cli._utils import pretty_print_df +from climate_ref.cli._utils import parse_facet_filters, pretty_print_df from climate_ref.datasets import get_dataset_adapter from climate_ref.models import Dataset from climate_ref.solver import apply_dataset_filters @@ -60,15 +60,10 @@ def list_( # noqa: PLR0913 data_catalog = adapter.load_catalog(database, include_files=include_files, limit=limit) if dataset_filter: - parsed_filters: dict[str, list[str]] = {} - for entry in dataset_filter: - if "=" not in entry: - raise typer.BadParameter( - f"Invalid dataset filter {entry!r}. Expected key=value format.", - param_hint="--dataset-filter", - ) - key, value = entry.split("=", 1) - parsed_filters.setdefault(key, []).append(value) + try: + parsed_filters = parse_facet_filters(dataset_filter) + except ValueError as e: + raise typer.BadParameter(str(e), param_hint="--dataset-filter") for facet in parsed_filters: if facet not in data_catalog.columns: @@ -108,10 +103,7 @@ def list_columns( include_files: bool = typer.Option(False, help="Include files in the output"), ) -> None: """ - Print the current climate_ref configuration - - If a configuration directory is provided, - the configuration will attempt to load from the specified directory. + List the available columns in the data catalog for the given source type. """ database = ctx.obj.database diff --git a/packages/climate-ref/src/climate_ref/cli/executions.py b/packages/climate-ref/src/climate_ref/cli/executions.py index 32f6d567b..bee225d1e 100644 --- a/packages/climate-ref/src/climate_ref/cli/executions.py +++ b/packages/climate-ref/src/climate_ref/cli/executions.py @@ -40,8 +40,8 @@ class ListGroupsFilterOptions: provider: list[str] | None = None """Filter by provider slug (substring, case-insensitive)""" - facets: dict[str, str] | None = None - """Filter by facet key-value pairs (exact match)""" + facets: dict[str, list[str]] | None = None + """Filter by facet key-value pairs (AND across keys, OR within key)""" @app.command() @@ -486,7 +486,10 @@ def emit_no_results_warning( if filters.provider: filter_parts.append(f"provider filters: {filters.provider}") if filters.facets: - facet_strs = [f"{k}={v}" for k, v in filters.facets.items()] + facet_strs = [ + f"{k}={','.join(values)}" if len(values) > 1 else f"{k}={values[0]}" + for k, values in filters.facets.items() + ] filter_parts.append(f"facet filters: {facet_strs}") logger.warning( diff --git a/packages/climate-ref/src/climate_ref/cli/solve.py b/packages/climate-ref/src/climate_ref/cli/solve.py index bd44d12b3..d8da183a6 100644 --- a/packages/climate-ref/src/climate_ref/cli/solve.py +++ b/packages/climate-ref/src/climate_ref/cli/solve.py @@ -2,6 +2,8 @@ import typer +from climate_ref.cli._utils import parse_facet_filters + app = typer.Typer() @@ -85,17 +87,10 @@ def solve( # noqa: PLR0913 config = ctx.obj.config db = ctx.obj.database - parsed_dataset_filters: dict[str, list[str]] | None = None - if dataset_filter: - parsed_dataset_filters = {} - for entry in dataset_filter: - if "=" not in entry: - raise typer.BadParameter( - f"Invalid dataset filter {entry!r}. Expected key=value format.", - param_hint="--dataset-filter", - ) - key, value = entry.split("=", 1) - parsed_dataset_filters.setdefault(key, []).append(value) + try: + parsed_dataset_filters = parse_facet_filters(dataset_filter) or None + except ValueError as e: + raise typer.BadParameter(str(e), param_hint="--dataset-filter") filters = SolveFilterOptions( diagnostic=diagnostic, diff --git a/packages/climate-ref/src/climate_ref/models/execution.py b/packages/climate-ref/src/climate_ref/models/execution.py index a6d029340..d5149910e 100644 --- a/packages/climate-ref/src/climate_ref/models/execution.py +++ b/packages/climate-ref/src/climate_ref/models/execution.py @@ -406,9 +406,39 @@ def get_execution_group_and_latest( return query # type: ignore +def _selectors_match_facet( + selectors: dict[str, list[list[str]]], + facet_key: str, + facet_values: list[str], +) -> bool: + """ + Check if an execution group's selectors match a single facet filter. + + Parameters + ---------- + selectors + The execution group's selectors dict (dataset_type -> list of [key, value] pairs) + facet_key + Facet key, optionally prefixed with ``dataset_type.`` to scope to one type + facet_values + Allowed values (OR logic -- any match is sufficient) + """ + if "." in facet_key: + dataset_type, key = facet_key.split(".", 1) + if dataset_type in selectors: + return any([key, fv] in selectors[dataset_type] for fv in facet_values) + return False + + # Bare key: search across all dataset types + for ds_type_selectors in selectors.values(): + if any([facet_key, fv] in ds_type_selectors for fv in facet_values): + return True + return False + + def _filter_executions_by_facets( results: Sequence[tuple[ExecutionGroup, Execution | None]], - facet_filters: dict[str, str], + facet_filters: dict[str, list[str]], ) -> list[tuple[ExecutionGroup, Execution | None]]: """ Filter execution groups and their latest executions based on facet key-value pairs. @@ -416,13 +446,13 @@ def _filter_executions_by_facets( This is a relatively expensive operation as it requires iterating over all results. This should be replaced once we have normalised the selectors into a separate table. - Parameters ---------- results List of tuples containing ExecutionGroup and its latest Execution (or None) facet_filters - Dictionary of facet key-value pairs to filter by (AND logic, exact match) + Dictionary mapping facet keys to lists of allowed values. + Different keys are ANDed; multiple values for the same key are ORed. Returns ------- @@ -434,41 +464,24 @@ def _filter_executions_by_facets( or dataset_type.key=value (searches specific dataset type) - Key=value filters search across all dataset types - dataset_type.key=value filters only search within the specified dataset type - - Multiple values within same filter type use OR logic - - All specified facets must match for an execution group to be included (AND logic) - """ - filtered_results = [] - for eg, execution in results: - all_filters_match = True - for facet_key, facet_value in facet_filters.items(): - filter_match = False - if "." in facet_key: - # Handle dataset_type.key=value format - dataset_type, key = facet_key.split(".", 1) - if dataset_type in eg.selectors: - if [key, facet_value] in eg.selectors[dataset_type]: - filter_match = True - break - else: - # Handle key=value format (search across all dataset types) - for ds_type_selectors in eg.selectors.values(): - if [facet_key, facet_value] in ds_type_selectors: - filter_match = True - break - - if not filter_match: - all_filters_match = False - break - if all_filters_match: - filtered_results.append((eg, execution)) - return filtered_results + - Multiple values for the same key use OR logic + - All specified keys must match for an execution group to be included (AND logic) + """ + return [ + (eg, execution) + for eg, execution in results + if all( + _selectors_match_facet(eg.selectors, facet_key, facet_values) + for facet_key, facet_values in facet_filters.items() + ) + ] def get_execution_group_and_latest_filtered( # noqa: PLR0913 session: Session, diagnostic_filters: list[str] | None = None, provider_filters: list[str] | None = None, - facet_filters: dict[str, str] | None = None, + facet_filters: dict[str, list[str]] | None = None, dirty: bool | None = None, successful: bool | None = None, ) -> list[tuple[ExecutionGroup, Execution | None]]: @@ -484,7 +497,8 @@ def get_execution_group_and_latest_filtered( # noqa: PLR0913 provider_filters List of provider slug substrings (OR logic, case-insensitive) facet_filters - Dictionary of facet key-value pairs (AND logic, exact match) + Dictionary mapping facet keys to lists of allowed values. + Different keys are ANDed; multiple values for the same key are ORed. dirty If True, only return dirty execution groups. If False, only return clean execution groups. diff --git a/packages/climate-ref/tests/unit/cli/test_executions.py b/packages/climate-ref/tests/unit/cli/test_executions.py index b714f89dc..e38877b8f 100644 --- a/packages/climate-ref/tests/unit/cli/test_executions.py +++ b/packages/climate-ref/tests/unit/cli/test_executions.py @@ -252,8 +252,8 @@ def test_filter_empty_results_warning(self, db_with_groups, invoke_cli): assert "Applied filters: facet filters: ['source_id=NONEXISTENT']" in result.stderr assert "id" in result.stdout # Ensure empty table headers are still printed - def test_facet_warning_multiple_same_key(self, db_with_groups, invoke_cli): - # This functionality might be useful in future, but not today + def test_facet_multiple_same_key_returns_both(self, db_with_groups, invoke_cli): + # Multiple values for the same key are ORed (both should appear) result = invoke_cli( [ "executions", @@ -264,12 +264,8 @@ def test_facet_warning_multiple_same_key(self, db_with_groups, invoke_cli): "source_id=ACCESS-ESM1-5", ] ) - assert ( - "Filter key 'source_id' specified multiple times. Using last value: 'ACCESS-ESM1-5'" - in result.stderr - ) assert "ACCESS-ESM1-5" in result.stdout - assert "GFDL-ESM4" not in result.stdout + assert "GFDL-ESM4" in result.stdout def test_filter_successful(self, db_with_groups, invoke_cli): result = invoke_cli(["executions", "list-groups", "--successful"]) diff --git a/packages/climate-ref/tests/unit/cli/test_solve.py b/packages/climate-ref/tests/unit/cli/test_solve.py index 57a44da00..879d7c810 100644 --- a/packages/climate-ref/tests/unit/cli/test_solve.py +++ b/packages/climate-ref/tests/unit/cli/test_solve.py @@ -94,8 +94,8 @@ def test_solve_with_invalid_dataset_filter(self, sample_data_dir, db, invoke_cli expected_exit_code=2, ) - assert "Invalid dataset filter" in result.stderr - assert "Expected key=value format" in result.stderr + assert "Invalid filter format" in result.stderr + assert "Expected format: 'key=value'" in result.stderr def test_solve_with_limit(self, sample_data_dir, db, invoke_cli, mocker): mock_solve = mocker.patch("climate_ref.solver.solve_required_executions") diff --git a/packages/climate-ref/tests/unit/cli/test_utils.py b/packages/climate-ref/tests/unit/cli/test_utils.py index fb30461e4..1132a123b 100644 --- a/packages/climate-ref/tests/unit/cli/test_utils.py +++ b/packages/climate-ref/tests/unit/cli/test_utils.py @@ -46,7 +46,7 @@ def test_format_size_boundary_values(self): def test_parse_facet_filters_valid_input(): filters = ["source_id=GFDL-ESM4", "variable_id=tas"] - expected = {"source_id": "GFDL-ESM4", "variable_id": "tas"} + expected = {"source_id": ["GFDL-ESM4"], "variable_id": ["tas"]} assert parse_facet_filters(filters) == expected @@ -60,17 +60,15 @@ def test_parse_facet_filters_none_input(): def test_parse_facet_filters_with_whitespace(): filters = [" key1 = value1 ", "key2=value2 "] - expected = {"key1": "value1", "key2": "value2"} + expected = {"key1": ["value1"], "key2": ["value2"]} assert parse_facet_filters(filters) == expected -def test_parse_facet_filters_duplicate_key(caplog): +def test_parse_facet_filters_multiple_values_same_key(): + """Multiple values for the same key are collected into a list (OR semantics).""" filters = ["key=value1", "key=value2"] - expected = {"key": "value2"} - with caplog.at_level("WARNING"): - result = parse_facet_filters(filters) - assert result == expected - assert "Filter key 'key' specified multiple times. Using last value: 'value2'" in caplog.text + result = parse_facet_filters(filters) + assert result == {"key": ["value1", "value2"]} def test_parse_facet_filters_invalid_format_no_equals(): @@ -90,12 +88,25 @@ def test_parse_facet_filters_empty_value(): def test_parse_facet_filters_value_with_equals(): filters = ["query=some_key=some_value"] - expected = {"query": "some_key=some_value"} + expected = {"query": ["some_key=some_value"]} assert parse_facet_filters(filters) == expected -def test_parse_facet_filters_mixed_valid_and_invalid(caplog): +def test_parse_facet_filters_mixed_valid_and_invalid(): filters = ["key1=value1", "invalid", "key2=value2"] with pytest.raises(ValueError, match="Invalid filter format: 'invalid'"): parse_facet_filters(filters) - assert not caplog.text + + +def test_parse_facet_filters_mixed_single_and_multi(): + """Different keys can have different numbers of values.""" + filters = ["source_id=A", "source_id=B", "variable_id=tas"] + result = parse_facet_filters(filters) + assert result == {"source_id": ["A", "B"], "variable_id": ["tas"]} + + +def test_parse_facet_filters_dotted_key(): + """Dotted keys (dataset_type.facet) are preserved as-is.""" + filters = ["cmip6.source_id=ACCESS-CM2"] + result = parse_facet_filters(filters) + assert result == {"cmip6.source_id": ["ACCESS-CM2"]} From 450759f6ea3957df973a53ae99e12e7bb4790651 Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Mon, 6 Apr 2026 07:22:24 +0900 Subject: [PATCH 2/2] docs: add changelog entry for PR #613 --- changelog/613.improvement.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog/613.improvement.md diff --git a/changelog/613.improvement.md b/changelog/613.improvement.md new file mode 100644 index 000000000..f30ec1d3a --- /dev/null +++ b/changelog/613.improvement.md @@ -0,0 +1 @@ +Unify facet filter parsing across CLI commands. `--filter` in `executions list-groups` and `delete-groups` now supports multiple values for the same key with OR semantics (e.g., `--filter source_id=A --filter source_id=B`), consistent with `--dataset-filter` in `datasets list` and `solve`.