From 41d07f5a42841e4bae68aed51c6e1f9fd09a0258 Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Fri, 6 Mar 2026 17:46:34 +0100 Subject: [PATCH 1/5] Fetch recipes in advance --- .../src/climate_ref_esmvaltool/__init__.py | 31 ++++++++++++++----- .../diagnostics/base.py | 6 ++-- .../src/climate_ref_esmvaltool/recipe.py | 18 +++-------- .../tests/unit/test_provider.py | 10 +++--- 4 files changed, 37 insertions(+), 28 deletions(-) diff --git a/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/__init__.py b/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/__init__.py index 4e849a14c..966de5de3 100644 --- a/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/__init__.py +++ b/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/__init__.py @@ -19,21 +19,26 @@ ) from climate_ref_core.providers import CondaDiagnosticProvider from climate_ref_esmvaltool._version import __version__ -from climate_ref_esmvaltool.recipe import _ESMVALCORE_URL, _ESMVALTOOL_URL +from climate_ref_esmvaltool.diagnostics.base import _DATASETS_REGISTRY_NAME +from climate_ref_esmvaltool.recipe import ( + _ESMVALCORE_URL, + _ESMVALTOOL_URL, + _RECIPES_REGISTRY_NAME, + _RECIPES_URL, +) if TYPE_CHECKING: from climate_ref.config import Config -_REGISTRY_NAME = "esmvaltool" - class ESMValToolProvider(CondaDiagnosticProvider): """Provider for ESMValTool diagnostics.""" def fetch_data(self, config: Config) -> None: """Fetch ESMValTool reference data.""" - registry = dataset_registry_manager[_REGISTRY_NAME] - fetch_all_files(registry, _REGISTRY_NAME, output_dir=None) + for registry_name in [_DATASETS_REGISTRY_NAME, _RECIPES_REGISTRY_NAME]: + registry = dataset_registry_manager[registry_name] + fetch_all_files(registry, registry_name, output_dir=None) def validate_setup(self, config: Config) -> bool: """Validate conda environment and data checksums.""" @@ -42,8 +47,9 @@ def validate_setup(self, config: Config) -> bool: return False # Then check data checksums - registry = dataset_registry_manager[_REGISTRY_NAME] - errors = validate_registry_cache(registry, _REGISTRY_NAME) + errors = [] + for registry_name in [_DATASETS_REGISTRY_NAME, _RECIPES_REGISTRY_NAME]: + errors.extend(validate_registry_cache(dataset_registry_manager[registry_name], registry_name)) if errors: for error in errors: logger.error(f"{self.slug} validation failed: {error}") @@ -73,8 +79,17 @@ def get_data_path(self) -> Path | None: # Register OBS, OBS6, and raw data dataset_registry_manager.register( - "esmvaltool", + name=_DATASETS_REGISTRY_NAME, base_url=DATASET_URL, package="climate_ref_esmvaltool.dataset_registry", resource="data.txt", + cache_name=f"climate_ref_esmvaltool/{_DATASETS_REGISTRY_NAME}", +) +# Register the ESMValTool recipes. +dataset_registry_manager.register( + name=_RECIPES_REGISTRY_NAME, + base_url=_RECIPES_URL, + package="climate_ref_esmvaltool", + resource="recipes.txt", + cache_name=f"climate_ref_esmvaltool/{_RECIPES_REGISTRY_NAME}", ) diff --git a/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/diagnostics/base.py b/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/diagnostics/base.py index 43e78fcf5..125f5aabc 100644 --- a/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/diagnostics/base.py +++ b/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/diagnostics/base.py @@ -29,6 +29,8 @@ ) from climate_ref_esmvaltool.types import MetricBundleArgs, OutputBundleArgs, Recipe +_DATASETS_REGISTRY_NAME = "esmvaltool-datasets" + def get_cmip_source_type( input_files: dict[SourceDatasetType, pandas.DataFrame], @@ -202,13 +204,13 @@ def build_cmd(self, definition: ExecutionDefinition) -> Iterable[str]: } # Configure the paths to OBS/OBS6/native6 and non-compliant obs4MIPs data - registry = dataset_registry_manager["esmvaltool"] + registry = dataset_registry_manager[_DATASETS_REGISTRY_NAME] data_dir = registry.abspath / "ESMValTool" # type: ignore[attr-defined] if not data_dir.exists(): logger.warning( "ESMValTool observational and reanalysis data is not available " f"in {data_dir}, you may want to run the command " - "`ref datasets fetch-data --registry esmvaltool`." + f"`ref datasets fetch-data --registry {_DATASETS_REGISTRY_NAME}`." ) else: config["projects"]["OBS"] = { diff --git a/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/recipe.py b/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/recipe.py index dd819b9bd..64d4858d3 100644 --- a/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/recipe.py +++ b/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/recipe.py @@ -1,15 +1,14 @@ from __future__ import annotations -import importlib.resources from collections.abc import Iterator from pathlib import Path from typing import TYPE_CHECKING, Any import cftime import pandas as pd -import pooch import yaml +from climate_ref_core.dataset_registry import dataset_registry_manager from climate_ref_esmvaltool.types import Recipe if TYPE_CHECKING: @@ -295,17 +294,10 @@ def get_child_and_parent_dataset( _ESMVALCORE_COMMIT = "da81d5f67158f3d2603831b56ab6b4fb8a388d86" _ESMVALCORE_URL = f"git+https://github.com/ESMValGroup/ESMValCore.git@{_ESMVALCORE_COMMIT}" -_RECIPES = pooch.create( - path=pooch.os_cache("climate_ref_esmvaltool"), - # TODO: use a released version - # base_url="https://raw.githubusercontent.com/ESMValGroup/ESMValTool/refs/tags/v{version}/esmvaltool/recipes/", - # version=_ESMVALTOOL_VERSION, - base_url=f"https://raw.githubusercontent.com/ESMValGroup/ESMValTool/{_ESMVALTOOL_COMMIT}/esmvaltool/recipes/", - env="REF_METRICS_ESMVALTOOL_DATA_DIR", - retry_if_failed=10, +_RECIPES_URL = ( + f"https://raw.githubusercontent.com/ESMValGroup/ESMValTool/{_ESMVALTOOL_COMMIT}/esmvaltool/recipes/" ) -with importlib.resources.files("climate_ref_esmvaltool").joinpath("recipes.txt").open("rb") as _buffer: - _RECIPES.load_registry(_buffer) +_RECIPES_REGISTRY_NAME = f"esmvaltool-recipes-v{_ESMVALTOOL_VERSION}" def fix_annual_statistics_keep_year(recipe: Recipe) -> None: @@ -348,7 +340,7 @@ def load_recipe(recipe: str) -> Recipe: ------- The loaded recipe. """ - filename = _RECIPES.fetch(recipe) + filename = dataset_registry_manager[_RECIPES_REGISTRY_NAME].fetch(recipe) def normalize(obj: Any) -> Any: # Ensure objects in the recipe are not shared. diff --git a/packages/climate-ref-esmvaltool/tests/unit/test_provider.py b/packages/climate-ref-esmvaltool/tests/unit/test_provider.py index 1c66d1c7d..3ff969002 100644 --- a/packages/climate-ref-esmvaltool/tests/unit/test_provider.py +++ b/packages/climate-ref-esmvaltool/tests/unit/test_provider.py @@ -2,7 +2,7 @@ from pathlib import Path import pooch -from climate_ref_esmvaltool import ESMValToolProvider, __version__, provider +from climate_ref_esmvaltool import _DATASETS_REGISTRY_NAME, ESMValToolProvider, __version__, provider def test_provider(): @@ -40,11 +40,11 @@ def test_fetch_data(self, mocker): provider.fetch_data(mock_config) - mock_fetch.assert_called_once() + mock_fetch.assert_called() # Check it's using the right registry name - call_args = mock_fetch.call_args - assert call_args[0][1] == "esmvaltool" - assert call_args[1]["output_dir"] is None + call = mock_fetch.mock_calls[0] + assert call.args[1] == _DATASETS_REGISTRY_NAME + assert call.kwargs["output_dir"] is None def test_validate_setup_env_missing(self, mocker): """Test validate_setup returns False when conda env is missing.""" From 06cf52bb2195010ec1c090ef309986fd77d14639 Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Fri, 6 Mar 2026 17:49:14 +0100 Subject: [PATCH 2/5] Add changelog --- changelog/582.improvement.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog/582.improvement.md diff --git a/changelog/582.improvement.md b/changelog/582.improvement.md new file mode 100644 index 000000000..d2e8fc719 --- /dev/null +++ b/changelog/582.improvement.md @@ -0,0 +1 @@ +Fetch ESMValTool recipes when installing the provider. From 65f9ed9dc374a74d7940853db1a82d0bdfdb966b Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Wed, 11 Mar 2026 11:13:14 +0100 Subject: [PATCH 3/5] Put everything under ~/.cache/climate_ref --- .../src/climate_ref_core/dataset_registry.py | 10 +++-- .../test_dataset_registry.py | 38 +++++++++++++++++-- .../src/climate_ref_esmvaltool/__init__.py | 4 +- 3 files changed, 44 insertions(+), 8 deletions(-) diff --git a/packages/climate-ref-core/src/climate_ref_core/dataset_registry.py b/packages/climate-ref-core/src/climate_ref_core/dataset_registry.py index 9e29a7516..b653ed539 100644 --- a/packages/climate-ref-core/src/climate_ref_core/dataset_registry.py +++ b/packages/climate-ref-core/src/climate_ref_core/dataset_registry.py @@ -245,14 +245,18 @@ def register( # noqa: PLR0913 This defaults to the value of `name` if not provided. """ if cache_name is None: - cache_name = "climate_ref" + cache_name = name + + if env_cache_dir := os.environ.get("REF_DATASET_CACHE_DIR"): + cache_dir = pathlib.Path(os.path.expandvars(env_cache_dir)).expanduser() + else: + cache_dir = pooch.os_cache("climate_ref") registry = pooch.create( - path=pooch.os_cache(cache_name), + path=cache_dir / cache_name, base_url=base_url, version=version, retry_if_failed=10, - env="REF_DATASET_CACHE_DIR", ) registry.load_registry(str(importlib.resources.files(package) / resource)) self._registries[name] = registry diff --git a/packages/climate-ref-core/tests/unit/test_dataset_registry/test_dataset_registry.py b/packages/climate-ref-core/tests/unit/test_dataset_registry/test_dataset_registry.py index ed058aafe..59ec88daf 100644 --- a/packages/climate-ref-core/tests/unit/test_dataset_registry/test_dataset_registry.py +++ b/packages/climate-ref-core/tests/unit/test_dataset_registry/test_dataset_registry.py @@ -85,7 +85,7 @@ def test_getitem(self, mocker, fake_registry_file): assert retrieved_registry == mock_pooch_instance @pytest.mark.parametrize( - "cache_name, expected", [(None, "climate_ref"), ("custom_cache", "custom_cache")] + "cache_name, expected", [(None, "test_registry"), ("custom_cache", "custom_cache")] ) def test_with_cache_name(self, mocker, fake_registry_file, cache_name, expected): registry = DatasetRegistryManager() @@ -93,13 +93,45 @@ def test_with_cache_name(self, mocker, fake_registry_file, cache_name, expected) base_url = "http://example.com" mock_pooch = mocker.patch("climate_ref_core.dataset_registry.pooch") + mock_pooch.os_cache.return_value = Path("/path/to/climate_ref") package, resource = self.setup_registry_file(fake_registry_file) registry.register(name, base_url, package, resource, cache_name=cache_name) - mock_pooch.os_cache.assert_called_with(expected) + mock_pooch.os_cache.assert_called_with("climate_ref") assert name in registry._registries - mock_pooch.create.assert_called_once() + expected_kwargs = { + "base_url": "http://example.com", + "path": Path("/path/to/climate_ref", expected), + "retry_if_failed": 10, + "version": None, + } + mock_pooch.create.assert_called_once_with(**expected_kwargs) + + @pytest.mark.parametrize("env", [None, "", "/some/other/path"]) + def test_with_environment_variable(self, monkeypatch, mocker, fake_registry_file, env): + if env is not None: + monkeypatch.setenv("REF_DATASET_CACHE_DIR", env) + expected_path = Path(env) / "test_registry" if env else Path("/path/to/climate_ref") / "test_registry" + + registry = DatasetRegistryManager() + name = "test_registry" + base_url = "http://example.com" + + mock_pooch = mocker.patch("climate_ref_core.dataset_registry.pooch") + mock_pooch.os_cache.return_value = Path("/path/to/climate_ref") + package, resource = self.setup_registry_file(fake_registry_file) + + registry.register(name, base_url, package, resource) + + assert name in registry._registries + expected_kwargs = { + "path": expected_path, + "base_url": "http://example.com", + "retry_if_failed": 10, + "version": None, + } + mock_pooch.create.assert_called_once_with(**expected_kwargs) @pytest.mark.parametrize("symlink", [True, False]) diff --git a/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/__init__.py b/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/__init__.py index 966de5de3..2d6a1286e 100644 --- a/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/__init__.py +++ b/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/__init__.py @@ -83,7 +83,7 @@ def get_data_path(self) -> Path | None: base_url=DATASET_URL, package="climate_ref_esmvaltool.dataset_registry", resource="data.txt", - cache_name=f"climate_ref_esmvaltool/{_DATASETS_REGISTRY_NAME}", + cache_name=_DATASETS_REGISTRY_NAME.replace("-", "/"), ) # Register the ESMValTool recipes. dataset_registry_manager.register( @@ -91,5 +91,5 @@ def get_data_path(self) -> Path | None: base_url=_RECIPES_URL, package="climate_ref_esmvaltool", resource="recipes.txt", - cache_name=f"climate_ref_esmvaltool/{_RECIPES_REGISTRY_NAME}", + cache_name=_RECIPES_REGISTRY_NAME.replace("-", "/"), ) From 30e5a7a43de7512fcf8d1d45f470acf645e99354 Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Sat, 11 Apr 2026 12:20:16 +1000 Subject: [PATCH 4/5] feat(core): add cache migration helper for registry layout changes Extract _resolve_cache_dir and _migrate_cache into DatasetRegistryManager so registries can declare legacy cache directories. Files found at old locations are moved to the new path on registration, avoiding re-downloads after cache layout changes. Wire up legacy_cache_dirs for ESMValTool datasets (previously flat under os_cache("climate_ref")) and recipes (previously under os_cache("climate_ref_esmvaltool")). Also fix test_with_environment_variable for the empty-string env case. --- .../src/climate_ref_core/dataset_registry.py | 78 +++++++++- .../test_dataset_registry.py | 145 +++++++++++++++++- .../src/climate_ref_esmvaltool/__init__.py | 9 ++ 3 files changed, 224 insertions(+), 8 deletions(-) diff --git a/packages/climate-ref-core/src/climate_ref_core/dataset_registry.py b/packages/climate-ref-core/src/climate_ref_core/dataset_registry.py index b653ed539..bbfdc558a 100644 --- a/packages/climate-ref-core/src/climate_ref_core/dataset_registry.py +++ b/packages/climate-ref-core/src/climate_ref_core/dataset_registry.py @@ -206,6 +206,66 @@ def keys(self) -> list[str]: """ return list(self._registries.keys()) + @staticmethod + def _resolve_cache_dir(cache_name: str) -> pathlib.Path: + """ + Resolve the cache directory for a registry. + + If the ``REF_DATASET_CACHE_DIR`` environment variable is set, + use that as the root. Otherwise, fall back to the OS cache + under ``climate_ref``. + + Parameters + ---------- + cache_name + Subdirectory name within the cache root. + + Returns + ------- + The resolved cache directory path. + """ + if env_cache_dir := os.environ.get("REF_DATASET_CACHE_DIR"): + cache_dir = pathlib.Path(os.path.expandvars(env_cache_dir)).expanduser() + else: + cache_dir = pooch.os_cache("climate_ref") + + return cache_dir / cache_name + + @staticmethod + def _migrate_cache( + registry: pooch.Pooch, + legacy_cache_dirs: list[pathlib.Path], + ) -> None: + """ + Migrate cached files from legacy cache directories to the current location. + + For each file in the registry, + if it does not already exist at the + current cache path but is found in one of the legacy directories, + move it to the new location. + + Parameters + ---------- + registry + The Pooch registry whose cache may need migrating. + legacy_cache_dirs + Directories where files may have been cached previously. + """ + new_root: pathlib.Path = registry.abspath # type: ignore[attr-defined] + + for key in registry.registry: + new_path = new_root / key + if new_path.exists(): + continue + + for legacy_dir in legacy_cache_dirs: + old_path = legacy_dir / key + if old_path.exists(): + new_path.parent.mkdir(parents=True, exist_ok=True) + logger.info(f"Migrating cached file {key}: {old_path} -> {new_path}") + shutil.move(str(old_path), str(new_path)) + break + def register( # noqa: PLR0913 self, name: str, @@ -214,6 +274,7 @@ def register( # noqa: PLR0913 resource: str, cache_name: str | None = None, version: str | None = None, + legacy_cache_dirs: list[pathlib.Path] | None = None, ) -> None: """ Register a new dataset registry @@ -243,22 +304,29 @@ def register( # noqa: PLR0913 Name to use to generate the cache directory. This defaults to the value of `name` if not provided. + legacy_cache_dirs + Previous cache directories to migrate files from. + + If provided, any files that exist in a legacy directory but not in + the current cache will be moved to the new location. This avoids + re-downloading data after a cache layout change. """ if cache_name is None: cache_name = name - if env_cache_dir := os.environ.get("REF_DATASET_CACHE_DIR"): - cache_dir = pathlib.Path(os.path.expandvars(env_cache_dir)).expanduser() - else: - cache_dir = pooch.os_cache("climate_ref") + cache_path = self._resolve_cache_dir(cache_name) registry = pooch.create( - path=cache_dir / cache_name, + path=cache_path, base_url=base_url, version=version, retry_if_failed=10, ) registry.load_registry(str(importlib.resources.files(package) / resource)) + + if legacy_cache_dirs: + self._migrate_cache(registry, legacy_cache_dirs) + self._registries[name] = registry diff --git a/packages/climate-ref-core/tests/unit/test_dataset_registry/test_dataset_registry.py b/packages/climate-ref-core/tests/unit/test_dataset_registry/test_dataset_registry.py index 59ec88daf..b06d83a55 100644 --- a/packages/climate-ref-core/tests/unit/test_dataset_registry/test_dataset_registry.py +++ b/packages/climate-ref-core/tests/unit/test_dataset_registry/test_dataset_registry.py @@ -108,11 +108,17 @@ def test_with_cache_name(self, mocker, fake_registry_file, cache_name, expected) } mock_pooch.create.assert_called_once_with(**expected_kwargs) - @pytest.mark.parametrize("env", [None, "", "/some/other/path"]) - def test_with_environment_variable(self, monkeypatch, mocker, fake_registry_file, env): + @pytest.mark.parametrize( + "env, expected_path", + [ + (None, Path("/path/to/climate_ref") / "test_registry"), + ("", Path("/path/to/climate_ref") / "test_registry"), + ("/some/other/path", Path("/some/other/path") / "test_registry"), + ], + ) + def test_with_environment_variable(self, monkeypatch, mocker, fake_registry_file, env, expected_path): if env is not None: monkeypatch.setenv("REF_DATASET_CACHE_DIR", env) - expected_path = Path(env) / "test_registry" if env else Path("/path/to/climate_ref") / "test_registry" registry = DatasetRegistryManager() name = "test_registry" @@ -134,6 +140,139 @@ def test_with_environment_variable(self, monkeypatch, mocker, fake_registry_file mock_pooch.create.assert_called_once_with(**expected_kwargs) +class TestMigrateCache: + """Tests for DatasetRegistryManager._migrate_cache.""" + + def test_migrate_moves_files_from_legacy_dir(self, tmp_path): + """Files in a legacy dir are moved to the new cache location.""" + legacy_dir = tmp_path / "old_cache" + legacy_dir.mkdir() + (legacy_dir / "file1.txt").write_text("data1") + (legacy_dir / "subdir").mkdir() + (legacy_dir / "subdir" / "file2.txt").write_text("data2") + + new_dir = tmp_path / "new_cache" + new_dir.mkdir() + + mock_registry = type( + "R", + (), + { + "abspath": new_dir, + "registry": {"file1.txt": "sha256:a", "subdir/file2.txt": "sha256:b"}, + }, + )() + + DatasetRegistryManager._migrate_cache(mock_registry, [legacy_dir]) + + assert (new_dir / "file1.txt").read_text() == "data1" + assert (new_dir / "subdir" / "file2.txt").read_text() == "data2" + assert not (legacy_dir / "file1.txt").exists() + assert not (legacy_dir / "subdir" / "file2.txt").exists() + + def test_migrate_skips_existing_files(self, tmp_path): + """Files already in the new cache are not overwritten.""" + legacy_dir = tmp_path / "old_cache" + legacy_dir.mkdir() + (legacy_dir / "file1.txt").write_text("old_data") + + new_dir = tmp_path / "new_cache" + new_dir.mkdir() + (new_dir / "file1.txt").write_text("new_data") + + mock_registry = type( + "R", + (), + { + "abspath": new_dir, + "registry": {"file1.txt": "sha256:a"}, + }, + )() + + DatasetRegistryManager._migrate_cache(mock_registry, [legacy_dir]) + + assert (new_dir / "file1.txt").read_text() == "new_data" + assert (legacy_dir / "file1.txt").read_text() == "old_data" + + def test_migrate_first_legacy_dir_wins(self, tmp_path): + """When a file exists in multiple legacy dirs, the first one is used.""" + legacy1 = tmp_path / "old1" + legacy1.mkdir() + (legacy1 / "file1.txt").write_text("from_first") + + legacy2 = tmp_path / "old2" + legacy2.mkdir() + (legacy2 / "file1.txt").write_text("from_second") + + new_dir = tmp_path / "new_cache" + new_dir.mkdir() + + mock_registry = type( + "R", + (), + { + "abspath": new_dir, + "registry": {"file1.txt": "sha256:a"}, + }, + )() + + DatasetRegistryManager._migrate_cache(mock_registry, [legacy1, legacy2]) + + assert (new_dir / "file1.txt").read_text() == "from_first" + assert (legacy2 / "file1.txt").read_text() == "from_second" + + def test_migrate_no_legacy_files(self, tmp_path): + """No error when legacy dirs are empty or missing.""" + legacy_dir = tmp_path / "nonexistent" + new_dir = tmp_path / "new_cache" + new_dir.mkdir() + + mock_registry = type( + "R", + (), + { + "abspath": new_dir, + "registry": {"file1.txt": "sha256:a"}, + }, + )() + + DatasetRegistryManager._migrate_cache(mock_registry, [legacy_dir]) + + assert not (new_dir / "file1.txt").exists() + + def test_register_with_legacy_cache_dirs(self, tmp_path, mocker, fake_registry_file): + """Integration test: register() with legacy_cache_dirs triggers migration.""" + legacy_dir = tmp_path / "old_cache" + legacy_dir.mkdir() + (legacy_dir / "file1.txt").write_text("legacy_data") + (legacy_dir / "file2.txt").write_text("legacy_data2") + + new_dir = tmp_path / "new_cache" / "test_registry" + + mocker.patch.object( + DatasetRegistryManager, + "_resolve_cache_dir", + return_value=new_dir, + ) + + registry_path, package, resource = fake_registry_file + with registry_path.open("w") as f: + f.write("file1.txt sha256:checksum1\n") + f.write("file2.txt sha256:checksum2\n") + + manager = DatasetRegistryManager() + manager.register( + "test_registry", + "http://example.com", + package, + resource, + legacy_cache_dirs=[legacy_dir], + ) + + assert (new_dir / "file1.txt").read_text() == "legacy_data" + assert (new_dir / "file2.txt").read_text() == "legacy_data2" + + @pytest.mark.parametrize("symlink", [True, False]) @pytest.mark.parametrize("verify", [True, False]) def test_fetch_all_files(mocker, tmp_path, symlink, verify): diff --git a/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/__init__.py b/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/__init__.py index 2d6a1286e..6add5b691 100644 --- a/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/__init__.py +++ b/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/__init__.py @@ -13,6 +13,7 @@ import climate_ref_esmvaltool.diagnostics from climate_ref_core.dataset_registry import ( DATASET_URL, + DatasetRegistryManager, dataset_registry_manager, fetch_all_files, validate_registry_cache, @@ -84,6 +85,10 @@ def get_data_path(self) -> Path | None: package="climate_ref_esmvaltool.dataset_registry", resource="data.txt", cache_name=_DATASETS_REGISTRY_NAME.replace("-", "/"), + legacy_cache_dirs=[ + # As of v0.12.3, cached under pooch.os_cache("climate_ref") with no subdirectory. + DatasetRegistryManager._resolve_cache_dir("climate_ref") + ], ) # Register the ESMValTool recipes. dataset_registry_manager.register( @@ -92,4 +97,8 @@ def get_data_path(self) -> Path | None: package="climate_ref_esmvaltool", resource="recipes.txt", cache_name=_RECIPES_REGISTRY_NAME.replace("-", "/"), + legacy_cache_dirs=[ + # As of v0.12.3, cached under pooch.os_cache("climate_ref_esmvaltool"). + Path(pooch.os_cache("climate_ref_esmvaltool")) + ], ) From 0fa7bf21a2d93c00737b025650088c1ec6b2132d Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Sat, 11 Apr 2026 12:57:16 +1000 Subject: [PATCH 5/5] refactor: improve cache migration handling in DatasetRegistryManager --- .../src/climate_ref_core/dataset_registry.py | 13 +++++++++++-- .../src/climate_ref_esmvaltool/__init__.py | 5 ----- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/packages/climate-ref-core/src/climate_ref_core/dataset_registry.py b/packages/climate-ref-core/src/climate_ref_core/dataset_registry.py index bbfdc558a..77d3c641f 100644 --- a/packages/climate-ref-core/src/climate_ref_core/dataset_registry.py +++ b/packages/climate-ref-core/src/climate_ref_core/dataset_registry.py @@ -316,6 +316,15 @@ def register( # noqa: PLR0913 cache_path = self._resolve_cache_dir(cache_name) + # Before v0.13.0 everything was cached directly under + # pooch.os_cache("climate_ref") with no per-registry subdirectory. + # Always include that as a legacy location so files are migrated + # automatically, regardless of whether the caller passes extra dirs. + default_legacy = pathlib.Path(pooch.os_cache("climate_ref")) + all_legacy_dirs = [default_legacy] + if legacy_cache_dirs: + all_legacy_dirs.extend(legacy_cache_dirs) + registry = pooch.create( path=cache_path, base_url=base_url, @@ -324,8 +333,8 @@ def register( # noqa: PLR0913 ) registry.load_registry(str(importlib.resources.files(package) / resource)) - if legacy_cache_dirs: - self._migrate_cache(registry, legacy_cache_dirs) + if cache_path != default_legacy: + self._migrate_cache(registry, all_legacy_dirs) self._registries[name] = registry diff --git a/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/__init__.py b/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/__init__.py index 6add5b691..2685b3a42 100644 --- a/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/__init__.py +++ b/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/__init__.py @@ -13,7 +13,6 @@ import climate_ref_esmvaltool.diagnostics from climate_ref_core.dataset_registry import ( DATASET_URL, - DatasetRegistryManager, dataset_registry_manager, fetch_all_files, validate_registry_cache, @@ -85,10 +84,6 @@ def get_data_path(self) -> Path | None: package="climate_ref_esmvaltool.dataset_registry", resource="data.txt", cache_name=_DATASETS_REGISTRY_NAME.replace("-", "/"), - legacy_cache_dirs=[ - # As of v0.12.3, cached under pooch.os_cache("climate_ref") with no subdirectory. - DatasetRegistryManager._resolve_cache_dir("climate_ref") - ], ) # Register the ESMValTool recipes. dataset_registry_manager.register(