diff --git a/changelog/582.improvement.md b/changelog/582.improvement.md new file mode 100644 index 000000000..d2e8fc719 --- /dev/null +++ b/changelog/582.improvement.md @@ -0,0 +1 @@ +Fetch ESMValTool recipes when installing the provider. diff --git a/packages/climate-ref-core/src/climate_ref_core/dataset_registry.py b/packages/climate-ref-core/src/climate_ref_core/dataset_registry.py index 9e29a7516..77d3c641f 100644 --- a/packages/climate-ref-core/src/climate_ref_core/dataset_registry.py +++ b/packages/climate-ref-core/src/climate_ref_core/dataset_registry.py @@ -206,6 +206,66 @@ def keys(self) -> list[str]: """ return list(self._registries.keys()) + @staticmethod + def _resolve_cache_dir(cache_name: str) -> pathlib.Path: + """ + Resolve the cache directory for a registry. + + If the ``REF_DATASET_CACHE_DIR`` environment variable is set, + use that as the root. Otherwise, fall back to the OS cache + under ``climate_ref``. + + Parameters + ---------- + cache_name + Subdirectory name within the cache root. + + Returns + ------- + The resolved cache directory path. + """ + if env_cache_dir := os.environ.get("REF_DATASET_CACHE_DIR"): + cache_dir = pathlib.Path(os.path.expandvars(env_cache_dir)).expanduser() + else: + cache_dir = pooch.os_cache("climate_ref") + + return cache_dir / cache_name + + @staticmethod + def _migrate_cache( + registry: pooch.Pooch, + legacy_cache_dirs: list[pathlib.Path], + ) -> None: + """ + Migrate cached files from legacy cache directories to the current location. + + For each file in the registry, + if it does not already exist at the + current cache path but is found in one of the legacy directories, + move it to the new location. + + Parameters + ---------- + registry + The Pooch registry whose cache may need migrating. + legacy_cache_dirs + Directories where files may have been cached previously. + """ + new_root: pathlib.Path = registry.abspath # type: ignore[attr-defined] + + for key in registry.registry: + new_path = new_root / key + if new_path.exists(): + continue + + for legacy_dir in legacy_cache_dirs: + old_path = legacy_dir / key + if old_path.exists(): + new_path.parent.mkdir(parents=True, exist_ok=True) + logger.info(f"Migrating cached file {key}: {old_path} -> {new_path}") + shutil.move(str(old_path), str(new_path)) + break + def register( # noqa: PLR0913 self, name: str, @@ -214,6 +274,7 @@ def register( # noqa: PLR0913 resource: str, cache_name: str | None = None, version: str | None = None, + legacy_cache_dirs: list[pathlib.Path] | None = None, ) -> None: """ Register a new dataset registry @@ -243,18 +304,38 @@ def register( # noqa: PLR0913 Name to use to generate the cache directory. This defaults to the value of `name` if not provided. + legacy_cache_dirs + Previous cache directories to migrate files from. + + If provided, any files that exist in a legacy directory but not in + the current cache will be moved to the new location. This avoids + re-downloading data after a cache layout change. """ if cache_name is None: - cache_name = "climate_ref" + cache_name = name + + cache_path = self._resolve_cache_dir(cache_name) + + # Before v0.13.0 everything was cached directly under + # pooch.os_cache("climate_ref") with no per-registry subdirectory. + # Always include that as a legacy location so files are migrated + # automatically, regardless of whether the caller passes extra dirs. + default_legacy = pathlib.Path(pooch.os_cache("climate_ref")) + all_legacy_dirs = [default_legacy] + if legacy_cache_dirs: + all_legacy_dirs.extend(legacy_cache_dirs) registry = pooch.create( - path=pooch.os_cache(cache_name), + path=cache_path, base_url=base_url, version=version, retry_if_failed=10, - env="REF_DATASET_CACHE_DIR", ) registry.load_registry(str(importlib.resources.files(package) / resource)) + + if cache_path != default_legacy: + self._migrate_cache(registry, all_legacy_dirs) + self._registries[name] = registry diff --git a/packages/climate-ref-core/tests/unit/test_dataset_registry/test_dataset_registry.py b/packages/climate-ref-core/tests/unit/test_dataset_registry/test_dataset_registry.py index ed058aafe..b06d83a55 100644 --- a/packages/climate-ref-core/tests/unit/test_dataset_registry/test_dataset_registry.py +++ b/packages/climate-ref-core/tests/unit/test_dataset_registry/test_dataset_registry.py @@ -85,7 +85,7 @@ def test_getitem(self, mocker, fake_registry_file): assert retrieved_registry == mock_pooch_instance @pytest.mark.parametrize( - "cache_name, expected", [(None, "climate_ref"), ("custom_cache", "custom_cache")] + "cache_name, expected", [(None, "test_registry"), ("custom_cache", "custom_cache")] ) def test_with_cache_name(self, mocker, fake_registry_file, cache_name, expected): registry = DatasetRegistryManager() @@ -93,13 +93,184 @@ def test_with_cache_name(self, mocker, fake_registry_file, cache_name, expected) base_url = "http://example.com" mock_pooch = mocker.patch("climate_ref_core.dataset_registry.pooch") + mock_pooch.os_cache.return_value = Path("/path/to/climate_ref") package, resource = self.setup_registry_file(fake_registry_file) registry.register(name, base_url, package, resource, cache_name=cache_name) - mock_pooch.os_cache.assert_called_with(expected) + mock_pooch.os_cache.assert_called_with("climate_ref") assert name in registry._registries - mock_pooch.create.assert_called_once() + expected_kwargs = { + "base_url": "http://example.com", + "path": Path("/path/to/climate_ref", expected), + "retry_if_failed": 10, + "version": None, + } + mock_pooch.create.assert_called_once_with(**expected_kwargs) + + @pytest.mark.parametrize( + "env, expected_path", + [ + (None, Path("/path/to/climate_ref") / "test_registry"), + ("", Path("/path/to/climate_ref") / "test_registry"), + ("/some/other/path", Path("/some/other/path") / "test_registry"), + ], + ) + def test_with_environment_variable(self, monkeypatch, mocker, fake_registry_file, env, expected_path): + if env is not None: + monkeypatch.setenv("REF_DATASET_CACHE_DIR", env) + + registry = DatasetRegistryManager() + name = "test_registry" + base_url = "http://example.com" + + mock_pooch = mocker.patch("climate_ref_core.dataset_registry.pooch") + mock_pooch.os_cache.return_value = Path("/path/to/climate_ref") + package, resource = self.setup_registry_file(fake_registry_file) + + registry.register(name, base_url, package, resource) + + assert name in registry._registries + expected_kwargs = { + "path": expected_path, + "base_url": "http://example.com", + "retry_if_failed": 10, + "version": None, + } + mock_pooch.create.assert_called_once_with(**expected_kwargs) + + +class TestMigrateCache: + """Tests for DatasetRegistryManager._migrate_cache.""" + + def test_migrate_moves_files_from_legacy_dir(self, tmp_path): + """Files in a legacy dir are moved to the new cache location.""" + legacy_dir = tmp_path / "old_cache" + legacy_dir.mkdir() + (legacy_dir / "file1.txt").write_text("data1") + (legacy_dir / "subdir").mkdir() + (legacy_dir / "subdir" / "file2.txt").write_text("data2") + + new_dir = tmp_path / "new_cache" + new_dir.mkdir() + + mock_registry = type( + "R", + (), + { + "abspath": new_dir, + "registry": {"file1.txt": "sha256:a", "subdir/file2.txt": "sha256:b"}, + }, + )() + + DatasetRegistryManager._migrate_cache(mock_registry, [legacy_dir]) + + assert (new_dir / "file1.txt").read_text() == "data1" + assert (new_dir / "subdir" / "file2.txt").read_text() == "data2" + assert not (legacy_dir / "file1.txt").exists() + assert not (legacy_dir / "subdir" / "file2.txt").exists() + + def test_migrate_skips_existing_files(self, tmp_path): + """Files already in the new cache are not overwritten.""" + legacy_dir = tmp_path / "old_cache" + legacy_dir.mkdir() + (legacy_dir / "file1.txt").write_text("old_data") + + new_dir = tmp_path / "new_cache" + new_dir.mkdir() + (new_dir / "file1.txt").write_text("new_data") + + mock_registry = type( + "R", + (), + { + "abspath": new_dir, + "registry": {"file1.txt": "sha256:a"}, + }, + )() + + DatasetRegistryManager._migrate_cache(mock_registry, [legacy_dir]) + + assert (new_dir / "file1.txt").read_text() == "new_data" + assert (legacy_dir / "file1.txt").read_text() == "old_data" + + def test_migrate_first_legacy_dir_wins(self, tmp_path): + """When a file exists in multiple legacy dirs, the first one is used.""" + legacy1 = tmp_path / "old1" + legacy1.mkdir() + (legacy1 / "file1.txt").write_text("from_first") + + legacy2 = tmp_path / "old2" + legacy2.mkdir() + (legacy2 / "file1.txt").write_text("from_second") + + new_dir = tmp_path / "new_cache" + new_dir.mkdir() + + mock_registry = type( + "R", + (), + { + "abspath": new_dir, + "registry": {"file1.txt": "sha256:a"}, + }, + )() + + DatasetRegistryManager._migrate_cache(mock_registry, [legacy1, legacy2]) + + assert (new_dir / "file1.txt").read_text() == "from_first" + assert (legacy2 / "file1.txt").read_text() == "from_second" + + def test_migrate_no_legacy_files(self, tmp_path): + """No error when legacy dirs are empty or missing.""" + legacy_dir = tmp_path / "nonexistent" + new_dir = tmp_path / "new_cache" + new_dir.mkdir() + + mock_registry = type( + "R", + (), + { + "abspath": new_dir, + "registry": {"file1.txt": "sha256:a"}, + }, + )() + + DatasetRegistryManager._migrate_cache(mock_registry, [legacy_dir]) + + assert not (new_dir / "file1.txt").exists() + + def test_register_with_legacy_cache_dirs(self, tmp_path, mocker, fake_registry_file): + """Integration test: register() with legacy_cache_dirs triggers migration.""" + legacy_dir = tmp_path / "old_cache" + legacy_dir.mkdir() + (legacy_dir / "file1.txt").write_text("legacy_data") + (legacy_dir / "file2.txt").write_text("legacy_data2") + + new_dir = tmp_path / "new_cache" / "test_registry" + + mocker.patch.object( + DatasetRegistryManager, + "_resolve_cache_dir", + return_value=new_dir, + ) + + registry_path, package, resource = fake_registry_file + with registry_path.open("w") as f: + f.write("file1.txt sha256:checksum1\n") + f.write("file2.txt sha256:checksum2\n") + + manager = DatasetRegistryManager() + manager.register( + "test_registry", + "http://example.com", + package, + resource, + legacy_cache_dirs=[legacy_dir], + ) + + assert (new_dir / "file1.txt").read_text() == "legacy_data" + assert (new_dir / "file2.txt").read_text() == "legacy_data2" @pytest.mark.parametrize("symlink", [True, False]) diff --git a/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/__init__.py b/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/__init__.py index 4e849a14c..2685b3a42 100644 --- a/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/__init__.py +++ b/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/__init__.py @@ -19,21 +19,26 @@ ) from climate_ref_core.providers import CondaDiagnosticProvider from climate_ref_esmvaltool._version import __version__ -from climate_ref_esmvaltool.recipe import _ESMVALCORE_URL, _ESMVALTOOL_URL +from climate_ref_esmvaltool.diagnostics.base import _DATASETS_REGISTRY_NAME +from climate_ref_esmvaltool.recipe import ( + _ESMVALCORE_URL, + _ESMVALTOOL_URL, + _RECIPES_REGISTRY_NAME, + _RECIPES_URL, +) if TYPE_CHECKING: from climate_ref.config import Config -_REGISTRY_NAME = "esmvaltool" - class ESMValToolProvider(CondaDiagnosticProvider): """Provider for ESMValTool diagnostics.""" def fetch_data(self, config: Config) -> None: """Fetch ESMValTool reference data.""" - registry = dataset_registry_manager[_REGISTRY_NAME] - fetch_all_files(registry, _REGISTRY_NAME, output_dir=None) + for registry_name in [_DATASETS_REGISTRY_NAME, _RECIPES_REGISTRY_NAME]: + registry = dataset_registry_manager[registry_name] + fetch_all_files(registry, registry_name, output_dir=None) def validate_setup(self, config: Config) -> bool: """Validate conda environment and data checksums.""" @@ -42,8 +47,9 @@ def validate_setup(self, config: Config) -> bool: return False # Then check data checksums - registry = dataset_registry_manager[_REGISTRY_NAME] - errors = validate_registry_cache(registry, _REGISTRY_NAME) + errors = [] + for registry_name in [_DATASETS_REGISTRY_NAME, _RECIPES_REGISTRY_NAME]: + errors.extend(validate_registry_cache(dataset_registry_manager[registry_name], registry_name)) if errors: for error in errors: logger.error(f"{self.slug} validation failed: {error}") @@ -73,8 +79,21 @@ def get_data_path(self) -> Path | None: # Register OBS, OBS6, and raw data dataset_registry_manager.register( - "esmvaltool", + name=_DATASETS_REGISTRY_NAME, base_url=DATASET_URL, package="climate_ref_esmvaltool.dataset_registry", resource="data.txt", + cache_name=_DATASETS_REGISTRY_NAME.replace("-", "/"), +) +# Register the ESMValTool recipes. +dataset_registry_manager.register( + name=_RECIPES_REGISTRY_NAME, + base_url=_RECIPES_URL, + package="climate_ref_esmvaltool", + resource="recipes.txt", + cache_name=_RECIPES_REGISTRY_NAME.replace("-", "/"), + legacy_cache_dirs=[ + # As of v0.12.3, cached under pooch.os_cache("climate_ref_esmvaltool"). + Path(pooch.os_cache("climate_ref_esmvaltool")) + ], ) diff --git a/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/diagnostics/base.py b/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/diagnostics/base.py index 43e78fcf5..125f5aabc 100644 --- a/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/diagnostics/base.py +++ b/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/diagnostics/base.py @@ -29,6 +29,8 @@ ) from climate_ref_esmvaltool.types import MetricBundleArgs, OutputBundleArgs, Recipe +_DATASETS_REGISTRY_NAME = "esmvaltool-datasets" + def get_cmip_source_type( input_files: dict[SourceDatasetType, pandas.DataFrame], @@ -202,13 +204,13 @@ def build_cmd(self, definition: ExecutionDefinition) -> Iterable[str]: } # Configure the paths to OBS/OBS6/native6 and non-compliant obs4MIPs data - registry = dataset_registry_manager["esmvaltool"] + registry = dataset_registry_manager[_DATASETS_REGISTRY_NAME] data_dir = registry.abspath / "ESMValTool" # type: ignore[attr-defined] if not data_dir.exists(): logger.warning( "ESMValTool observational and reanalysis data is not available " f"in {data_dir}, you may want to run the command " - "`ref datasets fetch-data --registry esmvaltool`." + f"`ref datasets fetch-data --registry {_DATASETS_REGISTRY_NAME}`." ) else: config["projects"]["OBS"] = { diff --git a/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/recipe.py b/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/recipe.py index f6d71c2df..8bb244be4 100644 --- a/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/recipe.py +++ b/packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/recipe.py @@ -1,15 +1,14 @@ from __future__ import annotations -import importlib.resources from collections.abc import Iterator from pathlib import Path from typing import TYPE_CHECKING, Any import cftime import pandas as pd -import pooch import yaml +from climate_ref_core.dataset_registry import dataset_registry_manager from climate_ref_esmvaltool.types import Recipe if TYPE_CHECKING: @@ -304,17 +303,10 @@ def get_child_and_parent_dataset( _ESMVALCORE_COMMIT = "da81d5f67158f3d2603831b56ab6b4fb8a388d86" _ESMVALCORE_URL = f"git+https://github.com/ESMValGroup/ESMValCore.git@{_ESMVALCORE_COMMIT}" -_RECIPES = pooch.create( - path=pooch.os_cache("climate_ref_esmvaltool"), - # TODO: use a released version - # base_url="https://raw.githubusercontent.com/ESMValGroup/ESMValTool/refs/tags/v{version}/esmvaltool/recipes/", - # version=_ESMVALTOOL_VERSION, - base_url=f"https://raw.githubusercontent.com/ESMValGroup/ESMValTool/{_ESMVALTOOL_COMMIT}/esmvaltool/recipes/", - env="REF_METRICS_ESMVALTOOL_DATA_DIR", - retry_if_failed=10, +_RECIPES_URL = ( + f"https://raw.githubusercontent.com/ESMValGroup/ESMValTool/{_ESMVALTOOL_COMMIT}/esmvaltool/recipes/" ) -with importlib.resources.files("climate_ref_esmvaltool").joinpath("recipes.txt").open("rb") as _buffer: - _RECIPES.load_registry(_buffer) +_RECIPES_REGISTRY_NAME = f"esmvaltool-recipes-v{_ESMVALTOOL_VERSION}" def fix_annual_statistics_keep_year(recipe: Recipe) -> None: @@ -357,7 +349,7 @@ def load_recipe(recipe: str) -> Recipe: ------- The loaded recipe. """ - filename = _RECIPES.fetch(recipe) + filename = dataset_registry_manager[_RECIPES_REGISTRY_NAME].fetch(recipe) def normalize(obj: Any) -> Any: # Ensure objects in the recipe are not shared. diff --git a/packages/climate-ref-esmvaltool/tests/unit/test_provider.py b/packages/climate-ref-esmvaltool/tests/unit/test_provider.py index 1c66d1c7d..3ff969002 100644 --- a/packages/climate-ref-esmvaltool/tests/unit/test_provider.py +++ b/packages/climate-ref-esmvaltool/tests/unit/test_provider.py @@ -2,7 +2,7 @@ from pathlib import Path import pooch -from climate_ref_esmvaltool import ESMValToolProvider, __version__, provider +from climate_ref_esmvaltool import _DATASETS_REGISTRY_NAME, ESMValToolProvider, __version__, provider def test_provider(): @@ -40,11 +40,11 @@ def test_fetch_data(self, mocker): provider.fetch_data(mock_config) - mock_fetch.assert_called_once() + mock_fetch.assert_called() # Check it's using the right registry name - call_args = mock_fetch.call_args - assert call_args[0][1] == "esmvaltool" - assert call_args[1]["output_dir"] is None + call = mock_fetch.mock_calls[0] + assert call.args[1] == _DATASETS_REGISTRY_NAME + assert call.kwargs["output_dir"] is None def test_validate_setup_env_missing(self, mocker): """Test validate_setup returns False when conda env is missing."""