Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog/582.improvement.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fetch ESMValTool recipes when installing the provider.
87 changes: 84 additions & 3 deletions packages/climate-ref-core/src/climate_ref_core/dataset_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,66 @@ def keys(self) -> list[str]:
"""
return list(self._registries.keys())

@staticmethod
def _resolve_cache_dir(cache_name: str) -> pathlib.Path:
"""
Resolve the cache directory for a registry.

If the ``REF_DATASET_CACHE_DIR`` environment variable is set,
use that as the root. Otherwise, fall back to the OS cache
under ``climate_ref``.

Parameters
----------
cache_name
Subdirectory name within the cache root.

Returns
-------
The resolved cache directory path.
"""
if env_cache_dir := os.environ.get("REF_DATASET_CACHE_DIR"):
cache_dir = pathlib.Path(os.path.expandvars(env_cache_dir)).expanduser()
else:
cache_dir = pooch.os_cache("climate_ref")

return cache_dir / cache_name

@staticmethod
def _migrate_cache(
registry: pooch.Pooch,
legacy_cache_dirs: list[pathlib.Path],
) -> None:
"""
Migrate cached files from legacy cache directories to the current location.

For each file in the registry,
if it does not already exist at the
current cache path but is found in one of the legacy directories,
move it to the new location.

Parameters
----------
registry
The Pooch registry whose cache may need migrating.
legacy_cache_dirs
Directories where files may have been cached previously.
"""
new_root: pathlib.Path = registry.abspath # type: ignore[attr-defined]

for key in registry.registry:
new_path = new_root / key
if new_path.exists():
continue

for legacy_dir in legacy_cache_dirs:
old_path = legacy_dir / key
if old_path.exists():
new_path.parent.mkdir(parents=True, exist_ok=True)
logger.info(f"Migrating cached file {key}: {old_path} -> {new_path}")
shutil.move(str(old_path), str(new_path))
break

def register( # noqa: PLR0913
self,
name: str,
Expand All @@ -214,6 +274,7 @@ def register( # noqa: PLR0913
resource: str,
cache_name: str | None = None,
version: str | None = None,
legacy_cache_dirs: list[pathlib.Path] | None = None,
) -> None:
"""
Register a new dataset registry
Expand Down Expand Up @@ -243,18 +304,38 @@ def register( # noqa: PLR0913
Name to use to generate the cache directory.

This defaults to the value of `name` if not provided.
legacy_cache_dirs
Previous cache directories to migrate files from.

If provided, any files that exist in a legacy directory but not in
the current cache will be moved to the new location. This avoids
re-downloading data after a cache layout change.
"""
if cache_name is None:
cache_name = "climate_ref"
cache_name = name

cache_path = self._resolve_cache_dir(cache_name)

# Before v0.13.0 everything was cached directly under
# pooch.os_cache("climate_ref") with no per-registry subdirectory.
# Always include that as a legacy location so files are migrated
# automatically, regardless of whether the caller passes extra dirs.
default_legacy = pathlib.Path(pooch.os_cache("climate_ref"))
all_legacy_dirs = [default_legacy]
if legacy_cache_dirs:
all_legacy_dirs.extend(legacy_cache_dirs)

registry = pooch.create(
path=pooch.os_cache(cache_name),
path=cache_path,
base_url=base_url,
version=version,
retry_if_failed=10,
env="REF_DATASET_CACHE_DIR",
)
registry.load_registry(str(importlib.resources.files(package) / resource))

if cache_path != default_legacy:
self._migrate_cache(registry, all_legacy_dirs)

self._registries[name] = registry


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,21 +85,192 @@ def test_getitem(self, mocker, fake_registry_file):
assert retrieved_registry == mock_pooch_instance

@pytest.mark.parametrize(
"cache_name, expected", [(None, "climate_ref"), ("custom_cache", "custom_cache")]
"cache_name, expected", [(None, "test_registry"), ("custom_cache", "custom_cache")]
)
def test_with_cache_name(self, mocker, fake_registry_file, cache_name, expected):
registry = DatasetRegistryManager()
name = "test_registry"
base_url = "http://example.com"

mock_pooch = mocker.patch("climate_ref_core.dataset_registry.pooch")
mock_pooch.os_cache.return_value = Path("/path/to/climate_ref")
package, resource = self.setup_registry_file(fake_registry_file)

registry.register(name, base_url, package, resource, cache_name=cache_name)

mock_pooch.os_cache.assert_called_with(expected)
mock_pooch.os_cache.assert_called_with("climate_ref")
assert name in registry._registries
mock_pooch.create.assert_called_once()
expected_kwargs = {
"base_url": "http://example.com",
"path": Path("/path/to/climate_ref", expected),
"retry_if_failed": 10,
"version": None,
}
mock_pooch.create.assert_called_once_with(**expected_kwargs)

@pytest.mark.parametrize(
"env, expected_path",
[
(None, Path("/path/to/climate_ref") / "test_registry"),
("", Path("/path/to/climate_ref") / "test_registry"),
("/some/other/path", Path("/some/other/path") / "test_registry"),
],
)
def test_with_environment_variable(self, monkeypatch, mocker, fake_registry_file, env, expected_path):
if env is not None:
monkeypatch.setenv("REF_DATASET_CACHE_DIR", env)

registry = DatasetRegistryManager()
name = "test_registry"
base_url = "http://example.com"

mock_pooch = mocker.patch("climate_ref_core.dataset_registry.pooch")
mock_pooch.os_cache.return_value = Path("/path/to/climate_ref")
package, resource = self.setup_registry_file(fake_registry_file)

registry.register(name, base_url, package, resource)

assert name in registry._registries
expected_kwargs = {
"path": expected_path,
"base_url": "http://example.com",
"retry_if_failed": 10,
"version": None,
}
mock_pooch.create.assert_called_once_with(**expected_kwargs)


class TestMigrateCache:
"""Tests for DatasetRegistryManager._migrate_cache."""

def test_migrate_moves_files_from_legacy_dir(self, tmp_path):
"""Files in a legacy dir are moved to the new cache location."""
legacy_dir = tmp_path / "old_cache"
legacy_dir.mkdir()
(legacy_dir / "file1.txt").write_text("data1")
(legacy_dir / "subdir").mkdir()
(legacy_dir / "subdir" / "file2.txt").write_text("data2")

new_dir = tmp_path / "new_cache"
new_dir.mkdir()

mock_registry = type(
"R",
(),
{
"abspath": new_dir,
"registry": {"file1.txt": "sha256:a", "subdir/file2.txt": "sha256:b"},
},
)()

DatasetRegistryManager._migrate_cache(mock_registry, [legacy_dir])

assert (new_dir / "file1.txt").read_text() == "data1"
assert (new_dir / "subdir" / "file2.txt").read_text() == "data2"
assert not (legacy_dir / "file1.txt").exists()
assert not (legacy_dir / "subdir" / "file2.txt").exists()

def test_migrate_skips_existing_files(self, tmp_path):
"""Files already in the new cache are not overwritten."""
legacy_dir = tmp_path / "old_cache"
legacy_dir.mkdir()
(legacy_dir / "file1.txt").write_text("old_data")

new_dir = tmp_path / "new_cache"
new_dir.mkdir()
(new_dir / "file1.txt").write_text("new_data")

mock_registry = type(
"R",
(),
{
"abspath": new_dir,
"registry": {"file1.txt": "sha256:a"},
},
)()

DatasetRegistryManager._migrate_cache(mock_registry, [legacy_dir])

assert (new_dir / "file1.txt").read_text() == "new_data"
assert (legacy_dir / "file1.txt").read_text() == "old_data"

def test_migrate_first_legacy_dir_wins(self, tmp_path):
"""When a file exists in multiple legacy dirs, the first one is used."""
legacy1 = tmp_path / "old1"
legacy1.mkdir()
(legacy1 / "file1.txt").write_text("from_first")

legacy2 = tmp_path / "old2"
legacy2.mkdir()
(legacy2 / "file1.txt").write_text("from_second")

new_dir = tmp_path / "new_cache"
new_dir.mkdir()

mock_registry = type(
"R",
(),
{
"abspath": new_dir,
"registry": {"file1.txt": "sha256:a"},
},
)()

DatasetRegistryManager._migrate_cache(mock_registry, [legacy1, legacy2])

assert (new_dir / "file1.txt").read_text() == "from_first"
assert (legacy2 / "file1.txt").read_text() == "from_second"

def test_migrate_no_legacy_files(self, tmp_path):
"""No error when legacy dirs are empty or missing."""
legacy_dir = tmp_path / "nonexistent"
new_dir = tmp_path / "new_cache"
new_dir.mkdir()

mock_registry = type(
"R",
(),
{
"abspath": new_dir,
"registry": {"file1.txt": "sha256:a"},
},
)()

DatasetRegistryManager._migrate_cache(mock_registry, [legacy_dir])

assert not (new_dir / "file1.txt").exists()

def test_register_with_legacy_cache_dirs(self, tmp_path, mocker, fake_registry_file):
"""Integration test: register() with legacy_cache_dirs triggers migration."""
legacy_dir = tmp_path / "old_cache"
legacy_dir.mkdir()
(legacy_dir / "file1.txt").write_text("legacy_data")
(legacy_dir / "file2.txt").write_text("legacy_data2")

new_dir = tmp_path / "new_cache" / "test_registry"

mocker.patch.object(
DatasetRegistryManager,
"_resolve_cache_dir",
return_value=new_dir,
)

registry_path, package, resource = fake_registry_file
with registry_path.open("w") as f:
f.write("file1.txt sha256:checksum1\n")
f.write("file2.txt sha256:checksum2\n")

manager = DatasetRegistryManager()
manager.register(
"test_registry",
"http://example.com",
package,
resource,
legacy_cache_dirs=[legacy_dir],
)

assert (new_dir / "file1.txt").read_text() == "legacy_data"
assert (new_dir / "file2.txt").read_text() == "legacy_data2"


@pytest.mark.parametrize("symlink", [True, False])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,21 +19,26 @@
)
from climate_ref_core.providers import CondaDiagnosticProvider
from climate_ref_esmvaltool._version import __version__
from climate_ref_esmvaltool.recipe import _ESMVALCORE_URL, _ESMVALTOOL_URL
from climate_ref_esmvaltool.diagnostics.base import _DATASETS_REGISTRY_NAME
from climate_ref_esmvaltool.recipe import (
_ESMVALCORE_URL,
_ESMVALTOOL_URL,
_RECIPES_REGISTRY_NAME,
_RECIPES_URL,
)

if TYPE_CHECKING:
from climate_ref.config import Config

_REGISTRY_NAME = "esmvaltool"


class ESMValToolProvider(CondaDiagnosticProvider):
"""Provider for ESMValTool diagnostics."""

def fetch_data(self, config: Config) -> None:
"""Fetch ESMValTool reference data."""
registry = dataset_registry_manager[_REGISTRY_NAME]
fetch_all_files(registry, _REGISTRY_NAME, output_dir=None)
for registry_name in [_DATASETS_REGISTRY_NAME, _RECIPES_REGISTRY_NAME]:
registry = dataset_registry_manager[registry_name]
fetch_all_files(registry, registry_name, output_dir=None)

def validate_setup(self, config: Config) -> bool:
"""Validate conda environment and data checksums."""
Expand All @@ -42,8 +47,9 @@ def validate_setup(self, config: Config) -> bool:
return False

# Then check data checksums
registry = dataset_registry_manager[_REGISTRY_NAME]
errors = validate_registry_cache(registry, _REGISTRY_NAME)
errors = []
for registry_name in [_DATASETS_REGISTRY_NAME, _RECIPES_REGISTRY_NAME]:
errors.extend(validate_registry_cache(dataset_registry_manager[registry_name], registry_name))
if errors:
for error in errors:
logger.error(f"{self.slug} validation failed: {error}")
Expand Down Expand Up @@ -73,8 +79,21 @@ def get_data_path(self) -> Path | None:

# Register OBS, OBS6, and raw data
dataset_registry_manager.register(
"esmvaltool",
name=_DATASETS_REGISTRY_NAME,
base_url=DATASET_URL,
package="climate_ref_esmvaltool.dataset_registry",
resource="data.txt",
cache_name=_DATASETS_REGISTRY_NAME.replace("-", "/"),
)
# Register the ESMValTool recipes.
dataset_registry_manager.register(
name=_RECIPES_REGISTRY_NAME,
base_url=_RECIPES_URL,
package="climate_ref_esmvaltool",
resource="recipes.txt",
cache_name=_RECIPES_REGISTRY_NAME.replace("-", "/"),
legacy_cache_dirs=[
# As of v0.12.3, cached under pooch.os_cache("climate_ref_esmvaltool").
Path(pooch.os_cache("climate_ref_esmvaltool"))
],
)
Loading
Loading