diff --git a/.github/workflows/scheduled_workflow.yml b/.github/workflows/scheduled_workflow.yml new file mode 100644 index 0000000000..77a789597e --- /dev/null +++ b/.github/workflows/scheduled_workflow.yml @@ -0,0 +1,24 @@ +name: ASPIRE Python Scheduled Workflow + +on: + schedule: + - cron: '15 0 * * 0' # Every Sunday at 00:15 UTC + + +jobs: + scheduled-tests: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + with: + ref: develop + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + - name: Install Dependencies + run: | + pip install -e ".[dev]" + - name: Scheduled Tests + run: pytest -m scheduled diff --git a/src/aspire/downloader/__init__.py b/src/aspire/downloader/__init__.py index be0d375878..714bba0e53 100644 --- a/src/aspire/downloader/__init__.py +++ b/src/aspire/downloader/__init__.py @@ -4,6 +4,7 @@ # isort: on from .data_fetcher import ( available_downloads, + download_all, emdb_2484, emdb_2660, emdb_2824, diff --git a/src/aspire/downloader/data_fetcher.py b/src/aspire/downloader/data_fetcher.py index f655d06ab3..146eb97448 100644 --- a/src/aspire/downloader/data_fetcher.py +++ b/src/aspire/downloader/data_fetcher.py @@ -1,3 +1,4 @@ +import logging import shutil import numpy as np @@ -10,6 +11,9 @@ from aspire.utils import Rotation from aspire.volume import Volume +logger = logging.getLogger(__name__) + + # Initialize pooch data fetcher instance. _data_fetcher = pooch.create( # Set the cache path defined in the config. By default, the cache @@ -33,14 +37,29 @@ def fetch_data(dataset_name): file in local storage doesn’t match the one in the registry, will download a new copy of the file. This is considered a sign that the file was updated in the remote storage. If the hash of the downloaded file still doesn’t match the - one in the registry, will raise an exception to warn of possible file corruption. + one in the registry, will warn user of possible file corruption. :param dataset_name: The file name (as appears in the registry) to fetch from local storage. :return: The absolute path (including the file name) of the file in local storage. """ - return _data_fetcher.fetch(dataset_name) + try: + return _data_fetcher.fetch(dataset_name) + except ValueError: + logger.warning( + f"Hash mismatch for {dataset_name}, proceeding with download. " + "Source file may have been updated." + ) + + # force download without hash check + url = _data_fetcher.get_url(dataset_name) + return pooch.retrieve( + url=url, + known_hash=None, + fname=dataset_name, + path=_data_fetcher.path, + ) def download_all(): diff --git a/src/aspire/downloader/registry.py b/src/aspire/downloader/registry.py index 467ad3c772..d96536d068 100644 --- a/src/aspire/downloader/registry.py +++ b/src/aspire/downloader/registry.py @@ -10,7 +10,7 @@ "emdb_5778.map": "877cbe37b86561c3dfb255aa2308fefcdd8f51f91928b17c2ef5c8dd3afaaef7", "emdb_6287.map": "81463aa6d024c80efcd19aa9b5ac58f3b3464af56e1ef0f104bd25071acc9204", "emdb_2824.map": "7682e1ef6e5bc9f2de9edcf824a03e454ef9cb1ca33bc12920633559f7f826e4", - "emdb_14621.map": "b45774245c2bd5e1a44e801b8fb1705a44d5850631838d060294be42e34a6900", + "emdb_14621.map": "98363ae950229243131025995b5ba0486857ccb1256b3df8d25c1c282155238c", "emdb_2484.map": "6a324e23352bea101c191d5e854026162a5a9b0b8fc73ac5a085cc22038e1999", "emdb_6458.map": "645208af6d36bbd3d172c549e58d387b81142fd320e064bc66105be0eae540d1", "simulated_channelspin.npz": "c0752674acb85417f6a77a28ac55280c1926c73fda9e25ce0a9940728b1dfcc8", diff --git a/tests/test_downloader.py b/tests/test_downloader.py new file mode 100644 index 0000000000..cddc1cf177 --- /dev/null +++ b/tests/test_downloader.py @@ -0,0 +1,46 @@ +from pathlib import Path + +import pytest + +from aspire.downloader import download_all +from aspire.downloader.data_fetcher import _data_fetcher, fetch_data + + +@pytest.mark.scheduled +def test_download_all(caplog): + """Fail if a hash mismatch warning is logged during download.""" + caplog.clear() + with caplog.at_level("WARNING"): + _ = download_all() + + if "Hash mismatch" in caplog.text: + pytest.fail(f"Hash mismatch warning was logged.\nCaptured logs:\n{caplog.text}") + + +@pytest.mark.scheduled +def test_fetch_data_warning(caplog): + """Test that we get expected warning on hash mismatch.""" + # Use the smallest dataset in the registry + dataset_name = "emdb_3645.map" + + # Remove file from cache if it exists + cached_path = Path(_data_fetcher.path) / dataset_name + if cached_path.exists(): + cached_path.unlink() + + # Save original hash from the registry + original_hash = _data_fetcher.registry.get(dataset_name) + assert original_hash is not None + + # Temporarily override the hash to simulate a mismatch + _data_fetcher.registry[dataset_name] = "md5:invalidhash123" + + try: + caplog.clear() + with caplog.at_level("WARNING"): + path = fetch_data(dataset_name) + assert path # Should return the path to the downloaded file + assert f"Hash mismatch for {dataset_name}" in caplog.text + finally: + # Restore original hash + _data_fetcher.registry[dataset_name] = original_hash diff --git a/tox.ini b/tox.ini index afa6003cc5..3183b3c0ef 100644 --- a/tox.ini +++ b/tox.ini @@ -92,7 +92,8 @@ line_length = 88 testpaths = tests markers = expensive: mark a test as a long running test. -addopts = -m "not expensive" + scheduled: tests that should only run in the scheduled workflow +addopts = -m "not expensive and not scheduled" [gh-actions] python =