ComputationalCryoEM · j-c-c · Jun 3, 2025 · May 15, 2025 · May 15, 2025 · May 16, 2025
@@ -0,0 +1,24 @@
+name: ASPIRE Python Scheduled Workflow
+
+on:
+  schedule:
+    - cron: '15 0 * * 0'  # Every Sunday at 00:15 UTC
+
+
+jobs:
+  scheduled-tests:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        ref: develop
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: '3.10'
+    - name: Install Dependencies
+      run: |
+        pip install -e ".[dev]"
+    - name: Scheduled Tests
+      run: pytest -m scheduled
@@ -4,6 +4,7 @@
 # isort: on
 from .data_fetcher import (
     available_downloads,
+    download_all,
     emdb_2484,
     emdb_2660,
     emdb_2824,

@@ -1,3 +1,4 @@
+import logging
 import shutil
 
 import numpy as np
@@ -10,6 +11,9 @@
 from aspire.utils import Rotation
 from aspire.volume import Volume
 
+logger = logging.getLogger(__name__)
+
+
 # Initialize pooch data fetcher instance.
 _data_fetcher = pooch.create(
     # Set the cache path defined in the config. By default, the cache
@@ -33,14 +37,29 @@
     file in local storage doesn’t match the one in the registry, will download a
     new copy of the file. This is considered a sign that the file was updated in
     the remote storage. If the hash of the downloaded file still doesn’t match the
-    one in the registry, will raise an exception to warn of possible file corruption.
+    one in the registry, will warn user of possible file corruption.
 
     :param dataset_name: The file name (as appears in the registry) to
         fetch from local storage.
     :return: The absolute path (including the file name) of the file in
         local storage.
     """
-    return _data_fetcher.fetch(dataset_name)
+    try:
+        return _data_fetcher.fetch(dataset_name)
+    except ValueError:
+        logger.warning(
+            f"Hash mismatch for {dataset_name}, proceeding with download. "
+            "Source file may have been updated."
+        )
+
+        # force download without hash check
+        url = _data_fetcher.get_url(dataset_name)
+        return pooch.retrieve(
+            url=url,
+            known_hash=None,
+            fname=dataset_name,
+            path=_data_fetcher.path,
+        )
 
 
 def download_all():

@@ -10,7 +10,7 @@
     "emdb_5778.map": "877cbe37b86561c3dfb255aa2308fefcdd8f51f91928b17c2ef5c8dd3afaaef7",
     "emdb_6287.map": "81463aa6d024c80efcd19aa9b5ac58f3b3464af56e1ef0f104bd25071acc9204",
     "emdb_2824.map": "7682e1ef6e5bc9f2de9edcf824a03e454ef9cb1ca33bc12920633559f7f826e4",
-    "emdb_14621.map": "b45774245c2bd5e1a44e801b8fb1705a44d5850631838d060294be42e34a6900",
+    "emdb_14621.map": "98363ae950229243131025995b5ba0486857ccb1256b3df8d25c1c282155238c",
     "emdb_2484.map": "6a324e23352bea101c191d5e854026162a5a9b0b8fc73ac5a085cc22038e1999",
     "emdb_6458.map": "645208af6d36bbd3d172c549e58d387b81142fd320e064bc66105be0eae540d1",
     "simulated_channelspin.npz": "c0752674acb85417f6a77a28ac55280c1926c73fda9e25ce0a9940728b1dfcc8",

@@ -0,0 +1,46 @@
+from pathlib import Path
+
+import pytest
+
+from aspire.downloader import download_all
+from aspire.downloader.data_fetcher import _data_fetcher, fetch_data
+
+
+@pytest.mark.scheduled
+def test_download_all(caplog):
+    """Fail if a hash mismatch warning is logged during download."""
+    caplog.clear()
+    with caplog.at_level("WARNING"):
+        _ = download_all()
+
+    if "Hash mismatch" in caplog.text:
+        pytest.fail(f"Hash mismatch warning was logged.\nCaptured logs:\n{caplog.text}")
+
+
+@pytest.mark.scheduled
+def test_fetch_data_warning(caplog):
+    """Test that we get expected warning on hash mismatch."""
+    # Use the smallest dataset in the registry
+    dataset_name = "emdb_3645.map"
+
+    # Remove file from cache if it exists
+    cached_path = Path(_data_fetcher.path) / dataset_name
+    if cached_path.exists():
+        cached_path.unlink()
+
+    # Save original hash from the registry
+    original_hash = _data_fetcher.registry.get(dataset_name)
+    assert original_hash is not None
+
+    # Temporarily override the hash to simulate a mismatch
+    _data_fetcher.registry[dataset_name] = "md5:invalidhash123"
+
+    try:
+        caplog.clear()
+        with caplog.at_level("WARNING"):
+            path = fetch_data(dataset_name)
+        assert path  # Should return the path to the downloaded file
+        assert f"Hash mismatch for {dataset_name}" in caplog.text
+    finally:
+        # Restore original hash
+        _data_fetcher.registry[dataset_name] = original_hash
@@ -92,7 +92,8 @@ line_length = 88
 testpaths = tests
 markers =
     expensive: mark a test as a long running test.
-addopts = -m "not expensive"
+    scheduled: tests that should only run in the scheduled workflow
+addopts = -m "not expensive and not scheduled"
 
 [gh-actions]
 python =