Skip to content
Merged
24 changes: 24 additions & 0 deletions .github/workflows/scheduled_workflow.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
name: ASPIRE Python Scheduled Workflow

on:
schedule:
- cron: '15 0 * * 0' # Every Sunday at 00:15 UTC


jobs:
scheduled-tests:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4
with:
ref: develop
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install Dependencies
run: |
pip install -e ".[dev]"
- name: Scheduled Tests
run: pytest -m scheduled
1 change: 1 addition & 0 deletions src/aspire/downloader/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# isort: on
from .data_fetcher import (
available_downloads,
download_all,
emdb_2484,
emdb_2660,
emdb_2824,
Expand Down
23 changes: 21 additions & 2 deletions src/aspire/downloader/data_fetcher.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
import shutil

import numpy as np
Expand All @@ -10,6 +11,9 @@
from aspire.utils import Rotation
from aspire.volume import Volume

logger = logging.getLogger(__name__)


# Initialize pooch data fetcher instance.
_data_fetcher = pooch.create(
# Set the cache path defined in the config. By default, the cache
Expand All @@ -33,14 +37,29 @@
file in local storage doesn’t match the one in the registry, will download a
new copy of the file. This is considered a sign that the file was updated in
the remote storage. If the hash of the downloaded file still doesn’t match the
one in the registry, will raise an exception to warn of possible file corruption.
one in the registry, will warn user of possible file corruption.

:param dataset_name: The file name (as appears in the registry) to
fetch from local storage.
:return: The absolute path (including the file name) of the file in
local storage.
"""
return _data_fetcher.fetch(dataset_name)
try:
return _data_fetcher.fetch(dataset_name)
except ValueError:
logger.warning(

Check warning on line 50 in src/aspire/downloader/data_fetcher.py

View check run for this annotation

Codecov / codecov/patch

src/aspire/downloader/data_fetcher.py#L49-L50

Added lines #L49 - L50 were not covered by tests
f"Hash mismatch for {dataset_name}, proceeding with download. "
"Source file may have been updated."
)

# force download without hash check
url = _data_fetcher.get_url(dataset_name)
return pooch.retrieve(

Check warning on line 57 in src/aspire/downloader/data_fetcher.py

View check run for this annotation

Codecov / codecov/patch

src/aspire/downloader/data_fetcher.py#L56-L57

Added lines #L56 - L57 were not covered by tests
url=url,
known_hash=None,
fname=dataset_name,
path=_data_fetcher.path,
)


def download_all():
Expand Down
2 changes: 1 addition & 1 deletion src/aspire/downloader/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"emdb_5778.map": "877cbe37b86561c3dfb255aa2308fefcdd8f51f91928b17c2ef5c8dd3afaaef7",
"emdb_6287.map": "81463aa6d024c80efcd19aa9b5ac58f3b3464af56e1ef0f104bd25071acc9204",
"emdb_2824.map": "7682e1ef6e5bc9f2de9edcf824a03e454ef9cb1ca33bc12920633559f7f826e4",
"emdb_14621.map": "b45774245c2bd5e1a44e801b8fb1705a44d5850631838d060294be42e34a6900",
"emdb_14621.map": "98363ae950229243131025995b5ba0486857ccb1256b3df8d25c1c282155238c",
"emdb_2484.map": "6a324e23352bea101c191d5e854026162a5a9b0b8fc73ac5a085cc22038e1999",
"emdb_6458.map": "645208af6d36bbd3d172c549e58d387b81142fd320e064bc66105be0eae540d1",
"simulated_channelspin.npz": "c0752674acb85417f6a77a28ac55280c1926c73fda9e25ce0a9940728b1dfcc8",
Expand Down
46 changes: 46 additions & 0 deletions tests/test_downloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from pathlib import Path

import pytest

from aspire.downloader import download_all
from aspire.downloader.data_fetcher import _data_fetcher, fetch_data


@pytest.mark.scheduled
def test_download_all(caplog):
"""Fail if a hash mismatch warning is logged during download."""
caplog.clear()
with caplog.at_level("WARNING"):
_ = download_all()

if "Hash mismatch" in caplog.text:
pytest.fail(f"Hash mismatch warning was logged.\nCaptured logs:\n{caplog.text}")


@pytest.mark.scheduled
def test_fetch_data_warning(caplog):
"""Test that we get expected warning on hash mismatch."""
# Use the smallest dataset in the registry
dataset_name = "emdb_3645.map"

# Remove file from cache if it exists
cached_path = Path(_data_fetcher.path) / dataset_name
if cached_path.exists():
cached_path.unlink()

# Save original hash from the registry
original_hash = _data_fetcher.registry.get(dataset_name)
assert original_hash is not None

# Temporarily override the hash to simulate a mismatch
_data_fetcher.registry[dataset_name] = "md5:invalidhash123"

try:
caplog.clear()
with caplog.at_level("WARNING"):
path = fetch_data(dataset_name)
assert path # Should return the path to the downloaded file
assert f"Hash mismatch for {dataset_name}" in caplog.text
finally:
# Restore original hash
_data_fetcher.registry[dataset_name] = original_hash
3 changes: 2 additions & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,8 @@ line_length = 88
testpaths = tests
markers =
expensive: mark a test as a long running test.
addopts = -m "not expensive"
scheduled: tests that should only run in the scheduled workflow
addopts = -m "not expensive and not scheduled"

[gh-actions]
python =
Expand Down
Loading