From c991a22b405a255824b4cc2e10a14801dc0cda48 Mon Sep 17 00:00:00 2001 From: Josh Carmichael Date: Thu, 15 May 2025 15:38:20 -0400 Subject: [PATCH 01/11] proceed with download on hash mismatch, with warning. --- src/aspire/downloader/__init__.py | 1 + src/aspire/downloader/data_fetcher.py | 17 ++++++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/src/aspire/downloader/__init__.py b/src/aspire/downloader/__init__.py index be0d375878..714bba0e53 100644 --- a/src/aspire/downloader/__init__.py +++ b/src/aspire/downloader/__init__.py @@ -4,6 +4,7 @@ # isort: on from .data_fetcher import ( available_downloads, + download_all, emdb_2484, emdb_2660, emdb_2824, diff --git a/src/aspire/downloader/data_fetcher.py b/src/aspire/downloader/data_fetcher.py index f655d06ab3..4b81092e6c 100644 --- a/src/aspire/downloader/data_fetcher.py +++ b/src/aspire/downloader/data_fetcher.py @@ -1,4 +1,5 @@ import shutil +import warnings import numpy as np import pooch @@ -40,7 +41,21 @@ def fetch_data(dataset_name): :return: The absolute path (including the file name) of the file in local storage. """ - return _data_fetcher.fetch(dataset_name) + try: + return _data_fetcher.fetch(dataset_name) + except ValueError as e: + warnings.warn( + f"Hash mismatch for {dataset_name}, proceeding with download. " + "Source file may have been updated." + ) + # force download without hash check + url = _data_fetcher.get_url(dataset_name) + return pooch.retrieve( + url=url, + known_hash=None, + fname=dataset_name, + path=_data_fetcher.path, + ) def download_all(): From 2cc6eb845ff4937bcb642f976d3cda63bf0d89ca Mon Sep 17 00:00:00 2001 From: Josh Carmichael Date: Thu, 15 May 2025 15:45:17 -0400 Subject: [PATCH 02/11] update hash in registry --- src/aspire/downloader/registry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aspire/downloader/registry.py b/src/aspire/downloader/registry.py index 467ad3c772..d96536d068 100644 --- a/src/aspire/downloader/registry.py +++ b/src/aspire/downloader/registry.py @@ -10,7 +10,7 @@ "emdb_5778.map": "877cbe37b86561c3dfb255aa2308fefcdd8f51f91928b17c2ef5c8dd3afaaef7", "emdb_6287.map": "81463aa6d024c80efcd19aa9b5ac58f3b3464af56e1ef0f104bd25071acc9204", "emdb_2824.map": "7682e1ef6e5bc9f2de9edcf824a03e454ef9cb1ca33bc12920633559f7f826e4", - "emdb_14621.map": "b45774245c2bd5e1a44e801b8fb1705a44d5850631838d060294be42e34a6900", + "emdb_14621.map": "98363ae950229243131025995b5ba0486857ccb1256b3df8d25c1c282155238c", "emdb_2484.map": "6a324e23352bea101c191d5e854026162a5a9b0b8fc73ac5a085cc22038e1999", "emdb_6458.map": "645208af6d36bbd3d172c549e58d387b81142fd320e064bc66105be0eae540d1", "simulated_channelspin.npz": "c0752674acb85417f6a77a28ac55280c1926c73fda9e25ce0a9940728b1dfcc8", From b58cada8d46ab491fe9dac7818f19a84a24dd3eb Mon Sep 17 00:00:00 2001 From: Josh Carmichael Date: Fri, 16 May 2025 10:57:59 -0400 Subject: [PATCH 03/11] stack_level. unused variable --- src/aspire/downloader/data_fetcher.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/aspire/downloader/data_fetcher.py b/src/aspire/downloader/data_fetcher.py index 4b81092e6c..df194c9432 100644 --- a/src/aspire/downloader/data_fetcher.py +++ b/src/aspire/downloader/data_fetcher.py @@ -34,7 +34,7 @@ def fetch_data(dataset_name): file in local storage doesn’t match the one in the registry, will download a new copy of the file. This is considered a sign that the file was updated in the remote storage. If the hash of the downloaded file still doesn’t match the - one in the registry, will raise an exception to warn of possible file corruption. + one in the registry, will warn user of possible file corruption. :param dataset_name: The file name (as appears in the registry) to fetch from local storage. @@ -43,11 +43,14 @@ def fetch_data(dataset_name): """ try: return _data_fetcher.fetch(dataset_name) - except ValueError as e: + except ValueError: warnings.warn( f"Hash mismatch for {dataset_name}, proceeding with download. " - "Source file may have been updated." + "Source file may have been updated.", + UserWarning, + stacklevel=1, ) + # force download without hash check url = _data_fetcher.get_url(dataset_name) return pooch.retrieve( From 23a8b1e7c67fdc7238b06e813164f79bd38b4846 Mon Sep 17 00:00:00 2001 From: Josh Carmichael Date: Fri, 16 May 2025 14:27:45 -0400 Subject: [PATCH 04/11] add scheduled_workflow and scheduled downloader test. --- .github/workflows/scheduled_workflow.yml | 38 ++++++++++++++++++++++++ tests/test_downloader.py | 9 ++++++ tox.ini | 3 +- 3 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/scheduled_workflow.yml create mode 100644 tests/test_downloader.py diff --git a/.github/workflows/scheduled_workflow.yml b/.github/workflows/scheduled_workflow.yml new file mode 100644 index 0000000000..ac8bac0976 --- /dev/null +++ b/.github/workflows/scheduled_workflow.yml @@ -0,0 +1,38 @@ +name: ASPIRE Python Scheduled Workflow + +on: + workflow_dispatch: # Manual "Run workflow" button + schedule: + - cron: '0 0 * * 0' # Every Sunday at 00:00 UTC + + +jobs: + check: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.9' + - name: Install dependencies + run: | + pip install tox + - name: Run Tox Check + run: tox -e check + + scheduled-tests: + needs: check + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + - name: Install Dependencies + run: | + pip install -e ".[dev]" + - name: Scheduled Tests + run: PYTHONWARNINGS=error python -m pytest -m scheduled diff --git a/tests/test_downloader.py b/tests/test_downloader.py new file mode 100644 index 0000000000..cfb32666c8 --- /dev/null +++ b/tests/test_downloader.py @@ -0,0 +1,9 @@ +import pytest + +from aspire.downloader import download_all + + +@pytest.mark.scheduled +def test_download_all(): + """This test will throw a warning if any hashes have changed""" + _ = download_all() diff --git a/tox.ini b/tox.ini index afa6003cc5..3183b3c0ef 100644 --- a/tox.ini +++ b/tox.ini @@ -92,7 +92,8 @@ line_length = 88 testpaths = tests markers = expensive: mark a test as a long running test. -addopts = -m "not expensive" + scheduled: tests that should only run in the scheduled workflow +addopts = -m "not expensive and not scheduled" [gh-actions] python = From cb237b778bae6ddea7c7a54380bc6ed2d419c571 Mon Sep 17 00:00:00 2001 From: Josh Carmichael Date: Fri, 16 May 2025 15:21:24 -0400 Subject: [PATCH 05/11] same py version --- .github/workflows/scheduled_workflow.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/scheduled_workflow.yml b/.github/workflows/scheduled_workflow.yml index ac8bac0976..0763244184 100644 --- a/.github/workflows/scheduled_workflow.yml +++ b/.github/workflows/scheduled_workflow.yml @@ -14,7 +14,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v5 with: - python-version: '3.9' + python-version: '3.10' - name: Install dependencies run: | pip install tox From fa1b431e34a09a652a4daa1cda31fc036d4645a3 Mon Sep 17 00:00:00 2001 From: Josh Carmichael Date: Tue, 20 May 2025 10:31:04 -0400 Subject: [PATCH 06/11] Use logger warning. Update test to fail on hash mismatch warning. --- src/aspire/downloader/data_fetcher.py | 11 ++++++----- tests/test_downloader.py | 11 ++++++++--- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/aspire/downloader/data_fetcher.py b/src/aspire/downloader/data_fetcher.py index df194c9432..146eb97448 100644 --- a/src/aspire/downloader/data_fetcher.py +++ b/src/aspire/downloader/data_fetcher.py @@ -1,5 +1,5 @@ +import logging import shutil -import warnings import numpy as np import pooch @@ -11,6 +11,9 @@ from aspire.utils import Rotation from aspire.volume import Volume +logger = logging.getLogger(__name__) + + # Initialize pooch data fetcher instance. _data_fetcher = pooch.create( # Set the cache path defined in the config. By default, the cache @@ -44,11 +47,9 @@ def fetch_data(dataset_name): try: return _data_fetcher.fetch(dataset_name) except ValueError: - warnings.warn( + logger.warning( f"Hash mismatch for {dataset_name}, proceeding with download. " - "Source file may have been updated.", - UserWarning, - stacklevel=1, + "Source file may have been updated." ) # force download without hash check diff --git a/tests/test_downloader.py b/tests/test_downloader.py index cfb32666c8..68e20b51fa 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -4,6 +4,11 @@ @pytest.mark.scheduled -def test_download_all(): - """This test will throw a warning if any hashes have changed""" - _ = download_all() +def test_download_all(caplog): + """Fail if a hash mismatch warning is logged during download.""" + caplog.clear() + with caplog.at_level("WARNING"): + _ = download_all() + + if "Hash mismatch" in caplog.text: + pytest.fail(f"Hash mismatch warning was logged.\nCaptured logs:\n{caplog.text}") From 6cbbf33000081b0adac8b011d13dc2b769394d96 Mon Sep 17 00:00:00 2001 From: Josh Carmichael Date: Tue, 20 May 2025 10:39:11 -0400 Subject: [PATCH 07/11] Workflow updates: Run on develop, remove check, remove fail on warnings. --- .github/workflows/scheduled_workflow.yml | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/.github/workflows/scheduled_workflow.yml b/.github/workflows/scheduled_workflow.yml index 0763244184..e4a1507a9e 100644 --- a/.github/workflows/scheduled_workflow.yml +++ b/.github/workflows/scheduled_workflow.yml @@ -2,25 +2,15 @@ name: ASPIRE Python Scheduled Workflow on: workflow_dispatch: # Manual "Run workflow" button + branches: + - develop schedule: - cron: '0 0 * * 0' # Every Sunday at 00:00 UTC + branches: + - develop jobs: - check: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.10' - - name: Install dependencies - run: | - pip install tox - - name: Run Tox Check - run: tox -e check - scheduled-tests: needs: check runs-on: ubuntu-latest @@ -35,4 +25,4 @@ jobs: run: | pip install -e ".[dev]" - name: Scheduled Tests - run: PYTHONWARNINGS=error python -m pytest -m scheduled + run: pytest -m scheduled From c48ad9fc2a0b67a3b155823eab36add9f0ff5e72 Mon Sep 17 00:00:00 2001 From: Josh Carmichael Date: Tue, 20 May 2025 10:59:29 -0400 Subject: [PATCH 08/11] Workflow updates: Checkout develop, set cron off-hour, remove needs field. --- .github/workflows/scheduled_workflow.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/scheduled_workflow.yml b/.github/workflows/scheduled_workflow.yml index e4a1507a9e..d4d82f80df 100644 --- a/.github/workflows/scheduled_workflow.yml +++ b/.github/workflows/scheduled_workflow.yml @@ -5,14 +5,12 @@ on: branches: - develop schedule: - - cron: '0 0 * * 0' # Every Sunday at 00:00 UTC - branches: - - develop + - cron: '15 0 * * 0' # Every Sunday at 00:15 UTC jobs: scheduled-tests: - needs: check + if: github.ref == 'refs/heads/develop' # Ensure only runs on develop runs-on: ubuntu-latest steps: From 679b2bf50e83df305aa56a87fcf2e7442abf9ab5 Mon Sep 17 00:00:00 2001 From: Josh Carmichael Date: Tue, 20 May 2025 14:38:49 -0400 Subject: [PATCH 09/11] test hash mismatch warning works --- tests/test_downloader.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tests/test_downloader.py b/tests/test_downloader.py index 68e20b51fa..dfbc135c61 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -1,6 +1,9 @@ +from pathlib import Path + import pytest from aspire.downloader import download_all +from aspire.downloader.data_fetcher import _data_fetcher, fetch_data @pytest.mark.scheduled @@ -12,3 +15,31 @@ def test_download_all(caplog): if "Hash mismatch" in caplog.text: pytest.fail(f"Hash mismatch warning was logged.\nCaptured logs:\n{caplog.text}") + + +def test_fetch_data_warning(caplog): + """Test that we get expected warning on hash mismatch.""" + # Use the smallest dataset in the registry + dataset_name = "emdb_3645.map" + + # Remove file from cache if it exists + cached_path = Path(_data_fetcher.path) / dataset_name + if cached_path.exists(): + cached_path.unlink() + + # Save original hash from the registry + original_hash = _data_fetcher.registry.get(dataset_name) + assert original_hash is not None + + # Temporarily override the hash to simulate a mismatch + _data_fetcher.registry[dataset_name] = "md5:invalidhash123" + + try: + caplog.clear() + with caplog.at_level("WARNING"): + path = fetch_data(dataset_name) + assert path # Should return the path to the downloaded file + assert f"Hash mismatch for {dataset_name}" in caplog.text + finally: + # Restore original hash + _data_fetcher.registry[dataset_name] = original_hash From 77c9ad7d1ffb72998e7c3c775253241be26ad6cd Mon Sep 17 00:00:00 2001 From: Josh Carmichael Date: Wed, 21 May 2025 08:48:44 -0400 Subject: [PATCH 10/11] checkout on develop. mark test as scheduled. --- .github/workflows/scheduled_workflow.yml | 3 ++- tests/test_downloader.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/scheduled_workflow.yml b/.github/workflows/scheduled_workflow.yml index d4d82f80df..8f96eaa30e 100644 --- a/.github/workflows/scheduled_workflow.yml +++ b/.github/workflows/scheduled_workflow.yml @@ -10,11 +10,12 @@ on: jobs: scheduled-tests: - if: github.ref == 'refs/heads/develop' # Ensure only runs on develop runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 + with: + ref: develop - name: Set up Python uses: actions/setup-python@v5 with: diff --git a/tests/test_downloader.py b/tests/test_downloader.py index dfbc135c61..cddc1cf177 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -17,6 +17,7 @@ def test_download_all(caplog): pytest.fail(f"Hash mismatch warning was logged.\nCaptured logs:\n{caplog.text}") +@pytest.mark.scheduled def test_fetch_data_warning(caplog): """Test that we get expected warning on hash mismatch.""" # Use the smallest dataset in the registry From 363f02ea11a97a211dae0098ed255ac0c0d707ef Mon Sep 17 00:00:00 2001 From: Josh Carmichael Date: Wed, 21 May 2025 11:17:57 -0400 Subject: [PATCH 11/11] remove manual run-workflow button --- .github/workflows/scheduled_workflow.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/scheduled_workflow.yml b/.github/workflows/scheduled_workflow.yml index 8f96eaa30e..77a789597e 100644 --- a/.github/workflows/scheduled_workflow.yml +++ b/.github/workflows/scheduled_workflow.yml @@ -1,9 +1,6 @@ name: ASPIRE Python Scheduled Workflow on: - workflow_dispatch: # Manual "Run workflow" button - branches: - - develop schedule: - cron: '15 0 * * 0' # Every Sunday at 00:15 UTC