From 8132509bd7306c53899602c8db190cedac5792df Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Fri, 5 Sep 2025 13:26:15 -0400 Subject: [PATCH 1/5] simplify example [skip ci] --- gallery/experiments/legacy_preprocessing.py | 44 +++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 gallery/experiments/legacy_preprocessing.py diff --git a/gallery/experiments/legacy_preprocessing.py b/gallery/experiments/legacy_preprocessing.py new file mode 100644 index 000000000..d0fb4ba6f --- /dev/null +++ b/gallery/experiments/legacy_preprocessing.py @@ -0,0 +1,44 @@ +""" +Legacy Preprocessing Pipeline +============================= +This notebook demonstrates reproducing preprocessing results from the legacy ASPIRE MATLAB workflow system for the EMPIAR 10028 ribosome dataset. + +https://www.ebi.ac.uk/empiar/EMPIAR-10028 +""" + +# %% +# Source data and Preprocessing +# ----------------------------- +# +# Load the data and apply preprocessing steps corresponding to the MATLAB prompts: +# +# >>> Phaseflip projections? (Y/N)? [Y] Y +# >>> Enter pixel size in Angstrom (-1 to read from STAR file): [-1.000000] +# >>> Number of projections to read? [105247] +# >>> Crop? (Y/N)? [Y] Y +# >>> Crop to size? [360]359 +# >>> Downsample? (Y/N)? [Y] Y +# >>> Downsample to size? [360]179 +# >>> Normalize background of images to variance 1? (Y/N)? [Y] +# >>> Prewhiten? (Y/N)? [Y] +# >>> Split data into groups? (Y/N)? [Y] N + +from aspire.source import RelionSource + +# Inputs +# Note the published ``shiny_2sets.star`` requires removal of a stray '9' character on line 5476. +starfile_in = "10028/data/shiny_2sets_fixed9.star" +# Caching, while not required, will increase speed in exchange for potentially increased memory usage. +src = RelionSource(starfile_in).cache() + +# Define preprocessing steps. +src = src.phase_flip().cache() +src = src.crop_pad(359).cache() +src = src.legacy_downsample(179).cache() +src = src.legacy_normalize_background().cache() +src = src.legacy_whiten().cache() +src = src.invert_contrast().cache() + +# Save the preprocessed images. +# `save_mode=single` will save a STAR file and single mrcs holding the image stack. +src.save("10028_legacy_preprocessed_179px.star", save_mode="single", overwrite=True) From bad4c98915c49e7f23c4c64b0381d200258b4b73 Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Wed, 10 Sep 2025 11:38:52 -0400 Subject: [PATCH 2/5] Cleanup normalize_psd option --- src/aspire/noise/noise.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/aspire/noise/noise.py b/src/aspire/noise/noise.py index eb3aed385..7f2bccc01 100644 --- a/src/aspire/noise/noise.py +++ b/src/aspire/noise/noise.py @@ -361,7 +361,7 @@ class LegacyNoiseEstimator(NoiseEstimator): Isotropic noise estimator ported from MATLAB `cryo_noise_estimation`. """ - def __init__(self, src, bg_radius=None, max_d=None, batch_size=512): + def __init__(self, src, bg_radius=None, max_d=None, batch_size=512, normalize_psd=False): """ Given an `ImageSource`, prepares for estimation of noise spectrum. @@ -375,6 +375,8 @@ def __init__(self, src, bg_radius=None, max_d=None, batch_size=512): :param max_d: Max computed correlation distance as a proportion of `src.L`. Default of `None` uses `np.floor(src.L/3) / L`. :param batch_size: The size of the batches in which to compute the variance estimate. + :param normalize_psd: Optionally normalize PSD in way that reproduces MATLAB intermediates. + Only useful if utilizing the `PSD` for applications outside of built-in legacy whitening. """ if bg_radius is None: @@ -386,6 +388,8 @@ def __init__(self, src, bg_radius=None, max_d=None, batch_size=512): if self.max_d is None: self.max_d = np.floor(src.L / 3) / src.L + self.normalize_psd = bool(normalize_psd) + def estimate(self): """ :return: The estimated noise variance of the images. @@ -414,6 +418,7 @@ def _estimate_noise_psd(self): samples_idx, max_d_pixels, batch_size=self.batch_size, + normalize_psd=self.normalize_psd, )[0] return psd @@ -552,7 +557,7 @@ def _estimate_power_spectrum_distribution_2d( Default of `None` yields `np.floor(L / 3)`. :param batch_size: The size of the batches in which to compute the variance estimate. :normalize_psd: Optionally normalize returned PSD. - Disabled by default because it will typiccally be + Disabled by default because it will typically be renormalized later in preperation for the convolution application in `Image.legacy_whiten`. Enable to reproduce legacy PSD. @@ -567,9 +572,6 @@ def _estimate_power_spectrum_distribution_2d( L = samples_idx.shape[-1] batch_size = min(batch_size, n_img) - # Migrate mask to GPU as needed - _samples_idx = xp.asarray(samples_idx) - # Correlations more than `max_d` pixels apart are not computed. if max_d is None: max_d = np.floor(L / 3) @@ -612,25 +614,24 @@ def _estimate_power_spectrum_distribution_2d( # to estimate it. E = 0.0 # Total energy of the noise samples used to estimate the power spectrum. - samples = xp.zeros((batch_size, L, L), dtype=np.float64) + n_samples_per_img = int(np.count_nonzero(samples_idx)) + samples = xp.zeros((batch_size, n_samples_per_img), dtype=np.float64) for start in trange( 0, n_img, batch_size, desc="Estimating image noise energy" ): end = min(n_img, start + batch_size) cnt = end - start - samples[:cnt, _samples_idx] = images[start:end].asnumpy()[0][ - samples_idx - ] + samples[:cnt] = xp.asarray(images[start:end].asnumpy()[:,samples_idx]) E += xp.sum( ( samples[:cnt] - - xp.mean(samples[:cnt], axis=(1, 2)).reshape(cnt, 1, 1) + - xp.mean(samples[:cnt],axis=-1).reshape(cnt, 1) ) ** 2 ) + # Mean energy of the noise samples - n_samples_per_img = xp.count_nonzero(_samples_idx) meanE = E / (n_samples_per_img * n_img) # Normalize P2 such that its mean energy is preserved and is equal to From 598769db1e3c3e4c5853f03c50e12ea240c000ff Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Wed, 10 Sep 2025 11:44:58 -0400 Subject: [PATCH 3/5] String cleanup --- gallery/experiments/legacy_preprocessing.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gallery/experiments/legacy_preprocessing.py b/gallery/experiments/legacy_preprocessing.py index d0fb4ba6f..e6595d5db 100644 --- a/gallery/experiments/legacy_preprocessing.py +++ b/gallery/experiments/legacy_preprocessing.py @@ -1,7 +1,9 @@ """ Legacy Preprocessing Pipeline ============================= -This notebook demonstrates reproducing preprocessing results from the legacy ASPIRE MATLAB workflow system for the EMPIAR 10028 ribosome dataset. +This notebook demonstrates reproducing preprocessing results from the +legacy ASPIRE MATLAB workflow system for the EMPIAR 10028 ribosome +dataset. https://www.ebi.ac.uk/empiar/EMPIAR-10028 """ From 2355edd644d9e979e88eee5ee0d57ee818b0d69c Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Wed, 10 Sep 2025 12:27:51 -0400 Subject: [PATCH 4/5] Tox cleanup --- src/aspire/noise/noise.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/aspire/noise/noise.py b/src/aspire/noise/noise.py index 7f2bccc01..d9e136bf7 100644 --- a/src/aspire/noise/noise.py +++ b/src/aspire/noise/noise.py @@ -361,7 +361,9 @@ class LegacyNoiseEstimator(NoiseEstimator): Isotropic noise estimator ported from MATLAB `cryo_noise_estimation`. """ - def __init__(self, src, bg_radius=None, max_d=None, batch_size=512, normalize_psd=False): + def __init__( + self, src, bg_radius=None, max_d=None, batch_size=512, normalize_psd=False + ): """ Given an `ImageSource`, prepares for estimation of noise spectrum. @@ -622,12 +624,9 @@ def _estimate_power_spectrum_distribution_2d( end = min(n_img, start + batch_size) cnt = end - start - samples[:cnt] = xp.asarray(images[start:end].asnumpy()[:,samples_idx]) + samples[:cnt] = xp.asarray(images[start:end].asnumpy()[:, samples_idx]) E += xp.sum( - ( - samples[:cnt] - - xp.mean(samples[:cnt],axis=-1).reshape(cnt, 1) - ) + (samples[:cnt] - xp.mean(samples[:cnt], axis=-1).reshape(cnt, 1)) ** 2 ) From 392ad8e76843dae6f5ef5328a966fc40c8f152b6 Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Wed, 10 Sep 2025 14:39:05 -0400 Subject: [PATCH 5/5] Gallery formatting --- gallery/experiments/legacy_preprocessing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gallery/experiments/legacy_preprocessing.py b/gallery/experiments/legacy_preprocessing.py index e6595d5db..e3cd44136 100644 --- a/gallery/experiments/legacy_preprocessing.py +++ b/gallery/experiments/legacy_preprocessing.py @@ -1,6 +1,7 @@ """ Legacy Preprocessing Pipeline ============================= + This notebook demonstrates reproducing preprocessing results from the legacy ASPIRE MATLAB workflow system for the EMPIAR 10028 ribosome dataset.