From 1df557e1c2bd87e35581b1029cd9571262ce4e28 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 9 May 2024 10:21:18 -0400
Subject: [PATCH 01/65] add simple basis benchmark and plotting script

[skip ci]
---
 bbenchmark/bbenchmark.py      |  57 ++++++++++++++++++++++++++++++
 bbenchmark/benchmark_gpu0.pkl | Bin 0 -> 307 bytes
 bbenchmark/benchmark_host.pkl | Bin 0 -> 307 bytes
 bbenchmark/plot_bb.py         |  64 ++++++++++++++++++++++++++++++++++
 4 files changed, 121 insertions(+)
 create mode 100644 bbenchmark/bbenchmark.py
 create mode 100644 bbenchmark/benchmark_gpu0.pkl
 create mode 100644 bbenchmark/benchmark_host.pkl
 create mode 100644 bbenchmark/plot_bb.py

diff --git a/bbenchmark/bbenchmark.py b/bbenchmark/bbenchmark.py
new file mode 100644
index 0000000000..6c6320c101
--- /dev/null
+++ b/bbenchmark/bbenchmark.py
@@ -0,0 +1,57 @@
+import os
+import pickle
+from pprint import pprint
+from time import perf_counter, time
+
+import matplotlib.pyplot as plt
+import numpy as np
+from aspire.basis import FFBBasis2D, FLEBasis2D
+from aspire.downloader import emdb_2660
+from aspire.noise import WhiteNoiseAdder
+from aspire.source import ArrayImageSource, Simulation
+
+# Download and cache volume map
+vol = emdb_2660().astype(np.float64)  # doubles
+cached_image_fn = "simulated_images.npy"
+
+if os.path.exists(cached_image_fn):
+    print(f"Loading cached image source from {cached_image_fn}.")
+    sim = ArrayImageSource(np.load(cached_image_fn))
+else:
+    print("Generating Simulated Datatset")
+    sim = Simulation(
+        n=512, C=1, vols=vol, noise_adder=WhiteNoiseAdder.from_snr(0.1)
+    ).cache()
+    print(f"Saving to {cached_image_fn}")
+    np.save(cached_image_fn, sim.images[:].asnumpy())
+
+
+TIMES = {}
+for L in [32, 64, 128, 256]:
+    print(f"Begin L={L}")
+    src = sim.downsample(L)
+    imgs = src.images[:]
+    TIMES[L] = {}
+    for basis_type in [FFBBasis2D, FLEBasis2D]:
+        # Construct basis
+        TIMES[L][basis_type.__name__] = {}
+        basis = basis_type(L, dtype=src.dtype)
+
+        # Time expanding into basis
+        tic = perf_counter()
+        coef = basis.evaluate_t(imgs)
+        toc = perf_counter()
+        TIMES[L][basis_type.__name__]["evaluate_t"] = toc - tic
+
+        # Time expanding back into images
+        tic = perf_counter()
+        _ = coef.evaluate()
+        toc = perf_counter()
+        TIMES[L][basis_type.__name__]["evaluate"] = toc - tic
+
+
+pprint(TIMES)
+
+
+with open(f"benchmark_{int(time())}.pkl", "wb") as fh:
+    pickle.dump(TIMES, fh)
diff --git a/bbenchmark/benchmark_gpu0.pkl b/bbenchmark/benchmark_gpu0.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..e702dd442dced27acb26fcc3b3e395ce9f465585
GIT binary patch
literal 307
zcmZo*nX19a00y;FG`tmnL=Tsno0C&wab~fR%M>s_wJb5GG_fQ#zGRBK{fB=m-#lPo
z=;45g0>v(0*=`CnqZFvs#}!Fy28+7`_dcgM4hDt{R(A)1g!$SUKxL)g4nT7=m_P)J
zyZseA`*#jt74}bVm#$9$s>oo2$T<k@u$$8Z(Jkt9*ZK}vcLTESZ-p~N4IsLIOg-)u
zfUMi}RJdCWM7P=Y|AtRNx_ucKk+eIwdfq!S3!+41Yt11RWbKukocN6vfSq1>FJo@?
J0igC$Jpf$(W#0e*

literal 0
HcmV?d00001

diff --git a/bbenchmark/benchmark_host.pkl b/bbenchmark/benchmark_host.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..dc0dd2a1769fc52c9470986e74eb64864f59e7fe
GIT binary patch
literal 307
zcmZo*nX19a00y;FG`tmnL=Tsno0C&wab~fR%M>s_wJb5GG_fQ#zGRBK{l>F_wm|hg
z957L!*k;SlN}yONP^*tClGY3scLzS3waO9<3>mEM4m>kTmTLf&m3lh>&COr}5iIWZ
zmm4fH4}ewJUv3m%3o<B!4I<|ty7tqn6o_u2PZg>vVBHPKy1(9iTm;ktG~fPnyM$W-
zvToDPIg_qJbek#on}>jO`!X;hX?O6pRZ8-OC=s5uZo?jA?UmgRJ_s6sonHA^-k?wb
IsJ&DV0PQbeIsgCw

literal 0
HcmV?d00001

diff --git a/bbenchmark/plot_bb.py b/bbenchmark/plot_bb.py
new file mode 100644
index 0000000000..05f5350f4b
--- /dev/null
+++ b/bbenchmark/plot_bb.py
@@ -0,0 +1,64 @@
+import os
+import pickle
+from pprint import pprint
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+host_fn = "benchmark_host.pkl"
+gpu_fn = "benchmark_gpu0.pkl"
+
+
+with open(host_fn, "rb") as fh:
+    host_times = pickle.load(fh)
+
+with open(gpu_fn, "rb") as fh:
+    gpu_times = pickle.load(fh)
+
+markers = {"FFBBasis2D": "8", "FLEBasis2D": "s"}
+
+# Evaluate_t
+Ls = list(host_times.keys())
+for basis_type in markers.keys():
+    plt.plot(
+        Ls,
+        [host_times[L][basis_type]["evaluate_t"] for L in Ls],
+        marker=markers[basis_type],
+        color="blue",
+        label=basis_type + "-host",
+    )
+    plt.plot(
+        Ls,
+        [gpu_times[L][basis_type]["evaluate_t"] for L in Ls],
+        marker=markers[basis_type],
+        color="green",
+        label=basis_type + "-gpu",
+    )
+plt.title("Basis `evaluate_t` Permformance - Batch of 512 Images")
+plt.xlabel("Image Pixel L (LxL)")
+plt.ylabel("Time (seconds)")
+plt.legend()
+plt.savefig("evaluate_t.png")
+plt.show()
+
+for basis_type in markers.keys():
+    plt.plot(
+        Ls,
+        [host_times[L][basis_type]["evaluate"] for L in Ls],
+        marker=markers[basis_type],
+        color="blue",
+        label=basis_type + "-host",
+    )
+    plt.plot(
+        Ls,
+        [gpu_times[L][basis_type]["evaluate"] for L in Ls],
+        marker=markers[basis_type],
+        color="green",
+        label=basis_type + "-gpu",
+    )
+plt.title("Basis `evaluate` Permformance - Batch of 512 Images")
+plt.xlabel("Image Pixel L (LxL)")
+plt.ylabel("Time (seconds)")
+plt.legend()
+plt.savefig("evaluate.png")
+plt.show()

From 3a9e509aa43952403bf12c1f3e732fd2a4a1dcd2 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Tue, 4 Jun 2024 10:26:19 -0400
Subject: [PATCH 02/65] convert cufinufft towards cupy, keeping result on dvice

---
 pyproject.toml                | 10 ++++-----
 src/aspire/nufft/cufinufft.py | 40 +++++++++++++++++------------------
 2 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3cd57981ef..fcc0f7cf4f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -61,11 +61,11 @@ dependencies = [
 "Source" = "https://github.com/ComputationalCryoEM/ASPIRE-Python"
 
 [project.optional-dependencies]
-gpu-102 = ["pycuda", "cupy-cuda102", "cufinufft==1.3"]
-gpu-110 = ["pycuda", "cupy-cuda110", "cufinufft==1.3"]
-gpu-111 = ["pycuda", "cupy-cuda111", "cufinufft==1.3"]
-gpu-11x = ["pycuda", "cupy-cuda11x", "cufinufft==1.3"]
-gpu-12x = ["pycuda", "cupy-cuda12x", "cufinufft==2.2.0"]
+gpu-102 = ["cupy-cuda102", "cufinufft==1.3"]
+gpu-110 = ["cupy-cuda110", "cufinufft==1.3"]
+gpu-111 = ["cupy-cuda111", "cufinufft==1.3"]
+gpu-11x = ["cupy-cuda11x", "cufinufft==1.3"]
+gpu-12x = ["cupy-cuda12x", "cufinufft==2.2.0"]
 dev = [
     "black",
     "bumpversion",
diff --git a/src/aspire/nufft/cufinufft.py b/src/aspire/nufft/cufinufft.py
index 465c0b23f9..2dceb08b80 100644
--- a/src/aspire/nufft/cufinufft.py
+++ b/src/aspire/nufft/cufinufft.py
@@ -1,9 +1,7 @@
 import logging
 
+import cupy as cp
 import numpy as np
-import pycuda.autoinit  # noqa: F401
-import pycuda.driver as cuda  # noqa: F401
-import pycuda.gpuarray as gpuarray  # noqa: F401
 from cufinufft import Plan as cufPlan
 
 from aspire.nufft import Plan
@@ -85,7 +83,7 @@ def __init__(self, sz, fourier_pts, epsilon=1e-8, ntransforms=1, **kwargs):
 
         # Note, I store self.fourier_pts_gpu so the GPUArrray life
         #   is tied to instance, instead of this method.
-        self.fourier_pts_gpu = gpuarray.to_gpu(self.fourier_pts)
+        self.fourier_pts_gpu = cp.array(self.fourier_pts)
 
         self._transform_plan.setpts(*self.fourier_pts_gpu)
         self._adjoint_plan.setpts(*self.fourier_pts_gpu)
@@ -99,7 +97,7 @@ def transform(self, signal):
         For a batch, signal should have shape `(*sz, ntransforms)`.
 
         :returns: Transformed signal of shape `num_pts` or
-        `(ntransforms, num_pts)`.
+        `(ntransforms, num_pts)` as CuPy array.
         """
 
         # Check we're not forcing a dtype workaround for ASPIRE-Python/703,
@@ -113,6 +111,8 @@ def transform(self, signal):
                 " In the future this will be an error."
             )
 
+        signal = cp.asarray(signal, dtype=self.complex_dtype)
+
         sig_shape = signal.shape
         res_shape = self.num_pts
         # Note, there is a corner case for ntransforms == 1.
@@ -134,17 +134,16 @@ def transform(self, signal):
             sig_shape == self.sz
         ), f"Signal frame to be transformed must have shape {self.sz}"
 
-        signal_gpu = gpuarray.to_gpu(
-            np.ascontiguousarray(signal, dtype=self.complex_dtype)
-        )
+        result = cp.empty(res_shape, dtype=self.complex_dtype)
 
-        result_gpu = gpuarray.GPUArray(res_shape, dtype=self.complex_dtype)
+        if signal.dtype != self.complex_dtype:
+            signal = signal.astype(self.complex_dtype)
 
-        self._transform_plan.execute(signal_gpu, out=result_gpu)
+        self._transform_plan.execute(signal, out=result)
 
-        result = result_gpu.get()
         # ASPIRE-Python/703
-        result = result.astype(complex_type(self._original_dtype), copy=False)
+        if result.dtype != complex_type(self._original_dtype):
+            result = result.astype(complex_type(self._original_dtype))
 
         return result
 
@@ -156,7 +155,7 @@ def adjoint(self, signal):
         this should be a a 1D array of len `num_pts`.
         For a batch, signal should have shape `(ntransforms, num_pts)`.
 
-        :returns: Transformed signal `(sz)` or `(sz, ntransforms)`.
+        :returns: Transformed signal `(sz)` or `(sz, ntransforms)` as CuPy array.
         """
 
         # Check we're not forcing a dtype workaround for ASPIRE-Python/703,
@@ -170,6 +169,8 @@ def adjoint(self, signal):
                 " In the future this will be an error."
             )
 
+        signal = cp.asarray(signal, dtype=self.complex_dtype)
+
         res_shape = self.sz
         # Note, there is a corner case for ntransforms == 1.
         if self.ntransforms > 1 or (self.ntransforms == 1 and len(signal.shape) == 2):
@@ -181,16 +182,15 @@ def adjoint(self, signal):
             ), "For multiple transforms, signal stack length should match ntransforms {self.ntransforms}."
             res_shape = (self.ntransforms, *self.sz)
 
-        signal_gpu = gpuarray.to_gpu(
-            np.ascontiguousarray(signal, dtype=self.complex_dtype)
-        )
+        result = cp.empty(res_shape, dtype=self.complex_dtype)
 
-        result_gpu = gpuarray.GPUArray(res_shape, dtype=self.complex_dtype)
+        if signal.dtype != self.complex_dtype:
+            signal = signal.astype(self.complex_dtype)
 
-        self._adjoint_plan.execute(signal_gpu, out=result_gpu)
+        self._adjoint_plan.execute(signal, out=result)
 
-        result = result_gpu.get()
         # ASPIRE-Python/703
-        result = result.astype(complex_type(self._original_dtype), copy=False)
+        if result.dtype != complex_type(self._original_dtype):
+            result = result.astype(complex_type(self._original_dtype))
 
         return result

From 4269f10ab3c88d2c10782e04187ea6260608c1b1 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Tue, 4 Jun 2024 10:49:11 -0400
Subject: [PATCH 03/65] convert anufft and nufft towards detecting whether to
 keep array on gpu

[skip ci]
---
 src/aspire/nufft/__init__.py | 35 +++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/src/aspire/nufft/__init__.py b/src/aspire/nufft/__init__.py
index aa7c3a4adf..f748a10d84 100644
--- a/src/aspire/nufft/__init__.py
+++ b/src/aspire/nufft/__init__.py
@@ -2,6 +2,11 @@
 
 import numpy as np
 
+try:
+    import cupy as cp
+except ModuleNotFoundError:
+    cp = None
+
 from aspire import config
 from aspire.utils import LogFilterByCount, complex_type, real_type
 
@@ -152,6 +157,9 @@ def anufft(sig_f, fourier_pts, sz, real=False, epsilon=1e-8):
 
     Selects best available package from `nfft` `backends` configuration list.
 
+    When sig_f is provided as a CuPy gpu array with a cufinufft
+    backend, result is maintained on GPU.
+
     :param sig_f: Array representing the signal(s) in Fourier space to be transformed. \
     sig_f either matches length of fourier_pts or sig_f.shape is stack of (`ntransforms`, ...).
     :param fourier_pts: The points in Fourier space where the Fourier transform is to be calculated,
@@ -162,6 +170,10 @@ def anufft(sig_f, fourier_pts, sz, real=False, epsilon=1e-8):
 
     """
 
+    on_gpu = False
+    if cp and isinstance(sig_f, cp.ndarray):
+        on_gpu = True
+
     if fourier_pts.dtype != real_type(sig_f.dtype):
         raise RuntimeError(
             "anufft passed inconsistent dtypes."
@@ -181,7 +193,13 @@ def anufft(sig_f, fourier_pts, sz, real=False, epsilon=1e-8):
         sz=sz, fourier_pts=fourier_pts, ntransforms=ntransforms, epsilon=epsilon
     )
     adjoint = plan.adjoint(sig_f)
-    return np.real(adjoint) if real else adjoint
+
+    adjoint = adjoint.real if real else adjoint
+
+    if not on_gpu:
+        adjoint = adjoint.get()
+
+    return adjoint
 
 
 def nufft(sig_f, fourier_pts, real=False, epsilon=1e-8):
@@ -191,6 +209,9 @@ def nufft(sig_f, fourier_pts, real=False, epsilon=1e-8):
 
     Selects best available package from `nfft` `backends` configuration list.
 
+    When sig_f is provided as a CuPy gpu array with a cufinufft
+    backend, result is maintained on GPU.
+
     :param sig_f: Array representing the signal(s) in real space to be transformed. \
     sig_f either matches `sz` or sig_f.shape is stack of (..., `ntransforms`).
     :param fourier_pts: The points in Fourier space where the Fourier transform is to be calculated,
@@ -200,6 +221,10 @@ def nufft(sig_f, fourier_pts, real=False, epsilon=1e-8):
 
     """
 
+    on_gpu = False
+    if cp and isinstance(sig_f, cp.ndarray):
+        on_gpu = True
+
     if fourier_pts.dtype != real_type(sig_f.dtype):
         raise RuntimeError(
             "nufft passed inconsistent dtypes."
@@ -229,4 +254,10 @@ def nufft(sig_f, fourier_pts, real=False, epsilon=1e-8):
         sz=sz, fourier_pts=fourier_pts, ntransforms=ntransforms, epsilon=epsilon
     )
     transform = plan.transform(sig_f)
-    return np.real(transform) if real else transform
+
+    transform = transform.real if real else transform
+
+    if not on_gpu:
+        transform = transform.get()
+
+    return transform

From 99d60d1883975f5df944cf563cd277d7e47a5961 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Wed, 5 Jun 2024 10:17:06 -0400
Subject: [PATCH 04/65] whitespace

---
 bbenchmark/bbenchmark.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bbenchmark/bbenchmark.py b/bbenchmark/bbenchmark.py
index 6c6320c101..01aac6e7eb 100644
--- a/bbenchmark/bbenchmark.py
+++ b/bbenchmark/bbenchmark.py
@@ -5,6 +5,7 @@
 
 import matplotlib.pyplot as plt
 import numpy as np
+
 from aspire.basis import FFBBasis2D, FLEBasis2D
 from aspire.downloader import emdb_2660
 from aspire.noise import WhiteNoiseAdder

From 967e3da6eae3b03c1d4d6fee308bf544e9157317 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Wed, 5 Jun 2024 10:16:54 -0400
Subject: [PATCH 05/65] add sparse cupy gpu wrapper and tests for methods in
 use

---
 src/aspire/numeric/__init__.py | 19 ++++++++++++
 tests/test_numeric_sparse.py   | 54 ++++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+)
 create mode 100644 tests/test_numeric_sparse.py

diff --git a/src/aspire/numeric/__init__.py b/src/aspire/numeric/__init__.py
index d298f131e4..95283e5d87 100644
--- a/src/aspire/numeric/__init__.py
+++ b/src/aspire/numeric/__init__.py
@@ -35,3 +35,22 @@ def fft_object(which):
 
 
 fft = fft_object(config["common"]["fft"].as_str())
+
+
+# Configure `sparse` in tandem with `numeric` as the arrays generally will need to interoperate.
+def sparse_object(which):
+    if which == "cupy":
+        from cupyx.scipy import sparse as SparseClass
+
+        # CuPy imports don't work the same as scipy
+        from cupyx.scipy.sparse.linalg import eigsh
+
+        SparseClass.linalg.eigsh = eigsh
+    elif which == "numpy":
+        from scipy import sparse as SparseClass
+    else:
+        raise RuntimeError(f"Invalid selection for sparse module: {which}")
+    return SparseClass
+
+
+sparse = sparse_object(config["common"]["numeric"].as_str())
diff --git a/tests/test_numeric_sparse.py b/tests/test_numeric_sparse.py
new file mode 100644
index 0000000000..3964419e21
--- /dev/null
+++ b/tests/test_numeric_sparse.py
@@ -0,0 +1,54 @@
+import numpy as np
+import pytest
+
+from aspire.numeric import numeric_object, sparse_object
+
+# If cupy is not available, skip this entire test module
+pytest.importorskip("cupy")
+
+NUMERICS = ["numpy", "cupy"]
+
+
+@pytest.fixture(params=NUMERICS, ids=lambda x: f"{x}", scope="module")
+def backends(request):
+    xp = numeric_object(request.param)
+    sparse = sparse_object(request.param)
+    return xp, sparse
+
+
+def test_csr_matrix(backends):
+    """
+    Create csr_matrix and multiply with an `xp` array.
+    """
+    xp, sparse = backends
+
+    m, n = 10, 10
+    jdx = xp.arange(10)
+    idx = xp.arange(10)
+    vals = xp.random.random(10)
+
+    # Compute dense matmul
+    _A = np.diag(xp.asnumpy(vals))
+    _B = np.random.random((10, 20))
+    _C = _A @ _B
+
+    # Compute matmul using sparse csr
+    A = sparse.csr_matrix((vals, (jdx, idx)), shape=(m, n), dtype=np.float64)
+    B = xp.array(_B)
+    C = A @ B
+
+    # Compare
+    np.testing.assert_allclose(_C, xp.asnumpy(C))
+
+
+def test_eigsh(backends):
+    """
+    Invoke sparse eigsh call with `xp` arrays.
+    """
+    xp, sparse = backends
+
+    A = xp.eye(123)
+
+    lamb, _ = sparse.linalg.eigsh(A)
+    np.testing.assert_allclose(xp.asnumpy(lamb), 1.0)
+    print(lamb)

From f131f73cd45eb93e65f20a18089011e5bfb3a5b1 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Wed, 5 Jun 2024 15:11:20 -0400
Subject: [PATCH 06/65] fixup mn

---
 tests/test_numeric_sparse.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_numeric_sparse.py b/tests/test_numeric_sparse.py
index 3964419e21..45c28bef31 100644
--- a/tests/test_numeric_sparse.py
+++ b/tests/test_numeric_sparse.py
@@ -23,13 +23,13 @@ def test_csr_matrix(backends):
     xp, sparse = backends
 
     m, n = 10, 10
-    jdx = xp.arange(10)
-    idx = xp.arange(10)
+    jdx = xp.arange(m)
+    idx = xp.arange(n)
     vals = xp.random.random(10)
 
     # Compute dense matmul
     _A = np.diag(xp.asnumpy(vals))
-    _B = np.random.random((10, 20))
+    _B = np.random.random((n, 20))
     _C = _A @ _B
 
     # Compute matmul using sparse csr

From 1afb05811e4d2df84fbb0c3add5cf1965f8bb174 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Wed, 5 Jun 2024 10:46:35 -0400
Subject: [PATCH 07/65] first pass migrating FLE to cupy via xp

---
 src/aspire/basis/fle_2d.py       | 37 ++++++++++++++++++--------------
 src/aspire/basis/fle_2d_utils.py | 16 ++++++++------
 src/aspire/config_default.yaml   |  2 +-
 3 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/src/aspire/basis/fle_2d.py b/src/aspire/basis/fle_2d.py
index 423d37c093..ce6f04d3a2 100644
--- a/src/aspire/basis/fle_2d.py
+++ b/src/aspire/basis/fle_2d.py
@@ -1,7 +1,6 @@
 import logging
 
 import numpy as np
-import scipy.sparse as sparse
 from scipy.fft import dct, idct
 from scipy.special import jv
 
@@ -13,7 +12,7 @@
     transform_complex_to_real,
 )
 from aspire.nufft import anufft, nufft
-from aspire.numeric import fft
+from aspire.numeric import fft, sparse, xp
 from aspire.operators import DiagMatrix
 from aspire.utils import complex_type, grid_2d
 
@@ -440,7 +439,7 @@ def _create_basis_functions(self):
         """
         Generate the actual basis functions as Python lambda operators
         """
-        norm_constants = np.zeros(self.count)
+        norm_constants = xp.zeros(self.count)
         basis_functions = [None] * self.count
         for i in range(self.count):
             # parameters defining the basis function: bessel order and which bessel root
@@ -537,13 +536,14 @@ def _step2_t(self, z):
         num_img = z.shape[0]
         # Compute FFT along angular nodes
         betas = fft.fft(z, axis=2) / self.num_angular_nodes
+        betas = xp.asarray(betas)  # RM
         betas = betas[:, :, self.nus]
-        betas = np.conj(betas)
-        betas = np.swapaxes(betas, 0, 2)
+        betas = betas.conj()
+        betas = betas.swapaxes(0, 2)
         betas = betas.reshape(-1, self.num_radial_nodes * num_img)
         betas = self.c2r_nus @ betas
         betas = betas.reshape(-1, self.num_radial_nodes, num_img)
-        betas = np.real(np.swapaxes(betas, 0, 2))
+        betas = betas.swapaxes(0, 2).real
         return betas
 
     def _step3_t(self, betas):
@@ -554,18 +554,20 @@ def _step3_t(self, betas):
         """
         num_img = betas.shape[0]
         if self.num_interp > self.num_radial_nodes:
+            betas = xp.asnumpy(betas)
             betas = dct(betas, axis=1, type=2) / (2 * self.num_radial_nodes)
             zeros = np.zeros(betas.shape)
             betas = np.concatenate((betas, zeros), axis=1)
             betas = idct(betas, axis=1, type=2) * 2 * betas.shape[1]
-        betas = np.moveaxis(betas, 0, -1)
+            betas = xp.asarray(betas)
+        betas = xp.moveaxis(betas, 0, -1)
 
-        coefs = np.zeros((self.count, num_img), dtype=np.float64)
+        coefs = xp.zeros((self.count, num_img), dtype=np.float64)
         for i in range(self.ell_p_max + 1):
             coefs[self.idx_list[i]] = self.A3[i] @ betas[:, i, :]
         coefs = coefs.T
 
-        return coefs * self.norm_constants / self.h
+        return xp.asnumpy(coefs * self.norm_constants / self.h)
 
     def _step3(self, coefs):
         """
@@ -574,19 +576,20 @@ def _step3(self, coefs):
         Uses barycenteric interpolation in reverse to compute values of Betas
             at Chebyshev nodes, given an array of FLE coefficients.
         """
-        coefs = coefs.copy().reshape(-1, self.count)
+        coefs = xp.asarray(coefs.reshape(-1, self.count))
         num_img = coefs.shape[0]
         coefs *= self.h * self.norm_constants
         coefs = coefs.T
 
-        out = np.zeros(
+        out = xp.zeros(
             (self.num_interp, 2 * self.max_ell + 1, num_img),
             dtype=np.float64,
         )
         for i in range(self.ell_p_max + 1):
             out[:, i, :] = self.A3_T[i] @ coefs[self.idx_list[i]]
-        out = np.moveaxis(out, -1, 0)
+        out = xp.moveaxis(out, -1, 0)
         if self.num_interp > self.num_radial_nodes:
+            out = xp.asnumpy(out)  # RM
             out = dct(out, axis=1, type=2)
             out = out[:, : self.num_radial_nodes, :]
             out = idct(out, axis=1, type=2)
@@ -599,19 +602,21 @@ def _step2(self, betas):
             to images).
         Uses the IFFT to convert Beta values into Fourier-space images.
         """
+        betas = xp.asarray(betas)
         num_img = betas.shape[0]
-        tmp = np.zeros(
+        tmp = xp.zeros(
             (num_img, self.num_radial_nodes, self.num_angular_nodes),
             dtype=np.complex128,
         )
 
-        betas = np.swapaxes(betas, 0, 2)
+        betas = betas.swapaxes(0, 2)
         betas = betas.reshape(-1, self.num_radial_nodes * num_img)
         betas = self.r2c_nus @ betas
         betas = betas.reshape(-1, self.num_radial_nodes, num_img)
-        betas = np.swapaxes(betas, 0, 2)
+        betas = betas.swapaxes(0, 2)
 
-        tmp[:, :, self.nus] = np.conj(betas)
+        tmp[:, :, self.nus] = betas.conj()
+        tmp = xp.asnumpy(tmp)  # rm
         z = fft.ifft(tmp, axis=2)
 
         return z
diff --git a/src/aspire/basis/fle_2d_utils.py b/src/aspire/basis/fle_2d_utils.py
index cde0cd11bf..23d1441a68 100644
--- a/src/aspire/basis/fle_2d_utils.py
+++ b/src/aspire/basis/fle_2d_utils.py
@@ -1,5 +1,6 @@
 import numpy as np
-import scipy.sparse as sparse
+
+from aspire.numeric import sparse, xp
 
 
 def transform_complex_to_real(B, ells):
@@ -43,9 +44,9 @@ def precomp_transform_complex_to_real(ells):
     """
     count = len(ells)
     num_nonzero = np.sum(ells == 0) + 2 * np.sum(ells != 0)
-    idx = np.zeros(num_nonzero, dtype=int)
-    jdx = np.zeros(num_nonzero, dtype=int)
-    vals = np.zeros(num_nonzero, dtype=np.complex128)
+    idx = xp.zeros(num_nonzero, dtype=int)
+    jdx = xp.zeros(num_nonzero, dtype=int)
+    vals = xp.zeros(num_nonzero, dtype=np.complex128)
 
     k = 0
     for i in range(count):
@@ -190,9 +191,10 @@ def barycentric_interp_sparse(target_points, known_points, numsparse):
     # note that const cancels in numerator and denominator
     vals = vals / denom.reshape(-1, 1)
 
-    vals = vals.flatten()
-    idx = idx.flatten()
-    jdx = jdx.flatten()
+    # TODO, migrate more of this method towards `xp`
+    vals = xp.array(vals.flatten())
+    idx = xp.array(idx.flatten())
+    jdx = xp.array(jdx.flatten())
     # A is the linear operator mapping the function values from the fixed source
     # points to the fixed target points.
     # A(i,j) = \ell(x[i] ) w_j/(x[i] - xs[j]), with the notation in Eq. 3.3
diff --git a/src/aspire/config_default.yaml b/src/aspire/config_default.yaml
index def78983c0..26176a97ac 100644
--- a/src/aspire/config_default.yaml
+++ b/src/aspire/config_default.yaml
@@ -1,7 +1,7 @@
 version: 0.12.3
 common:
     # numeric module to use - one of numpy/cupy
-    numeric: numpy
+    numeric: cupy
     # fft backend to use - one of pyfftw/scipy/cupy/mkl
     fft: scipy
 

From 257d69f94d157a230647211fc50eba256ecd4d63 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Wed, 5 Jun 2024 11:02:32 -0400
Subject: [PATCH 08/65] add dct/idct to pyfftw, scipy, cupy wrappers

---
 src/aspire/basis/fle_2d.py       | 13 ++++++-------
 src/aspire/numeric/cupy_fft.py   |  6 ++++++
 src/aspire/numeric/pyfftw_fft.py |  6 ++++++
 src/aspire/numeric/scipy_fft.py  |  6 ++++++
 4 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/src/aspire/basis/fle_2d.py b/src/aspire/basis/fle_2d.py
index ce6f04d3a2..5675becf86 100644
--- a/src/aspire/basis/fle_2d.py
+++ b/src/aspire/basis/fle_2d.py
@@ -1,7 +1,6 @@
 import logging
 
 import numpy as np
-from scipy.fft import dct, idct
 from scipy.special import jv
 
 from aspire.basis import Coef, FBBasisMixin, SteerableBasis2D
@@ -555,10 +554,10 @@ def _step3_t(self, betas):
         num_img = betas.shape[0]
         if self.num_interp > self.num_radial_nodes:
             betas = xp.asnumpy(betas)
-            betas = dct(betas, axis=1, type=2) / (2 * self.num_radial_nodes)
+            betas = fft.dct(betas, axis=1, type=2) / (2 * self.num_radial_nodes)
             zeros = np.zeros(betas.shape)
             betas = np.concatenate((betas, zeros), axis=1)
-            betas = idct(betas, axis=1, type=2) * 2 * betas.shape[1]
+            betas = fft.idct(betas, axis=1, type=2) * 2 * betas.shape[1]
             betas = xp.asarray(betas)
         betas = xp.moveaxis(betas, 0, -1)
 
@@ -590,9 +589,9 @@ def _step3(self, coefs):
         out = xp.moveaxis(out, -1, 0)
         if self.num_interp > self.num_radial_nodes:
             out = xp.asnumpy(out)  # RM
-            out = dct(out, axis=1, type=2)
+            out = fft.dct(out, axis=1, type=2)
             out = out[:, : self.num_radial_nodes, :]
-            out = idct(out, axis=1, type=2)
+            out = fft.idct(out, axis=1, type=2)
 
         return out
 
@@ -736,10 +735,10 @@ def _radial_convolve_weights(self, b):
         b = np.squeeze(b)
         b = np.array(b)
         if self.num_interp > self.num_radial_nodes:
-            b = dct(b, axis=0, type=2) / (2 * self.num_radial_nodes)
+            b = fft.dct(b, axis=0, type=2) / (2 * self.num_radial_nodes)
             bz = np.zeros(b.shape)
             b = np.concatenate((b, bz), axis=0)
-            b = idct(b, axis=0, type=2) * 2 * b.shape[0]
+            b = fft.idct(b, axis=0, type=2) * 2 * b.shape[0]
         a = np.zeros(self.count, dtype=np.float64)
         y = [None] * (self.ell_p_max + 1)
         for i in range(self.ell_p_max + 1):
diff --git a/src/aspire/numeric/cupy_fft.py b/src/aspire/numeric/cupy_fft.py
index 4f45f92117..3327c78f7d 100644
--- a/src/aspire/numeric/cupy_fft.py
+++ b/src/aspire/numeric/cupy_fft.py
@@ -33,3 +33,9 @@ def fftshift(self, x, axes=None):
 
     def ifftshift(self, x, axes=None):
         return cp.fft.ifftshift(x, axes=axes)
+
+    def dct(self, *args, **kwargs):
+        return cp.fft.dct(*args, **kwargs)
+
+    def idct(self, *args, **kwargs):
+        return cp.fft.idct(*args, **kwargs)
diff --git a/src/aspire/numeric/pyfftw_fft.py b/src/aspire/numeric/pyfftw_fft.py
index 9cfdd45210..afcad98d28 100644
--- a/src/aspire/numeric/pyfftw_fft.py
+++ b/src/aspire/numeric/pyfftw_fft.py
@@ -159,3 +159,9 @@ def fftshift(self, a, axes=None):
 
     def ifftshift(self, a, axes=None):
         return scipy_fft.ifftshift(a, axes=axes)
+
+    def dct(self, *args, **kwargs):
+        return scipy_fft.dct(*args, **kwargs)
+
+    def idct(self, *args, **kwargs):
+        return scipy_fft.idct(*args, **kwargs)
diff --git a/src/aspire/numeric/scipy_fft.py b/src/aspire/numeric/scipy_fft.py
index c5a392f96b..d78e463803 100644
--- a/src/aspire/numeric/scipy_fft.py
+++ b/src/aspire/numeric/scipy_fft.py
@@ -33,3 +33,9 @@ def fftshift(self, x, axes=None):
 
     def ifftshift(self, x, axes=None):
         return sp.fft.ifftshift(x, axes=axes)
+
+    def dct(self, *args, **kwargs):
+        return sp.fft.dct(*args, **kwargs)
+
+    def idct(self, *args, **kwargs):
+        return sp.fft.idct(*args, **kwargs)

From 64578372fe61448106eb4f06808606778dc0d04d Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Wed, 5 Jun 2024 11:52:13 -0400
Subject: [PATCH 09/65] phase 2, fle internals

---
 src/aspire/basis/fle_2d.py     | 15 ++++-----------
 src/aspire/config_default.yaml |  2 +-
 src/aspire/image/image.py      |  7 +++++--
 src/aspire/numeric/__init__.py | 15 +++++++++++++++
 src/aspire/numeric/cupy_fft.py |  5 +++--
 src/aspire/numeric/numpy.py    |  7 ++++++-
 6 files changed, 34 insertions(+), 17 deletions(-)

diff --git a/src/aspire/basis/fle_2d.py b/src/aspire/basis/fle_2d.py
index 5675becf86..6215baaca8 100644
--- a/src/aspire/basis/fle_2d.py
+++ b/src/aspire/basis/fle_2d.py
@@ -496,7 +496,7 @@ def _evaluate_t(self, imgs):
             coefficients.
         """
         # See Section 3.5
-        imgs = imgs.copy()
+        imgs = xp.array(imgs)  # Copy here, mutating.
         imgs[:, self.radial_mask] = 0
         z = self._step1_t(imgs)
         b = self._step2_t(z)
@@ -513,7 +513,7 @@ def _step1_t(self, im):
         """
         im = im.reshape(-1, self.nres, self.nres).astype(complex_type(self.dtype))
         num_img = im.shape[0]
-        z = np.zeros(
+        z = xp.zeros(
             (num_img, self.num_radial_nodes, self.num_angular_nodes),
             dtype=complex_type(self.dtype),
         )
@@ -535,7 +535,6 @@ def _step2_t(self, z):
         num_img = z.shape[0]
         # Compute FFT along angular nodes
         betas = fft.fft(z, axis=2) / self.num_angular_nodes
-        betas = xp.asarray(betas)  # RM
         betas = betas[:, :, self.nus]
         betas = betas.conj()
         betas = betas.swapaxes(0, 2)
@@ -553,12 +552,9 @@ def _step3_t(self, betas):
         """
         num_img = betas.shape[0]
         if self.num_interp > self.num_radial_nodes:
-            betas = xp.asnumpy(betas)
             betas = fft.dct(betas, axis=1, type=2) / (2 * self.num_radial_nodes)
-            zeros = np.zeros(betas.shape)
-            betas = np.concatenate((betas, zeros), axis=1)
+            betas = xp.concatenate((betas, xp.zeros(betas.shape)), axis=1)
             betas = fft.idct(betas, axis=1, type=2) * 2 * betas.shape[1]
-            betas = xp.asarray(betas)
         betas = xp.moveaxis(betas, 0, -1)
 
         coefs = xp.zeros((self.count, num_img), dtype=np.float64)
@@ -588,7 +584,6 @@ def _step3(self, coefs):
             out[:, i, :] = self.A3_T[i] @ coefs[self.idx_list[i]]
         out = xp.moveaxis(out, -1, 0)
         if self.num_interp > self.num_radial_nodes:
-            out = xp.asnumpy(out)  # RM
             out = fft.dct(out, axis=1, type=2)
             out = out[:, : self.num_radial_nodes, :]
             out = fft.idct(out, axis=1, type=2)
@@ -601,7 +596,6 @@ def _step2(self, betas):
             to images).
         Uses the IFFT to convert Beta values into Fourier-space images.
         """
-        betas = xp.asarray(betas)
         num_img = betas.shape[0]
         tmp = xp.zeros(
             (num_img, self.num_radial_nodes, self.num_angular_nodes),
@@ -615,7 +609,6 @@ def _step2(self, betas):
         betas = betas.swapaxes(0, 2)
 
         tmp[:, :, self.nus] = betas.conj()
-        tmp = xp.asnumpy(tmp)  # rm
         z = fft.ifft(tmp, axis=2)
 
         return z
@@ -639,7 +632,7 @@ def _step1(self, z):
         im = im.reshape(num_img, self.nres, self.nres)
         im[:, self.radial_mask] = 0
 
-        return im
+        return xp.asnumpy(im)
 
     def _create_dense_matrix(self):
         """
diff --git a/src/aspire/config_default.yaml b/src/aspire/config_default.yaml
index 26176a97ac..fed4cea50a 100644
--- a/src/aspire/config_default.yaml
+++ b/src/aspire/config_default.yaml
@@ -3,7 +3,7 @@ common:
     # numeric module to use - one of numpy/cupy
     numeric: cupy
     # fft backend to use - one of pyfftw/scipy/cupy/mkl
-    fft: scipy
+    fft: cupy
 
     # Set cache directory for ASPIRE example data.
     # By default the cache location will be set by pooch.os_cache(),
diff --git a/src/aspire/image/image.py b/src/aspire/image/image.py
index fd160d1644..892b6accd0 100644
--- a/src/aspire/image/image.py
+++ b/src/aspire/image/image.py
@@ -357,11 +357,14 @@ def downsample(self, ds_res):
         im = self.stack_reshape(-1)
 
         # compute FT with centered 0-frequency
-        fx = fft.centered_fft2(im._data)
+        fx = xp.asnumpy(fft.centered_fft2(xp.asarray(im._data)))
         # crop 2D Fourier transform for each image
         crop_fx = np.array([crop_pad_2d(fx[i], ds_res) for i in range(self.n_images)])
         # take back to real space, discard complex part, and scale
-        out = np.real(fft.centered_ifft2(crop_fx)) * (ds_res**2 / self.resolution**2)
+        out = fft.centered_ifft2(xp.asarray(crop_fx)).real * (
+            ds_res**2 / self.resolution**2
+        )
+        out = xp.asnumpy(out)
 
         return self.__class__(out).stack_reshape(original_stack_shape)
 
diff --git a/src/aspire/numeric/__init__.py b/src/aspire/numeric/__init__.py
index 95283e5d87..be88775498 100644
--- a/src/aspire/numeric/__init__.py
+++ b/src/aspire/numeric/__init__.py
@@ -36,6 +36,21 @@ def fft_object(which):
 
 fft = fft_object(config["common"]["fft"].as_str())
 
+# Sanity check.
+if (config["common"]["numeric"].as_str() == "cupy") and (
+    config["common"]["fft"].as_str() != "cupy"
+):
+    raise RuntimeError(
+        "Using `cupy` numeric backend without `cupy` fft is unsupported."
+    )
+
+if (config["common"]["fft"].as_str() == "cupy") and (
+    config["common"]["numeric"].as_str() != "cupy"
+):
+    raise RuntimeError(
+        "Using `cupy` fft without `cupy` numeric backend is unsupported."
+    )
+
 
 # Configure `sparse` in tandem with `numeric` as the arrays generally will need to interoperate.
 def sparse_object(which):
diff --git a/src/aspire/numeric/cupy_fft.py b/src/aspire/numeric/cupy_fft.py
index 3327c78f7d..29939e504c 100644
--- a/src/aspire/numeric/cupy_fft.py
+++ b/src/aspire/numeric/cupy_fft.py
@@ -1,4 +1,5 @@
 import cupy as cp
+import cupyx.scipy.fft as cufft
 
 from aspire.numeric.base_fft import FFT
 
@@ -35,7 +36,7 @@ def ifftshift(self, x, axes=None):
         return cp.fft.ifftshift(x, axes=axes)
 
     def dct(self, *args, **kwargs):
-        return cp.fft.dct(*args, **kwargs)
+        return cufft.dct(*args, **kwargs)
 
     def idct(self, *args, **kwargs):
-        return cp.fft.idct(*args, **kwargs)
+        return cufft.idct(*args, **kwargs)
diff --git a/src/aspire/numeric/numpy.py b/src/aspire/numeric/numpy.py
index 3237c2c3ad..9367409c78 100644
--- a/src/aspire/numeric/numpy.py
+++ b/src/aspire/numeric/numpy.py
@@ -1,8 +1,13 @@
+import cupy as cp
 import numpy as np
 
 
 class Numpy:
-    asnumpy = staticmethod(lambda x: x)
+    @staticmethod
+    def asnumpy(x):
+        if isinstance(x, cp.ndarray):
+            x = x.get()
+        return x
 
     def __getattr__(self, item):
         """

From eb5ffedde05b077d2c98554dde7c1e3951969238 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Wed, 5 Jun 2024 12:20:05 -0400
Subject: [PATCH 10/65] mem cleanup workaround

---
 src/aspire/basis/fle_2d.py | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/src/aspire/basis/fle_2d.py b/src/aspire/basis/fle_2d.py
index 6215baaca8..4278331e1a 100644
--- a/src/aspire/basis/fle_2d.py
+++ b/src/aspire/basis/fle_2d.py
@@ -1,3 +1,4 @@
+import gc
 import logging
 
 import numpy as np
@@ -18,6 +19,19 @@
 logger = logging.getLogger(__name__)
 
 
+def _cleanup():
+    """
+    Utility for informing python+cupy to cleanup memory held by old vars.
+    """
+    gc.collect()
+    try:
+        import cupy
+
+        cupy.get_default_memory_pool().free_all_blocks()
+    except ModuleNotFoundError:
+        pass
+
+
 class FLEBasis2D(SteerableBasis2D, FBBasisMixin):
     """
     Define a derived class for Fast Fourier Bessel 2D expansion using interpolation
@@ -499,12 +513,20 @@ def _evaluate_t(self, imgs):
         imgs = xp.array(imgs)  # Copy here, mutating.
         imgs[:, self.radial_mask] = 0
         z = self._step1_t(imgs)
+        del imgs
+        _cleanup()
+
         b = self._step2_t(z)
+        del z
+        _cleanup()
+
         coefs = self._step3_t(b)
+        del b
+        _cleanup()
 
         # return in FB order
         coefs = coefs[..., self._fle_to_fb_indices]
-        return coefs.astype(self.coefficient_dtype, copy=False)
+        return xp.asnumpy(coefs.astype(self.coefficient_dtype))
 
     def _step1_t(self, im):
         """
@@ -562,7 +584,7 @@ def _step3_t(self, betas):
             coefs[self.idx_list[i]] = self.A3[i] @ betas[:, i, :]
         coefs = coefs.T
 
-        return xp.asnumpy(coefs * self.norm_constants / self.h)
+        return coefs * self.norm_constants / self.h
 
     def _step3(self, coefs):
         """

From e24c1acfa5ce5e58b599c910a469827ee12c1ab4 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Wed, 5 Jun 2024 14:54:57 -0400
Subject: [PATCH 11/65] cupy eigvals needs large problem or nans...

---
 tests/test_numeric_sparse.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_numeric_sparse.py b/tests/test_numeric_sparse.py
index 45c28bef31..120a7de49f 100644
--- a/tests/test_numeric_sparse.py
+++ b/tests/test_numeric_sparse.py
@@ -47,7 +47,7 @@ def test_eigsh(backends):
     """
     xp, sparse = backends
 
-    A = xp.eye(123)
+    A = xp.eye(1234)
 
     lamb, _ = sparse.linalg.eigsh(A)
     np.testing.assert_allclose(xp.asnumpy(lamb), 1.0)

From d3f5993bb4bdbd9ed61930d8ad3c4e5e63ad9451 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Wed, 5 Jun 2024 14:55:34 -0400
Subject: [PATCH 12/65] crop pad updates

---
 src/aspire/image/image.py      |  6 +++---
 src/aspire/utils/coor_trans.py | 14 +++++++++-----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/aspire/image/image.py b/src/aspire/image/image.py
index 892b6accd0..d04bfe4e25 100644
--- a/src/aspire/image/image.py
+++ b/src/aspire/image/image.py
@@ -357,11 +357,11 @@ def downsample(self, ds_res):
         im = self.stack_reshape(-1)
 
         # compute FT with centered 0-frequency
-        fx = xp.asnumpy(fft.centered_fft2(xp.asarray(im._data)))
+        fx = fft.centered_fft2(xp.asarray(im._data))
         # crop 2D Fourier transform for each image
-        crop_fx = np.array([crop_pad_2d(fx[i], ds_res) for i in range(self.n_images)])
+        crop_fx = crop_pad_2d(fx, ds_res)
         # take back to real space, discard complex part, and scale
-        out = fft.centered_ifft2(xp.asarray(crop_fx)).real * (
+        out = fft.centered_ifft2(crop_fx).real * (
             ds_res**2 / self.resolution**2
         )
         out = xp.asnumpy(out)
diff --git a/src/aspire/utils/coor_trans.py b/src/aspire/utils/coor_trans.py
index e909e2f394..844f218551 100644
--- a/src/aspire/utils/coor_trans.py
+++ b/src/aspire/utils/coor_trans.py
@@ -8,6 +8,7 @@
 from numpy.linalg import norm
 from scipy.linalg import svd
 
+from aspire.numeric import xp
 from aspire.utils.random import Random
 from aspire.utils.rotation import Rotation
 
@@ -368,23 +369,26 @@ def rots_to_clmatrix(rots, n_theta):
 
 def crop_pad_2d(im, size, fill_value=0):
     """
-    :param im: A 2-dimensional numpy array
+    :param im: A >=2-dimensional numpy array
     :param size: Integer size of cropped/padded output
-    :return: A numpy array of shape (size, size)
+    :return: A numpy array of shape (..., size, size)
     """
 
-    im_y, im_x = im.shape
+    im_y, im_x = im.shape[-2:]
     # shift terms
     start_x = math.floor(im_x / 2) - math.floor(size / 2)
     start_y = math.floor(im_y / 2) - math.floor(size / 2)
 
     # cropping
     if size <= min(im_y, im_x):
-        return im[start_y : start_y + size, start_x : start_x + size]
+        return im[..., start_y : start_y + size, start_x : start_x + size]
     # padding
     elif size >= max(im_y, im_x):
+        # Determine shape
+        shape = list(im.shape[:-2])
+        shape.extend([size,size])
         # ensure that we return in the same dtype as the input
-        to_return = fill_value * np.ones((size, size), dtype=im.dtype)
+        to_return = xp.full(shape, fill_value, dtype=im.dtype)
         # when padding, start_x and start_y are negative since size is larger
         # than im_x and im_y; the below line calculates where the original image
         # is placed in relation to the (now-larger) box size

From 1c5c17f96f219f457966f1a555860ef8bf40f26a Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Wed, 5 Jun 2024 14:56:56 -0400
Subject: [PATCH 13/65] tox cleanup

[skip ci]
---
 src/aspire/image/image.py      | 4 +---
 src/aspire/utils/coor_trans.py | 2 +-
 tests/test_numeric_sparse.py   | 1 -
 3 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/aspire/image/image.py b/src/aspire/image/image.py
index d04bfe4e25..a0a954f2d9 100644
--- a/src/aspire/image/image.py
+++ b/src/aspire/image/image.py
@@ -361,9 +361,7 @@ def downsample(self, ds_res):
         # crop 2D Fourier transform for each image
         crop_fx = crop_pad_2d(fx, ds_res)
         # take back to real space, discard complex part, and scale
-        out = fft.centered_ifft2(crop_fx).real * (
-            ds_res**2 / self.resolution**2
-        )
+        out = fft.centered_ifft2(crop_fx).real * (ds_res**2 / self.resolution**2)
         out = xp.asnumpy(out)
 
         return self.__class__(out).stack_reshape(original_stack_shape)
diff --git a/src/aspire/utils/coor_trans.py b/src/aspire/utils/coor_trans.py
index 844f218551..dfb1c630f1 100644
--- a/src/aspire/utils/coor_trans.py
+++ b/src/aspire/utils/coor_trans.py
@@ -386,7 +386,7 @@ def crop_pad_2d(im, size, fill_value=0):
     elif size >= max(im_y, im_x):
         # Determine shape
         shape = list(im.shape[:-2])
-        shape.extend([size,size])
+        shape.extend([size, size])
         # ensure that we return in the same dtype as the input
         to_return = xp.full(shape, fill_value, dtype=im.dtype)
         # when padding, start_x and start_y are negative since size is larger
diff --git a/tests/test_numeric_sparse.py b/tests/test_numeric_sparse.py
index 120a7de49f..288e41a176 100644
--- a/tests/test_numeric_sparse.py
+++ b/tests/test_numeric_sparse.py
@@ -51,4 +51,3 @@ def test_eigsh(backends):
 
     lamb, _ = sparse.linalg.eigsh(A)
     np.testing.assert_allclose(xp.asnumpy(lamb), 1.0)
-    print(lamb)

From 02504556ccc212e65e606463e660ce89c632d812 Mon Sep 17 00:00:00 2001
From: "Joshua C. Carmichael" <jc5485@della-gpu.princeton.edu>
Date: Tue, 4 Jun 2024 16:06:28 -0400
Subject: [PATCH 14/65] evaluate_t on gpu.

---
 src/aspire/basis/ffb_2d.py | 37 +++++++++++++++++--------------------
 src/aspire/image/image.py  |  2 +-
 2 files changed, 18 insertions(+), 21 deletions(-)

diff --git a/src/aspire/basis/ffb_2d.py b/src/aspire/basis/ffb_2d.py
index 5a5c7c3f27..e9e6ab90a8 100644
--- a/src/aspire/basis/ffb_2d.py
+++ b/src/aspire/basis/ffb_2d.py
@@ -193,56 +193,53 @@ def _evaluate_t(self, x):
         n_images = x.shape[0]
 
         # resamping x in a polar Fourier gird using nonuniform discrete Fourier transform
-        pf = nufft(x, 2 * pi * freqs)
-        pf = np.reshape(pf, (n_images, n_r, n_theta))
+        pf = nufft(xp.array(x), 2 * pi * freqs)
+        pf = pf.reshape(n_images, n_r, n_theta)
 
         # Recover "negative" frequencies from "positive" half plane.
-        pf = np.concatenate((pf, pf.conjugate()), axis=2)
+        pf = xp.concatenate((pf, pf.conjugate()), axis=2)
 
         # evaluate radial integral using the Gauss-Legendre quadrature rule
-        for i_r in range(0, n_r):
-            pf[:, i_r, :] = pf[:, i_r, :] * (
-                self._precomp["gl_weights"][i_r] * self._precomp["gl_nodes"][i_r]
-            )
+        pf = pf * (xp.array(self._precomp["gl_weights"]) * xp.array(self._precomp["gl_nodes"]))[None, :, None]
 
         #  1D FFT on the angular dimension for each concentric circle
-        pf = 2 * pi / (2 * n_theta) * xp.asnumpy(fft.fft(xp.asarray(pf)))
+        pf = 2 * xp.pi / (2 * n_theta) * fft.fft(pf)
 
         # This only makes it easier to slice the array later.
-        v = np.zeros((n_images, self.count), dtype=x.dtype)
+        v = xp.zeros((n_images, self.count), dtype=x.dtype)
 
         # go through each basis function and find the corresponding coefficient
         ind = 0
-        idx = ind + np.arange(self.k_max[0])
+        idx = ind + xp.arange(self.k_max[0])
 
         # include the normalization factor of angular part into radial part
-        radial_norm = self._precomp["radial"] / np.expand_dims(self.angular_norms, 1)
+        radial_norm = xp.array(self._precomp["radial"] / np.expand_dims(self.angular_norms, 1))
         v[:, self._zero_angular_inds] = pf[:, :, 0].real @ radial_norm[idx].T
-        ind = ind + np.size(idx)
+        ind = ind + idx.size
 
         ind_pos = ind
         for ell in range(1, self.ell_max + 1):
-            idx = ind + np.arange(self.k_max[ell])
-            idx_pos = ind_pos + np.arange(self.k_max[ell])
+            idx = ind + xp.arange(self.k_max[ell])
+            idx_pos = ind_pos + xp.arange(self.k_max[ell])
             idx_neg = idx_pos + self.k_max[ell]
 
             v_ell = pf[:, :, ell] @ radial_norm[idx].T
 
             if np.mod(ell, 2) == 0:
-                v_pos = np.real(v_ell)
-                v_neg = -np.imag(v_ell)
+                v_pos = v_ell.real
+                v_neg = -v_ell.imag
             else:
-                v_pos = np.imag(v_ell)
-                v_neg = np.real(v_ell)
+                v_pos = v_ell.imag
+                v_neg = v_ell.real
 
             v[:, idx_pos] = v_pos
             v[:, idx_neg] = v_neg
 
-            ind = ind + np.size(idx)
+            ind = ind + idx.size
 
             ind_pos = ind_pos + 2 * self.k_max[ell]
 
-        return v
+        return xp.asnumpy(v)
 
     def filter_to_basis_mat(self, f, **kwargs):
         """
diff --git a/src/aspire/image/image.py b/src/aspire/image/image.py
index a0a954f2d9..d0727eb71f 100644
--- a/src/aspire/image/image.py
+++ b/src/aspire/image/image.py
@@ -364,7 +364,7 @@ def downsample(self, ds_res):
         out = fft.centered_ifft2(crop_fx).real * (ds_res**2 / self.resolution**2)
         out = xp.asnumpy(out)
 
-        return self.__class__(out).stack_reshape(original_stack_shape)
+        return self.__class__(np.array(out.get())).stack_reshape(original_stack_shape)
 
     def filter(self, filter):
         """

From 51459ce059c45cd0153e504a6f1857906b8f5b85 Mon Sep 17 00:00:00 2001
From: "Joshua C. Carmichael" <jc5485@della-gpu.princeton.edu>
Date: Wed, 5 Jun 2024 16:06:21 -0400
Subject: [PATCH 15/65] Optimize ffb2d for gpu.

---
 src/aspire/basis/ffb_2d.py | 56 ++++++++++++++++++++++----------------
 1 file changed, 32 insertions(+), 24 deletions(-)

diff --git a/src/aspire/basis/ffb_2d.py b/src/aspire/basis/ffb_2d.py
index e9e6ab90a8..996e44f95a 100644
--- a/src/aspire/basis/ffb_2d.py
+++ b/src/aspire/basis/ffb_2d.py
@@ -105,6 +105,7 @@ def _evaluate(self, v):
             coordinate basis. This is Image instance with resolution of `self.sz`
             and the first dimension correspond to remaining dimension of `v`.
         """
+        v = xp.array(v)
         sz_roll = v.shape[:-1]
         v = v.reshape(-1, self.count)
 
@@ -112,27 +113,29 @@ def _evaluate(self, v):
         n_data = v.shape[0]
 
         # get information on polar grids from precomputed data
-        n_theta = np.size(self._precomp["freqs"], 2)
-        n_r = np.size(self._precomp["freqs"], 1)
+        n_theta = self._precomp["freqs"].shape[2]
+        n_r = self._precomp["freqs"].shape[1]
 
         # go through  each basis function and find corresponding coefficient
-        pf = np.zeros((n_data, 2 * n_theta, n_r), dtype=complex_type(self.dtype))
+        pf = xp.zeros((n_data, 2 * n_theta, n_r), dtype=complex_type(self.dtype))
 
         ind = 0
 
-        idx = ind + np.arange(self.k_max[0], dtype=int)
+        idx = ind + xp.arange(self.k_max[0], dtype=int)
 
         # include the normalization factor of angular part into radial part
-        radial_norm = self._precomp["radial"] / np.expand_dims(self.angular_norms, 1)
-        pf[:, 0, :] = v[:, self._zero_angular_inds] @ radial_norm[idx]
-        ind = ind + np.size(idx)
+        radial_norm = xp.array(self._precomp["radial"]) / xp.array(
+            np.expand_dims(self.angular_norms, 1)
+        )
+        pf[:, 0, :] = v[:, xp.array(self._zero_angular_inds)] @ radial_norm[idx]
+        ind = ind + idx.size
 
         ind_pos = ind
 
         for ell in range(1, self.ell_max + 1):
-            idx = ind + np.arange(self.k_max[ell], dtype=int)
-            idx_pos = ind_pos + np.arange(self.k_max[ell], dtype=int)
-            idx_neg = idx_pos + self.k_max[ell]
+            idx = ind + xp.arange(self.k_max[ell], dtype=int)
+            idx_pos = ind_pos + xp.arange(self.k_max[ell], dtype=int)
+            idx_neg = idx_pos + xp.array(self.k_max[ell])
 
             v_ell = (v[:, idx_pos] - 1j * v[:, idx_neg]) / 2.0
 
@@ -147,22 +150,19 @@ def _evaluate(self, v):
             else:
                 pf[:, 2 * n_theta - ell, :] = -pf_ell.conjugate()
 
-            ind = ind + np.size(idx)
-            ind_pos = ind_pos + 2 * self.k_max[ell]
+            ind = ind + idx.size
+            ind_pos = ind_pos + 2 * xp.array(self.k_max[ell])
 
         # 1D inverse FFT in the degree of polar angle
-        pf = 2 * pi * xp.asnumpy(fft.ifft(xp.asarray(pf), axis=1))
+        pf = 2 * xp.pi * fft.ifft(xp.asarray(pf), axis=1)
 
         # Only need "positive" frequencies.
-        hsize = int(np.size(pf, 1) / 2)
+        hsize = int(pf.shape[1] / 2)
         pf = pf[:, 0:hsize, :]
-
-        for i_r in range(0, n_r):
-            pf[..., i_r] = pf[..., i_r] * (
-                self._precomp["gl_weights"][i_r] * self._precomp["gl_nodes"][i_r]
-            )
-
-        pf = np.reshape(pf, (n_data, n_r * n_theta))
+        pf *= (
+            xp.array(self._precomp["gl_weights"]) * xp.array(self._precomp["gl_nodes"])
+        )[None, None, :]
+        pf = pf.reshape(n_data, n_r * n_theta)
 
         # perform inverse non-uniformly FFT transform back to 2D coordinate basis
         freqs = m_reshape(self._precomp["freqs"], (2, n_r * n_theta))
@@ -172,7 +172,7 @@ def _evaluate(self, v):
         # Return X as Image instance with the last two dimensions as *self.sz
         x = x.reshape((*sz_roll, *self.sz))
 
-        return x
+        return xp.asnumpy(x)
 
     def _evaluate_t(self, x):
         """
@@ -200,7 +200,13 @@ def _evaluate_t(self, x):
         pf = xp.concatenate((pf, pf.conjugate()), axis=2)
 
         # evaluate radial integral using the Gauss-Legendre quadrature rule
-        pf = pf * (xp.array(self._precomp["gl_weights"]) * xp.array(self._precomp["gl_nodes"]))[None, :, None]
+        pf = (
+            pf
+            * (
+                xp.array(self._precomp["gl_weights"])
+                * xp.array(self._precomp["gl_nodes"])
+            )[None, :, None]
+        )
 
         #  1D FFT on the angular dimension for each concentric circle
         pf = 2 * xp.pi / (2 * n_theta) * fft.fft(pf)
@@ -213,7 +219,9 @@ def _evaluate_t(self, x):
         idx = ind + xp.arange(self.k_max[0])
 
         # include the normalization factor of angular part into radial part
-        radial_norm = xp.array(self._precomp["radial"] / np.expand_dims(self.angular_norms, 1))
+        radial_norm = xp.array(
+            self._precomp["radial"] / np.expand_dims(self.angular_norms, 1)
+        )
         v[:, self._zero_angular_inds] = pf[:, :, 0].real @ radial_norm[idx].T
         ind = ind + idx.size
 

From da85a5da84a87871a3ec3c19e07e821bc216a128 Mon Sep 17 00:00:00 2001
From: Josh Carmichael <carmichael@princeton.edu>
Date: Tue, 11 Jun 2024 11:01:51 -0400
Subject: [PATCH 16/65] downsample return

---
 src/aspire/image/image.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/aspire/image/image.py b/src/aspire/image/image.py
index d0727eb71f..a0a954f2d9 100644
--- a/src/aspire/image/image.py
+++ b/src/aspire/image/image.py
@@ -364,7 +364,7 @@ def downsample(self, ds_res):
         out = fft.centered_ifft2(crop_fx).real * (ds_res**2 / self.resolution**2)
         out = xp.asnumpy(out)
 
-        return self.__class__(np.array(out.get())).stack_reshape(original_stack_shape)
+        return self.__class__(out).stack_reshape(original_stack_shape)
 
     def filter(self, filter):
         """

From d9095611f80d8c0a9db0cf4f11aa7ebb29c2c41c Mon Sep 17 00:00:00 2001
From: Josh Carmichael <carmichael@princeton.edu>
Date: Tue, 11 Jun 2024 11:43:52 -0400
Subject: [PATCH 17/65] remove unnecessary xp.array

---
 src/aspire/basis/ffb_2d.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/aspire/basis/ffb_2d.py b/src/aspire/basis/ffb_2d.py
index 996e44f95a..4971b44f77 100644
--- a/src/aspire/basis/ffb_2d.py
+++ b/src/aspire/basis/ffb_2d.py
@@ -154,7 +154,7 @@ def _evaluate(self, v):
             ind_pos = ind_pos + 2 * xp.array(self.k_max[ell])
 
         # 1D inverse FFT in the degree of polar angle
-        pf = 2 * xp.pi * fft.ifft(xp.asarray(pf), axis=1)
+        pf = 2 * xp.pi * fft.ifft(pf, axis=1)
 
         # Only need "positive" frequencies.
         hsize = int(pf.shape[1] / 2)

From f1c296da759020b7e335e94b739fd5757c23c53b Mon Sep 17 00:00:00 2001
From: Josh Carmichael <carmichael@princeton.edu>
Date: Tue, 11 Jun 2024 15:24:56 -0400
Subject: [PATCH 18/65] convert pf to complex

---
 src/aspire/basis/ffb_2d.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/aspire/basis/ffb_2d.py b/src/aspire/basis/ffb_2d.py
index 4971b44f77..821800ee54 100644
--- a/src/aspire/basis/ffb_2d.py
+++ b/src/aspire/basis/ffb_2d.py
@@ -167,7 +167,9 @@ def _evaluate(self, v):
         # perform inverse non-uniformly FFT transform back to 2D coordinate basis
         freqs = m_reshape(self._precomp["freqs"], (2, n_r * n_theta))
 
-        x = 2 * anufft(pf, 2 * pi * freqs, self.sz, real=True)
+        x = 2 * anufft(
+            pf.astype(complex_type(self.dtype)), 2 * pi * freqs, self.sz, real=True
+        )
 
         # Return X as Image instance with the last two dimensions as *self.sz
         x = x.reshape((*sz_roll, *self.sz))

From 5d02abf7994daa552f69313407fb185dfa663f7d Mon Sep 17 00:00:00 2001
From: Josh Carmichael <carmichael@princeton.edu>
Date: Thu, 13 Jun 2024 08:59:56 -0400
Subject: [PATCH 19/65] precompute radial_norm and gl_weighted_nodes in build.

---
 src/aspire/basis/ffb_2d.py | 38 +++++++++++++++++---------------------
 1 file changed, 17 insertions(+), 21 deletions(-)

diff --git a/src/aspire/basis/ffb_2d.py b/src/aspire/basis/ffb_2d.py
index 821800ee54..891ab2600d 100644
--- a/src/aspire/basis/ffb_2d.py
+++ b/src/aspire/basis/ffb_2d.py
@@ -58,6 +58,16 @@ def _build(self):
         # precompute the basis functions in 2D grids
         self._precomp = self._precomp()
 
+        # include the normalization factor of angular part into radial part
+        self.radial_norm = xp.array(self._precomp["radial"]) / xp.array(
+            np.expand_dims(self.angular_norms, 1)
+        )
+
+        # precompute weighted nodes
+        self.gl_weighted_nodes = xp.array(self._precomp["gl_weights"]) * xp.array(
+            self._precomp["gl_nodes"]
+        )
+
     def _precomp(self):
         """
         Precomute the basis functions on a polar Fourier grid
@@ -123,11 +133,7 @@ def _evaluate(self, v):
 
         idx = ind + xp.arange(self.k_max[0], dtype=int)
 
-        # include the normalization factor of angular part into radial part
-        radial_norm = xp.array(self._precomp["radial"]) / xp.array(
-            np.expand_dims(self.angular_norms, 1)
-        )
-        pf[:, 0, :] = v[:, xp.array(self._zero_angular_inds)] @ radial_norm[idx]
+        pf[:, 0, :] = v[:, xp.array(self._zero_angular_inds)] @ self.radial_norm[idx]
         ind = ind + idx.size
 
         ind_pos = ind
@@ -142,7 +148,7 @@ def _evaluate(self, v):
             if np.mod(ell, 2) == 1:
                 v_ell = 1j * v_ell
 
-            pf_ell = v_ell @ radial_norm[idx]
+            pf_ell = v_ell @ self.radial_norm[idx]
             pf[:, ell, :] = pf_ell
 
             if np.mod(ell, 2) == 0:
@@ -159,9 +165,7 @@ def _evaluate(self, v):
         # Only need "positive" frequencies.
         hsize = int(pf.shape[1] / 2)
         pf = pf[:, 0:hsize, :]
-        pf *= (
-            xp.array(self._precomp["gl_weights"]) * xp.array(self._precomp["gl_nodes"])
-        )[None, None, :]
+        pf *= self.gl_weighted_nodes[None, None, :]
         pf = pf.reshape(n_data, n_r * n_theta)
 
         # perform inverse non-uniformly FFT transform back to 2D coordinate basis
@@ -202,13 +206,7 @@ def _evaluate_t(self, x):
         pf = xp.concatenate((pf, pf.conjugate()), axis=2)
 
         # evaluate radial integral using the Gauss-Legendre quadrature rule
-        pf = (
-            pf
-            * (
-                xp.array(self._precomp["gl_weights"])
-                * xp.array(self._precomp["gl_nodes"])
-            )[None, :, None]
-        )
+        pf *= self.gl_weighted_nodes[None, :, None]
 
         #  1D FFT on the angular dimension for each concentric circle
         pf = 2 * xp.pi / (2 * n_theta) * fft.fft(pf)
@@ -221,10 +219,8 @@ def _evaluate_t(self, x):
         idx = ind + xp.arange(self.k_max[0])
 
         # include the normalization factor of angular part into radial part
-        radial_norm = xp.array(
-            self._precomp["radial"] / np.expand_dims(self.angular_norms, 1)
-        )
-        v[:, self._zero_angular_inds] = pf[:, :, 0].real @ radial_norm[idx].T
+
+        v[:, self._zero_angular_inds] = pf[:, :, 0].real @ self.radial_norm[idx].T
         ind = ind + idx.size
 
         ind_pos = ind
@@ -233,7 +229,7 @@ def _evaluate_t(self, x):
             idx_pos = ind_pos + xp.arange(self.k_max[ell])
             idx_neg = idx_pos + self.k_max[ell]
 
-            v_ell = pf[:, :, ell] @ radial_norm[idx].T
+            v_ell = pf[:, :, ell] @ self.radial_norm[idx].T
 
             if np.mod(ell, 2) == 0:
                 v_pos = v_ell.real

From 6a832a746a844b499a7722ca6030b7a87234cd1d Mon Sep 17 00:00:00 2001
From: Josh Carmichael <carmichael@princeton.edu>
Date: Thu, 13 Jun 2024 09:07:46 -0400
Subject: [PATCH 20/65] remove comment

---
 src/aspire/basis/ffb_2d.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/aspire/basis/ffb_2d.py b/src/aspire/basis/ffb_2d.py
index 891ab2600d..1c7ed8cad1 100644
--- a/src/aspire/basis/ffb_2d.py
+++ b/src/aspire/basis/ffb_2d.py
@@ -218,8 +218,6 @@ def _evaluate_t(self, x):
         ind = 0
         idx = ind + xp.arange(self.k_max[0])
 
-        # include the normalization factor of angular part into radial part
-
         v[:, self._zero_angular_inds] = pf[:, :, 0].real @ self.radial_norm[idx].T
         ind = ind + idx.size
 

From 1c27b06a8dae11aae8a22d107c9b6deab6d61b54 Mon Sep 17 00:00:00 2001
From: Josh Carmichael <carmichael@princeton.edu>
Date: Thu, 13 Jun 2024 10:42:35 -0400
Subject: [PATCH 21/65] use asarray

---
 src/aspire/basis/ffb_2d.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/aspire/basis/ffb_2d.py b/src/aspire/basis/ffb_2d.py
index 1c7ed8cad1..f44e959149 100644
--- a/src/aspire/basis/ffb_2d.py
+++ b/src/aspire/basis/ffb_2d.py
@@ -59,12 +59,12 @@ def _build(self):
         self._precomp = self._precomp()
 
         # include the normalization factor of angular part into radial part
-        self.radial_norm = xp.array(self._precomp["radial"]) / xp.array(
+        self.radial_norm = xp.asarray(self._precomp["radial"]) / xp.asarray(
             np.expand_dims(self.angular_norms, 1)
         )
 
         # precompute weighted nodes
-        self.gl_weighted_nodes = xp.array(self._precomp["gl_weights"]) * xp.array(
+        self.gl_weighted_nodes = xp.asarray(self._precomp["gl_weights"]) * xp.asarray(
             self._precomp["gl_nodes"]
         )
 
@@ -115,7 +115,7 @@ def _evaluate(self, v):
             coordinate basis. This is Image instance with resolution of `self.sz`
             and the first dimension correspond to remaining dimension of `v`.
         """
-        v = xp.array(v)
+        v = xp.asarray(v)
         sz_roll = v.shape[:-1]
         v = v.reshape(-1, self.count)
 
@@ -133,7 +133,7 @@ def _evaluate(self, v):
 
         idx = ind + xp.arange(self.k_max[0], dtype=int)
 
-        pf[:, 0, :] = v[:, xp.array(self._zero_angular_inds)] @ self.radial_norm[idx]
+        pf[:, 0, :] = v[:, xp.asarray(self._zero_angular_inds)] @ self.radial_norm[idx]
         ind = ind + idx.size
 
         ind_pos = ind
@@ -141,7 +141,7 @@ def _evaluate(self, v):
         for ell in range(1, self.ell_max + 1):
             idx = ind + xp.arange(self.k_max[ell], dtype=int)
             idx_pos = ind_pos + xp.arange(self.k_max[ell], dtype=int)
-            idx_neg = idx_pos + xp.array(self.k_max[ell])
+            idx_neg = idx_pos + xp.asarray(self.k_max[ell])
 
             v_ell = (v[:, idx_pos] - 1j * v[:, idx_neg]) / 2.0
 
@@ -157,7 +157,7 @@ def _evaluate(self, v):
                 pf[:, 2 * n_theta - ell, :] = -pf_ell.conjugate()
 
             ind = ind + idx.size
-            ind_pos = ind_pos + 2 * xp.array(self.k_max[ell])
+            ind_pos = ind_pos + 2 * xp.asarray(self.k_max[ell])
 
         # 1D inverse FFT in the degree of polar angle
         pf = 2 * xp.pi * fft.ifft(pf, axis=1)
@@ -199,7 +199,7 @@ def _evaluate_t(self, x):
         n_images = x.shape[0]
 
         # resamping x in a polar Fourier gird using nonuniform discrete Fourier transform
-        pf = nufft(xp.array(x), 2 * pi * freqs)
+        pf = nufft(xp.asarray(x), 2 * pi * freqs)
         pf = pf.reshape(n_images, n_r, n_theta)
 
         # Recover "negative" frequencies from "positive" half plane.

From 69701a6e19372ea9f660838a433c97e55d77943f Mon Sep 17 00:00:00 2001
From: Josh Carmichael <carmichael@princeton.edu>
Date: Thu, 13 Jun 2024 13:36:16 -0400
Subject: [PATCH 22/65] Remove cupy.fill culprit. un-cupy indices.

---
 src/aspire/basis/ffb_2d.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/aspire/basis/ffb_2d.py b/src/aspire/basis/ffb_2d.py
index f44e959149..116e009757 100644
--- a/src/aspire/basis/ffb_2d.py
+++ b/src/aspire/basis/ffb_2d.py
@@ -141,7 +141,7 @@ def _evaluate(self, v):
         for ell in range(1, self.ell_max + 1):
             idx = ind + xp.arange(self.k_max[ell], dtype=int)
             idx_pos = ind_pos + xp.arange(self.k_max[ell], dtype=int)
-            idx_neg = idx_pos + xp.asarray(self.k_max[ell])
+            idx_neg = idx_pos + self.k_max[ell]
 
             v_ell = (v[:, idx_pos] - 1j * v[:, idx_neg]) / 2.0
 
@@ -157,7 +157,7 @@ def _evaluate(self, v):
                 pf[:, 2 * n_theta - ell, :] = -pf_ell.conjugate()
 
             ind = ind + idx.size
-            ind_pos = ind_pos + 2 * xp.asarray(self.k_max[ell])
+            ind_pos = ind_pos + 2 * self.k_max[ell]
 
         # 1D inverse FFT in the degree of polar angle
         pf = 2 * xp.pi * fft.ifft(pf, axis=1)

From 0302ab68e57c923cb5b43f7a437c034b9261d1fe Mon Sep 17 00:00:00 2001
From: Josh Carmichael <carmichael@princeton.edu>
Date: Thu, 13 Jun 2024 14:10:48 -0400
Subject: [PATCH 23/65] cupy.fill culprit in fle_2d. sparse indices.

---
 src/aspire/basis/fle_2d_utils.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/aspire/basis/fle_2d_utils.py b/src/aspire/basis/fle_2d_utils.py
index 23d1441a68..33f237165e 100644
--- a/src/aspire/basis/fle_2d_utils.py
+++ b/src/aspire/basis/fle_2d_utils.py
@@ -44,9 +44,9 @@ def precomp_transform_complex_to_real(ells):
     """
     count = len(ells)
     num_nonzero = np.sum(ells == 0) + 2 * np.sum(ells != 0)
-    idx = xp.zeros(num_nonzero, dtype=int)
-    jdx = xp.zeros(num_nonzero, dtype=int)
-    vals = xp.zeros(num_nonzero, dtype=np.complex128)
+    idx = np.zeros(num_nonzero, dtype=int)
+    jdx = np.zeros(num_nonzero, dtype=int)
+    vals = np.zeros(num_nonzero, dtype=np.complex128)
 
     k = 0
     for i in range(count):
@@ -86,7 +86,11 @@ def precomp_transform_complex_to_real(ells):
             jdx[k] = i + 1
             k = k + 1
 
-    A = sparse.csr_matrix((vals, (idx, jdx)), shape=(count, count), dtype=np.complex128)
+    A = sparse.csr_matrix(
+        (xp.asarray(vals), (xp.asarray(idx), xp.asarray(jdx))),
+        shape=(count, count),
+        dtype=np.complex128,
+    )
 
     return A.conjugate()
 

From 4794493737107d4de22fad6cfbd1d729280ee21a Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 13 Jun 2024 10:58:04 -0400
Subject: [PATCH 24/65] bare min vol hack

---
 src/aspire/volume/volume.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/aspire/volume/volume.py b/src/aspire/volume/volume.py
index b6c100db36..7883f59190 100644
--- a/src/aspire/volume/volume.py
+++ b/src/aspire/volume/volume.py
@@ -475,15 +475,16 @@ def downsample(self, ds_res, mask=None):
         v = self.stack_reshape(-1)
 
         # take 3D Fourier transform of each volume in the stack
-        fx = fft.fftshift(fft.fftn(v._data, axes=(1, 2, 3)))
+        fx = xp.asnumpy(fft.fftshift(fft.fftn(xp.asarray(v._data), axes=(1, 2, 3))))
         # crop each volume to the desired resolution in frequency space
         crop_fx = (
             np.array([crop_pad_3d(fx[i, :, :, :], ds_res) for i in range(self.n_vols)])
             * mask
         )
         # inverse Fourier transform of each volume
-        out = fft.ifftn(fft.ifftshift(crop_fx), axes=(1, 2, 3)) * (
-            ds_res**3 / self.resolution**3
+        out = xp.asnumpy(
+            fft.ifftn(fft.ifftshift(xp.asarray(crop_fx)), axes=(1, 2, 3))
+            * (ds_res**3 / self.resolution**3)
         )
         # returns a new Volume object
         return self.__class__(

From e6ee1bd569d5e703ed16196fe5ad670e5837cae6 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 13 Jun 2024 11:33:04 -0400
Subject: [PATCH 25/65] bare min ffb3d hacks

[skip ci]
---
 src/aspire/basis/ffb_3d.py | 52 ++++++++++++++++++++------------------
 1 file changed, 28 insertions(+), 24 deletions(-)

diff --git a/src/aspire/basis/ffb_3d.py b/src/aspire/basis/ffb_3d.py
index 6362a9a703..1ac5fd62ff 100644
--- a/src/aspire/basis/ffb_3d.py
+++ b/src/aspire/basis/ffb_3d.py
@@ -6,6 +6,7 @@
 from aspire.basis import FBBasis3D
 from aspire.basis.basis_utils import lgwt, norm_assoc_legendre, sph_bessel
 from aspire.nufft import anufft, nufft
+from aspire.numeric import xp
 from aspire.utils.matlab_compat import m_flatten, m_reshape
 
 logger = logging.getLogger(__name__)
@@ -146,10 +147,10 @@ def _precomp(self):
         )
 
         return {
-            "radial_wtd": radial_wtd,
-            "ang_phi_wtd_even": ang_phi_wtd_even,
-            "ang_phi_wtd_odd": ang_phi_wtd_odd,
-            "ang_theta_wtd": ang_theta_wtd,
+            "radial_wtd": xp.asarray(radial_wtd),
+            "ang_phi_wtd_even": [xp.asarray(x) for x in ang_phi_wtd_even],
+            "ang_phi_wtd_odd": [xp.asarray(x) for x in ang_phi_wtd_odd],
+            "ang_theta_wtd": xp.asarray(ang_theta_wtd),
             "fourier_pts": fourier_pts,
         }
 
@@ -163,6 +164,7 @@ def _evaluate(self, v):
             coordinate basis. This is an array whose last three dimensions equal
             `self.sz` and the remaining dimensions correspond to `v`.
         """
+        v = xp.asarray(v)
         # roll dimensions of v
         sz_roll = v.shape[:-1]
         v = v.reshape((-1, self.count))
@@ -175,7 +177,7 @@ def _evaluate(self, v):
         # number of 3D image samples
         n_data = v.shape[0]
 
-        u_even = np.zeros(
+        u_even = xp.zeros(
             (
                 n_r,
                 int(2 * self.ell_max + 1),
@@ -184,7 +186,7 @@ def _evaluate(self, v):
             ),
             dtype=v.dtype,
         )
-        u_odd = np.zeros(
+        u_odd = xp.zeros(
             (n_r, int(2 * self.ell_max + 1), n_data, int(np.ceil(self.ell_max / 2))),
             dtype=v.dtype,
         )
@@ -216,10 +218,10 @@ def _evaluate(self, v):
                     int((ell - 1) / 2),
                 ] = v_ell
 
-        u_even = np.transpose(u_even, (3, 0, 1, 2))
-        u_odd = np.transpose(u_odd, (3, 0, 1, 2))
-        w_even = np.zeros((n_phi, n_r, n_data, 2 * self.ell_max + 1), dtype=v.dtype)
-        w_odd = np.zeros((n_phi, n_r, n_data, 2 * self.ell_max + 1), dtype=v.dtype)
+        u_even = xp.transpose(u_even, (3, 0, 1, 2))
+        u_odd = xp.transpose(u_odd, (3, 0, 1, 2))
+        w_even = xp.zeros((n_phi, n_r, n_data, 2 * self.ell_max + 1), dtype=v.dtype)
+        w_odd = xp.zeros((n_phi, n_r, n_data, 2 * self.ell_max + 1), dtype=v.dtype)
 
         # evaluate the phi parts
         for m in range(0, self.ell_max + 1):
@@ -252,8 +254,8 @@ def _evaluate(self, v):
                 w_even[:, :, :, self.ell_max + sgn * m] = w_m_even
                 w_odd[:, :, :, self.ell_max + sgn * m] = w_m_odd
 
-        w_even = np.transpose(w_even, (3, 0, 1, 2))
-        w_odd = np.transpose(w_odd, (3, 0, 1, 2))
+        w_even = xp.transpose(w_even, (3, 0, 1, 2))
+        w_odd = xp.transpose(w_odd, (3, 0, 1, 2))
         u_even = w_even
         u_odd = w_odd
 
@@ -266,7 +268,7 @@ def _evaluate(self, v):
 
         pf = w_even + 1j * w_odd
         pf = m_reshape(pf, (n_theta * n_phi * n_r, n_data))
-        pf = np.moveaxis(pf, 0, -1)
+        pf = xp.moveaxis(pf, 0, -1)
 
         # perform inverse non-uniformly FFT transformation back to 3D rectangular coordinates
         freqs = m_reshape(self._precomp["fourier_pts"], (3, n_r * n_theta * n_phi))
@@ -275,7 +277,7 @@ def _evaluate(self, v):
         # Roll, return the x with the last three dimensions as self.sz
         # Higher dimensions should be like v.
         x = x.reshape((*sz_roll, *self.sz))
-        return x
+        return xp.asnumpy(x)
 
     def _evaluate_t(self, x):
         """
@@ -288,6 +290,7 @@ def _evaluate_t(self, x):
             `self.count` and whose remaining dimensions correspond to higher
             dimensions of `x`.
         """
+        x = xp.asarray(x)
         # roll dimensions
         sz_roll = x.shape[:-3]
         x = x.reshape((-1, *self.sz))
@@ -303,20 +306,21 @@ def _evaluate_t(self, x):
         pf = m_reshape(pf.T, (n_theta, n_phi * n_r * n_data))
 
         # evaluate the theta parts
-        u_even = self._precomp["ang_theta_wtd"].T @ np.real(pf)
-        u_odd = self._precomp["ang_theta_wtd"].T @ np.imag(pf)
+        tmp = self._precomp["ang_theta_wtd"].T
+        u_even = tmp @ xp.real(pf)
+        u_odd = tmp @ xp.imag(pf)
 
         u_even = m_reshape(u_even, (2 * self.ell_max + 1, n_phi, n_r, n_data))
         u_odd = m_reshape(u_odd, (2 * self.ell_max + 1, n_phi, n_r, n_data))
 
-        u_even = np.transpose(u_even, (1, 2, 3, 0))
-        u_odd = np.transpose(u_odd, (1, 2, 3, 0))
+        u_even = xp.transpose(u_even, (1, 2, 3, 0))
+        u_odd = xp.transpose(u_odd, (1, 2, 3, 0))
 
-        w_even = np.zeros(
+        w_even = xp.zeros(
             (int(np.floor(self.ell_max / 2) + 1), n_r, 2 * self.ell_max + 1, n_data),
             dtype=x.dtype,
         )
-        w_odd = np.zeros(
+        w_odd = xp.zeros(
             (int(np.ceil(self.ell_max / 2)), n_r, 2 * self.ell_max + 1, n_data),
             dtype=x.dtype,
         )
@@ -351,11 +355,11 @@ def _evaluate_t(self, x):
                 end = np.size(w_odd, 0)
                 w_odd[end - n_odd_ell : end, :, self.ell_max + sgn * m, :] = w_m_odd
 
-        w_even = np.transpose(w_even, (1, 2, 3, 0))
-        w_odd = np.transpose(w_odd, (1, 2, 3, 0))
+        w_even = xp.transpose(w_even, (1, 2, 3, 0))
+        w_odd = xp.transpose(w_odd, (1, 2, 3, 0))
 
         # evaluate the radial parts
-        v = np.zeros((n_data, self.count), dtype=x.dtype)
+        v = xp.zeros((n_data, self.count), dtype=x.dtype)
         for ell in range(0, self.ell_max + 1):
             k_max_ell = self.k_max[ell]
             radial_wtd = self._precomp["radial_wtd"][:, 0:k_max_ell, ell]
@@ -388,4 +392,4 @@ def _evaluate_t(self, x):
         # Roll dimensions, last dimension should be self.count,
         # Higher dimensions like x.
         v = v.reshape((*sz_roll, self.count))
-        return v
+        return xp.asnumpy(v)

From c852070792181fa56b28674702a00e29c2af51cf Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 13 Jun 2024 14:58:00 -0400
Subject: [PATCH 26/65] better style

---
 src/aspire/basis/ffb_3d.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/aspire/basis/ffb_3d.py b/src/aspire/basis/ffb_3d.py
index 1ac5fd62ff..5740900a34 100644
--- a/src/aspire/basis/ffb_3d.py
+++ b/src/aspire/basis/ffb_3d.py
@@ -218,8 +218,8 @@ def _evaluate(self, v):
                     int((ell - 1) / 2),
                 ] = v_ell
 
-        u_even = xp.transpose(u_even, (3, 0, 1, 2))
-        u_odd = xp.transpose(u_odd, (3, 0, 1, 2))
+        u_even = u_even.transpose((3, 0, 1, 2))
+        u_odd = u_odd.transpose((3, 0, 1, 2))
         w_even = xp.zeros((n_phi, n_r, n_data, 2 * self.ell_max + 1), dtype=v.dtype)
         w_odd = xp.zeros((n_phi, n_r, n_data, 2 * self.ell_max + 1), dtype=v.dtype)
 
@@ -254,8 +254,8 @@ def _evaluate(self, v):
                 w_even[:, :, :, self.ell_max + sgn * m] = w_m_even
                 w_odd[:, :, :, self.ell_max + sgn * m] = w_m_odd
 
-        w_even = xp.transpose(w_even, (3, 0, 1, 2))
-        w_odd = xp.transpose(w_odd, (3, 0, 1, 2))
+        w_even = w_even.transpose((3, 0, 1, 2))
+        w_odd = w_odd.transpose((3, 0, 1, 2))
         u_even = w_even
         u_odd = w_odd
 
@@ -307,14 +307,14 @@ def _evaluate_t(self, x):
 
         # evaluate the theta parts
         tmp = self._precomp["ang_theta_wtd"].T
-        u_even = tmp @ xp.real(pf)
-        u_odd = tmp @ xp.imag(pf)
+        u_even = tmp @ pf.real
+        u_odd = tmp @ pf.imag
 
         u_even = m_reshape(u_even, (2 * self.ell_max + 1, n_phi, n_r, n_data))
         u_odd = m_reshape(u_odd, (2 * self.ell_max + 1, n_phi, n_r, n_data))
 
-        u_even = xp.transpose(u_even, (1, 2, 3, 0))
-        u_odd = xp.transpose(u_odd, (1, 2, 3, 0))
+        u_even = u_even.transpose((1, 2, 3, 0))
+        u_odd = u_odd.transpose((1, 2, 3, 0))
 
         w_even = xp.zeros(
             (int(np.floor(self.ell_max / 2) + 1), n_r, 2 * self.ell_max + 1, n_data),
@@ -355,8 +355,8 @@ def _evaluate_t(self, x):
                 end = np.size(w_odd, 0)
                 w_odd[end - n_odd_ell : end, :, self.ell_max + sgn * m, :] = w_m_odd
 
-        w_even = xp.transpose(w_even, (1, 2, 3, 0))
-        w_odd = xp.transpose(w_odd, (1, 2, 3, 0))
+        w_even = w_even.transpose((1, 2, 3, 0))
+        w_odd = w_odd.transpose((1, 2, 3, 0))
 
         # evaluate the radial parts
         v = xp.zeros((n_data, self.count), dtype=x.dtype)

From 75acfb19cea42396615e99ab3e9a162ac3f8f21e Mon Sep 17 00:00:00 2001
From: Josh Carmichael <carmichael@princeton.edu>
Date: Thu, 13 Jun 2024 15:11:04 -0400
Subject: [PATCH 27/65] last cupy fill

---
 src/aspire/basis/fle_2d.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/aspire/basis/fle_2d.py b/src/aspire/basis/fle_2d.py
index 4278331e1a..df1d66c608 100644
--- a/src/aspire/basis/fle_2d.py
+++ b/src/aspire/basis/fle_2d.py
@@ -452,7 +452,7 @@ def _create_basis_functions(self):
         """
         Generate the actual basis functions as Python lambda operators
         """
-        norm_constants = xp.zeros(self.count)
+        norm_constants = np.zeros(self.count)
         basis_functions = [None] * self.count
         for i in range(self.count):
             # parameters defining the basis function: bessel order and which bessel root
@@ -481,7 +481,7 @@ def _create_basis_functions(self):
 
             norm_constants[i] = c
 
-        self.norm_constants = norm_constants
+        self.norm_constants = xp.asarray(norm_constants)
         self.basis_functions = basis_functions
 
     def _evaluate(self, coefs):

From ab8ca85713186e9ef370a1309ab434112c0eb527 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Tue, 18 Jun 2024 09:39:42 -0400
Subject: [PATCH 28/65] revert config to numpy/scipy

---
 src/aspire/config_default.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/aspire/config_default.yaml b/src/aspire/config_default.yaml
index fed4cea50a..def78983c0 100644
--- a/src/aspire/config_default.yaml
+++ b/src/aspire/config_default.yaml
@@ -1,9 +1,9 @@
 version: 0.12.3
 common:
     # numeric module to use - one of numpy/cupy
-    numeric: cupy
+    numeric: numpy
     # fft backend to use - one of pyfftw/scipy/cupy/mkl
-    fft: cupy
+    fft: scipy
 
     # Set cache directory for ASPIRE example data.
     # By default the cache location will be set by pooch.os_cache(),

From 54289abc29d8f7cf65cc2d62e20382ba9211b9ed Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Tue, 18 Jun 2024 16:13:28 -0400
Subject: [PATCH 29/65] fft host array preservation

---
 src/aspire/numeric/cupy_fft.py | 45 +++++++++++++++++++++++++++++++---
 1 file changed, 41 insertions(+), 4 deletions(-)

diff --git a/src/aspire/numeric/cupy_fft.py b/src/aspire/numeric/cupy_fft.py
index 29939e504c..7d73367bc7 100644
--- a/src/aspire/numeric/cupy_fft.py
+++ b/src/aspire/numeric/cupy_fft.py
@@ -1,9 +1,36 @@
+import functools
+
 import cupy as cp
 import cupyx.scipy.fft as cufft
 
 from aspire.numeric.base_fft import FFT
 
 
+def _preserve_host(func):
+    """
+    Method decorator that returns a numpy/cupy array result when passed a numpy/cupy array input.
+
+    This improves the flexibility of our FFT wrappers by allowing for incremental code changes.
+    """
+
+    @functools.wraps(func)  # Pass metadata (eg name and doctrings) from `func`
+    def wrapper(self, x, *args, **kwargs):
+
+        _host = False
+        if not isinstance(x, cp.ndarray):
+            _host = True
+            x = cp.asarray(x)
+
+        res = func(self, x, *args, **kwargs)
+
+        if _host:
+            res = res.get()
+
+        return res
+
+    return wrapper
+
+
 class CupyFFT(FFT):
     """
     Define a unified wrapper class for Cupy FFT functions
@@ -11,32 +38,42 @@ class CupyFFT(FFT):
     To be consistent with Scipy and Pyfftw, not all arguments are included.
     """
 
+    @_preserve_host
     def fft(self, x, axis=-1, workers=-1):
         return cp.fft.fft(x, axis=axis)
 
+    @_preserve_host
     def ifft(self, x, axis=-1, workers=-1):
         return cp.fft.ifft(x, axis=axis)
 
+    @_preserve_host
     def fft2(self, x, axes=(-2, -1), workers=-1):
         return cp.fft.fft2(x, axes=axes)
 
+    @_preserve_host
     def ifft2(self, x, axes=(-2, -1), workers=-1):
         return cp.fft.ifft2(x, axes=axes)
 
+    @_preserve_host
     def fftn(self, x, axes=None, workers=-1):
         return cp.fft.fftn(x, axes=axes)
 
+    @_preserve_host
     def ifftn(self, x, axes=None, workers=-1):
         return cp.fft.ifftn(x, axes=axes)
 
+    @_preserve_host
     def fftshift(self, x, axes=None):
         return cp.fft.fftshift(x, axes=axes)
 
+    @_preserve_host
     def ifftshift(self, x, axes=None):
         return cp.fft.ifftshift(x, axes=axes)
 
-    def dct(self, *args, **kwargs):
-        return cufft.dct(*args, **kwargs)
+    @_preserve_host
+    def dct(self, x, **kwargs):
+        return cufft.dct(x, **kwargs)
 
-    def idct(self, *args, **kwargs):
-        return cufft.idct(*args, **kwargs)
+    @_preserve_host
+    def idct(self, x, **kwargs):
+        return cufft.idct(x, **kwargs)

From fc15024dcdc8441026eb692d7aef7a5aec16b01a Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 20 Jun 2024 09:29:34 -0400
Subject: [PATCH 30/65] interop crop_pad_2d

---
 src/aspire/utils/coor_trans.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/aspire/utils/coor_trans.py b/src/aspire/utils/coor_trans.py
index dfb1c630f1..771c5bd5af 100644
--- a/src/aspire/utils/coor_trans.py
+++ b/src/aspire/utils/coor_trans.py
@@ -369,6 +369,11 @@ def rots_to_clmatrix(rots, n_theta):
 
 def crop_pad_2d(im, size, fill_value=0):
     """
+    Crop/pads `im` according to `size`.
+
+    Padding will use `fill_value`.
+    Return's host/gpu array based on `im`.
+
     :param im: A >=2-dimensional numpy array
     :param size: Integer size of cropped/padded output
     :return: A numpy array of shape (..., size, size)
@@ -387,8 +392,16 @@ def crop_pad_2d(im, size, fill_value=0):
         # Determine shape
         shape = list(im.shape[:-2])
         shape.extend([size, size])
-        # ensure that we return in the same dtype as the input
-        to_return = xp.full(shape, fill_value, dtype=im.dtype)
+
+        # Ensure that we return in the same dtype as the input
+        _full = np.full  # Default to numpy array
+        if isinstance(im, xp.ndarray):
+            # Use cupy when `im` _and_ xp are cupy ndarray
+            # Avoids having to handle when cupy is not installed
+            _full = xp.full
+
+        to_return = _full(shape, fill_value, dtype=im.dtype)
+        
         # when padding, start_x and start_y are negative since size is larger
         # than im_x and im_y; the below line calculates where the original image
         # is placed in relation to the (now-larger) box size

From 0bb40179ce4c1d7e3a1468812f943a76bebd1e14 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 20 Jun 2024 10:11:49 -0400
Subject: [PATCH 31/65] interop fle radial convolve

---
 src/aspire/basis/fle_2d.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/aspire/basis/fle_2d.py b/src/aspire/basis/fle_2d.py
index df1d66c608..332b84f64d 100644
--- a/src/aspire/basis/fle_2d.py
+++ b/src/aspire/basis/fle_2d.py
@@ -721,10 +721,12 @@ def radial_convolve(self, coefs, radial_img):
                 "`radial_convolve` currently only implemented for 1D stacks."
             )
 
-        coefs = coefs.asnumpy()
+        # Potentially migrate to GPU
+        coefs = xp.asarray(coefs.asnumpy())
+        radial_img = xp.asarray(radial_img)
 
         num_img = coefs.shape[0]
-        coefs_conv = np.zeros(coefs.shape)
+        coefs_conv = xp.zeros(coefs.shape)
 
         # Convert to internal FLE indices ordering
         coefs = coefs[..., self._fb_to_fle_indices]
@@ -736,25 +738,26 @@ def radial_convolve(self, coefs, radial_img):
             weights = self._radial_convolve_weights(b)
             b = weights / (self.h**2)
             b = b.reshape(self.count)
-            coefs_conv[k, :] = np.real(self.c2r @ (b * (self.r2c @ _coefs).flatten()))
+            coefs_conv[k, :] = (self.c2r @ (b * (self.r2c @ _coefs).flatten())).real
 
         # Convert from internal FLE ordering to FB convention
         coefs_conv = coefs_conv[..., self._fle_to_fb_indices]
 
-        return Coef(self, coefs_conv)
+        # Return as Coef on host
+        return Coef(self, xp.asnumpy(coefs_conv))
 
     def _radial_convolve_weights(self, b):
         """
         Helper function for step 3 of convolving with a radial function.
         """
-        b = np.squeeze(b)
-        b = np.array(b)
+        b = xp.squeeze(b)
+        b = xp.array(b)  # implies copy
         if self.num_interp > self.num_radial_nodes:
             b = fft.dct(b, axis=0, type=2) / (2 * self.num_radial_nodes)
-            bz = np.zeros(b.shape)
-            b = np.concatenate((b, bz), axis=0)
+            bz = xp.zeros(b.shape)
+            b = xp.concatenate((b, bz), axis=0)
             b = fft.idct(b, axis=0, type=2) * 2 * b.shape[0]
-        a = np.zeros(self.count, dtype=np.float64)
+        a = xp.zeros(self.count, dtype=np.float64)
         y = [None] * (self.ell_p_max + 1)
         for i in range(self.ell_p_max + 1):
             y[i] = (self.A3[i] @ b[:, 0]).flatten()

From 48b93e72df07c490f5a1fa6e6b9af8a04b4a94a2 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 20 Jun 2024 14:05:33 -0400
Subject: [PATCH 32/65] cleanup

---
 src/aspire/utils/coor_trans.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/aspire/utils/coor_trans.py b/src/aspire/utils/coor_trans.py
index 771c5bd5af..ef7857c994 100644
--- a/src/aspire/utils/coor_trans.py
+++ b/src/aspire/utils/coor_trans.py
@@ -401,7 +401,7 @@ def crop_pad_2d(im, size, fill_value=0):
             _full = xp.full
 
         to_return = _full(shape, fill_value, dtype=im.dtype)
-        
+
         # when padding, start_x and start_y are negative since size is larger
         # than im_x and im_y; the below line calculates where the original image
         # is placed in relation to the (now-larger) box size

From c6a5c1616d803b5f8424c4ca8fd6dbb027a3c271 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 20 Jun 2024 14:06:43 -0400
Subject: [PATCH 33/65] remove bbenchmark code, hackathon over

---
 bbenchmark/bbenchmark.py      |  58 ------------------------------
 bbenchmark/benchmark_gpu0.pkl | Bin 307 -> 0 bytes
 bbenchmark/benchmark_host.pkl | Bin 307 -> 0 bytes
 bbenchmark/plot_bb.py         |  64 ----------------------------------
 4 files changed, 122 deletions(-)
 delete mode 100644 bbenchmark/bbenchmark.py
 delete mode 100644 bbenchmark/benchmark_gpu0.pkl
 delete mode 100644 bbenchmark/benchmark_host.pkl
 delete mode 100644 bbenchmark/plot_bb.py

diff --git a/bbenchmark/bbenchmark.py b/bbenchmark/bbenchmark.py
deleted file mode 100644
index 01aac6e7eb..0000000000
--- a/bbenchmark/bbenchmark.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import os
-import pickle
-from pprint import pprint
-from time import perf_counter, time
-
-import matplotlib.pyplot as plt
-import numpy as np
-
-from aspire.basis import FFBBasis2D, FLEBasis2D
-from aspire.downloader import emdb_2660
-from aspire.noise import WhiteNoiseAdder
-from aspire.source import ArrayImageSource, Simulation
-
-# Download and cache volume map
-vol = emdb_2660().astype(np.float64)  # doubles
-cached_image_fn = "simulated_images.npy"
-
-if os.path.exists(cached_image_fn):
-    print(f"Loading cached image source from {cached_image_fn}.")
-    sim = ArrayImageSource(np.load(cached_image_fn))
-else:
-    print("Generating Simulated Datatset")
-    sim = Simulation(
-        n=512, C=1, vols=vol, noise_adder=WhiteNoiseAdder.from_snr(0.1)
-    ).cache()
-    print(f"Saving to {cached_image_fn}")
-    np.save(cached_image_fn, sim.images[:].asnumpy())
-
-
-TIMES = {}
-for L in [32, 64, 128, 256]:
-    print(f"Begin L={L}")
-    src = sim.downsample(L)
-    imgs = src.images[:]
-    TIMES[L] = {}
-    for basis_type in [FFBBasis2D, FLEBasis2D]:
-        # Construct basis
-        TIMES[L][basis_type.__name__] = {}
-        basis = basis_type(L, dtype=src.dtype)
-
-        # Time expanding into basis
-        tic = perf_counter()
-        coef = basis.evaluate_t(imgs)
-        toc = perf_counter()
-        TIMES[L][basis_type.__name__]["evaluate_t"] = toc - tic
-
-        # Time expanding back into images
-        tic = perf_counter()
-        _ = coef.evaluate()
-        toc = perf_counter()
-        TIMES[L][basis_type.__name__]["evaluate"] = toc - tic
-
-
-pprint(TIMES)
-
-
-with open(f"benchmark_{int(time())}.pkl", "wb") as fh:
-    pickle.dump(TIMES, fh)
diff --git a/bbenchmark/benchmark_gpu0.pkl b/bbenchmark/benchmark_gpu0.pkl
deleted file mode 100644
index e702dd442dced27acb26fcc3b3e395ce9f465585..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 307
zcmZo*nX19a00y;FG`tmnL=Tsno0C&wab~fR%M>s_wJb5GG_fQ#zGRBK{fB=m-#lPo
z=;45g0>v(0*=`CnqZFvs#}!Fy28+7`_dcgM4hDt{R(A)1g!$SUKxL)g4nT7=m_P)J
zyZseA`*#jt74}bVm#$9$s>oo2$T<k@u$$8Z(Jkt9*ZK}vcLTESZ-p~N4IsLIOg-)u
zfUMi}RJdCWM7P=Y|AtRNx_ucKk+eIwdfq!S3!+41Yt11RWbKukocN6vfSq1>FJo@?
J0igC$Jpf$(W#0e*

diff --git a/bbenchmark/benchmark_host.pkl b/bbenchmark/benchmark_host.pkl
deleted file mode 100644
index dc0dd2a1769fc52c9470986e74eb64864f59e7fe..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 307
zcmZo*nX19a00y;FG`tmnL=Tsno0C&wab~fR%M>s_wJb5GG_fQ#zGRBK{l>F_wm|hg
z957L!*k;SlN}yONP^*tClGY3scLzS3waO9<3>mEM4m>kTmTLf&m3lh>&COr}5iIWZ
zmm4fH4}ewJUv3m%3o<B!4I<|ty7tqn6o_u2PZg>vVBHPKy1(9iTm;ktG~fPnyM$W-
zvToDPIg_qJbek#on}>jO`!X;hX?O6pRZ8-OC=s5uZo?jA?UmgRJ_s6sonHA^-k?wb
IsJ&DV0PQbeIsgCw

diff --git a/bbenchmark/plot_bb.py b/bbenchmark/plot_bb.py
deleted file mode 100644
index 05f5350f4b..0000000000
--- a/bbenchmark/plot_bb.py
+++ /dev/null
@@ -1,64 +0,0 @@
-import os
-import pickle
-from pprint import pprint
-
-import matplotlib.pyplot as plt
-import numpy as np
-
-host_fn = "benchmark_host.pkl"
-gpu_fn = "benchmark_gpu0.pkl"
-
-
-with open(host_fn, "rb") as fh:
-    host_times = pickle.load(fh)
-
-with open(gpu_fn, "rb") as fh:
-    gpu_times = pickle.load(fh)
-
-markers = {"FFBBasis2D": "8", "FLEBasis2D": "s"}
-
-# Evaluate_t
-Ls = list(host_times.keys())
-for basis_type in markers.keys():
-    plt.plot(
-        Ls,
-        [host_times[L][basis_type]["evaluate_t"] for L in Ls],
-        marker=markers[basis_type],
-        color="blue",
-        label=basis_type + "-host",
-    )
-    plt.plot(
-        Ls,
-        [gpu_times[L][basis_type]["evaluate_t"] for L in Ls],
-        marker=markers[basis_type],
-        color="green",
-        label=basis_type + "-gpu",
-    )
-plt.title("Basis `evaluate_t` Permformance - Batch of 512 Images")
-plt.xlabel("Image Pixel L (LxL)")
-plt.ylabel("Time (seconds)")
-plt.legend()
-plt.savefig("evaluate_t.png")
-plt.show()
-
-for basis_type in markers.keys():
-    plt.plot(
-        Ls,
-        [host_times[L][basis_type]["evaluate"] for L in Ls],
-        marker=markers[basis_type],
-        color="blue",
-        label=basis_type + "-host",
-    )
-    plt.plot(
-        Ls,
-        [gpu_times[L][basis_type]["evaluate"] for L in Ls],
-        marker=markers[basis_type],
-        color="green",
-        label=basis_type + "-gpu",
-    )
-plt.title("Basis `evaluate` Permformance - Batch of 512 Images")
-plt.xlabel("Image Pixel L (LxL)")
-plt.ylabel("Time (seconds)")
-plt.legend()
-plt.savefig("evaluate.png")
-plt.show()

From d968df3f22bac472cfaed66b6d671e7391ef7795 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 20 Jun 2024 14:25:30 -0400
Subject: [PATCH 34/65] fix interop cp check

---
 src/aspire/nufft/__init__.py | 12 +++++++-----
 src/aspire/numeric/numpy.py  |  9 +++++++--
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/aspire/nufft/__init__.py b/src/aspire/nufft/__init__.py
index f748a10d84..c2db4f2e8e 100644
--- a/src/aspire/nufft/__init__.py
+++ b/src/aspire/nufft/__init__.py
@@ -2,13 +2,15 @@
 
 import numpy as np
 
+from aspire import config
+from aspire.utils import LogFilterByCount, complex_type, real_type
+
+cp = None
 try:
     import cupy as cp
 except ModuleNotFoundError:
-    cp = None
+    pass
 
-from aspire import config
-from aspire.utils import LogFilterByCount, complex_type, real_type
 
 logger = logging.getLogger(__name__)
 
@@ -196,7 +198,7 @@ def anufft(sig_f, fourier_pts, sz, real=False, epsilon=1e-8):
 
     adjoint = adjoint.real if real else adjoint
 
-    if not on_gpu:
+    if cp and not on_gpu:
         adjoint = adjoint.get()
 
     return adjoint
@@ -257,7 +259,7 @@ def nufft(sig_f, fourier_pts, real=False, epsilon=1e-8):
 
     transform = transform.real if real else transform
 
-    if not on_gpu:
+    if cp and not on_gpu:
         transform = transform.get()
 
     return transform
diff --git a/src/aspire/numeric/numpy.py b/src/aspire/numeric/numpy.py
index 9367409c78..07627399f9 100644
--- a/src/aspire/numeric/numpy.py
+++ b/src/aspire/numeric/numpy.py
@@ -1,11 +1,16 @@
-import cupy as cp
 import numpy as np
 
+cp = None
+try:
+    import cupy as cp
+except ModuleNotFoundError:
+    pass
+
 
 class Numpy:
     @staticmethod
     def asnumpy(x):
-        if isinstance(x, cp.ndarray):
+        if cp and isinstance(x, cp.ndarray):
             x = x.get()
         return x
 

From 29b990c9d194791d5aa0009311cd77ef2a98f430 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 20 Jun 2024 15:22:03 -0400
Subject: [PATCH 35/65] use cupy modes on ampere_gpu jobs

---
 .github/workflows/workflow.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/workflow.yml b/.github/workflows/workflow.yml
index f2d5472e52..24528a6b72 100644
--- a/.github/workflows/workflow.yml
+++ b/.github/workflows/workflow.yml
@@ -148,6 +148,8 @@ jobs:
         echo "WORK_DIR=${WORK_DIR}" >> $GITHUB_ENV
         echo -e "ray:\n    temp_dir: ${WORK_DIR}\n" > ${WORK_DIR}/config.yaml
         echo -e "common:\n    cache_dir: ${CI_CACHE_DIR}\n" >> ${WORK_DIR}/config.yaml
+        echo -e "    numeric: cupy\n" >> ${WORK_DIR}/config.yaml
+        echo -e "    fft: cupy\n" >> ${WORK_DIR}/config.yaml
         echo "Log the config: ${WORK_DIR}/config.yaml"
         cat ${WORK_DIR}/config.yaml
     - name: Run

From 3baf067414a8a5b71bcd25b7bf92a9a73db1d787 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 20 Jun 2024 15:24:25 -0400
Subject: [PATCH 36/65] ws cleanup in gha config gen

---
 .github/workflows/workflow.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/workflow.yml b/.github/workflows/workflow.yml
index 24528a6b72..fce5a7f6d4 100644
--- a/.github/workflows/workflow.yml
+++ b/.github/workflows/workflow.yml
@@ -147,8 +147,8 @@ jobs:
         echo "Stash the WORK_DIR to GitHub env so we can clean it up later."
         echo "WORK_DIR=${WORK_DIR}" >> $GITHUB_ENV
         echo -e "ray:\n    temp_dir: ${WORK_DIR}\n" > ${WORK_DIR}/config.yaml
-        echo -e "common:\n    cache_dir: ${CI_CACHE_DIR}\n" >> ${WORK_DIR}/config.yaml
-        echo -e "    numeric: cupy\n" >> ${WORK_DIR}/config.yaml
+        echo -e "common:\n    cache_dir: ${CI_CACHE_DIR}" >> ${WORK_DIR}/config.yaml
+        echo -e "    numeric: cupy" >> ${WORK_DIR}/config.yaml
         echo -e "    fft: cupy\n" >> ${WORK_DIR}/config.yaml
         echo "Log the config: ${WORK_DIR}/config.yaml"
         cat ${WORK_DIR}/config.yaml

From eecfbdfe069911e352173bc6d76f1be2a41cfa19 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 20 Jun 2024 17:14:46 -0400
Subject: [PATCH 37/65] remove older GPU environments.

---
 pyproject.toml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index fcc0f7cf4f..c9c25a9976 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -61,10 +61,6 @@ dependencies = [
 "Source" = "https://github.com/ComputationalCryoEM/ASPIRE-Python"
 
 [project.optional-dependencies]
-gpu-102 = ["cupy-cuda102", "cufinufft==1.3"]
-gpu-110 = ["cupy-cuda110", "cufinufft==1.3"]
-gpu-111 = ["cupy-cuda111", "cufinufft==1.3"]
-gpu-11x = ["cupy-cuda11x", "cufinufft==1.3"]
 gpu-12x = ["cupy-cuda12x", "cufinufft==2.2.0"]
 dev = [
     "black",

From 795905208ac89f9d362b4b4053b68505b44b4ee8 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Fri, 21 Jun 2024 09:47:04 -0400
Subject: [PATCH 38/65] fle basis to mat xp conversion

---
 src/aspire/basis/fle_2d.py | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/src/aspire/basis/fle_2d.py b/src/aspire/basis/fle_2d.py
index 332b84f64d..e82dbb5d2f 100644
--- a/src/aspire/basis/fle_2d.py
+++ b/src/aspire/basis/fle_2d.py
@@ -786,20 +786,26 @@ def filter_to_basis_mat(self, f, **kwargs):
 
         # get 2D grid in polar coordinate
         k_vals, wts = lgwt(n_k, 0, 0.5, dtype=self.dtype)
-        k, theta = np.meshgrid(
-            k_vals, np.arange(n_theta) * 2 * np.pi / (2 * n_theta), indexing="ij"
+        k, theta = xp.meshgrid(
+            xp.asarray(k_vals),
+            xp.arange(n_theta) * 2 * np.pi / (2 * n_theta),
+            indexing="ij",
         )
 
         # Get function values in polar 2D grid and average out angle contribution
         # NOTE: should probably just let the ctf objects handle this...
-        omegax = k * np.cos(theta)
-        omegay = k * np.sin(theta)
-        omega = 2 * np.pi * np.vstack((omegax.flatten("C"), omegay.flatten("C")))
-
-        h_vals2d = h_fun(omega).reshape(n_k, n_theta).astype(self.dtype)
-        h_vals = np.sum(h_vals2d, axis=1) / n_theta
+        omegax = k * xp.cos(theta)
+        omegay = k * xp.sin(theta)
+        omega = 2 * xp.pi * xp.vstack((omegax.flatten("C"), omegay.flatten("C")))
+
+        h_vals2d = (
+            xp.asarray(h_fun(omega))
+            .reshape(n_k, n_theta)
+            .astype(self.dtype, copy=False)
+        )
+        h_vals = xp.sum(h_vals2d, axis=1) / n_theta
 
-        h_basis = np.zeros(self.count, dtype=self.dtype)
+        h_basis = xp.zeros(self.count, dtype=self.dtype)
         # For now we just need to handle 1D (stack of one ctf)
         for j in range(self.ell_p_max + 1):
             h_basis[self.idx_list[j]] = self.A3[j] @ h_vals
@@ -807,4 +813,4 @@ def filter_to_basis_mat(self, f, **kwargs):
         # Convert from internal FLE ordering to FB convention
         h_basis = h_basis[self._fle_to_fb_indices]
 
-        return DiagMatrix(h_basis)
+        return DiagMatrix(xp.asnumpy(h_basis))

From c429d0243e3cb28e8e8a8397a2733c8eb2a4e74c Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Fri, 21 Jun 2024 10:14:39 -0400
Subject: [PATCH 39/65] better eigsh sanity check

---
 tests/test_numeric_sparse.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/test_numeric_sparse.py b/tests/test_numeric_sparse.py
index 288e41a176..5a8227fe47 100644
--- a/tests/test_numeric_sparse.py
+++ b/tests/test_numeric_sparse.py
@@ -47,7 +47,8 @@ def test_eigsh(backends):
     """
     xp, sparse = backends
 
-    A = xp.eye(1234)
+    n = 123
+    A = xp.diag(xp.arange(1, n + 1, dtype=np.float64))
 
-    lamb, _ = sparse.linalg.eigsh(A)
-    np.testing.assert_allclose(xp.asnumpy(lamb), 1.0)
+    lamb, _ = sparse.linalg.eigsh(A, k=1)
+    np.testing.assert_allclose(xp.asnumpy(lamb), n)

From bb8a913ddf86afa079ef5434778022a1cad1966a Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 24 Jun 2024 10:50:24 -0400
Subject: [PATCH 40/65] cupy fft accuracy casting work around

---
 src/aspire/numeric/cupy_fft.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/aspire/numeric/cupy_fft.py b/src/aspire/numeric/cupy_fft.py
index 7d73367bc7..043780c7a7 100644
--- a/src/aspire/numeric/cupy_fft.py
+++ b/src/aspire/numeric/cupy_fft.py
@@ -2,6 +2,7 @@
 
 import cupy as cp
 import cupyx.scipy.fft as cufft
+import numpy as np
 
 from aspire.numeric.base_fft import FFT
 
@@ -16,6 +17,17 @@ def _preserve_host(func):
     @functools.wraps(func)  # Pass metadata (eg name and doctrings) from `func`
     def wrapper(self, x, *args, **kwargs):
 
+        # CuPy's single precision FFT appears to be too inaccurate for
+        # many of our unit tests, so the signal is upcast and recast
+        # on return.
+        _singles = False
+        if x.dtype == np.float32:
+            _singles = True
+            x = x.astype(np.float64)
+        elif x.dtype == np.complex64:
+            _singles = True
+            x = x.astype(np.complex128)
+
         _host = False
         if not isinstance(x, cp.ndarray):
             _host = True
@@ -26,6 +38,12 @@ def wrapper(self, x, *args, **kwargs):
         if _host:
             res = res.get()
 
+        # Recast if needed.
+        if _singles and res.dtype == np.float64:
+            res = res.astype(np.float32)
+        elif _singles and res.dtype == np.complex128:
+            res = res.astype(np.complex64)
+
         return res
 
     return wrapper

From 4204efe5a210ab750550935bb164d1d0f7e96503 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 24 Jun 2024 10:51:01 -0400
Subject: [PATCH 41/65] some numpy/cupy interop tweaks

---
 src/aspire/image/image.py   | 15 ++++++++-------
 src/aspire/volume/volume.py |  8 ++++----
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/aspire/image/image.py b/src/aspire/image/image.py
index a0a954f2d9..13c6e122de 100644
--- a/src/aspire/image/image.py
+++ b/src/aspire/image/image.py
@@ -377,17 +377,18 @@ def filter(self, filter):
 
         im = self.stack_reshape(-1)
 
-        filter_values = filter.evaluate_grid(self.resolution)
+        filter_values = xp.asarray(filter.evaluate_grid(self.resolution))
 
-        im_f = xp.asnumpy(fft.centered_fft2(xp.asarray(im._data)))
+        im_f = fft.centered_fft2(xp.asarray(im._data))
 
         # TODO: why are these different? Doesn't the broadcast work?
         if im_f.ndim > filter_values.ndim:
             im_f *= filter_values
         else:
             im_f = filter_values * im_f
-        im = xp.asnumpy(fft.centered_ifft2(xp.asarray(im_f)))
-        im = np.real(im)
+
+        im = fft.centered_ifft2(im_f)
+        im = xp.asnumpy(im.real)
 
         return self.__class__(im).stack_reshape(original_stack_shape)
 
@@ -461,7 +462,7 @@ def _im_translate(self, shifts):
         shifts = shifts.astype(self.dtype)
 
         L = self.resolution
-        im_f = xp.asnumpy(fft.fft2(xp.asarray(im)))
+        im_f = xp.asnumpy(fft.fft2(xp.asarray(im)))  # todo
         grid_shifted = fft.ifftshift(
             xp.asarray(np.ceil(np.arange(-L / 2, L / 2, dtype=self.dtype)))
         )
@@ -477,8 +478,8 @@ def _im_translate(self, shifts):
         )
         mult_f = np.exp(-1j * phase_shifts)
         im_translated_f = im_f * mult_f
-        im_translated = xp.asnumpy(fft.ifft2(xp.asarray(im_translated_f)))
-        im_translated = np.real(im_translated)
+        im_translated = fft.ifft2(xp.asarray(im_translated_f))
+        im_translated = xp.asnumpy(im_translated.real)
 
         # Reshape to stack shape
         return self.__class__(im_translated).stack_reshape(stack_shape)
diff --git a/src/aspire/volume/volume.py b/src/aspire/volume/volume.py
index 7883f59190..b7e4245ede 100644
--- a/src/aspire/volume/volume.py
+++ b/src/aspire/volume/volume.py
@@ -342,13 +342,13 @@ def project(self, rot_matrices):
         if rot_matrices.ndim == 2:
             rot_matrices = np.expand_dims(rot_matrices, axis=0)
 
-        data = self._data
+        data = xp.asarray(self._data)
         n_rots = rot_matrices.shape[0]
         pts_rot = rotated_grids(self.resolution, rot_matrices)
 
         if n_rots == self.n_vols:
             # Apply rotations to Volumes element-wise.
-            im_f = np.empty(
+            im_f = xp.empty(
                 (self.n_vols, self.resolution**2), dtype=complex_type(self.dtype)
             )
             pts_rot = pts_rot.reshape((3, n_rots, self.resolution**2))
@@ -370,9 +370,9 @@ def project(self, rot_matrices):
             im_f[:, 0, :] = 0
             im_f[:, :, 0] = 0
 
-        im_f = xp.asnumpy(fft.centered_ifft2(xp.asarray(im_f)))
+        im_f = fft.centered_ifft2(im_f)
 
-        return aspire.image.Image(np.real(im_f))
+        return aspire.image.Image(xp.asnumpy(im_f.real))
 
     def to_vec(self):
         """Returns an N x resolution ** 3 array."""

From c9d3a92253c6234adabbc5a44cb3700f56963321 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 24 Jun 2024 10:57:14 -0400
Subject: [PATCH 42/65] more image interop tweaks

---
 src/aspire/image/image.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/aspire/image/image.py b/src/aspire/image/image.py
index 13c6e122de..c8cc9cbe24 100644
--- a/src/aspire/image/image.py
+++ b/src/aspire/image/image.py
@@ -459,15 +459,15 @@ def _im_translate(self, shifts):
             n_shifts == 1 or n_shifts == self.n_images
         ), "number of shifts must be 1 or match the number of images"
         # Cast shifts to this instance's internal dtype
-        shifts = shifts.astype(self.dtype)
+        shifts = xp.asarray(shifts, dtype=self.dtype)
 
         L = self.resolution
-        im_f = xp.asnumpy(fft.fft2(xp.asarray(im)))  # todo
+        im_f = fft.fft2(xp.asarray(im))
         grid_shifted = fft.ifftshift(
-            xp.asarray(np.ceil(np.arange(-L / 2, L / 2, dtype=self.dtype)))
+            xp.ceil(xp.arange(-L / 2, L / 2, dtype=self.dtype))
         )
-        grid_1d = xp.asnumpy(grid_shifted) * 2 * np.pi / L
-        om_x, om_y = np.meshgrid(grid_1d, grid_1d, indexing="ij")
+        grid_1d = grid_shifted * 2 * xp.pi / L
+        om_x, om_y = xp.meshgrid(grid_1d, grid_1d, indexing="ij")
 
         phase_shifts_x = -shifts[:, 0].reshape((n_shifts, 1, 1))
         phase_shifts_y = -shifts[:, 1].reshape((n_shifts, 1, 1))
@@ -476,9 +476,9 @@ def _im_translate(self, shifts):
             om_x[np.newaxis, :, :] * phase_shifts_x
             + om_y[np.newaxis, :, :] * phase_shifts_y
         )
-        mult_f = np.exp(-1j * phase_shifts)
+        mult_f = xp.exp(-1j * phase_shifts)
         im_translated_f = im_f * mult_f
-        im_translated = fft.ifft2(xp.asarray(im_translated_f))
+        im_translated = fft.ifft2(im_translated_f)
         im_translated = xp.asnumpy(im_translated.real)
 
         # Reshape to stack shape

From 59ff57effb120bad53e0a6bf05616b21acc5110e Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 24 Jun 2024 14:40:29 -0400
Subject: [PATCH 43/65] misc xp/numeric wrapper cleanup

---
 src/aspire/nufft/__init__.py     | 12 ++++++------
 src/aspire/numeric/cupy_fft.py   | 12 ++++++++++--
 src/aspire/numeric/numpy.py      |  4 ++++
 src/aspire/numeric/pyfftw_fft.py |  8 ++++----
 src/aspire/numeric/scipy_fft.py  |  8 ++++----
 5 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/src/aspire/nufft/__init__.py b/src/aspire/nufft/__init__.py
index c2db4f2e8e..953f55ce3e 100644
--- a/src/aspire/nufft/__init__.py
+++ b/src/aspire/nufft/__init__.py
@@ -172,9 +172,9 @@ def anufft(sig_f, fourier_pts, sz, real=False, epsilon=1e-8):
 
     """
 
-    on_gpu = False
+    _on_gpu = False
     if cp and isinstance(sig_f, cp.ndarray):
-        on_gpu = True
+        _on_gpu = True
 
     if fourier_pts.dtype != real_type(sig_f.dtype):
         raise RuntimeError(
@@ -198,7 +198,7 @@ def anufft(sig_f, fourier_pts, sz, real=False, epsilon=1e-8):
 
     adjoint = adjoint.real if real else adjoint
 
-    if cp and not on_gpu:
+    if cp and not _on_gpu:
         adjoint = adjoint.get()
 
     return adjoint
@@ -223,9 +223,9 @@ def nufft(sig_f, fourier_pts, real=False, epsilon=1e-8):
 
     """
 
-    on_gpu = False
+    _on_gpu = False
     if cp and isinstance(sig_f, cp.ndarray):
-        on_gpu = True
+        _on_gpu = True
 
     if fourier_pts.dtype != real_type(sig_f.dtype):
         raise RuntimeError(
@@ -259,7 +259,7 @@ def nufft(sig_f, fourier_pts, real=False, epsilon=1e-8):
 
     transform = transform.real if real else transform
 
-    if cp and not on_gpu:
+    if cp and not _on_gpu:
         transform = transform.get()
 
     return transform
diff --git a/src/aspire/numeric/cupy_fft.py b/src/aspire/numeric/cupy_fft.py
index 043780c7a7..f67937813f 100644
--- a/src/aspire/numeric/cupy_fft.py
+++ b/src/aspire/numeric/cupy_fft.py
@@ -7,11 +7,16 @@
 from aspire.numeric.base_fft import FFT
 
 
+# This improves the flexibility of our FFT wrappers by allowing for
+# incremental code changes and testing.
 def _preserve_host(func):
     """
-    Method decorator that returns a numpy/cupy array result when passed a numpy/cupy array input.
+    Method decorator that returns a numpy/cupy array result when
+    passed a numpy/cupy array input respectively.
 
-    This improves the flexibility of our FFT wrappers by allowing for incremental code changes.
+    At the time of writing this wrapper will also upcast cupy FFT
+    operations to doubles as the precision in singles can cause
+    accuracy issues.
     """
 
     @functools.wraps(func)  # Pass metadata (eg name and doctrings) from `func`
@@ -20,6 +25,9 @@ def wrapper(self, x, *args, **kwargs):
         # CuPy's single precision FFT appears to be too inaccurate for
         # many of our unit tests, so the signal is upcast and recast
         # on return.
+        # Todo, discuss with Joakim whether we want this upcasting
+        # business configurable or keep singles, both in conjunction
+        # with xfailing the tests.
         _singles = False
         if x.dtype == np.float32:
             _singles = True
diff --git a/src/aspire/numeric/numpy.py b/src/aspire/numeric/numpy.py
index 07627399f9..ddc8355816 100644
--- a/src/aspire/numeric/numpy.py
+++ b/src/aspire/numeric/numpy.py
@@ -8,8 +8,12 @@
 
 
 class Numpy:
+    # This can be required when mixing nufft/fft/numpy backend combinations.
     @staticmethod
     def asnumpy(x):
+        """
+        Ensure `asnumpy` is always available and returns a numpy array.
+        """
         if cp and isinstance(x, cp.ndarray):
             x = x.get()
         return x
diff --git a/src/aspire/numeric/pyfftw_fft.py b/src/aspire/numeric/pyfftw_fft.py
index afcad98d28..95a8ea80f7 100644
--- a/src/aspire/numeric/pyfftw_fft.py
+++ b/src/aspire/numeric/pyfftw_fft.py
@@ -160,8 +160,8 @@ def fftshift(self, a, axes=None):
     def ifftshift(self, a, axes=None):
         return scipy_fft.ifftshift(a, axes=axes)
 
-    def dct(self, *args, **kwargs):
-        return scipy_fft.dct(*args, **kwargs)
+    def dct(self, x, **kwargs):
+        return scipy_fft.dct(x, **kwargs)
 
-    def idct(self, *args, **kwargs):
-        return scipy_fft.idct(*args, **kwargs)
+    def idct(self, x, **kwargs):
+        return scipy_fft.idct(x, **kwargs)
diff --git a/src/aspire/numeric/scipy_fft.py b/src/aspire/numeric/scipy_fft.py
index d78e463803..3891d45671 100644
--- a/src/aspire/numeric/scipy_fft.py
+++ b/src/aspire/numeric/scipy_fft.py
@@ -34,8 +34,8 @@ def fftshift(self, x, axes=None):
     def ifftshift(self, x, axes=None):
         return sp.fft.ifftshift(x, axes=axes)
 
-    def dct(self, *args, **kwargs):
-        return sp.fft.dct(*args, **kwargs)
+    def dct(self, x, **kwargs):
+        return sp.fft.dct(x, **kwargs)
 
-    def idct(self, *args, **kwargs):
-        return sp.fft.idct(*args, **kwargs)
+    def idct(self, x, **kwargs):
+        return sp.fft.idct(x, **kwargs)

From f62e93549783087180c40ca60f7b451aa6ba083b Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Wed, 26 Jun 2024 14:20:20 -0400
Subject: [PATCH 44/65] precache fle x y grids on gpu

---
 src/aspire/basis/fle_2d.py    | 22 ++++++++++------------
 src/aspire/nufft/cufinufft.py | 14 +++++---------
 2 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/src/aspire/basis/fle_2d.py b/src/aspire/basis/fle_2d.py
index e82dbb5d2f..2ca41a2994 100644
--- a/src/aspire/basis/fle_2d.py
+++ b/src/aspire/basis/fle_2d.py
@@ -308,12 +308,13 @@ def _compute_nufft_points(self):
             * np.arange(self.num_angular_nodes // 2, dtype=self.dtype)
             / self.num_angular_nodes
         )
-        x = np.cos(phi).reshape(1, self.num_angular_nodes // 2)
-        y = np.sin(phi).reshape(1, self.num_angular_nodes // 2)
-        x = x * nodes * h
-        y = y * nodes * h
-        self.grid_x = x.flatten()
-        self.grid_y = y.flatten()
+        grid_xy = np.empty(
+            (2, self.num_radial_nodes, self.num_angular_nodes // 2), dtype=self.dtype
+        )
+        grid_xy[0] = np.cos(phi)  # x
+        grid_xy[1] = np.sin(phi)  # y
+        grid_xy *= nodes * h
+        self.grid_xy = xp.asarray(grid_xy.reshape(2, -1))
 
     def _build_interpolation_matrix(self):
         """
@@ -531,7 +532,7 @@ def _evaluate_t(self, imgs):
     def _step1_t(self, im):
         """
         Step 1 of the adjoint transformation (images to coefficients).
-        Calculates the NUFFT of the image on gridpoints `self.grid_x` and `self.grid_y`.
+        Calculates the NUFFT of the image on gridpoints `grid_x` and `grid_y`.
         """
         im = im.reshape(-1, self.nres, self.nres).astype(complex_type(self.dtype))
         num_img = im.shape[0]
@@ -539,10 +540,7 @@ def _step1_t(self, im):
             (num_img, self.num_radial_nodes, self.num_angular_nodes),
             dtype=complex_type(self.dtype),
         )
-        _z = (
-            nufft(im, np.stack((self.grid_x, self.grid_y)), epsilon=self.epsilon)
-            * self.h**2
-        )
+        _z = nufft(im, self.grid_xy, epsilon=self.epsilon) * self.h**2
         _z = _z.reshape(num_img, self.num_radial_nodes, self.num_angular_nodes // 2)
         z[:, :, : self.num_angular_nodes // 2] = _z
         z[:, :, self.num_angular_nodes // 2 :] = np.conj(_z)
@@ -645,7 +643,7 @@ def _step1(self, z):
         z = z[:, :, : self.num_angular_nodes // 2].reshape(num_img, -1)
         im = anufft(
             z.astype(complex_type(self.dtype)),
-            np.stack((self.grid_x, self.grid_y)),
+            self.grid_xy,
             (self.nres, self.nres),
             epsilon=self.epsilon,
         )
diff --git a/src/aspire/nufft/cufinufft.py b/src/aspire/nufft/cufinufft.py
index 2dceb08b80..c1d15ff686 100644
--- a/src/aspire/nufft/cufinufft.py
+++ b/src/aspire/nufft/cufinufft.py
@@ -51,11 +51,11 @@ def __init__(self, sz, fourier_pts, epsilon=1e-8, ntransforms=1, **kwargs):
                 "cufinufft has caught a non C_CONTIGUOUS array,"
                 " `fourier_pts` will be copied to C_CONTIGUOUS."
             )
-        self.fourier_pts = np.ascontiguousarray(
-            np.mod(fourier_pts + np.pi, 2 * np.pi) - np.pi, dtype=self.dtype
+        self.fourier_pts = cp.ascontiguousarray(
+            cp.mod(cp.asarray(fourier_pts, dtype=self.dtype) + cp.pi, 2 * cp.pi) - cp.pi
         )
 
-        self.num_pts = fourier_pts.shape[1]
+        self.num_pts = self.fourier_pts.shape[1]
         self.epsilon = max(epsilon, np.finfo(self.dtype).eps)
 
         self._transform_plan = cufPlan(
@@ -81,12 +81,8 @@ def __init__(self, sz, fourier_pts, epsilon=1e-8, ntransforms=1, **kwargs):
             **self.adjoint_opts,
         )
 
-        # Note, I store self.fourier_pts_gpu so the GPUArrray life
-        #   is tied to instance, instead of this method.
-        self.fourier_pts_gpu = cp.array(self.fourier_pts)
-
-        self._transform_plan.setpts(*self.fourier_pts_gpu)
-        self._adjoint_plan.setpts(*self.fourier_pts_gpu)
+        self._transform_plan.setpts(*self.fourier_pts)
+        self._adjoint_plan.setpts(*self.fourier_pts)
 
     def transform(self, signal):
         """

From 3f1208baff9e9b99cadbce3d48e2749401e8d7f5 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 27 Jun 2024 15:32:28 -0400
Subject: [PATCH 45/65] Rm unneeded gc call

---
 src/aspire/basis/fle_2d.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/aspire/basis/fle_2d.py b/src/aspire/basis/fle_2d.py
index 2ca41a2994..ba7d4636e9 100644
--- a/src/aspire/basis/fle_2d.py
+++ b/src/aspire/basis/fle_2d.py
@@ -1,4 +1,3 @@
-import gc
 import logging
 
 import numpy as np
@@ -21,9 +20,8 @@
 
 def _cleanup():
     """
-    Utility for informing python+cupy to cleanup memory held by old vars.
+    Utility for informing cupy to cleanup memory held by old vars.
     """
-    gc.collect()
     try:
         import cupy
 
@@ -511,18 +509,18 @@ def _evaluate_t(self, imgs):
             coefficients.
         """
         # See Section 3.5
-        imgs = xp.array(imgs)  # Copy here, mutating.
+        imgs = xp.array(imgs)  # Intentionally copying here, mutating.
         imgs[:, self.radial_mask] = 0
         z = self._step1_t(imgs)
-        del imgs
+        del imgs  # inform python we're done with imgs
         _cleanup()
 
         b = self._step2_t(z)
-        del z
+        del z  # inform python we're done with z
         _cleanup()
 
         coefs = self._step3_t(b)
-        del b
+        del b  # inform python we're done with b
         _cleanup()
 
         # return in FB order

From cba8d7c5ee6179722de14b39f190440798494d9b Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Fri, 28 Jun 2024 08:50:25 -0400
Subject: [PATCH 46/65] Add cupy GPU options to config tutorial

---
 gallery/tutorials/configuration.py | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/gallery/tutorials/configuration.py b/gallery/tutorials/configuration.py
index 819ff9b675..75354bc422 100644
--- a/gallery/tutorials/configuration.py
+++ b/gallery/tutorials/configuration.py
@@ -102,6 +102,36 @@
     time.sleep(1)
 print("Done Loop 2\n")
 
+# %%
+# Enabling GPU Acceleration
+# -------------------------
+# Enabling GPU acceleration requires installing supporting software
+# packages and small config changes. Installing the supporting
+# software is most easily accomplished by installing ASPIRE with one
+# of the published GPU extensions, for example ``pip install
+# "aspire[dev,gpu_12x]"``.  Once the packages are installed users
+# should automatically find that the NUFFT calls are running on the
+# GPU.  Additional acceleration is achieved by enabling `cupy` for
+# `numeric` and `fft` components.
+#
+#    .. code-block:: yaml
+#
+#      common:
+#          # numeric module to use - one of numpy/cupy
+#          numeric: cupy
+#          # fft backend to use - one of pyfftw/scipy/cupy/mkl
+#          fft: cupy
+#
+# Alternatively, like other config options, this can be changed
+# dynamically with code.
+#
+#    .. code-block:: python
+#
+#      from aspire import config
+#
+#      config["common"]["numeric"] = "cupy"
+#      config["common"]["fft"] = "cupy"
+#
 
 # %%
 # Resolution

From f9cc8925a5cdee1b9891b75b3d3bbeeb5e5b1939 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Fri, 28 Jun 2024 09:12:55 -0400
Subject: [PATCH 47/65] update GPU install docs

---
 docs/source/installation.rst | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/docs/source/installation.rst b/docs/source/installation.rst
index 4a48e3a505..5fa608ecdf 100644
--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -129,10 +129,10 @@ an M1 laptop:
 Installing GPU Extensions
 *************************
 
-ASPIRE does support GPUs, depending on several external packages.  The
-collection of GPU extensions can be installed using ``pip``.
-Extensions are grouped based on CUDA versions.  To find the CUDA
-driver version, run ``nvidia-smi`` on the intended system.
+ASPIRE does support using a GPU, depending on several external
+packages.  The collection of GPU extensions can be installed using
+``pip``.  Extensions are grouped based on CUDA versions.  To find the
+CUDA driver version, run ``nvidia-smi`` on the intended system.
 
 .. list-table:: CUDA GPU Extension Versions
    :widths: 25 25
@@ -140,14 +140,6 @@ driver version, run ``nvidia-smi`` on the intended system.
 
    * - CUDA Version
      - ASPIRE Extension
-   * - 10.2
-     - gpu-102
-   * - 11.0
-     - gpu-110
-   * - 11.1
-     - gpu-111
-   * - >=11.2
-     - gpu-11x
    * - >=12
      - gpu-12x
 
@@ -164,12 +156,15 @@ the command below would install GPU packages required for ASPIRE.
 
     
 By default if the required GPU extensions are correctly installed,
-ASPIRE should automatically begin using the GPU for select components
-(such as those using ``nufft``).
-
-Because GPU extensions depend on several third party packages and
-libraries, we can only offer limited support if one of the packages
-has a problem on your system.
+ASPIRE should automatically begin using the GPU calls to our ``nufft`` module.
+
+Using GPU in other areas of the code is still an experimental feature
+and requires a minor configuration setting to enable ``cupy``. See the
+:ref:`sphx_glr_auto_tutorials_configuration.py` for details. Because
+GPU extensions depend on several third party softwares and machines
+vary wildly, we can only offer limited support if one of the packages
+has a problem on your system.  We are currently expanding GPU code
+coverage.
 
 Generating Documentation
 ************************

From 7675e6638e6e29918fde691cdf44d52bd943ad3a Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Fri, 28 Jun 2024 10:14:57 -0400
Subject: [PATCH 48/65] improve crop 3d xp interop

---
 src/aspire/utils/coor_trans.py | 63 ++++++++++++++++++++++++----------
 1 file changed, 45 insertions(+), 18 deletions(-)

diff --git a/src/aspire/utils/coor_trans.py b/src/aspire/utils/coor_trans.py
index ef7857c994..457e29f9f8 100644
--- a/src/aspire/utils/coor_trans.py
+++ b/src/aspire/utils/coor_trans.py
@@ -376,7 +376,7 @@ def crop_pad_2d(im, size, fill_value=0):
 
     :param im: A >=2-dimensional numpy array
     :param size: Integer size of cropped/padded output
-    :return: A numpy array of shape (..., size, size)
+    :return: Array of shape (..., size, size)
     """
 
     im_y, im_x = im.shape[-2:]
@@ -393,7 +393,7 @@ def crop_pad_2d(im, size, fill_value=0):
         shape = list(im.shape[:-2])
         shape.extend([size, size])
 
-        # Ensure that we return in the same dtype as the input
+        # Ensure that we return the same dtype as the input
         _full = np.full  # Default to numpy array
         if isinstance(im, xp.ndarray):
             # Use cupy when `im` _and_ xp are cupy ndarray
@@ -405,34 +405,61 @@ def crop_pad_2d(im, size, fill_value=0):
         # when padding, start_x and start_y are negative since size is larger
         # than im_x and im_y; the below line calculates where the original image
         # is placed in relation to the (now-larger) box size
-        to_return[-start_y : im_y - start_y, -start_x : im_x - start_x] = im
+        to_return[..., -start_y : im_y - start_y, -start_x : im_x - start_x] = im
         return to_return
     else:
         # target size is between mat_x and mat_y
         raise ValueError("Cannot crop and pad an image at the same time.")
 
 
-def crop_pad_3d(im, size, fill_value=0):
-    im_y, im_x, im_z = im.shape
+def crop_pad_3d(vol, size, fill_value=0):
+    """
+    Crop/pads `vol` according to `size`.
+
+    Padding will use `fill_value`.
+    Return's host/gpu array based on `vol`.
+
+    :param vol: A >=3-dimensional numpy array
+    :param size: Integer size of cropped/padded output
+    :return: Array of shape (..., size, size, size)
+    """
+
+    vol_z, vol_y, vol_x = vol.shape[-3:]
     # shift terms
-    start_x = math.floor(im_x / 2) - math.floor(size / 2)
-    start_y = math.floor(im_y / 2) - math.floor(size / 2)
-    start_z = math.floor(im_z / 2) - math.floor(size / 2)
+    start_z = math.floor(vol_z / 2) - math.floor(size / 2)
+    start_y = math.floor(vol_y / 2) - math.floor(size / 2)
+    start_x = math.floor(vol_x / 2) - math.floor(size / 2)
 
     # cropping
-    if size <= min(im_y, im_x, im_z):
-        return im[
-            start_y : start_y + size, start_x : start_x + size, start_z : start_z + size
+    if size <= min(vol_z, vol_y, vol_x):
+        return vol[
+            ...,
+            start_z : start_z + size,
+            start_y : start_y + size,
+            start_x : start_x + size,
         ]
     # padding
-    elif size >= max(im_y, im_x, im_z):
-        to_return = fill_value * np.ones((size, size, size), dtype=im.dtype)
+    elif size >= max(vol_z, vol_y, vol_x):
+        # Determine shape
+        shape = list(vol.shape[:-3])
+        shape.extend([size, size, size])
+
+        # Ensure that we return the same dtype as the input
+        _full = np.full  # Default to numpy array
+        if isinstance(vol, xp.ndarray):
+            # Use cupy when `vol` _and_ xp are cupy ndarray
+            # Avoids having to handle when cupy is not installed
+            _full = xp.full
+
+        to_return = _full(shape, fill_value, dtype=vol.dtype)
+
         to_return[
-            -start_y : im_y - start_y,
-            -start_x : im_x - start_x,
-            -start_z : im_z - start_z,
-        ] = im
+            ...,
+            -start_z : vol_z - start_z,
+            -start_y : vol_y - start_y,
+            -start_x : vol_x - start_x,
+        ] = vol
         return to_return
     else:
-        # target size is between min and max of (im_y, im_x, im_z)
+        # target size is between min and max of (vol_y, vol_x, vol_z)
         raise ValueError("Cannot crop and pad a volume at the same time.")

From 11986f32bae996af151507897c2545fe37cc3c44 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 1 Jul 2024 11:13:04 -0400
Subject: [PATCH 49/65] ffb2d self review cleanup

---
 src/aspire/basis/ffb_2d.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/aspire/basis/ffb_2d.py b/src/aspire/basis/ffb_2d.py
index 116e009757..8d46e8419c 100644
--- a/src/aspire/basis/ffb_2d.py
+++ b/src/aspire/basis/ffb_2d.py
@@ -131,16 +131,16 @@ def _evaluate(self, v):
 
         ind = 0
 
-        idx = ind + xp.arange(self.k_max[0], dtype=int)
+        idx = ind + np.arange(self.k_max[0], dtype=int)
 
-        pf[:, 0, :] = v[:, xp.asarray(self._zero_angular_inds)] @ self.radial_norm[idx]
+        pf[:, 0, :] = v[:, self._zero_angular_inds] @ self.radial_norm[idx]
         ind = ind + idx.size
 
         ind_pos = ind
 
         for ell in range(1, self.ell_max + 1):
             idx = ind + xp.arange(self.k_max[ell], dtype=int)
-            idx_pos = ind_pos + xp.arange(self.k_max[ell], dtype=int)
+            idx_pos = ind_pos + np.arange(self.k_max[ell], dtype=int)
             idx_neg = idx_pos + self.k_max[ell]
 
             v_ell = (v[:, idx_pos] - 1j * v[:, idx_neg]) / 2.0
@@ -171,9 +171,7 @@ def _evaluate(self, v):
         # perform inverse non-uniformly FFT transform back to 2D coordinate basis
         freqs = m_reshape(self._precomp["freqs"], (2, n_r * n_theta))
 
-        x = 2 * anufft(
-            pf.astype(complex_type(self.dtype)), 2 * pi * freqs, self.sz, real=True
-        )
+        x = 2 * anufft(pf, 2 * pi * freqs, self.sz, real=True)
 
         # Return X as Image instance with the last two dimensions as *self.sz
         x = x.reshape((*sz_roll, *self.sz))
@@ -206,7 +204,7 @@ def _evaluate_t(self, x):
         pf = xp.concatenate((pf, pf.conjugate()), axis=2)
 
         # evaluate radial integral using the Gauss-Legendre quadrature rule
-        pf *= self.gl_weighted_nodes[None, :, None]
+        pf = pf * self.gl_weighted_nodes[None, :, None]
 
         #  1D FFT on the angular dimension for each concentric circle
         pf = 2 * xp.pi / (2 * n_theta) * fft.fft(pf)

From 1b89886f8a6e7f4f19e095b9208f17649ca657ff Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 1 Jul 2024 14:43:57 -0400
Subject: [PATCH 50/65] ffb3d move more grid precomp to gpu

---
 src/aspire/basis/ffb_3d.py | 76 +++++++++++++++++++-------------------
 1 file changed, 39 insertions(+), 37 deletions(-)

diff --git a/src/aspire/basis/ffb_3d.py b/src/aspire/basis/ffb_3d.py
index 5740900a34..7a2509382f 100644
--- a/src/aspire/basis/ffb_3d.py
+++ b/src/aspire/basis/ffb_3d.py
@@ -1,7 +1,6 @@
 import logging
 
 import numpy as np
-from numpy import pi
 
 from aspire.basis import FBBasis3D
 from aspire.basis.basis_utils import lgwt, norm_assoc_legendre, sph_bessel
@@ -61,26 +60,29 @@ def _precomp(self):
 
         r, wt_r = lgwt(n_r, 0.0, self.kcut, dtype=self.dtype)
         z, wt_z = lgwt(n_phi, -1, 1, dtype=self.dtype)
-        r = m_reshape(r, (n_r, 1))
-        wt_r = m_reshape(wt_r, (n_r, 1))
-        z = m_reshape(z, (n_phi, 1))
-        wt_z = m_reshape(wt_z, (n_phi, 1))
-        phi = np.arccos(z)
+        r = m_reshape(xp.asarray(r), (n_r, 1))
+        rh = xp.asnumpy(r)
+        wt_r = m_reshape(xp.asarray(wt_r), (n_r, 1))
+        z = m_reshape(xp.asarray(z), (n_phi, 1))
+        wt_z = m_reshape(xp.asarray(wt_z), (n_phi, 1))
+        phi = xp.arccos(z)
         wt_phi = wt_z
-        theta = 2 * pi * np.arange(n_theta, dtype=self.dtype).T / (2 * n_theta)
+        theta = 2 * xp.pi * xp.arange(n_theta, dtype=self.dtype).T / (2 * n_theta)
         theta = m_reshape(theta, (n_theta, 1))
 
         # evaluate basis function in the radial dimension
-        radial_wtd = np.zeros(
+        radial_wtd = xp.zeros(
             shape=(n_r, np.max(self.k_max), self.ell_max + 1), dtype=self.dtype
         )
         for ell in range(0, self.ell_max + 1):
             k_max_ell = self.k_max[ell]
-            rmat = r * self.r0[ell][0:k_max_ell].T / self.kcut
-            radial_ell = np.zeros_like(rmat)
+            rmat = rh * self.r0[ell][0:k_max_ell].T / self.kcut  # host
+            radial_ell = xp.zeros_like(rmat)
             for ik in range(0, k_max_ell):
-                radial_ell[:, ik] = sph_bessel(ell, rmat[:, ik])
-            nrm = np.abs(sph_bessel(ell + 1, self.r0[ell][0:k_max_ell].T) / 4)
+                radial_ell[:, ik] = xp.asarray(sph_bessel(ell, rmat[:, ik]))
+            nrm = xp.abs(
+                xp.asarray(sph_bessel(ell + 1, self.r0[ell][0:k_max_ell].T)) / 4
+            )
             radial_ell = radial_ell / nrm
             radial_ell_wtd = r**2 * wt_r * radial_ell
             radial_wtd[:, 0:k_max_ell, ell] = radial_ell_wtd
@@ -95,14 +97,14 @@ def _precomp(self):
                 - np.mod(self.ell_max, 2) * np.mod(m, 2)
             )
             n_odd_ell = int(self.ell_max - m + 1 - n_even_ell)
-            phi_wtd_m_even = np.zeros((n_phi, n_even_ell), dtype=phi.dtype)
-            phi_wtd_m_odd = np.zeros((n_phi, n_odd_ell), dtype=phi.dtype)
+            phi_wtd_m_even = xp.zeros((n_phi, n_even_ell), dtype=phi.dtype)
+            phi_wtd_m_odd = xp.zeros((n_phi, n_odd_ell), dtype=phi.dtype)
 
             ind_even = 0
             ind_odd = 0
             for ell in range(m, self.ell_max + 1):
-                phi_m_ell = norm_assoc_legendre(ell, m, z)
-                nrm_inv = np.sqrt(0.5 / pi)
+                phi_m_ell = xp.asarray(norm_assoc_legendre(ell, m, z))
+                nrm_inv = np.sqrt(0.5 / np.pi)
                 phi_m_ell = nrm_inv * phi_m_ell
                 phi_wtd_m_ell = wt_phi * phi_m_ell
                 if np.mod(ell, 2) == 0:
@@ -116,41 +118,41 @@ def _precomp(self):
             ang_phi_wtd_odd.append(phi_wtd_m_odd)
 
         # evaluate basis function in the theta dimension
-        ang_theta = np.zeros((n_theta, 2 * self.ell_max + 1), dtype=theta.dtype)
+        ang_theta = xp.zeros((n_theta, 2 * self.ell_max + 1), dtype=theta.dtype)
 
-        ang_theta[:, 0 : self.ell_max] = np.sqrt(2) * np.sin(
-            theta @ m_reshape(np.arange(self.ell_max, 0, -1), (1, self.ell_max))
+        ang_theta[:, 0 : self.ell_max] = np.sqrt(2) * xp.sin(
+            theta @ m_reshape(xp.arange(self.ell_max, 0, -1), (1, self.ell_max))
         )
-        ang_theta[:, self.ell_max] = np.ones(n_theta, dtype=theta.dtype)
-        ang_theta[:, self.ell_max + 1 : 2 * self.ell_max + 1] = np.sqrt(2) * np.cos(
-            theta @ m_reshape(np.arange(1, self.ell_max + 1), (1, self.ell_max))
+        ang_theta[:, self.ell_max] = xp.ones(n_theta, dtype=theta.dtype)
+        ang_theta[:, self.ell_max + 1 : 2 * self.ell_max + 1] = np.sqrt(2) * xp.cos(
+            theta @ m_reshape(xp.arange(1, self.ell_max + 1), (1, self.ell_max))
         )
 
-        ang_theta_wtd = (2 * pi / n_theta) * ang_theta
+        ang_theta_wtd = (2 * np.pi / n_theta) * ang_theta
 
-        theta_grid, phi_grid, r_grid = np.meshgrid(
-            theta, phi, r, sparse=False, indexing="ij"
+        theta_grid, phi_grid, r_grid = xp.meshgrid(
+            theta.flatten(), phi.flatten(), r.flatten(), sparse=False, indexing="ij"
         )
-        fourier_x = m_flatten(r_grid * np.cos(theta_grid) * np.sin(phi_grid))
-        fourier_y = m_flatten(r_grid * np.sin(theta_grid) * np.sin(phi_grid))
-        fourier_z = m_flatten(r_grid * np.cos(phi_grid))
+        fourier_x = m_flatten(r_grid * xp.cos(theta_grid) * xp.sin(phi_grid))
+        fourier_y = m_flatten(r_grid * xp.sin(theta_grid) * xp.sin(phi_grid))
+        fourier_z = m_flatten(r_grid * xp.cos(phi_grid))
         fourier_pts = (
             2
-            * pi
-            * np.vstack(
+            * xp.pi
+            * xp.vstack(
                 (
-                    fourier_z[np.newaxis, ...],
-                    fourier_y[np.newaxis, ...],
-                    fourier_x[np.newaxis, ...],
+                    fourier_z[xp.newaxis, ...],
+                    fourier_y[xp.newaxis, ...],
+                    fourier_x[xp.newaxis, ...],
                 )
             )
         )
 
         return {
-            "radial_wtd": xp.asarray(radial_wtd),
-            "ang_phi_wtd_even": [xp.asarray(x) for x in ang_phi_wtd_even],
-            "ang_phi_wtd_odd": [xp.asarray(x) for x in ang_phi_wtd_odd],
-            "ang_theta_wtd": xp.asarray(ang_theta_wtd),
+            "radial_wtd": radial_wtd,
+            "ang_phi_wtd_even": ang_phi_wtd_even,
+            "ang_phi_wtd_odd": ang_phi_wtd_odd,
+            "ang_theta_wtd": ang_theta_wtd,
             "fourier_pts": fourier_pts,
         }
 

From d9d313b8132eb7b63024f1242bc0854a91c9b747 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 1 Jul 2024 14:58:08 -0400
Subject: [PATCH 51/65] Move more FLE2D grid precomp to GPU

---
 src/aspire/basis/fle_2d.py       | 23 +++++++++++++----------
 src/aspire/basis/fle_2d_utils.py |  1 -
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/src/aspire/basis/fle_2d.py b/src/aspire/basis/fle_2d.py
index ba7d4636e9..631161d0fe 100644
--- a/src/aspire/basis/fle_2d.py
+++ b/src/aspire/basis/fle_2d.py
@@ -21,6 +21,9 @@
 def _cleanup():
     """
     Utility for informing cupy to cleanup memory held by old vars.
+
+    This method is designed to be safely called even when `CuPy` is
+    not installed, in which case it is a no-op.
     """
     try:
         import cupy
@@ -288,10 +291,10 @@ def _compute_nufft_points(self):
         self.num_angular_nodes = num_angular_nodes
 
         # create gridpoints
-        nodes = 1 - (2 * np.arange(self.num_radial_nodes, dtype=self.dtype) + 1) / (
+        nodes = 1 - (2 * xp.arange(self.num_radial_nodes, dtype=self.dtype) + 1) / (
             2 * self.num_radial_nodes
         )
-        nodes = (np.cos(np.pi * nodes) + 1) / 2
+        nodes = (xp.cos(np.pi * nodes) + 1) / 2
         nodes = (
             self.greatest_lambda - self.smallest_lambda
         ) * nodes + self.smallest_lambda
@@ -302,17 +305,17 @@ def _compute_nufft_points(self):
 
         phi = (
             2
-            * np.pi
-            * np.arange(self.num_angular_nodes // 2, dtype=self.dtype)
+            * xp.pi
+            * xp.arange(self.num_angular_nodes // 2, dtype=self.dtype)
             / self.num_angular_nodes
         )
-        grid_xy = np.empty(
+        grid_xy = xp.empty(
             (2, self.num_radial_nodes, self.num_angular_nodes // 2), dtype=self.dtype
         )
-        grid_xy[0] = np.cos(phi)  # x
-        grid_xy[1] = np.sin(phi)  # y
-        grid_xy *= nodes * h
-        self.grid_xy = xp.asarray(grid_xy.reshape(2, -1))
+        grid_xy[0] = xp.cos(phi)  # x
+        grid_xy[1] = xp.sin(phi)  # y
+        grid_xy = grid_xy * nodes * h
+        self.grid_xy = grid_xy.reshape(2, -1)
 
     def _build_interpolation_matrix(self):
         """
@@ -530,7 +533,7 @@ def _evaluate_t(self, imgs):
     def _step1_t(self, im):
         """
         Step 1 of the adjoint transformation (images to coefficients).
-        Calculates the NUFFT of the image on gridpoints `grid_x` and `grid_y`.
+        Calculates the NUFFT of the image on gridpoints `grid_xy`.
         """
         im = im.reshape(-1, self.nres, self.nres).astype(complex_type(self.dtype))
         num_img = im.shape[0]
diff --git a/src/aspire/basis/fle_2d_utils.py b/src/aspire/basis/fle_2d_utils.py
index 33f237165e..ea459988b0 100644
--- a/src/aspire/basis/fle_2d_utils.py
+++ b/src/aspire/basis/fle_2d_utils.py
@@ -195,7 +195,6 @@ def barycentric_interp_sparse(target_points, known_points, numsparse):
     # note that const cancels in numerator and denominator
     vals = vals / denom.reshape(-1, 1)
 
-    # TODO, migrate more of this method towards `xp`
     vals = xp.array(vals.flatten())
     idx = xp.array(idx.flatten())
     jdx = xp.array(jdx.flatten())

From 4f9922224b7d4109ecd564c5b42678024f53b68c Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 1 Jul 2024 15:05:58 -0400
Subject: [PATCH 52/65] image self review cleanup

---
 src/aspire/image/image.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/aspire/image/image.py b/src/aspire/image/image.py
index c8cc9cbe24..ba383f49e9 100644
--- a/src/aspire/image/image.py
+++ b/src/aspire/image/image.py
@@ -356,6 +356,10 @@ def downsample(self, ds_res):
         original_stack_shape = self.stack_shape
         im = self.stack_reshape(-1)
 
+        # Note image data is intentionally migrated via `xp.asarray`
+        # because all of the subsequent calls until `asnumpy` are GPU
+        # when xp and fft in `cupy` mode.
+
         # compute FT with centered 0-frequency
         fx = fft.centered_fft2(xp.asarray(im._data))
         # crop 2D Fourier transform for each image
@@ -377,17 +381,16 @@ def filter(self, filter):
 
         im = self.stack_reshape(-1)
 
+        # Note image and filter data is intentionally migrated via
+        # `xp.asarray` because all of the subsequent calls until
+        # `asnumpy` are GPU when xp and fft in `cupy` mode.
         filter_values = xp.asarray(filter.evaluate_grid(self.resolution))
 
+        # Convolve
         im_f = fft.centered_fft2(xp.asarray(im._data))
-
-        # TODO: why are these different? Doesn't the broadcast work?
-        if im_f.ndim > filter_values.ndim:
-            im_f *= filter_values
-        else:
-            im_f = filter_values * im_f
-
+        im_f = filter_values * im_f
         im = fft.centered_ifft2(im_f)
+
         im = xp.asnumpy(im.real)
 
         return self.__class__(im).stack_reshape(original_stack_shape)

From 19cb24f5fc9c1982bc25b880a5ddc36a00f96f77 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 1 Jul 2024 15:10:56 -0400
Subject: [PATCH 53/65] var name improvement

---
 src/aspire/nufft/__init__.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/aspire/nufft/__init__.py b/src/aspire/nufft/__init__.py
index 953f55ce3e..fcfe182918 100644
--- a/src/aspire/nufft/__init__.py
+++ b/src/aspire/nufft/__init__.py
@@ -172,9 +172,9 @@ def anufft(sig_f, fourier_pts, sz, real=False, epsilon=1e-8):
 
     """
 
-    _on_gpu = False
+    _keep_on_gpu = False
     if cp and isinstance(sig_f, cp.ndarray):
-        _on_gpu = True
+        _keep_on_gpu = True
 
     if fourier_pts.dtype != real_type(sig_f.dtype):
         raise RuntimeError(
@@ -198,7 +198,7 @@ def anufft(sig_f, fourier_pts, sz, real=False, epsilon=1e-8):
 
     adjoint = adjoint.real if real else adjoint
 
-    if cp and not _on_gpu:
+    if cp and not _keep_on_gpu:
         adjoint = adjoint.get()
 
     return adjoint
@@ -223,9 +223,9 @@ def nufft(sig_f, fourier_pts, real=False, epsilon=1e-8):
 
     """
 
-    _on_gpu = False
+    _keep_on_gpu = False
     if cp and isinstance(sig_f, cp.ndarray):
-        _on_gpu = True
+        _keep_on_gpu = True
 
     if fourier_pts.dtype != real_type(sig_f.dtype):
         raise RuntimeError(
@@ -259,7 +259,7 @@ def nufft(sig_f, fourier_pts, real=False, epsilon=1e-8):
 
     transform = transform.real if real else transform
 
-    if cp and not _on_gpu:
+    if cp and not _keep_on_gpu:
         transform = transform.get()
 
     return transform

From d1f41a21895610a7239a2f4d9faa531cdb4c6db7 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 1 Jul 2024 15:16:42 -0400
Subject: [PATCH 54/65] minor crop pad string improvements

---
 src/aspire/utils/coor_trans.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/aspire/utils/coor_trans.py b/src/aspire/utils/coor_trans.py
index 457e29f9f8..53f86714c8 100644
--- a/src/aspire/utils/coor_trans.py
+++ b/src/aspire/utils/coor_trans.py
@@ -409,7 +409,11 @@ def crop_pad_2d(im, size, fill_value=0):
         return to_return
     else:
         # target size is between mat_x and mat_y
-        raise ValueError("Cannot crop and pad an image at the same time.")
+        raise ValueError(
+            "Cannot crop and pad Image at the same time."
+            "If this is really what you intended,"
+            " make two seperate calls for cropping and padding."
+        )
 
 
 def crop_pad_3d(vol, size, fill_value=0):
@@ -461,5 +465,9 @@ def crop_pad_3d(vol, size, fill_value=0):
         ] = vol
         return to_return
     else:
-        # target size is between min and max of (vol_y, vol_x, vol_z)
-        raise ValueError("Cannot crop and pad a volume at the same time.")
+        # target size is between min and max of (vol_x, vol_y, vol_z)
+        raise ValueError(
+            "Cannot crop and pad Volume at the same time."
+            "If this is really what you intended,"
+            " make two seperate calls for cropping and padding."
+        )

From 0bde3ceedf2c04e218c8b6105aa69416f61a65bd Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 1 Jul 2024 15:23:54 -0400
Subject: [PATCH 55/65] Update volume downsample with crop_pad_3d improvements

---
 src/aspire/volume/volume.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/aspire/volume/volume.py b/src/aspire/volume/volume.py
index b7e4245ede..0f01ef5e61 100644
--- a/src/aspire/volume/volume.py
+++ b/src/aspire/volume/volume.py
@@ -468,27 +468,27 @@ def downsample(self, ds_res, mask=None):
         :param ds_res: Desired resolution.
         :param mask: Optional NumPy array mask to multiply in Fourier space.
         """
-        if mask is None:
-            mask = 1.0
 
         original_stack_shape = self.stack_shape
         v = self.stack_reshape(-1)
 
         # take 3D Fourier transform of each volume in the stack
-        fx = xp.asnumpy(fft.fftshift(fft.fftn(xp.asarray(v._data), axes=(1, 2, 3))))
+        fx = fft.fftshift(fft.fftn(xp.asarray(v._data), axes=(1, 2, 3)))
+
         # crop each volume to the desired resolution in frequency space
-        crop_fx = (
-            np.array([crop_pad_3d(fx[i, :, :, :], ds_res) for i in range(self.n_vols)])
-            * mask
-        )
+        fx = crop_pad_3d(fx, ds_res)
+
+        # Optionally apply mask
+        if mask is not None:
+            fx = fx * xp.asarray(mask)
+
         # inverse Fourier transform of each volume
-        out = xp.asnumpy(
-            fft.ifftn(fft.ifftshift(xp.asarray(crop_fx)), axes=(1, 2, 3))
-            * (ds_res**3 / self.resolution**3)
-        )
+        out = fft.ifftn(fft.ifftshift(fx), axes=(1, 2, 3)).real
+        out = out.real * (ds_res**3 / self.resolution**3)
+
         # returns a new Volume object
         return self.__class__(
-            np.real(out), symmetry_group=self.symmetry_group
+            xp.asnumpy(out), symmetry_group=self.symmetry_group
         ).stack_reshape(original_stack_shape)
 
     def shift(self):

From fe3ab8d2e8f8aaa575ff7aa3cc3fd282099dbe22 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 1 Jul 2024 15:25:59 -0400
Subject: [PATCH 56/65] add docstring

---
 tests/test_numeric_sparse.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/test_numeric_sparse.py b/tests/test_numeric_sparse.py
index 5a8227fe47..e58aa02e6a 100644
--- a/tests/test_numeric_sparse.py
+++ b/tests/test_numeric_sparse.py
@@ -1,3 +1,7 @@
+"""
+Tests basic numpy/cupy functionality of sparse numeric wrappers.
+"""
+
 import numpy as np
 import pytest
 

From f80e191d7c5102c3f13881a89d7840598badfc3a Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 1 Jul 2024 16:02:58 -0400
Subject: [PATCH 57/65] enforce filter dtype

---
 src/aspire/image/image.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/aspire/image/image.py b/src/aspire/image/image.py
index ba383f49e9..0fb3b0dc0f 100644
--- a/src/aspire/image/image.py
+++ b/src/aspire/image/image.py
@@ -384,7 +384,9 @@ def filter(self, filter):
         # Note image and filter data is intentionally migrated via
         # `xp.asarray` because all of the subsequent calls until
         # `asnumpy` are GPU when xp and fft in `cupy` mode.
-        filter_values = xp.asarray(filter.evaluate_grid(self.resolution))
+        filter_values = xp.asarray(
+            filter.evaluate_grid(self.resolution), dtype=self.dtype
+        )
 
         # Convolve
         im_f = fft.centered_fft2(xp.asarray(im._data))

From a71ba260f2b09094856d881c0d8bbc08f7792640 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 1 Jul 2024 16:18:53 -0400
Subject: [PATCH 58/65] explicitly force C order before cufinufft call

---
 src/aspire/nufft/cufinufft.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/aspire/nufft/cufinufft.py b/src/aspire/nufft/cufinufft.py
index c1d15ff686..218fbd5fb7 100644
--- a/src/aspire/nufft/cufinufft.py
+++ b/src/aspire/nufft/cufinufft.py
@@ -165,7 +165,8 @@ def adjoint(self, signal):
                 " In the future this will be an error."
             )
 
-        signal = cp.asarray(signal, dtype=self.complex_dtype)
+        # Note, if not C order, cuFINUFFT will copy-cast anyway.
+        signal = cp.asarray(signal, order="C", dtype=self.complex_dtype)
 
         res_shape = self.sz
         # Note, there is a corner case for ntransforms == 1.

From 333ce7608ccb0bd41ee6443b65b0b439f2ea32d7 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Tue, 2 Jul 2024 10:56:38 -0400
Subject: [PATCH 59/65] Add dtype note and utest tolerance for singles

---
 src/aspire/image/image.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/aspire/image/image.py b/src/aspire/image/image.py
index 0fb3b0dc0f..b04f2b4b3d 100644
--- a/src/aspire/image/image.py
+++ b/src/aspire/image/image.py
@@ -384,6 +384,8 @@ def filter(self, filter):
         # Note image and filter data is intentionally migrated via
         # `xp.asarray` because all of the subsequent calls until
         # `asnumpy` are GPU when xp and fft in `cupy` mode.
+        #
+        # Second note, filter dtype may not match image dtype.
         filter_values = xp.asarray(
             filter.evaluate_grid(self.resolution), dtype=self.dtype
         )

From 9fd86f96e5114c636d7de27e7b087929cb7ffdf1 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 15 Jul 2024 10:35:56 -0400
Subject: [PATCH 60/65] configuration doc wording (strings)

---
 gallery/tutorials/configuration.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gallery/tutorials/configuration.py b/gallery/tutorials/configuration.py
index 75354bc422..372d97df06 100644
--- a/gallery/tutorials/configuration.py
+++ b/gallery/tutorials/configuration.py
@@ -110,7 +110,7 @@
 # software is most easily accomplished by installing ASPIRE with one
 # of the published GPU extensions, for example ``pip install
 # "aspire[dev,gpu_12x]"``.  Once the packages are installed users
-# should automatically find that the NUFFT calls are running on the
+# should find that the NUFFT calls are automatically running on the
 # GPU.  Additional acceleration is achieved by enabling `cupy` for
 # `numeric` and `fft` components.
 #

From 80780db57a505e5a1fd504717ab7e25c8c4ec1bf Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 15 Jul 2024 11:04:40 -0400
Subject: [PATCH 61/65] keep a few more vars as cupy

---
 src/aspire/basis/fle_2d.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/aspire/basis/fle_2d.py b/src/aspire/basis/fle_2d.py
index 631161d0fe..76330e6fba 100644
--- a/src/aspire/basis/fle_2d.py
+++ b/src/aspire/basis/fle_2d.py
@@ -544,7 +544,7 @@ def _step1_t(self, im):
         _z = nufft(im, self.grid_xy, epsilon=self.epsilon) * self.h**2
         _z = _z.reshape(num_img, self.num_radial_nodes, self.num_angular_nodes // 2)
         z[:, :, : self.num_angular_nodes // 2] = _z
-        z[:, :, self.num_angular_nodes // 2 :] = np.conj(_z)
+        z[:, :, self.num_angular_nodes // 2 :] = _z.conj()
         return z
 
     def _step2_t(self, z):
@@ -643,13 +643,13 @@ def _step1(self, z):
         num_img = z.shape[0]
         z = z[:, :, : self.num_angular_nodes // 2].reshape(num_img, -1)
         im = anufft(
-            z.astype(complex_type(self.dtype)),
+            z.astype(complex_type(self.dtype), copy=False),
             self.grid_xy,
             (self.nres, self.nres),
             epsilon=self.epsilon,
         )
-        im = im + np.conj(im)
-        im = np.real(im)
+        im = im + im.conj()
+        im = im.real
         im = im.reshape(num_img, self.nres, self.nres)
         im[:, self.radial_mask] = 0
 

From bf9e2b467c1cbaaec87a793251e79f7c7a8d7cb0 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 15 Jul 2024 15:09:41 -0400
Subject: [PATCH 62/65] replace xp.newaxis with None

---
 src/aspire/basis/ffb_3d.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/aspire/basis/ffb_3d.py b/src/aspire/basis/ffb_3d.py
index 7a2509382f..4137e572a9 100644
--- a/src/aspire/basis/ffb_3d.py
+++ b/src/aspire/basis/ffb_3d.py
@@ -141,9 +141,9 @@ def _precomp(self):
             * xp.pi
             * xp.vstack(
                 (
-                    fourier_z[xp.newaxis, ...],
-                    fourier_y[xp.newaxis, ...],
-                    fourier_x[xp.newaxis, ...],
+                    fourier_z[None, ...],
+                    fourier_y[None, ...],
+                    fourier_x[None, ...],
                 )
             )
         )

From b2834dd97b38ad21434e39176b812a883e16ed79 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Tue, 23 Jul 2024 08:40:19 -0400
Subject: [PATCH 63/65] put cache dir on new line

---
 .github/workflows/workflow.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/workflow.yml b/.github/workflows/workflow.yml
index fce5a7f6d4..c41b221ec4 100644
--- a/.github/workflows/workflow.yml
+++ b/.github/workflows/workflow.yml
@@ -147,7 +147,8 @@ jobs:
         echo "Stash the WORK_DIR to GitHub env so we can clean it up later."
         echo "WORK_DIR=${WORK_DIR}" >> $GITHUB_ENV
         echo -e "ray:\n    temp_dir: ${WORK_DIR}\n" > ${WORK_DIR}/config.yaml
-        echo -e "common:\n    cache_dir: ${CI_CACHE_DIR}" >> ${WORK_DIR}/config.yaml
+        echo -e "common:" >> ${WORK_DIR}/config.yaml
+        echo -e "    cache_dir: ${CI_CACHE_DIR}" >> ${WORK_DIR}/config.yaml
         echo -e "    numeric: cupy" >> ${WORK_DIR}/config.yaml
         echo -e "    fft: cupy\n" >> ${WORK_DIR}/config.yaml
         echo "Log the config: ${WORK_DIR}/config.yaml"

From bb696e297cf90d1b666b3648890fa2ac988b6aa4 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Tue, 23 Jul 2024 08:52:57 -0400
Subject: [PATCH 64/65] rename tmp to ang_theta_wtd_trans

---
 src/aspire/basis/ffb_3d.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/aspire/basis/ffb_3d.py b/src/aspire/basis/ffb_3d.py
index 4137e572a9..7f0821b99a 100644
--- a/src/aspire/basis/ffb_3d.py
+++ b/src/aspire/basis/ffb_3d.py
@@ -308,9 +308,9 @@ def _evaluate_t(self, x):
         pf = m_reshape(pf.T, (n_theta, n_phi * n_r * n_data))
 
         # evaluate the theta parts
-        tmp = self._precomp["ang_theta_wtd"].T
-        u_even = tmp @ pf.real
-        u_odd = tmp @ pf.imag
+        ang_theta_wtd_trans = self._precomp["ang_theta_wtd"].T
+        u_even = ang_theta_wtd_trans @ pf.real
+        u_odd = ang_theta_wtd_trans @ pf.imag
 
         u_even = m_reshape(u_even, (2 * self.ell_max + 1, n_phi, n_r, n_data))
         u_odd = m_reshape(u_odd, (2 * self.ell_max + 1, n_phi, n_r, n_data))

From e22d562685ea797db3bc4557d987c854ca1e5689 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Tue, 23 Jul 2024 08:57:42 -0400
Subject: [PATCH 65/65] gpu to GPU and rm dev comment

---
 src/aspire/nufft/__init__.py   | 4 ++--
 src/aspire/numeric/cupy_fft.py | 3 ---
 src/aspire/utils/coor_trans.py | 4 ++--
 tests/test_orient_sdp.py       | 2 +-
 4 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/aspire/nufft/__init__.py b/src/aspire/nufft/__init__.py
index fcfe182918..07d92c736c 100644
--- a/src/aspire/nufft/__init__.py
+++ b/src/aspire/nufft/__init__.py
@@ -159,7 +159,7 @@ def anufft(sig_f, fourier_pts, sz, real=False, epsilon=1e-8):
 
     Selects best available package from `nfft` `backends` configuration list.
 
-    When sig_f is provided as a CuPy gpu array with a cufinufft
+    When sig_f is provided as a CuPy GPU array with a cufinufft
     backend, result is maintained on GPU.
 
     :param sig_f: Array representing the signal(s) in Fourier space to be transformed. \
@@ -211,7 +211,7 @@ def nufft(sig_f, fourier_pts, real=False, epsilon=1e-8):
 
     Selects best available package from `nfft` `backends` configuration list.
 
-    When sig_f is provided as a CuPy gpu array with a cufinufft
+    When sig_f is provided as a CuPy GPU array with a cufinufft
     backend, result is maintained on GPU.
 
     :param sig_f: Array representing the signal(s) in real space to be transformed. \
diff --git a/src/aspire/numeric/cupy_fft.py b/src/aspire/numeric/cupy_fft.py
index f67937813f..6ad6a4e9da 100644
--- a/src/aspire/numeric/cupy_fft.py
+++ b/src/aspire/numeric/cupy_fft.py
@@ -25,9 +25,6 @@ def wrapper(self, x, *args, **kwargs):
         # CuPy's single precision FFT appears to be too inaccurate for
         # many of our unit tests, so the signal is upcast and recast
         # on return.
-        # Todo, discuss with Joakim whether we want this upcasting
-        # business configurable or keep singles, both in conjunction
-        # with xfailing the tests.
         _singles = False
         if x.dtype == np.float32:
             _singles = True
diff --git a/src/aspire/utils/coor_trans.py b/src/aspire/utils/coor_trans.py
index 53f86714c8..cad8fb0295 100644
--- a/src/aspire/utils/coor_trans.py
+++ b/src/aspire/utils/coor_trans.py
@@ -372,7 +372,7 @@ def crop_pad_2d(im, size, fill_value=0):
     Crop/pads `im` according to `size`.
 
     Padding will use `fill_value`.
-    Return's host/gpu array based on `im`.
+    Return's host/GPU array based on `im`.
 
     :param im: A >=2-dimensional numpy array
     :param size: Integer size of cropped/padded output
@@ -421,7 +421,7 @@ def crop_pad_3d(vol, size, fill_value=0):
     Crop/pads `vol` according to `size`.
 
     Padding will use `fill_value`.
-    Return's host/gpu array based on `vol`.
+    Return's host/GPU array based on `vol`.
 
     :param vol: A >=3-dimensional numpy array
     :param size: Integer size of cropped/padded output
diff --git a/tests/test_orient_sdp.py b/tests/test_orient_sdp.py
index a161d2fdd7..22658ee06a 100644
--- a/tests/test_orient_sdp.py
+++ b/tests/test_orient_sdp.py
@@ -77,7 +77,7 @@ def test_estimate_rotations(src_orient_est_fixture):
     src, orient_est = src_orient_est_fixture
 
     if backend_available("cufinufft") and src.dtype == np.float32:
-        pytest.skip("CI on gpu fails for singles.")
+        pytest.skip("CI on GPU fails for singles.")
 
     orient_est.estimate_rotations()