From ca2d15d35f40d8fcb910d097efa84aee144e7e92 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Wed, 14 Aug 2024 10:53:00 -0400
Subject: [PATCH 01/45] quick and dirty cupy build_clmatrix

---
 src/aspire/abinitio/commonline_base.py | 144 +++++++++++++++++++++++--
 1 file changed, 135 insertions(+), 9 deletions(-)

diff --git a/src/aspire/abinitio/commonline_base.py b/src/aspire/abinitio/commonline_base.py
index c0c3718803..6c222eafa8 100644
--- a/src/aspire/abinitio/commonline_base.py
+++ b/src/aspire/abinitio/commonline_base.py
@@ -1,10 +1,12 @@
 import logging
 import math
+from time import perf_counter
 
 import numpy as np
 import scipy.sparse as sparse
 
 from aspire.image import Image
+from aspire.numeric import fft, xp
 from aspire.operators import PolarFT
 from aspire.utils import common_line_from_rots, fuzzy_mask, tqdm
 from aspire.utils.random import choice
@@ -129,6 +131,22 @@ def estimate_rotations(self):
         raise NotImplementedError("subclasses should implement this")
 
     def build_clmatrix(self):
+        """dispatch, compare, dispair"""
+
+        tic1 = perf_counter()
+        clmatrix_orig = self.build_clmatrix_orig()
+        tic2 = perf_counter()
+        clmatrix_cupy = self.build_clmatrix_cupy()
+        tic3 = perf_counter()
+        print(f"Orig CL build: {tic2-tic1}")
+        print(f"Cupy CL build: {tic3-tic2}")
+
+        np.testing.assert_allclose(clmatrix_orig[0], clmatrix_cupy[0])
+        np.testing.assert_allclose(clmatrix_orig[1], clmatrix_cupy[1])
+
+        self.shifts_1d, self.clmatrix = clmatrix_cupy
+
+    def build_clmatrix_orig(self):
         """
         Build common-lines matrix from Fourier stack of 2D images
         """
@@ -177,10 +195,11 @@ def build_clmatrix(self):
         shifts, shift_phases, h = self._generate_shift_phase_and_filter(
             r_max, max_shift, shift_step
         )
+        shifts, shift_phases = xp.asnumpy(shifts), xp.asnumpy(shift_phases)
 
         # Apply bandpass filter, normalize each ray of each image
         # Note that only use half of each ray
-        pf = self._apply_filter_and_norm("ijk, k -> ijk", pf, r_max, h)
+        pf = xp.asnumpy(self._apply_filter_and_norm("ijk, k -> ijk", pf, r_max, h))
 
         # Setup a progress bar
         _total_pairs_to_test = self.n_img * (self.n_check - 1) // 2
@@ -233,8 +252,115 @@ def build_clmatrix(self):
                 pbar.update()
         pbar.close()
 
-        self.clmatrix = clmatrix
-        self.shifts_1d = shifts_1d
+        return shifts_1d, clmatrix
+
+    def build_clmatrix_cupy(self):
+        """
+        Build common-lines matrix from Fourier stack of 2D images
+        """
+
+        n_img = self.n_img
+        n_check = self.n_check
+
+        if self.n_theta % 2 == 1:
+            msg = "n_theta must be even"
+            logger.error(msg)
+            raise NotImplementedError(msg)
+
+        n_theta_half = self.n_theta // 2
+
+        # need to do a copy to prevent modifying self.pf for other functions
+        # also places on GPU when xp is in cupy mode.
+        pf = xp.array(self.pf)
+
+        # Allocate local variables for return
+        # clmatrix represents the common lines matrix.
+        # Namely, clmatrix[i,j] contains the index in image i of
+        # the common line with image j. Note the common line index
+        # starts from 0 instead of 1 as Matlab version. -1 means
+        # there is no common line such as clmatrix[i,i].
+        clmatrix = -xp.ones((n_img, n_img), dtype=self.dtype)
+        # When cl_dist[i, j] is not -1, it stores the maximum value
+        # of correlation between image i and j for all possible 1D shifts.
+        # We will use cl_dist[i, j] = -1 (including j<=i) to
+        # represent that there is no need to check common line
+        # between i and j. Since it is symmetric,
+        # only above the diagonal entries are necessary.
+        cl_dist = -xp.ones((n_img, n_img), dtype=self.dtype)
+
+        # Allocate variables used for shift estimation
+
+        # set maximum value of 1D shift (in pixels) to search
+        # between common-lines.
+        max_shift = self.max_shift
+        # Set resolution of shift estimation in pixels. Note that
+        # shift_step can be any positive real number.
+        shift_step = self.shift_step
+        # 1D shift between common-lines
+        shifts_1d = xp.zeros((n_img, n_img))
+
+        # Prepare the shift phases to try and generate filter for common-line detection
+        r_max = pf.shape[2]
+        shifts, shift_phases, h = self._generate_shift_phase_and_filter(
+            r_max, max_shift, shift_step
+        )
+
+        # Apply bandpass filter, normalize each ray of each image
+        # Note that only use half of each ray
+        pf = self._apply_filter_and_norm("ijk, k -> ijk", pf, r_max, h)
+
+        # Setup a progress bar
+        _total_pairs_to_test = self.n_img * (self.n_check - 1) // 2
+        pbar = tqdm(desc="Searching over common line pairs", total=_total_pairs_to_test)
+
+        # Search for common lines between [i, j] pairs of images.
+        # Creating pf and building common lines are different to the Matlab version.
+        # The random selection is implemented.
+        for i in range(n_img - 1):
+            p1 = pf[i]
+            p1_real = xp.real(p1)
+            p1_imag = xp.imag(p1)
+
+            # build the subset of j images if n_check < n_img
+            n_remaining = n_img - i - 1
+            n_j = min(n_remaining, n_check)
+            subset_j = np.sort(choice(n_remaining, n_j, replace=False) + i + 1)
+
+            for j in subset_j:
+                p2_flipped = xp.conj(pf[j])
+
+                for shift in range(len(shifts)):
+                    shift_phase = shift_phases[shift]
+                    p2_shifted_flipped = (shift_phase * p2_flipped).T
+                    # Compute correlations in the positive r direction
+                    part1 = p1_real.dot(p2_shifted_flipped.real)
+                    # Compute correlations in the negative r direction
+                    part2 = p1_imag.dot(p2_shifted_flipped.imag)
+
+                    c1 = part1 - part2
+                    sidx = c1.argmax()
+                    cl1, cl2 = np.unravel_index(sidx, c1.shape)
+                    sval = c1[cl1, cl2]
+
+                    c2 = part1 + part2
+                    sidx = c2.argmax()
+                    cl1_2, cl2_2 = np.unravel_index(sidx, c2.shape)
+                    sval2 = c2[cl1_2, cl2_2]
+
+                    if sval2 > sval:
+                        cl1 = cl1_2
+                        cl2 = cl2_2 + n_theta_half
+                        sval = sval2
+                    sval = 2 * sval
+                    if sval > cl_dist[i, j]:
+                        clmatrix[i, j] = cl1
+                        clmatrix[j, i] = cl2
+                        cl_dist[i, j] = sval
+                        shifts_1d[i, j] = shifts[shift]
+                pbar.update()
+        pbar.close()
+
+        return xp.asnumpy(shifts_1d), xp.asnumpy(clmatrix)
 
     def estimate_shifts(self, equations_factor=1, max_memory=4000):
         """
@@ -488,13 +614,13 @@ def _generate_shift_phase_and_filter(self, r_max, max_shift, shift_step):
         n_shifts = int(np.ceil(2 * max_shift / shift_step + 1))
 
         # only half of ray, excluding the DC component.
-        rk = np.arange(1, r_max + 1)
+        rk = xp.arange(1, r_max + 1)
 
         # Generate all shift phases
-        shifts = -max_shift + shift_step * np.arange(n_shifts)
-        shift_phases = np.exp(np.outer(shifts, -2 * np.pi * 1j * rk / (2 * r_max + 1)))
+        shifts = -max_shift + shift_step * xp.arange(n_shifts)
+        shift_phases = xp.exp(xp.outer(shifts, -2 * xp.pi * 1j * rk / (2 * r_max + 1)))
         # Set filter for common-line detection
-        h = np.sqrt(np.abs(rk)) * np.exp(-np.square(rk) / (2 * (r_max / 4) ** 2))
+        h = xp.sqrt(xp.abs(rk)) * xp.exp(-xp.square(rk) / (2 * (r_max / 4) ** 2))
 
         return shifts, shift_phases, h
 
@@ -556,11 +682,11 @@ def _apply_filter_and_norm(self, subscripts, pf, r_max, h):
 
         # Note if we'd rather not have the dtype and casting args,
         #   we can control h.dtype instead.
-        np.einsum(subscripts, pf, h, out=pf, dtype=pf.dtype, casting="same_kind")
+        xp.einsum(subscripts, pf, h, out=pf, dtype=pf.dtype, casting="same_kind")
 
         # This is a high pass filter, cutting out the lowest frequency
         # (DC has already been removed).
         pf[..., 0] = 0
-        pf /= np.linalg.norm(pf, axis=-1)[..., np.newaxis]
+        pf /= xp.linalg.norm(pf, axis=-1)[..., xp.newaxis]
 
         return pf

From 99e0bf281fadb442a1338caebfd6b2b26552b9a7 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 15 Aug 2024 14:04:28 -0400
Subject: [PATCH 02/45] stashing

---
 src/aspire/abinitio/commonline_base.cu |  87 ++++++++++++++
 src/aspire/abinitio/commonline_base.py | 159 +++++++++++++++----------
 src/aspire/config_default.yaml         |   4 +-
 3 files changed, 182 insertions(+), 68 deletions(-)
 create mode 100644 src/aspire/abinitio/commonline_base.cu

diff --git a/src/aspire/abinitio/commonline_base.cu b/src/aspire/abinitio/commonline_base.cu
new file mode 100644
index 0000000000..78d39586fb
--- /dev/null
+++ b/src/aspire/abinitio/commonline_base.cu
@@ -0,0 +1,87 @@
+
+extern "C" __global__
+void build_clmatrix_kernel(int n, int m, int r, double* pf, double* clmatrix, double* cl_dist, double* shifts_1d, int n_shifts, int* shifts,  double* shift_phases)
+{
+  /* n n_img */
+  /* m,r st (n, m, r) = pf.shape, ie len(pf[i])  */
+     
+  /* thread index (1d), represents "i" index */
+  unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+  unsigned int j = blockDim.y * blockIdx.y + threadIdx.y;
+
+  /* no-op when out of bounds */
+  if(i >= n) return;
+  if(j >= n) return;
+  /* no-op lower triangle */
+  if(j <= i) return;
+
+  int ind;
+  int k;
+  int s;
+  int cl1, cl2;
+  int best_cl1, best_cl2, best_s;
+  double dist, best_cl_dist;
+  double p1,p2;
+
+  double p1_realk, p1_imagk;
+  double p2conj_realk, p2conj_imagk;
+  double p2_realk, p2_imagk;  
+  double shift_phases_real, shift_phases_imag;
+
+  
+  best_s = -1;
+  best_cl1 = -1;
+  best_cl2 = -1;
+  best_cl_dist = -1/0;
+
+  for(s=0; s<n_shifts; s++){
+    for(cl1=0; cl1<m; cl1++){
+      for(cl2=0; cl2<m; cl2++){
+        p1 = 0;
+        p2 = 0;
+        /* inner most dim of dot (matmul) */
+        for(k=0; k<r; k++){
+          /* factor */
+          p1_realk = pf[2*(i*m*r + cl1*r + k)];
+          p1_imagk = pf[2*(i*m*r + cl1*r + k) + 1];
+          p2conj_realk =  pf[2*(j*m*r + cl2*r + k)];
+          p2conj_imagk = -pf[2*(j*m*r + cl2*r + k) + 1];
+          shift_phases_real = shift_phases[2*(s*r + k)];
+          shift_phases_imag = shift_phases[2*(s*r + k) +1];
+          p2_realk = (p2conj_realk * shift_phases_real) - (p2conj_imagk*shift_phases_imag);
+          p2_imagk = (p2conj_imagk * shift_phases_real) + (p2conj_realk*shift_phases_imag);
+          p1 += p1_realk * p2_realk;
+          p2 += p1_imagk * p2_imagk;
+
+          // p1 += pf[2*(i*m*r + cl1*r + k)]     *  pf[2*(j*m*r + cl2*r + k)]     * shift_phases[2*(s*r + k)];       
+          // p2 += pf[2*(i*m*r + cl1*r + k) + 1] * -pf[2*(j*m*r + cl2*r + k) + 1] * shift_phases[2*(s*r + k) +1];
+        }
+
+        dist = p1 - p2;
+        if(dist > best_cl_dist){
+          best_cl_dist = dist;
+          best_cl1 = cl1;
+          best_cl2 = cl2;
+          best_s = s;
+        }
+
+        dist = p1 + p2;
+        if(dist > best_cl_dist){
+          best_cl_dist = dist;
+          best_cl1 = cl1;
+          best_cl2 = cl2 + m; // m is pf.shape[1], which should be n_theta//2...
+          best_s = s;
+        }
+
+      } /* cl2 */
+    }/* cl1 */
+  } /* s */
+
+  /* update global best for i, j*/
+  ind = i*n + j;
+  clmatrix[ind] = best_cl1;
+  clmatrix[j*n+i] = best_cl2;  /* [j,i] */
+  cl_dist[ind] = 2*best_cl_dist;  // 2 of mystery
+  shifts_1d[ind] = shifts[best_s];
+  
+}  
diff --git a/src/aspire/abinitio/commonline_base.py b/src/aspire/abinitio/commonline_base.py
index 6c222eafa8..5da7bbf04a 100644
--- a/src/aspire/abinitio/commonline_base.py
+++ b/src/aspire/abinitio/commonline_base.py
@@ -1,5 +1,6 @@
 import logging
 import math
+import os
 from time import perf_counter
 
 import numpy as np
@@ -70,6 +71,23 @@ def __init__(
         self.rotations = None
         self._pf = None
 
+        # Auto configure GPU
+        self.__gpu_module = None
+        try:
+            import cupy as cp
+
+            if cp.cuda.runtime.getDeviceCount() >= 1:
+                gpu_id = cp.cuda.runtime.getDevice()
+                logger.info(
+                    f"cupy and GPU {gpu_id} found by cuda runtime; enabling cupy."
+                )
+                self.__gpu_module = self.__init_cupy_module()
+            else:
+                logger.info("GPU not found, defaulting to numpy.")
+
+        except ModuleNotFoundError:
+            logger.info("cupy not found, defaulting to numpy.")
+
         self._build()
 
     def _build(self):
@@ -133,18 +151,22 @@ def estimate_rotations(self):
     def build_clmatrix(self):
         """dispatch, compare, dispair"""
 
+        logger.info("Begin building Common Lines Matrix")
+
         tic1 = perf_counter()
-        clmatrix_orig = self.build_clmatrix_orig()
+        clmatrix_cu = self.build_clmatrix_cu()
         tic2 = perf_counter()
-        clmatrix_cupy = self.build_clmatrix_cupy()
+        print(f"Cuda CL build: {tic2-tic1}")
+
         tic3 = perf_counter()
-        print(f"Orig CL build: {tic2-tic1}")
-        print(f"Cupy CL build: {tic3-tic2}")
+        clmatrix_orig = self.build_clmatrix_orig()
+        tic4 = perf_counter()
+        print(f"Orig CL build: {tic4-tic3}")
 
-        np.testing.assert_allclose(clmatrix_orig[0], clmatrix_cupy[0])
-        np.testing.assert_allclose(clmatrix_orig[1], clmatrix_cupy[1])
+        # np.testing.assert_allclose(clmatrix_cu[1], clmatrix_orig[1])
+        # np.testing.assert_allclose( clmatrix_cu[0], clmatrix_orig[0])
 
-        self.shifts_1d, self.clmatrix = clmatrix_cupy
+        self.shifts_1d, self.clmatrix = clmatrix_cu
 
     def build_clmatrix_orig(self):
         """
@@ -203,7 +225,9 @@ def build_clmatrix_orig(self):
 
         # Setup a progress bar
         _total_pairs_to_test = self.n_img * (self.n_check - 1) // 2
-        pbar = tqdm(desc="Searching over common line pairs", total=_total_pairs_to_test)
+        pbar = tqdm(
+            desc="Searching over common line pairs (orig)", total=_total_pairs_to_test
+        )
 
         # Search for common lines between [i, j] pairs of images.
         # Creating pf and building common lines are different to the Matlab version.
@@ -219,6 +243,7 @@ def build_clmatrix_orig(self):
             subset_j = np.sort(choice(n_remaining, n_j, replace=False) + i + 1)
 
             for j in subset_j:
+                # for j in range(i+1,n_img):
                 p2_flipped = np.conj(pf[j])
 
                 for shift in range(len(shifts)):
@@ -254,11 +279,13 @@ def build_clmatrix_orig(self):
 
         return shifts_1d, clmatrix
 
-    def build_clmatrix_cupy(self):
+    def build_clmatrix_cu(self):
         """
         Build common-lines matrix from Fourier stack of 2D images
         """
 
+        import cupy as cp
+
         n_img = self.n_img
         n_check = self.n_check
 
@@ -267,11 +294,12 @@ def build_clmatrix_cupy(self):
             logger.error(msg)
             raise NotImplementedError(msg)
 
-        n_theta_half = self.n_theta // 2
+        n_theta_half = self.n_theta // 2  # rm
 
         # need to do a copy to prevent modifying self.pf for other functions
-        # also places on GPU when xp is in cupy mode.
-        pf = xp.array(self.pf)
+        # also places on GPU
+        pf = cp.array(self.pf)
+        assert pf.shape[1] == n_theta_half
 
         # Allocate local variables for return
         # clmatrix represents the common lines matrix.
@@ -279,14 +307,14 @@ def build_clmatrix_cupy(self):
         # the common line with image j. Note the common line index
         # starts from 0 instead of 1 as Matlab version. -1 means
         # there is no common line such as clmatrix[i,i].
-        clmatrix = -xp.ones((n_img, n_img), dtype=self.dtype)
+        clmatrix = -cp.ones((n_img, n_img), dtype=np.float64)
         # When cl_dist[i, j] is not -1, it stores the maximum value
         # of correlation between image i and j for all possible 1D shifts.
         # We will use cl_dist[i, j] = -1 (including j<=i) to
         # represent that there is no need to check common line
         # between i and j. Since it is symmetric,
         # only above the diagonal entries are necessary.
-        cl_dist = -xp.ones((n_img, n_img), dtype=self.dtype)
+        cl_dist = -cp.ones((n_img, n_img), dtype=np.float64)
 
         # Allocate variables used for shift estimation
 
@@ -297,7 +325,7 @@ def build_clmatrix_cupy(self):
         # shift_step can be any positive real number.
         shift_step = self.shift_step
         # 1D shift between common-lines
-        shifts_1d = xp.zeros((n_img, n_img))
+        shifts_1d = cp.zeros((n_img, n_img), dtype=np.float64)  # check dtype
 
         # Prepare the shift phases to try and generate filter for common-line detection
         r_max = pf.shape[2]
@@ -309,58 +337,40 @@ def build_clmatrix_cupy(self):
         # Note that only use half of each ray
         pf = self._apply_filter_and_norm("ijk, k -> ijk", pf, r_max, h)
 
-        # Setup a progress bar
-        _total_pairs_to_test = self.n_img * (self.n_check - 1) // 2
-        pbar = tqdm(desc="Searching over common line pairs", total=_total_pairs_to_test)
-
-        # Search for common lines between [i, j] pairs of images.
-        # Creating pf and building common lines are different to the Matlab version.
-        # The random selection is implemented.
-        for i in range(n_img - 1):
-            p1 = pf[i]
-            p1_real = xp.real(p1)
-            p1_imag = xp.imag(p1)
-
-            # build the subset of j images if n_check < n_img
-            n_remaining = n_img - i - 1
-            n_j = min(n_remaining, n_check)
-            subset_j = np.sort(choice(n_remaining, n_j, replace=False) + i + 1)
-
-            for j in subset_j:
-                p2_flipped = xp.conj(pf[j])
-
-                for shift in range(len(shifts)):
-                    shift_phase = shift_phases[shift]
-                    p2_shifted_flipped = (shift_phase * p2_flipped).T
-                    # Compute correlations in the positive r direction
-                    part1 = p1_real.dot(p2_shifted_flipped.real)
-                    # Compute correlations in the negative r direction
-                    part2 = p1_imag.dot(p2_shifted_flipped.imag)
-
-                    c1 = part1 - part2
-                    sidx = c1.argmax()
-                    cl1, cl2 = np.unravel_index(sidx, c1.shape)
-                    sval = c1[cl1, cl2]
-
-                    c2 = part1 + part2
-                    sidx = c2.argmax()
-                    cl1_2, cl2_2 = np.unravel_index(sidx, c2.shape)
-                    sval2 = c2[cl1_2, cl2_2]
+        # Get kernel
+        build_clmatrix_kernel = self.__gpu_module.get_function("build_clmatrix_kernel")
+
+        # Configure grid of blocks
+        blkszx = 32
+        nblkx = (self.n_img + blkszx - 2) // blkszx
+        blkszy = 32
+        nblky = (self.n_img + blkszy - 1) // blkszy
+
+        # Launch
+        logger.info("Launching `build_clmatrix_kernel`.")
+        build_clmatrix_kernel(
+            (nblkx, nblky),
+            (blkszx, blkszy),
+            (
+                pf.shape[0],
+                pf.shape[1],
+                pf.shape[2],
+                pf.astype(np.complex128, copy=False).view(np.float64),
+                clmatrix,
+                cl_dist,
+                shifts_1d,
+                len(shifts),
+                shifts,
+                shift_phases.astype(np.complex128, copy=False).view(np.float64),
+            ),
+        )
 
-                    if sval2 > sval:
-                        cl1 = cl1_2
-                        cl2 = cl2_2 + n_theta_half
-                        sval = sval2
-                    sval = 2 * sval
-                    if sval > cl_dist[i, j]:
-                        clmatrix[i, j] = cl1
-                        clmatrix[j, i] = cl2
-                        cl_dist[i, j] = sval
-                        shifts_1d[i, j] = shifts[shift]
-                pbar.update()
-        pbar.close()
+        # Copy result device arrays to host
+        shifts_1d = shifts_1d.get().astype(self.dtype, copy=False)
+        clmatrix = clmatrix.get().astype(self.dtype, copy=False)
+        # cl_dist = cl_dist.get().astype(self.dtype, copy=False)
 
-        return xp.asnumpy(shifts_1d), xp.asnumpy(clmatrix)
+        return shifts_1d, clmatrix
 
     def estimate_shifts(self, equations_factor=1, max_memory=4000):
         """
@@ -682,7 +692,7 @@ def _apply_filter_and_norm(self, subscripts, pf, r_max, h):
 
         # Note if we'd rather not have the dtype and casting args,
         #   we can control h.dtype instead.
-        xp.einsum(subscripts, pf, h, out=pf, dtype=pf.dtype, casting="same_kind")
+        pf = xp.einsum(subscripts, pf, h, dtype=pf.dtype)  # casting="same_kind"
 
         # This is a high pass filter, cutting out the lowest frequency
         # (DC has already been removed).
@@ -690,3 +700,20 @@ def _apply_filter_and_norm(self, subscripts, pf, r_max, h):
         pf /= xp.linalg.norm(pf, axis=-1)[..., xp.newaxis]
 
         return pf
+
+    @staticmethod
+    def __init_cupy_module():
+        """
+        Private utility method to read in CUDA source and return as
+        compiled CUPY module.
+        """
+
+        import cupy as cp
+
+        # Read in contents of file
+        fp = os.path.join(os.path.dirname(__file__), "commonline_base.cu")
+        with open(fp, "r") as fh:
+            module_code = fh.read()
+
+        # CUPY compile the CUDA code
+        return cp.RawModule(code=module_code)
diff --git a/src/aspire/config_default.yaml b/src/aspire/config_default.yaml
index 9af8732dd1..6e32ead3d2 100644
--- a/src/aspire/config_default.yaml
+++ b/src/aspire/config_default.yaml
@@ -1,9 +1,9 @@
 version: 0.13.0
 common:
     # numeric module to use - one of numpy/cupy
-    numeric: numpy
+    numeric: cupy
     # fft backend to use - one of pyfftw/scipy/cupy/mkl
-    fft: scipy
+    fft: cupy
 
     # Set cache directory for ASPIRE example data.
     # By default the cache location will be set by pooch.os_cache(),

From 80d9acdd86c5fd47ccd236f19020f0dae4e5ebe6 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 15 Aug 2024 16:13:36 -0400
Subject: [PATCH 03/45] stashing 2, clmatrix match, good speedup

[skip ci]
---
 src/aspire/abinitio/commonline_base.cu | 12 +++++-------
 src/aspire/abinitio/commonline_base.py |  9 ++++-----
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/src/aspire/abinitio/commonline_base.cu b/src/aspire/abinitio/commonline_base.cu
index 78d39586fb..8dabbb4533 100644
--- a/src/aspire/abinitio/commonline_base.cu
+++ b/src/aspire/abinitio/commonline_base.cu
@@ -21,7 +21,7 @@ void build_clmatrix_kernel(int n, int m, int r, double* pf, double* clmatrix, do
   int cl1, cl2;
   int best_cl1, best_cl2, best_s;
   double dist, best_cl_dist;
-  double p1,p2;
+  double p1, p2;
 
   double p1_realk, p1_imagk;
   double p2conj_realk, p2conj_imagk;
@@ -29,14 +29,14 @@ void build_clmatrix_kernel(int n, int m, int r, double* pf, double* clmatrix, do
   double shift_phases_real, shift_phases_imag;
 
   
-  best_s = -1;
+  best_s = -99999;
   best_cl1 = -1;
   best_cl2 = -1;
   best_cl_dist = -1/0;
 
-  for(s=0; s<n_shifts; s++){
-    for(cl1=0; cl1<m; cl1++){
-      for(cl2=0; cl2<m; cl2++){
+  for(cl1=0; cl1<m; cl1++){
+    for(cl2=0; cl2<m; cl2++){
+      for(s=0; s<n_shifts; s++){
         p1 = 0;
         p2 = 0;
         /* inner most dim of dot (matmul) */
@@ -53,8 +53,6 @@ void build_clmatrix_kernel(int n, int m, int r, double* pf, double* clmatrix, do
           p1 += p1_realk * p2_realk;
           p2 += p1_imagk * p2_imagk;
 
-          // p1 += pf[2*(i*m*r + cl1*r + k)]     *  pf[2*(j*m*r + cl2*r + k)]     * shift_phases[2*(s*r + k)];       
-          // p2 += pf[2*(i*m*r + cl1*r + k) + 1] * -pf[2*(j*m*r + cl2*r + k) + 1] * shift_phases[2*(s*r + k) +1];
         }
 
         dist = p1 - p2;
diff --git a/src/aspire/abinitio/commonline_base.py b/src/aspire/abinitio/commonline_base.py
index 5da7bbf04a..9e65db096e 100644
--- a/src/aspire/abinitio/commonline_base.py
+++ b/src/aspire/abinitio/commonline_base.py
@@ -163,7 +163,7 @@ def build_clmatrix(self):
         tic4 = perf_counter()
         print(f"Orig CL build: {tic4-tic3}")
 
-        # np.testing.assert_allclose(clmatrix_cu[1], clmatrix_orig[1])
+        np.testing.assert_allclose(clmatrix_cu[1], clmatrix_orig[1])
         # np.testing.assert_allclose( clmatrix_cu[0], clmatrix_orig[0])
 
         self.shifts_1d, self.clmatrix = clmatrix_cu
@@ -243,7 +243,7 @@ def build_clmatrix_orig(self):
             subset_j = np.sort(choice(n_remaining, n_j, replace=False) + i + 1)
 
             for j in subset_j:
-                # for j in range(i+1,n_img):
+                # for j in range(i+1,n_img):  # for testing, rm later
                 p2_flipped = np.conj(pf[j])
 
                 for shift in range(len(shifts)):
@@ -299,7 +299,6 @@ def build_clmatrix_cu(self):
         # need to do a copy to prevent modifying self.pf for other functions
         # also places on GPU
         pf = cp.array(self.pf)
-        assert pf.shape[1] == n_theta_half
 
         # Allocate local variables for return
         # clmatrix represents the common lines matrix.
@@ -342,7 +341,7 @@ def build_clmatrix_cu(self):
 
         # Configure grid of blocks
         blkszx = 32
-        nblkx = (self.n_img + blkszx - 2) // blkszx
+        nblkx = (self.n_img + blkszx - 2) // blkszx  # n_img-1
         blkszy = 32
         nblky = (self.n_img + blkszy - 1) // blkszy
 
@@ -368,7 +367,7 @@ def build_clmatrix_cu(self):
         # Copy result device arrays to host
         shifts_1d = shifts_1d.get().astype(self.dtype, copy=False)
         clmatrix = clmatrix.get().astype(self.dtype, copy=False)
-        # cl_dist = cl_dist.get().astype(self.dtype, copy=False)
+        cl_dist = cl_dist.get().astype(self.dtype, copy=False)  # diagnostic
 
         return shifts_1d, clmatrix
 

From 0ec15b4d436c8345c981b4a98b20ab84a451d76d Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 19 Aug 2024 10:04:42 -0400
Subject: [PATCH 04/45] cleanup

---
 src/aspire/abinitio/commonline_base.cu | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/aspire/abinitio/commonline_base.cu b/src/aspire/abinitio/commonline_base.cu
index 8dabbb4533..7673b68af0 100644
--- a/src/aspire/abinitio/commonline_base.cu
+++ b/src/aspire/abinitio/commonline_base.cu
@@ -4,7 +4,7 @@ void build_clmatrix_kernel(int n, int m, int r, double* pf, double* clmatrix, do
 {
   /* n n_img */
   /* m,r st (n, m, r) = pf.shape, ie len(pf[i])  */
-     
+
   /* thread index (1d), represents "i" index */
   unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
   unsigned int j = blockDim.y * blockIdx.y + threadIdx.y;
@@ -25,10 +25,10 @@ void build_clmatrix_kernel(int n, int m, int r, double* pf, double* clmatrix, do
 
   double p1_realk, p1_imagk;
   double p2conj_realk, p2conj_imagk;
-  double p2_realk, p2_imagk;  
+  double p2_realk, p2_imagk;
   double shift_phases_real, shift_phases_imag;
 
-  
+
   best_s = -99999;
   best_cl1 = -1;
   best_cl2 = -1;
@@ -71,9 +71,10 @@ void build_clmatrix_kernel(int n, int m, int r, double* pf, double* clmatrix, do
           best_s = s;
         }
 
-      } /* cl2 */
-    }/* cl1 */
-  } /* s */
+      } /* s */
+    } /* cl2 */
+  }/* cl1 */
+
 
   /* update global best for i, j*/
   ind = i*n + j;
@@ -81,5 +82,5 @@ void build_clmatrix_kernel(int n, int m, int r, double* pf, double* clmatrix, do
   clmatrix[j*n+i] = best_cl2;  /* [j,i] */
   cl_dist[ind] = 2*best_cl_dist;  // 2 of mystery
   shifts_1d[ind] = shifts[best_s];
-  
-}  
+
+}

From 6cb90d1205e5617708ef1634fe54e049e5f202bb Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 19 Aug 2024 10:52:58 -0400
Subject: [PATCH 05/45] implement transpose PF for better mem patterns

---
 src/aspire/abinitio/commonline_base.cu | 8 ++++----
 src/aspire/abinitio/commonline_base.py | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/aspire/abinitio/commonline_base.cu b/src/aspire/abinitio/commonline_base.cu
index 7673b68af0..670eb5fa39 100644
--- a/src/aspire/abinitio/commonline_base.cu
+++ b/src/aspire/abinitio/commonline_base.cu
@@ -42,10 +42,10 @@ void build_clmatrix_kernel(int n, int m, int r, double* pf, double* clmatrix, do
         /* inner most dim of dot (matmul) */
         for(k=0; k<r; k++){
           /* factor */
-          p1_realk = pf[2*(i*m*r + cl1*r + k)];
-          p1_imagk = pf[2*(i*m*r + cl1*r + k) + 1];
-          p2conj_realk =  pf[2*(j*m*r + cl2*r + k)];
-          p2conj_imagk = -pf[2*(j*m*r + cl2*r + k) + 1];
+          p1_realk = pf[2*(k*m*n + cl1*n + i)];
+          p1_imagk = pf[2*(k*m*n + cl1*n + i) + 1];
+          p2conj_realk =  pf[2*(k*m*n + cl2*n + j)];
+          p2conj_imagk = -pf[2*(k*m*n + cl2*n + j) + 1];
           shift_phases_real = shift_phases[2*(s*r + k)];
           shift_phases_imag = shift_phases[2*(s*r + k) +1];
           p2_realk = (p2conj_realk * shift_phases_real) - (p2conj_imagk*shift_phases_imag);
diff --git a/src/aspire/abinitio/commonline_base.py b/src/aspire/abinitio/commonline_base.py
index 9e65db096e..1775d6d9d4 100644
--- a/src/aspire/abinitio/commonline_base.py
+++ b/src/aspire/abinitio/commonline_base.py
@@ -354,7 +354,7 @@ def build_clmatrix_cu(self):
                 pf.shape[0],
                 pf.shape[1],
                 pf.shape[2],
-                pf.astype(np.complex128, copy=False).view(np.float64),
+                cp.ascontiguousarray(pf.T, dtype=np.complex128).view(np.float64),
                 clmatrix,
                 cl_dist,
                 shifts_1d,

From 9137b6c92362e1657adf81615c707ea091088541 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 19 Aug 2024 15:06:36 -0400
Subject: [PATCH 06/45] use cu complex for CL kernel

---
 src/aspire/abinitio/commonline_base.cu | 26 +++++++-------------------
 src/aspire/abinitio/commonline_base.py | 12 ++++++------
 2 files changed, 13 insertions(+), 25 deletions(-)

diff --git a/src/aspire/abinitio/commonline_base.cu b/src/aspire/abinitio/commonline_base.cu
index 670eb5fa39..2d159b87a6 100644
--- a/src/aspire/abinitio/commonline_base.cu
+++ b/src/aspire/abinitio/commonline_base.cu
@@ -1,6 +1,7 @@
+#include <cupy/complex.cuh>
 
 extern "C" __global__
-void build_clmatrix_kernel(int n, int m, int r, double* pf, double* clmatrix, double* cl_dist, double* shifts_1d, int n_shifts, int* shifts,  double* shift_phases)
+void build_clmatrix_kernel(int n, int m, int r, const complex<double>* __restrict__ pf, double* __restrict__ clmatrix, double* __restrict__  cl_dist, double* __restrict__ shifts_1d, int n_shifts, int* __restrict__ shifts,  const complex<double>* __restrict__ shift_phases)
 {
   /* n n_img */
   /* m,r st (n, m, r) = pf.shape, ie len(pf[i])  */
@@ -22,12 +23,7 @@ void build_clmatrix_kernel(int n, int m, int r, double* pf, double* clmatrix, do
   int best_cl1, best_cl2, best_s;
   double dist, best_cl_dist;
   double p1, p2;
-
-  double p1_realk, p1_imagk;
-  double p2conj_realk, p2conj_imagk;
-  double p2_realk, p2_imagk;
-  double shift_phases_real, shift_phases_imag;
-
+  complex<double> pfik, pfjk;
 
   best_s = -99999;
   best_cl1 = -1;
@@ -41,18 +37,10 @@ void build_clmatrix_kernel(int n, int m, int r, double* pf, double* clmatrix, do
         p2 = 0;
         /* inner most dim of dot (matmul) */
         for(k=0; k<r; k++){
-          /* factor */
-          p1_realk = pf[2*(k*m*n + cl1*n + i)];
-          p1_imagk = pf[2*(k*m*n + cl1*n + i) + 1];
-          p2conj_realk =  pf[2*(k*m*n + cl2*n + j)];
-          p2conj_imagk = -pf[2*(k*m*n + cl2*n + j) + 1];
-          shift_phases_real = shift_phases[2*(s*r + k)];
-          shift_phases_imag = shift_phases[2*(s*r + k) +1];
-          p2_realk = (p2conj_realk * shift_phases_real) - (p2conj_imagk*shift_phases_imag);
-          p2_imagk = (p2conj_imagk * shift_phases_real) + (p2conj_realk*shift_phases_imag);
-          p1 += p1_realk * p2_realk;
-          p2 += p1_imagk * p2_imagk;
-
+          pfik = pf[k*m*n + cl1*n + i];
+          pfjk = conj(pf[k*m*n + cl2*n + j]) * shift_phases[s*r + k];
+          p1 += real(pfik) * real(pfjk);
+          p2 += imag(pfik) * imag(pfjk);
         }
 
         dist = p1 - p2;
diff --git a/src/aspire/abinitio/commonline_base.py b/src/aspire/abinitio/commonline_base.py
index 1775d6d9d4..cf749e0f1f 100644
--- a/src/aspire/abinitio/commonline_base.py
+++ b/src/aspire/abinitio/commonline_base.py
@@ -294,8 +294,6 @@ def build_clmatrix_cu(self):
             logger.error(msg)
             raise NotImplementedError(msg)
 
-        n_theta_half = self.n_theta // 2  # rm
-
         # need to do a copy to prevent modifying self.pf for other functions
         # also places on GPU
         pf = cp.array(self.pf)
@@ -335,6 +333,8 @@ def build_clmatrix_cu(self):
         # Apply bandpass filter, normalize each ray of each image
         # Note that only use half of each ray
         pf = self._apply_filter_and_norm("ijk, k -> ijk", pf, r_max, h)
+        # Tranpose `pf` for better (CUDA) memory access pattern, and cast as needed.
+        pf = cp.ascontiguousarray(pf.T, dtype=np.complex128)
 
         # Get kernel
         build_clmatrix_kernel = self.__gpu_module.get_function("build_clmatrix_kernel")
@@ -351,16 +351,16 @@ def build_clmatrix_cu(self):
             (nblkx, nblky),
             (blkszx, blkszy),
             (
-                pf.shape[0],
+                n_img,
                 pf.shape[1],
-                pf.shape[2],
-                cp.ascontiguousarray(pf.T, dtype=np.complex128).view(np.float64),
+                r_max,
+                pf,
                 clmatrix,
                 cl_dist,
                 shifts_1d,
                 len(shifts),
                 shifts,
-                shift_phases.astype(np.complex128, copy=False).view(np.float64),
+                shift_phases.astype(np.complex128, copy=False),
             ),
         )
 

From 6a4689a174201a1360ebead1085e6baca0cfbac1 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Tue, 20 Aug 2024 12:49:44 -0400
Subject: [PATCH 07/45] fixed shifts, dtype

---
 src/aspire/abinitio/commonline_base.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/aspire/abinitio/commonline_base.cu b/src/aspire/abinitio/commonline_base.cu
index 2d159b87a6..a6bad78aea 100644
--- a/src/aspire/abinitio/commonline_base.cu
+++ b/src/aspire/abinitio/commonline_base.cu
@@ -1,7 +1,7 @@
 #include <cupy/complex.cuh>
 
 extern "C" __global__
-void build_clmatrix_kernel(int n, int m, int r, const complex<double>* __restrict__ pf, double* __restrict__ clmatrix, double* __restrict__  cl_dist, double* __restrict__ shifts_1d, int n_shifts, int* __restrict__ shifts,  const complex<double>* __restrict__ shift_phases)
+void build_clmatrix_kernel(int n, int m, int r, const complex<double>* __restrict__ pf, double* __restrict__ clmatrix, double* __restrict__  cl_dist, double* __restrict__ shifts_1d, int n_shifts, double* __restrict__ shifts,  const complex<double>* __restrict__ shift_phases)
 {
   /* n n_img */
   /* m,r st (n, m, r) = pf.shape, ie len(pf[i])  */

From 6a0e2769a867d8ac0bfc295fb04541dc9a18ebf6 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Tue, 20 Aug 2024 13:57:26 -0400
Subject: [PATCH 08/45] revert some unused optimizations, fix minor
 casting/dtype issue, begin cleanup

---
 src/aspire/abinitio/commonline_base.py | 42 ++++++++++++--------------
 1 file changed, 19 insertions(+), 23 deletions(-)

diff --git a/src/aspire/abinitio/commonline_base.py b/src/aspire/abinitio/commonline_base.py
index cf749e0f1f..2fbf12d8ea 100644
--- a/src/aspire/abinitio/commonline_base.py
+++ b/src/aspire/abinitio/commonline_base.py
@@ -153,22 +153,16 @@ def build_clmatrix(self):
 
         logger.info("Begin building Common Lines Matrix")
 
-        tic1 = perf_counter()
-        clmatrix_cu = self.build_clmatrix_cu()
-        tic2 = perf_counter()
-        print(f"Cuda CL build: {tic2-tic1}")
-
-        tic3 = perf_counter()
-        clmatrix_orig = self.build_clmatrix_orig()
-        tic4 = perf_counter()
-        print(f"Orig CL build: {tic4-tic3}")
-
-        np.testing.assert_allclose(clmatrix_cu[1], clmatrix_orig[1])
-        # np.testing.assert_allclose( clmatrix_cu[0], clmatrix_orig[0])
+        # host/gpu dispatch
+        if self.__gpu_module:
+            res = self.build_clmatrix_cu()
+        else:
+            res = self.build_clmatrix_host()
 
-        self.shifts_1d, self.clmatrix = clmatrix_cu
+        # Unpack result
+        self.shifts_1d, self.clmatrix = res
 
-    def build_clmatrix_orig(self):
+    def build_clmatrix_host(self):
         """
         Build common-lines matrix from Fourier stack of 2D images
         """
@@ -217,11 +211,10 @@ def build_clmatrix_orig(self):
         shifts, shift_phases, h = self._generate_shift_phase_and_filter(
             r_max, max_shift, shift_step
         )
-        shifts, shift_phases = xp.asnumpy(shifts), xp.asnumpy(shift_phases)
 
         # Apply bandpass filter, normalize each ray of each image
         # Note that only use half of each ray
-        pf = xp.asnumpy(self._apply_filter_and_norm("ijk, k -> ijk", pf, r_max, h))
+        pf = self._apply_filter_and_norm("ijk, k -> ijk", pf, r_max, h)
 
         # Setup a progress bar
         _total_pairs_to_test = self.n_img * (self.n_check - 1) // 2
@@ -329,6 +322,9 @@ def build_clmatrix_cu(self):
         shifts, shift_phases, h = self._generate_shift_phase_and_filter(
             r_max, max_shift, shift_step
         )
+        # Transfer to device, dtypes must match kernel header.
+        shifts = cp.asarray(shifts, dtype=np.float64)
+        shift_phases = cp.asarray(shift_phases, dtype=np.complex128)
 
         # Apply bandpass filter, normalize each ray of each image
         # Note that only use half of each ray
@@ -360,7 +356,7 @@ def build_clmatrix_cu(self):
                 shifts_1d,
                 len(shifts),
                 shifts,
-                shift_phases.astype(np.complex128, copy=False),
+                shift_phases,
             ),
         )
 
@@ -623,13 +619,13 @@ def _generate_shift_phase_and_filter(self, r_max, max_shift, shift_step):
         n_shifts = int(np.ceil(2 * max_shift / shift_step + 1))
 
         # only half of ray, excluding the DC component.
-        rk = xp.arange(1, r_max + 1)
+        rk = np.arange(1, r_max + 1, dtype=self.dtype)
 
         # Generate all shift phases
-        shifts = -max_shift + shift_step * xp.arange(n_shifts)
-        shift_phases = xp.exp(xp.outer(shifts, -2 * xp.pi * 1j * rk / (2 * r_max + 1)))
+        shifts = -max_shift + shift_step * np.arange(n_shifts, dtype=self.dtype)
+        shift_phases = np.exp(np.outer(shifts, -2 * np.pi * 1j * rk / (2 * r_max + 1)))
         # Set filter for common-line detection
-        h = xp.sqrt(xp.abs(rk)) * xp.exp(-xp.square(rk) / (2 * (r_max / 4) ** 2))
+        h = np.sqrt(np.abs(rk)) * np.exp(-np.square(rk) / (2 * (r_max / 4) ** 2))
 
         return shifts, shift_phases, h
 
@@ -691,12 +687,12 @@ def _apply_filter_and_norm(self, subscripts, pf, r_max, h):
 
         # Note if we'd rather not have the dtype and casting args,
         #   we can control h.dtype instead.
-        pf = xp.einsum(subscripts, pf, h, dtype=pf.dtype)  # casting="same_kind"
+        pf = np.einsum(subscripts, pf, h, dtype=pf.dtype)
 
         # This is a high pass filter, cutting out the lowest frequency
         # (DC has already been removed).
         pf[..., 0] = 0
-        pf /= xp.linalg.norm(pf, axis=-1)[..., xp.newaxis]
+        pf /= np.linalg.norm(pf, axis=-1)[..., np.newaxis]
 
         return pf
 

From 0930e69790ff377dc8c46d96c895f665b6c4fa9e Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Tue, 20 Aug 2024 14:26:18 -0400
Subject: [PATCH 09/45] tox check cleanup

---
 src/aspire/abinitio/commonline_base.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/aspire/abinitio/commonline_base.py b/src/aspire/abinitio/commonline_base.py
index 2fbf12d8ea..54ee26b290 100644
--- a/src/aspire/abinitio/commonline_base.py
+++ b/src/aspire/abinitio/commonline_base.py
@@ -1,7 +1,6 @@
 import logging
 import math
 import os
-from time import perf_counter
 
 import numpy as np
 import scipy.sparse as sparse
@@ -280,7 +279,6 @@ def build_clmatrix_cu(self):
         import cupy as cp
 
         n_img = self.n_img
-        n_check = self.n_check
 
         if self.n_theta % 2 == 1:
             msg = "n_theta must be even"

From 997da4517eb09f585d61304d765b1924a19e5650 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 26 Aug 2024 13:59:45 -0400
Subject: [PATCH 10/45] stashing stub

---
 src/aspire/abinitio/commonline_sync3n.cu | 17 +++++++++++++++++
 src/aspire/abinitio/commonline_sync3n.py | 22 ++++++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu
index eeaee723b9..7ec1cbfd05 100644
--- a/src/aspire/abinitio/commonline_sync3n.cu
+++ b/src/aspire/abinitio/commonline_sync3n.cu
@@ -413,3 +413,20 @@ void triangle_scores_inner(int n, double* Rijs, int n_intervals, unsigned int* s
 
   return;
 };
+
+extern "C" __global__
+void estimate_all_Rijs(int n,double* __restrict__ clmatrix)
+{
+  /* n n_img */
+
+  /* thread index (2d), represents "i" index, "j" index */
+  unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+  unsigned int j = blockDim.y * blockIdx.y + threadIdx.y;
+
+  /* no-op when out of bounds */
+  if(i >= n) return;
+  if(j >= n) return;
+  /* no-op lower triangle */
+  if(j <= i) return;
+
+}
diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 6a7e5390d3..d6b12ef1d1 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -823,6 +823,28 @@ def _estimate_all_Rijs(self, clmatrix):
         :param clmatrix: Common lines matrix
         :return: Estimated rotations
         """
+        # host/gpu dispatch
+        if self.__gpu_module:
+            res = self._estimate_all_Rijs_cupy(Rijs)
+        else:
+            res = self._estimate_all_Rijs_host(Rijs)
+        
+        return res
+        
+    def _estimate_all_Rijs_cupy(self, clmatrix):
+        import cupy as cp
+
+        estimate_all_Rijs = self.__gpu_module.get_function("estimate_all_Rijs")
+        
+        return
+
+    def _estimate_all_Rijs_host(self, clmatrix):
+        """
+        Estimate Rijs using the voting method.
+
+        :param clmatrix: Common lines matrix
+        :return: Estimated rotations
+        """        
         n_img = self.n_img
         n_theta = self.n_theta
         Rijs = np.zeros((len(self._pairs), 3, 3))

From 425d3ee187f86eb8a8b3beb58967c5493caad70d Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Wed, 4 Sep 2024 09:56:15 -0400
Subject: [PATCH 11/45] stashing stubs

---
 src/aspire/abinitio/commonline_sync3n.cu | 74 ++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu
index 7ec1cbfd05..2adfe80084 100644
--- a/src/aspire/abinitio/commonline_sync3n.cu
+++ b/src/aspire/abinitio/commonline_sync3n.cu
@@ -422,6 +422,14 @@ void estimate_all_Rijs(int n,double* __restrict__ clmatrix)
   /* thread index (2d), represents "i" index, "j" index */
   unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
   unsigned int j = blockDim.y * blockIdx.y + threadIdx.y;
+  int cl_diff1, cl_diff2, cl_diff3;
+  double theta1, theta2, theta3;
+  double c1, c2, c3;
+  double cond;
+  double cos_phi2;
+  double alpha;
+  double angles[3];
+  double r[3][3];
 
   /* no-op when out of bounds */
   if(i >= n) return;
@@ -429,4 +437,70 @@ void estimate_all_Rijs(int n,double* __restrict__ clmatrix)
   /* no-op lower triangle */
   if(j <= i) return;
 
+
+  // vote_ij creates good_k list per (i,j)
+
+  // We shouldn't need this... confirm and rm later.
+  if(clmatrix[i*n + j] == -1) return;
+
+  int cl_idx12 = clmatrix[i*n + j];
+  int cl_idx21 = clmatrix[j*n + i];
+
+  /* Assume that k_list starts as [0,n] */
+  for(k=0; k<n; k++){
+
+    cl_idx13 = clmatrix[i*n + k];
+    cl_idx31 = clmatrix[k*n + i];
+    cl_idx23 = clmatrix[j*n + k];
+    cl_idx32 = clmatrix[k*n + j];
+
+    // test `k` values
+    if(k==i) return;
+    if(cl_idx13 == -1) return;  // i, k
+    if(cl_idx31 == -1) return;  // k, i
+    if(cl_idx23 == -1) return;  // j, k
+    // if(cl_idx32 == -1) return;  // k, j
+
+    // self._get_cos_phis(cl_diff1, cl_diff2, cl_diff3, n_theta)
+    cl_diff1 = cl_idx13 - cl_idx12;
+    cl_diff2 = cl_idx21 - cl_idx23;
+    cl_diff3 = cl_idx32 - cl_idx31;
+
+    theta1 = cl_diff1 * 2 * np.pi / n_theta;
+    theta2 = cl_diff2 * 2 * np.pi / n_theta;
+    theta3 = cl_diff3 * 2 * np.pi / n_theta;
+
+    c1 = cos(theta1);
+    c2 = cos(theta2);
+    c3 = cos(theta3);
+
+    cond = 1 + 2 * c1 * c2 * c3 - (c1*c1 + c2*c2 + c3*c3);
+
+    // test if we have a good_idx
+    if(cond < 1e-5) return;
+
+    cos_phi2 = (c3 - c1*c2 ) / (sin(theta1) * sin(theta2));
+    
+    //end _get_cos_phis
+
+    // clip [-1,1]
+    if(cos_phi2 >1){
+      cos_phi2 = 1;
+    }
+    if(cos_phi2 <-1){
+      cos_phi2 = -1;
+    }
+    
+    // // _rotratio_eulerangle_vec loops over good_k list per (i,j)
+    // alpha = arccos(cos_phi2);    
+    // // convert Euler angle to rotation matrix
+    // angles[0] = cl_idx12 * 2 * pi / n + pi / 2;
+    // angles[1] = alpha;
+    // angles[2] = -pi / 2 - cl_idx21 * 2 * pi / n;
+    // //r = from_euler(angles);
+  
+
+
+  
+  
 }

From 5b0d7ccb9fec1351638db28672c0680b9971f2e1 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Wed, 4 Sep 2024 16:02:11 -0400
Subject: [PATCH 12/45] stashing init kernel port

---
 src/aspire/abinitio/commonline_sync3n.cu | 109 +++++++++++++++++++----
 1 file changed, 91 insertions(+), 18 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu
index 2adfe80084..8e83aee9aa 100644
--- a/src/aspire/abinitio/commonline_sync3n.cu
+++ b/src/aspire/abinitio/commonline_sync3n.cu
@@ -1,7 +1,28 @@
+# define M_PI           3.14159265358979323846  /* pi */
 
 /* from i,j indices to the common index in the N-choose-2 sized array */
 #define PAIR_IDX(N,I,J) ((2*N-I-1)*I/2 + J-I-1)
 
+/* convert euler angles (a,b,c) in ZYZ to rotation matrix r */
+inline void ang2orth(double* r, double a, double b, double c){
+  double sa = sin(a);
+  double sb = sin(b);
+  double sc = sin(c);
+  double ca = cos(a);
+  double cb = cos(b);
+  double cc = cos(c);
+
+  r[0] = ca*cb*cc - sa*sc;
+  r[1] = -ca*cb*cc - sa*sc;
+  r[2] = ca*sb;
+  r[3] = sa*cb*cc + ca*sc;
+  r[4] = -sa*cb*cc + ca*sc;
+  r[5] = sa*sb;
+  r[6] = -sb*cc;
+  r[7] = sb*sc;
+  r[8] = cb;  
+}
+  
 
 inline void mult_3x3(double *out, double *R1, double *R2) {
   /* 3X3 matrices multiplication: out = R1*R2
@@ -415,21 +436,24 @@ void triangle_scores_inner(int n, double* Rijs, int n_intervals, unsigned int* s
 };
 
 extern "C" __global__
-void estimate_all_Rijs(int n,double* __restrict__ clmatrix)
+void estimate_all_Rijs(int n, int n_theta, double* __restrict__ clmatrix, double* __restrict__ hist, int* __restrict__ kmap, double* rotations)
 {
+  // try toget kmap as uint16_t
   /* n n_img */
 
   /* thread index (2d), represents "i" index, "j" index */
   unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
   unsigned int j = blockDim.y * blockIdx.y + threadIdx.y;
+  int k;
+  int kk;
   int cl_diff1, cl_diff2, cl_diff3;
   double theta1, theta2, theta3;
   double c1, c2, c3;
   double cond;
   double cos_phi2;
-  double alpha;
   double angles[3];
-  double r[3][3];
+  double r[9];
+  int cnt;
 
   /* no-op when out of bounds */
   if(i >= n) return;
@@ -445,7 +469,16 @@ void estimate_all_Rijs(int n,double* __restrict__ clmatrix)
 
   int cl_idx12 = clmatrix[i*n + j];
   int cl_idx21 = clmatrix[j*n + i];
-
+  int cl_idx13, cl_idx31;
+  int cl_idx23, cl_idx32;
+  const int ntics = 60;
+  const double sigma = 3.0;
+  double ga;
+  double angle;
+  int b;
+  int peak_idx;
+  double peak;
+  
   /* Assume that k_list starts as [0,n] */
   for(k=0; k<n; k++){
 
@@ -466,9 +499,9 @@ void estimate_all_Rijs(int n,double* __restrict__ clmatrix)
     cl_diff2 = cl_idx21 - cl_idx23;
     cl_diff3 = cl_idx32 - cl_idx31;
 
-    theta1 = cl_diff1 * 2 * np.pi / n_theta;
-    theta2 = cl_diff2 * 2 * np.pi / n_theta;
-    theta3 = cl_diff3 * 2 * np.pi / n_theta;
+    theta1 = cl_diff1 * 2 * M_PI / n_theta;
+    theta2 = cl_diff2 * 2 * M_PI / n_theta;
+    theta3 = cl_diff3 * 2 * M_PI / n_theta;
 
     c1 = cos(theta1);
     c2 = cos(theta2);
@@ -490,17 +523,57 @@ void estimate_all_Rijs(int n,double* __restrict__ clmatrix)
     if(cos_phi2 <-1){
       cos_phi2 = -1;
     }
-    
-    // // _rotratio_eulerangle_vec loops over good_k list per (i,j)
-    // alpha = arccos(cos_phi2);    
-    // // convert Euler angle to rotation matrix
-    // angles[0] = cl_idx12 * 2 * pi / n + pi / 2;
-    // angles[1] = alpha;
-    // angles[2] = -pi / 2 - cl_idx21 * 2 * pi / n;
-    // //r = from_euler(angles);
+                   
+    /* compute histogram contribution and index map */
+    angle = acos(cos_phi2) * 180. / M_PI;
+    // angle's bin
+    kmap[PAIR_IDX(n,i,j)*n + k] = angle / ntics;
+    for(b=0; b<ntics; b++){
+      ga = b*(180/ntics);  // grid angle // todo, just compute in radians to avoid extra arithmetic
+      // histogram contribution
+      hist[PAIR_IDX(n,i,j)*ntics + b ] += exp(
+          (2*angle*ga - (angle*angle + ga*ga))/(2*sigma*sigma));
+    } /* bins */
+  } /* k*/
+  
+  /* find peak of histogram */
+  peak = -1;
+  peak_idx = -1;
+  for(b=0; b<ntics; b++){
+    if(hist[PAIR_IDX(n,i,j)*ntics + b] > peak){
+      peak = hist[PAIR_IDX(n,i,j)*ntics + b];
+      peak_idx = b;
+    }
+  }
   
+  /* find mean of rotations */
+  // initialize
+  cnt = 0;
 
+  // _rotratio_eulerangle_vec loops over good_k list per (i,j)
+  // find satisfying indices  
+  for(k=0; k<n; k++){
+    // image k in peak bin
+    if(abs(kmap[PAIR_IDX(n,i,j)*n + k] - peak_idx) < 2){
+      cnt += 1;
+      // convert to euler angles  // check
+      angles[0] = cl_idx12 * 2 * M_PI / n + M_PI / 2;
+      angles[1] = angle;
+      angles[2] = -M_PI / 2 - cl_idx21 * 2 * M_PI / n;
+      
+      // convert euler to rotation
+      ang2orth(r, angles[0], angles[1], angles[2]);
+      // add rotation matrix contribution to mean
+      for(kk=0; kk<9;kk++){
+        rotations[PAIR_IDX(n,i,j)*9 + kk] += r[kk];
+      } /* kk */
+    } /* if */   
+  } /* k */
+    
 
-  
-  
-}
+  // divide  (todo, better handle 0/0?)
+  for(kk=0; kk<9;kk++){
+    rotations[PAIR_IDX(n,i,j)*9 + kk] /= cnt;
+  } /* kk */
+    
+} /* kernel */

From 0b779a1f72a139ab11bfaa408840dfeed6712caf Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Tue, 17 Sep 2024 11:58:39 -0400
Subject: [PATCH 13/45] update kernel with some of the angs work

---
 src/aspire/abinitio/commonline_sync3n.cu | 123 +++++++++++++++--------
 1 file changed, 82 insertions(+), 41 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu
index 8e83aee9aa..71b2cdc365 100644
--- a/src/aspire/abinitio/commonline_sync3n.cu
+++ b/src/aspire/abinitio/commonline_sync3n.cu
@@ -20,9 +20,9 @@ inline void ang2orth(double* r, double a, double b, double c){
   r[5] = sa*sb;
   r[6] = -sb*cc;
   r[7] = sb*sc;
-  r[8] = cb;  
+  r[8] = cb;
 }
-  
+
 
 inline void mult_3x3(double *out, double *R1, double *R2) {
   /* 3X3 matrices multiplication: out = R1*R2
@@ -436,7 +436,16 @@ void triangle_scores_inner(int n, double* Rijs, int n_intervals, unsigned int* s
 };
 
 extern "C" __global__
-void estimate_all_Rijs(int n, int n_theta, double* __restrict__ clmatrix, double* __restrict__ hist, int* __restrict__ kmap, double* rotations)
+void estimate_all_Rijs(int n,
+                       int n_theta,
+                       double hist_bin_width,
+                       int full_width,
+                       double sigma,
+                       int sync,
+                       double* __restrict__ clmatrix,
+                       double* __restrict__ hist,
+                       int* __restrict__ kmap,
+                       double* rotations)
 {
   // try toget kmap as uint16_t
   /* n n_img */
@@ -445,14 +454,12 @@ void estimate_all_Rijs(int n, int n_theta, double* __restrict__ clmatrix, double
   unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
   unsigned int j = blockDim.y * blockIdx.y + threadIdx.y;
   int k;
-  int kk;
   int cl_diff1, cl_diff2, cl_diff3;
   double theta1, theta2, theta3;
   double c1, c2, c3;
   double cond;
   double cos_phi2;
   double angles[3];
-  double r[9];
   int cnt;
 
   /* no-op when out of bounds */
@@ -471,14 +478,15 @@ void estimate_all_Rijs(int n, int n_theta, double* __restrict__ clmatrix, double
   int cl_idx21 = clmatrix[j*n + i];
   int cl_idx13, cl_idx31;
   int cl_idx23, cl_idx32;
-  const int ntics = 60;
-  const double sigma = 3.0;
+  int ntics = 180 / hist_bin_width;
+  const double TOL_idx = 1e-12;
+  bool ind1, ind2;
   double ga;
   double angle;
   int b;
   int peak_idx;
   double peak;
-  
+
   /* Assume that k_list starts as [0,n] */
   for(k=0; k<n; k++){
 
@@ -496,7 +504,7 @@ void estimate_all_Rijs(int n, int n_theta, double* __restrict__ clmatrix, double
 
     // self._get_cos_phis(cl_diff1, cl_diff2, cl_diff3, n_theta)
     cl_diff1 = cl_idx13 - cl_idx12;
-    cl_diff2 = cl_idx21 - cl_idx23;
+    cl_diff2 = cl_idx23 - cl_idx21;
     cl_diff3 = cl_idx32 - cl_idx31;
 
     theta1 = cl_diff1 * 2 * M_PI / n_theta;
@@ -507,13 +515,38 @@ void estimate_all_Rijs(int n, int n_theta, double* __restrict__ clmatrix, double
     c2 = cos(theta2);
     c3 = cos(theta3);
 
-    cond = 1 + 2 * c1 * c2 * c3 - (c1*c1 + c2*c2 + c3*c3);
-
     // test if we have a good_idx
+    cond = 1 + 2 * c1 * c2 * c3 - (c1*c1 + c2*c2 + c3*c3);
     if(cond < 1e-5) return;
 
-    cos_phi2 = (c3 - c1*c2 ) / (sin(theta1) * sin(theta2));
-    
+    /* Calculated cos values of angle between i and j images */
+    if( sync == 1){
+
+      cos_phi2 = (c3 - c1 * c2) / (sqrt(1 - c1*c1) * sqrt(1 - c2 *c2));
+
+      /* Some synchronization must be applied when common line is out by 180 degrees.
+         Here fix the angles between c_ij(c_ji) and c_ik(c_jk) to be smaller than pi/2,
+         otherwise there will be an ambiguity between alpha and pi-alpha.
+      */
+
+      /* Check sync conditions */
+      ind1 = (theta1 > (M_PI + TOL_idx)) | (
+          (theta1 < -TOL_idx) & (theta1 > -M_PI)
+                                            );
+      ind2 = (theta2 > (M_PI + TOL_idx)) | (
+          (theta2 < -TOL_idx) & (theta2 > -M_PI)
+                                            );
+      if( (ind1 & ~ind2) | (~ind1 & ind2)){
+        /* Apply sync */
+        cos_phi2 = -cos_phi2;
+      }
+
+    }
+    else{
+
+      cos_phi2 = (c3 - c1*c2 ) / (sin(theta1) * sin(theta2));
+    }
+
     //end _get_cos_phis
 
     // clip [-1,1]
@@ -523,19 +556,29 @@ void estimate_all_Rijs(int n, int n_theta, double* __restrict__ clmatrix, double
     if(cos_phi2 <-1){
       cos_phi2 = -1;
     }
-                   
+
     /* compute histogram contribution and index map */
     angle = acos(cos_phi2) * 180. / M_PI;
     // angle's bin
     kmap[PAIR_IDX(n,i,j)*n + k] = angle / ntics;
     for(b=0; b<ntics; b++){
       ga = b*(180/ntics);  // grid angle // todo, just compute in radians to avoid extra arithmetic
+      ga = (ga - angle);
       // histogram contribution
       hist[PAIR_IDX(n,i,j)*ntics + b ] += exp(
-          (2*angle*ga - (angle*angle + ga*ga))/(2*sigma*sigma));
+          ga*ga / ( 2*sigma*sigma));
     } /* bins */
   } /* k*/
-  
+
+  /*
+    At this point, the kernel should have accumulated all "good"
+    k contributions into the histogram.
+    The next section solves the histogram.
+
+    Potentially this could be a distinct kernel,
+    but would require looking up cl_idx again or assigning them as below.
+  */
+
   /* find peak of histogram */
   peak = -1;
   peak_idx = -1;
@@ -545,35 +588,33 @@ void estimate_all_Rijs(int n, int n_theta, double* __restrict__ clmatrix, double
       peak_idx = b;
     }
   }
-  
+
   /* find mean of rotations */
   // initialize
+  angles[0] = cl_idx12 * 2 * M_PI / n + M_PI / 2;
+  angles[1] = 0;
+  angles[2] = -M_PI / 2 - cl_idx21 * 2 * M_PI / n;
   cnt = 0;
 
-  // _rotratio_eulerangle_vec loops over good_k list per (i,j)
-  // find satisfying indices  
-  for(k=0; k<n; k++){
-    // image k in peak bin
-    if(abs(kmap[PAIR_IDX(n,i,j)*n + k] - peak_idx) < 2){
-      cnt += 1;
-      // convert to euler angles  // check
-      angles[0] = cl_idx12 * 2 * M_PI / n + M_PI / 2;
-      angles[1] = angle;
-      angles[2] = -M_PI / 2 - cl_idx21 * 2 * M_PI / n;
-      
-      // convert euler to rotation
-      ang2orth(r, angles[0], angles[1], angles[2]);
-      // add rotation matrix contribution to mean
-      for(kk=0; kk<9;kk++){
-        rotations[PAIR_IDX(n,i,j)*9 + kk] += r[kk];
-      } /* kk */
-    } /* if */   
-  } /* k */
-    
+  if(full_width!=-1){
+    /* fixed width */
+    // find satisfying indices
+    for(k=0; k<n; k++){
+      // image k in peak bin
+      if(abs(kmap[PAIR_IDX(n,i,j)*n + k] - peak_idx) < 2){
+        cnt += 1;
+        angles[1] += angle;
+      } /* if */
+    } /* k */
+  }
+  else {
+    /* adaptive width*/
+  }
 
   // divide  (todo, better handle 0/0?)
-  for(kk=0; kk<9;kk++){
-    rotations[PAIR_IDX(n,i,j)*9 + kk] /= cnt;
-  } /* kk */
-    
+  angles[1] /= cnt;
+
+  // convert euler to rotation, probably break into it's own kernel.
+  ang2orth(&(rotations[PAIR_IDX(n,i,j)]), angles[0], angles[1], angles[2]);
+
 } /* kernel */

From 6235b1f6158fe17a4c19dd3a16b41c59a3ad7f34 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Tue, 17 Sep 2024 12:22:28 -0400
Subject: [PATCH 14/45] start populating eastimate_Rijs kernel call (stash)

---
 src/aspire/abinitio/commonline_sync3n.py | 69 ++++++++++++++++++++----
 1 file changed, 60 insertions(+), 9 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index d6b12ef1d1..f9755257da 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -824,19 +824,70 @@ def _estimate_all_Rijs(self, clmatrix):
         :return: Estimated rotations
         """
         # host/gpu dispatch
-        if self.__gpu_module:
-            res = self._estimate_all_Rijs_cupy(Rijs)
-        else:
-            res = self._estimate_all_Rijs_host(Rijs)
-        
+        # if self.__gpu_module:
+        #     res = self._estimate_all_Rijs_cupy(Rijs)
+        # else:
+        #     res = self._estimate_all_Rijs_host(Rijs)
+        res = self._estimate_all_Rijs_cupy(clmatrix)
+        res_host = self._estimate_all_Rijs_host(clmatrix)
+        if not np.allclose(res, res_host):
+            breakpoint()
+
         return res
-        
+
     def _estimate_all_Rijs_cupy(self, clmatrix):
         import cupy as cp
 
         estimate_all_Rijs = self.__gpu_module.get_function("estimate_all_Rijs")
-        
-        return
+
+        full_width = self.full_width
+        if str(full_width).lower() == "adaptive":
+            full_width = -1
+
+        sigma = 3.0
+        sync = 1
+
+        # transfer input to device
+        clmatrix = cp.asarray(self.clmatrix, dtype=int)
+
+        # workspace arrays
+        ntics = int(180 / self.hist_bin_width)
+        n_pairs = (self.n_img * (self.n_img - 1)) // 2
+        hist = cp.zeros((n_pairs, ntics), dtype=np.float64)
+        # kmap stores the mapping of k indices to histogram bins
+        kmap = cp.zeros(
+            (n_pairs, self.n_img), dtype=int
+        )  # note, high memory , probably need to change...
+        rotations = cp.empty((n_pairs, 3, 3), dtype=np.float64)
+
+        # Configure grid of blocks
+        blkszx = 32
+        nblkx = (self.n_img + blkszx - 1) // blkszx
+        blkszy = 32
+        nblky = (self.n_img + blkszy - 1) // blkszy
+
+        logger.info("Launching `estimate_all_Rijs` kernel.")
+        estimate_all_Rijs(
+            (nblkx, nblky),
+            (blkszx, blkszy),
+            (
+                self.n_img,
+                self.n_theta,
+                self.hist_bin_width,  # converted to double
+                full_width,
+                sigma,
+                sync,
+                clmatrix,  # input
+                hist,  # tmp
+                kmap,  # tmp
+                rotations,  # output
+            ),
+        )
+
+        # transfer results device to host
+        rotations = rotations.get()
+
+        return rotations
 
     def _estimate_all_Rijs_host(self, clmatrix):
         """
@@ -844,7 +895,7 @@ def _estimate_all_Rijs_host(self, clmatrix):
 
         :param clmatrix: Common lines matrix
         :return: Estimated rotations
-        """        
+        """
         n_img = self.n_img
         n_theta = self.n_theta
         Rijs = np.zeros((len(self._pairs), 3, 3))

From 41665b7670de18ce58e8fec4920fad94fe2a4ad8 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Tue, 17 Sep 2024 15:06:56 -0400
Subject: [PATCH 15/45] breakout angles and add angles map (stash)

---
 src/aspire/abinitio/commonline_sync3n.cu | 71 +++++++++++++++---------
 src/aspire/abinitio/commonline_sync3n.py | 55 +++++++++++++-----
 2 files changed, 87 insertions(+), 39 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu
index 71b2cdc365..fa8b9261e3 100644
--- a/src/aspire/abinitio/commonline_sync3n.cu
+++ b/src/aspire/abinitio/commonline_sync3n.cu
@@ -436,18 +436,19 @@ void triangle_scores_inner(int n, double* Rijs, int n_intervals, unsigned int* s
 };
 
 extern "C" __global__
-void estimate_all_Rijs(int n,
-                       int n_theta,
-                       double hist_bin_width,
-                       int full_width,
-                       double sigma,
-                       int sync,
-                       double* __restrict__ clmatrix,
-                       double* __restrict__ hist,
-                       int* __restrict__ kmap,
-                       double* rotations)
+void estimate_all_angles(int n,
+                         int n_theta,
+                         double hist_bin_width,
+                         int full_width,
+                         double sigma,
+                         int sync,
+                         int* __restrict__ clmatrix,
+                         double* __restrict__ hist,
+                         int* __restrict__ k_map,
+                         double* __restrict__ angles_map,
+                         double* __restrict__ angles)
 {
-  // try toget kmap as uint16_t
+  // try toget k_map as uint16_t
   /* n n_img */
 
   /* thread index (2d), represents "i" index, "j" index */
@@ -459,7 +460,6 @@ void estimate_all_Rijs(int n,
   double c1, c2, c3;
   double cond;
   double cos_phi2;
-  double angles[3];
   int cnt;
 
   /* no-op when out of bounds */
@@ -470,6 +470,8 @@ void estimate_all_Rijs(int n,
 
 
   // vote_ij creates good_k list per (i,j)
+  const int pair_idx = PAIR_IDX(n,i,j);
+  int map_idx; /* tmp index var */
 
   // We shouldn't need this... confirm and rm later.
   if(clmatrix[i*n + j] == -1) return;
@@ -560,12 +562,14 @@ void estimate_all_Rijs(int n,
     /* compute histogram contribution and index map */
     angle = acos(cos_phi2) * 180. / M_PI;
     // angle's bin
-    kmap[PAIR_IDX(n,i,j)*n + k] = angle / ntics;
+    map_idx = pair_idx*n + k;
+    k_map[map_idx] = angle / ntics;
+    angles_map[map_idx] = angle;
     for(b=0; b<ntics; b++){
       ga = b*(180/ntics);  // grid angle // todo, just compute in radians to avoid extra arithmetic
       ga = (ga - angle);
       // histogram contribution
-      hist[PAIR_IDX(n,i,j)*ntics + b ] += exp(
+      hist[pair_idx*ntics + b ] += exp(
           ga*ga / ( 2*sigma*sigma));
     } /* bins */
   } /* k*/
@@ -583,17 +587,19 @@ void estimate_all_Rijs(int n,
   peak = -1;
   peak_idx = -1;
   for(b=0; b<ntics; b++){
-    if(hist[PAIR_IDX(n,i,j)*ntics + b] > peak){
-      peak = hist[PAIR_IDX(n,i,j)*ntics + b];
+    map_idx = pair_idx*ntics + b;
+    if(hist[map_idx] > peak){
+      peak = hist[map_idx];
       peak_idx = b;
     }
   }
 
   /* find mean of rotations */
   // initialize
-  angles[0] = cl_idx12 * 2 * M_PI / n + M_PI / 2;
-  angles[1] = 0;
-  angles[2] = -M_PI / 2 - cl_idx21 * 2 * M_PI / n;
+  map_idx = pair_idx*3;
+  angles[map_idx    ] = cl_idx12 * 2 * M_PI / n + M_PI / 2;
+  angles[map_idx + 1] = 0;
+  angles[map_idx + 2] = -M_PI / 2 - cl_idx21 * 2 * M_PI / n;
   cnt = 0;
 
   if(full_width!=-1){
@@ -601,9 +607,10 @@ void estimate_all_Rijs(int n,
     // find satisfying indices
     for(k=0; k<n; k++){
       // image k in peak bin
-      if(abs(kmap[PAIR_IDX(n,i,j)*n + k] - peak_idx) < 2){
+      map_idx = pair_idx*n + k;
+      if(abs(k_map[map_idx] - peak_idx) < 2){
         cnt += 1;
-        angles[1] += angle;
+        angles[pair_idx*3 + 1] += angles_map[map_idx];
       } /* if */
     } /* k */
   }
@@ -612,9 +619,23 @@ void estimate_all_Rijs(int n,
   }
 
   // divide  (todo, better handle 0/0?)
-  angles[1] /= cnt;
-
-  // convert euler to rotation, probably break into it's own kernel.
-  ang2orth(&(rotations[PAIR_IDX(n,i,j)]), angles[0], angles[1], angles[2]);
+  angles[pair_idx*3 + 1] /= cnt;
 
 } /* kernel */
+
+extern "C" __global__
+void angles_to_rots(int n,
+                    double* __restrict__ angles,
+                    double* __restrict__ rotations)
+{
+  /* Convert stack of ZYZ Euler angles to stack of rotation matrices */
+
+  /* thread index (1d), represents "i" index */
+  unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+  /* no-op when out of bounds */
+  if(i >= n) return;
+
+  ang2orth(&(rotations[i*9]), angles[i*3],angles[i*3+1],angles[i*3+2]);
+
+}
diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index f9755257da..835788bc5a 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -830,15 +830,16 @@ def _estimate_all_Rijs(self, clmatrix):
         #     res = self._estimate_all_Rijs_host(Rijs)
         res = self._estimate_all_Rijs_cupy(clmatrix)
         res_host = self._estimate_all_Rijs_host(clmatrix)
-        if not np.allclose(res, res_host):
-            breakpoint()
+        # if not np.allclose(res, res_host):
+        #     breakpoint()
 
         return res
 
     def _estimate_all_Rijs_cupy(self, clmatrix):
         import cupy as cp
 
-        estimate_all_Rijs = self.__gpu_module.get_function("estimate_all_Rijs")
+        estimate_all_angles = self.__gpu_module.get_function("estimate_all_angles")
+        angles_to_rots = self.__gpu_module.get_function("angles_to_rots")
 
         full_width = self.full_width
         if str(full_width).lower() == "adaptive":
@@ -854,11 +855,14 @@ def _estimate_all_Rijs_cupy(self, clmatrix):
         ntics = int(180 / self.hist_bin_width)
         n_pairs = (self.n_img * (self.n_img - 1)) // 2
         hist = cp.zeros((n_pairs, ntics), dtype=np.float64)
-        # kmap stores the mapping of k indices to histogram bins
-        kmap = cp.zeros(
+        # k_map stores the mapping of k indices to histogram bins
+        k_map = cp.empty(
             (n_pairs, self.n_img), dtype=int
         )  # note, high memory , probably need to change...
-        rotations = cp.empty((n_pairs, 3, 3), dtype=np.float64)
+        angles_map = cp.empty(
+            (n_pairs, self.n_img), dtype=int
+        )  # note, high memory , probably need to change...
+        angles = cp.empty((n_pairs, 3), dtype=np.float64)
 
         # Configure grid of blocks
         blkszx = 32
@@ -866,21 +870,44 @@ def _estimate_all_Rijs_cupy(self, clmatrix):
         blkszy = 32
         nblky = (self.n_img + blkszy - 1) // blkszy
 
-        logger.info("Launching `estimate_all_Rijs` kernel.")
-        estimate_all_Rijs(
+        logger.info("Launching `estimate_all_angles` kernel.")
+        estimate_all_angles(
             (nblkx, nblky),
             (blkszx, blkszy),
             (
                 self.n_img,
                 self.n_theta,
-                self.hist_bin_width,  # converted to double
-                full_width,
-                sigma,
-                sync,
+                np.float64(self.hist_bin_width),
+                int(full_width),
+                np.float64(sigma),
+                int(sync),
                 clmatrix,  # input
                 hist,  # tmp
-                kmap,  # tmp
-                rotations,  # output
+                k_map,  # tmp
+                angles_map,  # tmp
+                angles,  # output
+            ),
+        )
+
+        # no longer need workspace vars
+        del hist
+        del k_map
+        del angles_map
+
+        # convert angles to rots
+        rotations = cp.empty((n_pairs, 3, 3), dtype=np.float64)
+
+        blkszx = 1024
+        nblkx = (n_pairs + blkszx - 1) // blkszx
+
+        logger.info("Launching `angles_to_rots` kernel.")
+        angles_to_rots(
+            (nblkx,),
+            (blkszx,),
+            (
+                n_pairs,
+                angles,
+                rotations,
             ),
         )
 

From be329599961503fa9f54cf5e0d6b620e237f4a58 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Tue, 17 Sep 2024 17:05:08 -0400
Subject: [PATCH 16/45] pair_idx doesn't include diag

---
 src/aspire/abinitio/commonline_sync3n.cu | 59 +++++++++++++-----------
 src/aspire/abinitio/commonline_sync3n.py | 21 ++++++---
 tests/test_commonline_sync3n.py          |  4 +-
 3 files changed, 51 insertions(+), 33 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu
index fa8b9261e3..bc84ce71eb 100644
--- a/src/aspire/abinitio/commonline_sync3n.cu
+++ b/src/aspire/abinitio/commonline_sync3n.cu
@@ -1,8 +1,10 @@
 # define M_PI           3.14159265358979323846  /* pi */
 
 /* from i,j indices to the common index in the N-choose-2 sized array */
+// careful, this is stricly the upper triangle!
 #define PAIR_IDX(N,I,J) ((2*N-I-1)*I/2 + J-I-1)
 
+
 /* convert euler angles (a,b,c) in ZYZ to rotation matrix r */
 inline void ang2orth(double* r, double a, double b, double c){
   double sa = sin(a);
@@ -452,8 +454,8 @@ void estimate_all_angles(int n,
   /* n n_img */
 
   /* thread index (2d), represents "i" index, "j" index */
-  unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-  unsigned int j = blockDim.y * blockIdx.y + threadIdx.y;
+  const unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+  const unsigned int j = blockDim.y * blockIdx.y + threadIdx.y;
   int k;
   int cl_diff1, cl_diff2, cl_diff3;
   double theta1, theta2, theta3;
@@ -470,14 +472,15 @@ void estimate_all_angles(int n,
 
 
   // vote_ij creates good_k list per (i,j)
-  const int pair_idx = PAIR_IDX(n,i,j);
+  const int pair_idx = PAIR_IDXD(n,i,j);
   int map_idx; /* tmp index var */
 
   // We shouldn't need this... confirm and rm later.
+  //xxx
   if(clmatrix[i*n + j] == -1) return;
 
-  int cl_idx12 = clmatrix[i*n + j];
-  int cl_idx21 = clmatrix[j*n + i];
+  const int cl_idx12 = clmatrix[i*n + j];
+  const int cl_idx21 = clmatrix[j*n + i];
   int cl_idx13, cl_idx31;
   int cl_idx23, cl_idx32;
   int ntics = 180 / hist_bin_width;
@@ -498,10 +501,10 @@ void estimate_all_angles(int n,
     cl_idx32 = clmatrix[k*n + j];
 
     // test `k` values
-    if(k==i) return;
-    if(cl_idx13 == -1) return;  // i, k
-    if(cl_idx31 == -1) return;  // k, i
-    if(cl_idx23 == -1) return;  // j, k
+    if(k==i) continue;
+    if(cl_idx13 == -1) continue;  // i, k
+    if(cl_idx31 == -1) continue;  // k, i
+    if(cl_idx23 == -1) continue;  // j, k
     // if(cl_idx32 == -1) return;  // k, j
 
     // self._get_cos_phis(cl_diff1, cl_diff2, cl_diff3, n_theta)
@@ -519,12 +522,12 @@ void estimate_all_angles(int n,
 
     // test if we have a good_idx
     cond = 1 + 2 * c1 * c2 * c3 - (c1*c1 + c2*c2 + c3*c3);
-    if(cond < 1e-5) return;
+    if(cond <= 1e-5) continue;  // this value of k is not good
 
     /* Calculated cos values of angle between i and j images */
     if( sync == 1){
 
-      cos_phi2 = (c3 - c1 * c2) / (sqrt(1 - c1*c1) * sqrt(1 - c2 *c2));
+      cos_phi2 = (c3 - c1*c2) / (sqrt(1 - c1*c1) * sqrt(1 - c2*c2));
 
       /* Some synchronization must be applied when common line is out by 180 degrees.
          Here fix the angles between c_ij(c_ji) and c_ik(c_jk) to be smaller than pi/2,
@@ -535,21 +538,19 @@ void estimate_all_angles(int n,
       ind1 = (theta1 > (M_PI + TOL_idx)) | (
           (theta1 < -TOL_idx) & (theta1 > -M_PI)
                                             );
-      ind2 = (theta2 > (M_PI + TOL_idx)) | (
-          (theta2 < -TOL_idx) & (theta2 > -M_PI)
+      ind2 = (theta2 > (M_PI + TOL_idx)) || (
+          (theta2 < -TOL_idx) && (theta2 > -M_PI)
                                             );
-      if( (ind1 & ~ind2) | (~ind1 & ind2)){
+      if( (ind1 && !ind2) || (!ind1 && ind2)){
         /* Apply sync */
         cos_phi2 = -cos_phi2;
       }
 
-    }
+    }  // end sync
     else{
 
       cos_phi2 = (c3 - c1*c2 ) / (sin(theta1) * sin(theta2));
-    }
-
-    //end _get_cos_phis
+    } // end not sync
 
     // clip [-1,1]
     if(cos_phi2 >1){
@@ -564,7 +565,7 @@ void estimate_all_angles(int n,
     // angle's bin
     map_idx = pair_idx*n + k;
     k_map[map_idx] = angle / ntics;
-    angles_map[map_idx] = angle;
+    angles_map[map_idx] = angle;  // degree
     for(b=0; b<ntics; b++){
       ga = b*(180/ntics);  // grid angle // todo, just compute in radians to avoid extra arithmetic
       ga = (ga - angle);
@@ -597,9 +598,9 @@ void estimate_all_angles(int n,
   /* find mean of rotations */
   // initialize
   map_idx = pair_idx*3;
-  angles[map_idx    ] = cl_idx12 * 2 * M_PI / n + M_PI / 2;
+  angles[map_idx    ] = cl_idx12 * 2 * M_PI / n_theta + M_PI / 2;
   angles[map_idx + 1] = 0;
-  angles[map_idx + 2] = -M_PI / 2 - cl_idx21 * 2 * M_PI / n;
+  angles[map_idx + 2] = -M_PI / 2 - cl_idx21 * 2 * M_PI / n_theta;
   cnt = 0;
 
   if(full_width!=-1){
@@ -608,9 +609,9 @@ void estimate_all_angles(int n,
     for(k=0; k<n; k++){
       // image k in peak bin
       map_idx = pair_idx*n + k;
-      if(abs(k_map[map_idx] - peak_idx) < 2){
-        cnt += 1;
-        angles[pair_idx*3 + 1] += angles_map[map_idx];
+      if(abs(k_map[map_idx] - peak_idx) < full_width){
+        cnt += 1;  // this image was in the peak
+        angles[pair_idx*3 + 1] += angles_map[map_idx];  // accumulate angle
       } /* if */
     } /* k */
   }
@@ -618,9 +619,15 @@ void estimate_all_angles(int n,
     /* adaptive width*/
   }
 
-  // divide  (todo, better handle 0/0?)
-  angles[pair_idx*3 + 1] /= cnt;
+  /* Divide accumulated angles (resulting in the mean alpha angle) */   //(todo, better handle 0/0?)
+  /* convert degree radian */
+  angles[pair_idx*3 + 1] *= M_PI / (180*cnt);
 
+  //debug
+  // angles[pair_idx*3 + 0] = 0;
+  // angles[pair_idx*3 + 1] = 1;
+  // angles[pair_idx*3 + 2] = 2;
+  
 } /* kernel */
 
 extern "C" __global__
diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 835788bc5a..c753426bdc 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -828,6 +828,7 @@ def _estimate_all_Rijs(self, clmatrix):
         #     res = self._estimate_all_Rijs_cupy(Rijs)
         # else:
         #     res = self._estimate_all_Rijs_host(Rijs)
+        print(f"clmatrix {clmatrix}")
         res = self._estimate_all_Rijs_cupy(clmatrix)
         res_host = self._estimate_all_Rijs_host(clmatrix)
         # if not np.allclose(res, res_host):
@@ -844,6 +845,9 @@ def _estimate_all_Rijs_cupy(self, clmatrix):
         full_width = self.full_width
         if str(full_width).lower() == "adaptive":
             full_width = -1
+            raise
+        #XXX debug (check rest of kernel works before porting adaptive)
+
 
         sigma = 3.0
         sync = 1
@@ -856,13 +860,13 @@ def _estimate_all_Rijs_cupy(self, clmatrix):
         n_pairs = (self.n_img * (self.n_img - 1)) // 2
         hist = cp.zeros((n_pairs, ntics), dtype=np.float64)
         # k_map stores the mapping of k indices to histogram bins
-        k_map = cp.empty(
+        k_map = cp.zeros(
             (n_pairs, self.n_img), dtype=int
         )  # note, high memory , probably need to change...
-        angles_map = cp.empty(
-            (n_pairs, self.n_img), dtype=int
+        angles_map = cp.zeros(
+            (n_pairs, self.n_img), dtype=np.float64
         )  # note, high memory , probably need to change...
-        angles = cp.empty((n_pairs, 3), dtype=np.float64)
+        angles = cp.zeros((n_pairs, 3), dtype=np.float64)
 
         # Configure grid of blocks
         blkszx = 32
@@ -875,8 +879,8 @@ def _estimate_all_Rijs_cupy(self, clmatrix):
             (nblkx, nblky),
             (blkszx, blkszy),
             (
-                self.n_img,
-                self.n_theta,
+                int(self.n_img),
+                int(self.n_theta),
                 np.float64(self.hist_bin_width),
                 int(full_width),
                 np.float64(sigma),
@@ -894,6 +898,8 @@ def _estimate_all_Rijs_cupy(self, clmatrix):
         del k_map
         del angles_map
 
+        print(f"First three angles\n{angles.get()[:3]}")
+
         # convert angles to rots
         rotations = cp.empty((n_pairs, 3, 3), dtype=np.float64)
 
@@ -958,6 +964,9 @@ def _syncmatrix_ij_vote_3n(self, clmatrix, i, j, k_list, n_theta):
             angles[2] = -np.pi / 2 - clmatrix[j, i] * 2 * np.pi / n_theta
             rot = Rotation.from_euler(angles).matrices
 
+            if i == 0 and j<=3:
+                print(f'host angles {i} {j} {angles}')
+
         else:
             # This is for the case that images i and j correspond to the same
             # viewing direction and differ only by in-plane rotation.
diff --git a/tests/test_commonline_sync3n.py b/tests/test_commonline_sync3n.py
index 600f883c2d..53fa544ea6 100644
--- a/tests/test_commonline_sync3n.py
+++ b/tests/test_commonline_sync3n.py
@@ -62,7 +62,9 @@ def source_orientation_objs(resolution, offsets, dtype):
         max_shift = 0.20
         shift_step = 0.25  # Reduce shift steps for non-integer offsets of Simulation.
 
-    orient_est = CLSync3N(src, max_shift=max_shift, shift_step=shift_step, seed=789)
+    orient_est = CLSync3N(
+        src, max_shift=max_shift, shift_step=shift_step, seed=789, full_width=2
+    )
 
     # Estimate rotations once for all tests.
     orient_est.estimate_rotations()

From 62d6d1a767298fa132866e4d04259cda220796e6 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Tue, 17 Sep 2024 23:39:45 -0400
Subject: [PATCH 17/45] angles matching (Stash), bug in angles to rot func

---
 src/aspire/abinitio/commonline_sync3n.cu | 50 +++++++++++-------------
 src/aspire/abinitio/commonline_sync3n.py | 25 ++++++------
 2 files changed, 35 insertions(+), 40 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu
index bc84ce71eb..82cf859222 100644
--- a/src/aspire/abinitio/commonline_sync3n.cu
+++ b/src/aspire/abinitio/commonline_sync3n.cu
@@ -7,21 +7,22 @@
 
 /* convert euler angles (a,b,c) in ZYZ to rotation matrix r */
 inline void ang2orth(double* r, double a, double b, double c){
-  double sa = sin(a);
+  double sa = sin(c);
   double sb = sin(b);
   double sc = sin(c);
   double ca = cos(a);
   double cb = cos(b);
   double cc = cos(c);
 
+  // bugs here
   r[0] = ca*cb*cc - sa*sc;
-  r[1] = -ca*cb*cc - sa*sc;
-  r[2] = ca*sb;
-  r[3] = sa*cb*cc + ca*sc;
-  r[4] = -sa*cb*cc + ca*sc;
-  r[5] = sa*sb;
-  r[6] = -sb*cc;
-  r[7] = sb*sc;
+  r[1] = sa*cb*cc + ca*sc;
+  r[2] = -sb*cc;
+  r[3] = -ca*cb*sc - sa*cc;
+  r[4] = -sa*cb*sc + ca*cc;
+  r[5] = sb*sc;
+  r[6] = ca*sb;
+  r[7] = sa*sb;
   r[8] = cb;
 }
 
@@ -472,18 +473,18 @@ void estimate_all_angles(int n,
 
 
   // vote_ij creates good_k list per (i,j)
-  const int pair_idx = PAIR_IDXD(n,i,j);
+  const int pair_idx = PAIR_IDX(n,i,j);
   int map_idx; /* tmp index var */
 
   // We shouldn't need this... confirm and rm later.
   //xxx
-  if(clmatrix[i*n + j] == -1) return;
+  //  if(clmatrix[i*n + j] == -1) return;
 
-  const int cl_idx12 = clmatrix[i*n + j];
-  const int cl_idx21 = clmatrix[j*n + i];
+  int cl_idx12 = clmatrix[i*n + j];
+  int cl_idx21 = clmatrix[j*n + i];
   int cl_idx13, cl_idx31;
   int cl_idx23, cl_idx32;
-  int ntics = 180 / hist_bin_width;
+  int ntics = 180. / hist_bin_width;
   const double TOL_idx = 1e-12;
   bool ind1, ind2;
   double ga;
@@ -503,7 +504,7 @@ void estimate_all_angles(int n,
     // test `k` values
     if(k==i) continue;
     if(cl_idx13 == -1) continue;  // i, k
-    if(cl_idx31 == -1) continue;  // k, i
+    //if(cl_idx31 == -1) continue;  // k, i
     if(cl_idx23 == -1) continue;  // j, k
     // if(cl_idx32 == -1) return;  // k, j
 
@@ -522,7 +523,7 @@ void estimate_all_angles(int n,
 
     // test if we have a good_idx
     cond = 1 + 2 * c1 * c2 * c3 - (c1*c1 + c2*c2 + c3*c3);
-    if(cond <= 1e-5) continue;  // this value of k is not good
+    if(cond <= 1e-5) continue;  // that value of k is not good, skip
 
     /* Calculated cos values of angle between i and j images */
     if( sync == 1){
@@ -540,7 +541,7 @@ void estimate_all_angles(int n,
                                             );
       ind2 = (theta2 > (M_PI + TOL_idx)) || (
           (theta2 < -TOL_idx) && (theta2 > -M_PI)
-                                            );
+                                             );
       if( (ind1 && !ind2) || (!ind1 && ind2)){
         /* Apply sync */
         cos_phi2 = -cos_phi2;
@@ -548,7 +549,6 @@ void estimate_all_angles(int n,
 
     }  // end sync
     else{
-
       cos_phi2 = (c3 - c1*c2 ) / (sin(theta1) * sin(theta2));
     } // end not sync
 
@@ -564,14 +564,14 @@ void estimate_all_angles(int n,
     angle = acos(cos_phi2) * 180. / M_PI;
     // angle's bin
     map_idx = pair_idx*n + k;
-    k_map[map_idx] = angle / ntics;
+    k_map[map_idx] = angle / hist_bin_width;  //check
     angles_map[map_idx] = angle;  // degree
     for(b=0; b<ntics; b++){
-      ga = b*(180/ntics);  // grid angle // todo, just compute in radians to avoid extra arithmetic
+      ga = b*(180./ntics);  // grid angle // todo, just compute in radians to avoid extra arithmetic
       ga = (ga - angle);
       // histogram contribution
       hist[pair_idx*ntics + b ] += exp(
-          ga*ga / ( 2*sigma*sigma));
+          -(ga*ga) / ( 2*sigma*sigma));
     } /* bins */
   } /* k*/
 
@@ -585,7 +585,7 @@ void estimate_all_angles(int n,
   */
 
   /* find peak of histogram */
-  peak = -1;
+  peak = -99999;
   peak_idx = -1;
   for(b=0; b<ntics; b++){
     map_idx = pair_idx*ntics + b;
@@ -601,6 +601,7 @@ void estimate_all_angles(int n,
   angles[map_idx    ] = cl_idx12 * 2 * M_PI / n_theta + M_PI / 2;
   angles[map_idx + 1] = 0;
   angles[map_idx + 2] = -M_PI / 2 - cl_idx21 * 2 * M_PI / n_theta;
+
   cnt = 0;
 
   if(full_width!=-1){
@@ -623,11 +624,6 @@ void estimate_all_angles(int n,
   /* convert degree radian */
   angles[pair_idx*3 + 1] *= M_PI / (180*cnt);
 
-  //debug
-  // angles[pair_idx*3 + 0] = 0;
-  // angles[pair_idx*3 + 1] = 1;
-  // angles[pair_idx*3 + 2] = 2;
-  
 } /* kernel */
 
 extern "C" __global__
@@ -643,6 +639,6 @@ void angles_to_rots(int n,
   /* no-op when out of bounds */
   if(i >= n) return;
 
-  ang2orth(&(rotations[i*9]), angles[i*3],angles[i*3+1],angles[i*3+2]);
+  ang2orth(&(rotations[i*9]), angles[i*3], angles[i*3+1], angles[i*3+2]);
 
 }
diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index c753426bdc..d28315ea71 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -831,8 +831,7 @@ def _estimate_all_Rijs(self, clmatrix):
         print(f"clmatrix {clmatrix}")
         res = self._estimate_all_Rijs_cupy(clmatrix)
         res_host = self._estimate_all_Rijs_host(clmatrix)
-        # if not np.allclose(res, res_host):
-        #     breakpoint()
+        breakpoint()
 
         return res
 
@@ -846,22 +845,21 @@ def _estimate_all_Rijs_cupy(self, clmatrix):
         if str(full_width).lower() == "adaptive":
             full_width = -1
             raise
-        #XXX debug (check rest of kernel works before porting adaptive)
-
+        # XXX debug (check rest of kernel works before porting adaptive)
 
         sigma = 3.0
         sync = 1
 
         # transfer input to device
-        clmatrix = cp.asarray(self.clmatrix, dtype=int)
+        clmatrix = cp.asarray(clmatrix, dtype=np.int32)
 
         # workspace arrays
         ntics = int(180 / self.hist_bin_width)
-        n_pairs = (self.n_img * (self.n_img - 1)) // 2
+        n_pairs = self.n_img * (self.n_img - 1) // 2
         hist = cp.zeros((n_pairs, ntics), dtype=np.float64)
         # k_map stores the mapping of k indices to histogram bins
         k_map = cp.zeros(
-            (n_pairs, self.n_img), dtype=int
+            (n_pairs, self.n_img), dtype=np.int32
         )  # note, high memory , probably need to change...
         angles_map = cp.zeros(
             (n_pairs, self.n_img), dtype=np.float64
@@ -879,12 +877,12 @@ def _estimate_all_Rijs_cupy(self, clmatrix):
             (nblkx, nblky),
             (blkszx, blkszy),
             (
-                int(self.n_img),
-                int(self.n_theta),
+                self.n_img,
+                self.n_theta,
                 np.float64(self.hist_bin_width),
-                int(full_width),
+                full_width,
                 np.float64(sigma),
-                int(sync),
+                sync,
                 clmatrix,  # input
                 hist,  # tmp
                 k_map,  # tmp
@@ -899,6 +897,7 @@ def _estimate_all_Rijs_cupy(self, clmatrix):
         del angles_map
 
         print(f"First three angles\n{angles.get()[:3]}")
+        np.save("cupy_angles.npy", angles.get())
 
         # convert angles to rots
         rotations = cp.empty((n_pairs, 3, 3), dtype=np.float64)
@@ -964,8 +963,8 @@ def _syncmatrix_ij_vote_3n(self, clmatrix, i, j, k_list, n_theta):
             angles[2] = -np.pi / 2 - clmatrix[j, i] * 2 * np.pi / n_theta
             rot = Rotation.from_euler(angles).matrices
 
-            if i == 0 and j<=3:
-                print(f'host angles {i} {j} {angles}')
+            if i == 0 and j <= 3:
+                print(f"host angles {i} {j} {angles}")
 
         else:
             # This is for the case that images i and j correspond to the same

From 558fddc3aa7418380aa421740b900b65dc4c4fed Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Wed, 18 Sep 2024 10:00:04 -0400
Subject: [PATCH 18/45] fixed zyz angles conversion

---
 src/aspire/abinitio/commonline_sync3n.cu | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu
index 82cf859222..297b5dbb6d 100644
--- a/src/aspire/abinitio/commonline_sync3n.cu
+++ b/src/aspire/abinitio/commonline_sync3n.cu
@@ -7,22 +7,23 @@
 
 /* convert euler angles (a,b,c) in ZYZ to rotation matrix r */
 inline void ang2orth(double* r, double a, double b, double c){
-  double sa = sin(c);
+  double sa = sin(a);
   double sb = sin(b);
   double sc = sin(c);
   double ca = cos(a);
   double cb = cos(b);
   double cc = cos(c);
 
-  // bugs here
+  // ZYZ Proper Euler angles
+  // https://en.wikipedia.org/wiki/Euler_angles#Rotation_matrix
   r[0] = ca*cb*cc - sa*sc;
-  r[1] = sa*cb*cc + ca*sc;
-  r[2] = -sb*cc;
-  r[3] = -ca*cb*sc - sa*cc;
-  r[4] = -sa*cb*sc + ca*cc;
-  r[5] = sb*sc;
-  r[6] = ca*sb;
-  r[7] = sa*sb;
+  r[1] = -cc*sa -ca*cb*sc;
+  r[2] = ca*sb;
+  r[3] = ca*sc + cb*cc*sa;
+  r[4] = ca*cc - cb*sa*sc;
+  r[5] = sa*sb;
+  r[6] = -cc*sb;
+  r[7] = sb*sc;
   r[8] = cb;
 }
 

From cd26b1a944cc0ead45d4fe7af428f11724bbf655 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Wed, 18 Sep 2024 10:01:46 -0400
Subject: [PATCH 19/45] remove dbg prints

---
 src/aspire/abinitio/commonline_sync3n.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index d28315ea71..e8736533fc 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -828,10 +828,8 @@ def _estimate_all_Rijs(self, clmatrix):
         #     res = self._estimate_all_Rijs_cupy(Rijs)
         # else:
         #     res = self._estimate_all_Rijs_host(Rijs)
-        print(f"clmatrix {clmatrix}")
         res = self._estimate_all_Rijs_cupy(clmatrix)
         res_host = self._estimate_all_Rijs_host(clmatrix)
-        breakpoint()
 
         return res
 
@@ -896,9 +894,6 @@ def _estimate_all_Rijs_cupy(self, clmatrix):
         del k_map
         del angles_map
 
-        print(f"First three angles\n{angles.get()[:3]}")
-        np.save("cupy_angles.npy", angles.get())
-
         # convert angles to rots
         rotations = cp.empty((n_pairs, 3, 3), dtype=np.float64)
 
@@ -963,9 +958,6 @@ def _syncmatrix_ij_vote_3n(self, clmatrix, i, j, k_list, n_theta):
             angles[2] = -np.pi / 2 - clmatrix[j, i] * 2 * np.pi / n_theta
             rot = Rotation.from_euler(angles).matrices
 
-            if i == 0 and j <= 3:
-                print(f"host angles {i} {j} {angles}")
-
         else:
             # This is for the case that images i and j correspond to the same
             # viewing direction and differ only by in-plane rotation.

From 9e7a3b460b4d220a3c25123383b969b95c8e96cf Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Wed, 18 Sep 2024 10:10:36 -0400
Subject: [PATCH 20/45] add adaptive width to kernel

---
 src/aspire/abinitio/commonline_base.py   |  4 +++-
 src/aspire/abinitio/commonline_sync3n.cu | 28 +++++++++++++++++++-----
 src/aspire/abinitio/commonline_sync3n.py |  8 +------
 3 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/src/aspire/abinitio/commonline_base.py b/src/aspire/abinitio/commonline_base.py
index 54ee26b290..c5d99e96b7 100644
--- a/src/aspire/abinitio/commonline_base.py
+++ b/src/aspire/abinitio/commonline_base.py
@@ -62,7 +62,9 @@ def __init__(
         self.n_theta = n_theta
         self.n_check = n_check
         self.hist_bin_width = hist_bin_width
-        self.full_width = full_width
+        if str(full_width).lower() == "adaptive":
+            full_width = -1
+        self.full_width = int(full_width)
         self.clmatrix = None
         self.max_shift = math.ceil(max_shift * self.n_res)
         self.shift_step = shift_step
diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu
index 297b5dbb6d..c40dd32c84 100644
--- a/src/aspire/abinitio/commonline_sync3n.cu
+++ b/src/aspire/abinitio/commonline_sync3n.cu
@@ -465,6 +465,8 @@ void estimate_all_angles(int n,
   double cond;
   double cos_phi2;
   int cnt;
+  double w_theta_need;
+  
 
   /* no-op when out of bounds */
   if(i >= n) return;
@@ -605,7 +607,23 @@ void estimate_all_angles(int n,
 
   cnt = 0;
 
-  if(full_width!=-1){
+  if(full_width==-1){
+    /* adaptive width*/
+    w_theta_need = 0;
+    cnt = 0;
+    while(cnt == 0){
+      w_theta_need += hist_bin_width;
+      for(k=0; k<n; k++){
+        // image k in peak bin
+        map_idx = pair_idx*n + k;
+        if(abs(k_map[map_idx] - peak_idx) < w_theta_need){
+          cnt += 1;  // this image was in the peak
+          angles[pair_idx*3 + 1] += angles_map[map_idx];  // accumulate angle
+        } /* fi < w_theta_need*/
+      } /* k */
+    } /* while */
+  } /* fi adaptive */
+  else {
     /* fixed width */
     // find satisfying indices
     for(k=0; k<n; k++){
@@ -614,12 +632,10 @@ void estimate_all_angles(int n,
       if(abs(k_map[map_idx] - peak_idx) < full_width){
         cnt += 1;  // this image was in the peak
         angles[pair_idx*3 + 1] += angles_map[map_idx];  // accumulate angle
-      } /* if */
+      } /* fi full_width */
     } /* k */
-  }
-  else {
-    /* adaptive width*/
-  }
+  } /* fi fixed width */
+
 
   /* Divide accumulated angles (resulting in the mean alpha angle) */   //(todo, better handle 0/0?)
   /* convert degree radian */
diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index e8736533fc..b97d33fdc1 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -839,12 +839,6 @@ def _estimate_all_Rijs_cupy(self, clmatrix):
         estimate_all_angles = self.__gpu_module.get_function("estimate_all_angles")
         angles_to_rots = self.__gpu_module.get_function("angles_to_rots")
 
-        full_width = self.full_width
-        if str(full_width).lower() == "adaptive":
-            full_width = -1
-            raise
-        # XXX debug (check rest of kernel works before porting adaptive)
-
         sigma = 3.0
         sync = 1
 
@@ -878,7 +872,7 @@ def _estimate_all_Rijs_cupy(self, clmatrix):
                 self.n_img,
                 self.n_theta,
                 np.float64(self.hist_bin_width),
-                full_width,
+                self.full_width,
                 np.float64(sigma),
                 sync,
                 clmatrix,  # input

From 9d1d6e2ad68e9c2d29dd9a6546a80df4daf0e6ad Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 19 Sep 2024 07:57:18 -0400
Subject: [PATCH 21/45] 1d rij kernel

---
 src/aspire/abinitio/commonline_sync3n.cu | 291 ++++++++++++-----------
 src/aspire/abinitio/commonline_sync3n.py |  26 +-
 2 files changed, 162 insertions(+), 155 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu
index c40dd32c84..ec24101546 100644
--- a/src/aspire/abinitio/commonline_sync3n.cu
+++ b/src/aspire/abinitio/commonline_sync3n.cu
@@ -455,10 +455,10 @@ void estimate_all_angles(int n,
   // try toget k_map as uint16_t
   /* n n_img */
 
-  /* thread index (2d), represents "i" index, "j" index */
+  /* thread index (1d), represents "i" index */
   const unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-  const unsigned int j = blockDim.y * blockIdx.y + threadIdx.y;
-  int k;
+
+  int j, k;
   int cl_diff1, cl_diff2, cl_diff3;
   double theta1, theta2, theta3;
   double c1, c2, c3;
@@ -466,25 +466,15 @@ void estimate_all_angles(int n,
   double cos_phi2;
   int cnt;
   double w_theta_need;
-  
 
   /* no-op when out of bounds */
   if(i >= n) return;
-  if(j >= n) return;
-  /* no-op lower triangle */
-  if(j <= i) return;
-
 
   // vote_ij creates good_k list per (i,j)
-  const int pair_idx = PAIR_IDX(n,i,j);
+  int pair_idx;
   int map_idx; /* tmp index var */
 
-  // We shouldn't need this... confirm and rm later.
-  //xxx
-  //  if(clmatrix[i*n + j] == -1) return;
-
-  int cl_idx12 = clmatrix[i*n + j];
-  int cl_idx21 = clmatrix[j*n + i];
+  int cl_idx12, cl_idx21;
   int cl_idx13, cl_idx31;
   int cl_idx23, cl_idx32;
   int ntics = 180. / hist_bin_width;
@@ -496,151 +486,166 @@ void estimate_all_angles(int n,
   int peak_idx;
   double peak;
 
-  /* Assume that k_list starts as [0,n] */
-  for(k=0; k<n; k++){
-
-    cl_idx13 = clmatrix[i*n + k];
-    cl_idx31 = clmatrix[k*n + i];
-    cl_idx23 = clmatrix[j*n + k];
-    cl_idx32 = clmatrix[k*n + j];
-
-    // test `k` values
-    if(k==i) continue;
-    if(cl_idx13 == -1) continue;  // i, k
-    //if(cl_idx31 == -1) continue;  // k, i
-    if(cl_idx23 == -1) continue;  // j, k
-    // if(cl_idx32 == -1) return;  // k, j
-
-    // self._get_cos_phis(cl_diff1, cl_diff2, cl_diff3, n_theta)
-    cl_diff1 = cl_idx13 - cl_idx12;
-    cl_diff2 = cl_idx23 - cl_idx21;
-    cl_diff3 = cl_idx32 - cl_idx31;
-
-    theta1 = cl_diff1 * 2 * M_PI / n_theta;
-    theta2 = cl_diff2 * 2 * M_PI / n_theta;
-    theta3 = cl_diff3 * 2 * M_PI / n_theta;
-
-    c1 = cos(theta1);
-    c2 = cos(theta2);
-    c3 = cos(theta3);
-
-    // test if we have a good_idx
-    cond = 1 + 2 * c1 * c2 * c3 - (c1*c1 + c2*c2 + c3*c3);
-    if(cond <= 1e-5) continue;  // that value of k is not good, skip
-
-    /* Calculated cos values of angle between i and j images */
-    if( sync == 1){
-
-      cos_phi2 = (c3 - c1*c2) / (sqrt(1 - c1*c1) * sqrt(1 - c2*c2));
-
-      /* Some synchronization must be applied when common line is out by 180 degrees.
-         Here fix the angles between c_ij(c_ji) and c_ik(c_jk) to be smaller than pi/2,
-         otherwise there will be an ambiguity between alpha and pi-alpha.
-      */
-
-      /* Check sync conditions */
-      ind1 = (theta1 > (M_PI + TOL_idx)) | (
-          (theta1 < -TOL_idx) & (theta1 > -M_PI)
-                                            );
-      ind2 = (theta2 > (M_PI + TOL_idx)) || (
-          (theta2 < -TOL_idx) && (theta2 > -M_PI)
-                                             );
-      if( (ind1 && !ind2) || (!ind1 && ind2)){
-        /* Apply sync */
-        cos_phi2 = -cos_phi2;
-      }
-
-    }  // end sync
-    else{
-      cos_phi2 = (c3 - c1*c2 ) / (sin(theta1) * sin(theta2));
-    } // end not sync
+  for(j=i+1; j<n; j++){
+    pair_idx = PAIR_IDX(n,i,j);
 
-    // clip [-1,1]
-    if(cos_phi2 >1){
-      cos_phi2 = 1;
+    /* initialize */
+    for(k=0; k<ntics; k++){
+      hist[i*ntics+k] = 0;
     }
-    if(cos_phi2 <-1){
-      cos_phi2 = -1;
+    for(k=0; k<3; k++){
+      angles[pair_idx*3+k] = 0;
     }
 
-    /* compute histogram contribution and index map */
-    angle = acos(cos_phi2) * 180. / M_PI;
-    // angle's bin
-    map_idx = pair_idx*n + k;
-    k_map[map_idx] = angle / hist_bin_width;  //check
-    angles_map[map_idx] = angle;  // degree
+    cl_idx12 = clmatrix[i*n + j];
+    cl_idx21 = clmatrix[j*n + i];
+
+    /* Assume that k_list starts as [0,n] */
+    for(k=0; k<n; k++){
+
+      cl_idx13 = clmatrix[i*n + k];
+      cl_idx31 = clmatrix[k*n + i];
+      cl_idx23 = clmatrix[j*n + k];
+      cl_idx32 = clmatrix[k*n + j];
+
+      // test `k` values
+      if(k==i) continue;
+      if(cl_idx13 == -1) continue;  // i, k
+      //if(cl_idx31 == -1) continue;  // k, i
+      if(cl_idx23 == -1) continue;  // j, k
+      // if(cl_idx32 == -1) return;  // k, j
+
+      // self._get_cos_phis(cl_diff1, cl_diff2, cl_diff3, n_theta)
+      cl_diff1 = cl_idx13 - cl_idx12;
+      cl_diff2 = cl_idx23 - cl_idx21;
+      cl_diff3 = cl_idx32 - cl_idx31;
+
+      theta1 = cl_diff1 * 2 * M_PI / n_theta;
+      theta2 = cl_diff2 * 2 * M_PI / n_theta;
+      theta3 = cl_diff3 * 2 * M_PI / n_theta;
+
+      c1 = cos(theta1);
+      c2 = cos(theta2);
+      c3 = cos(theta3);
+
+      // test if we have a good_idx
+      cond = 1 + 2 * c1 * c2 * c3 - (c1*c1 + c2*c2 + c3*c3);
+      if(cond <= 1e-5) continue;  // that value of k is not good, skip
+
+      /* Calculated cos values of angle between i and j images */
+      if( sync == 1){
+
+        cos_phi2 = (c3 - c1*c2) / (sqrt(1 - c1*c1) * sqrt(1 - c2*c2));
+
+        /* Some synchronization must be applied when common line is out by 180 degrees.
+           Here fix the angles between c_ij(c_ji) and c_ik(c_jk) to be smaller than pi/2,
+           otherwise there will be an ambiguity between alpha and pi-alpha.
+        */
+
+        /* Check sync conditions */
+        ind1 = (theta1 > (M_PI + TOL_idx)) | (
+            (theta1 < -TOL_idx) & (theta1 > -M_PI)
+                                              );
+        ind2 = (theta2 > (M_PI + TOL_idx)) || (
+            (theta2 < -TOL_idx) && (theta2 > -M_PI)
+                                               );
+        if( (ind1 && !ind2) || (!ind1 && ind2)){
+          /* Apply sync */
+          cos_phi2 = -cos_phi2;
+        }
+
+      }  // end sync
+      else{
+        cos_phi2 = (c3 - c1*c2 ) / (sin(theta1) * sin(theta2));
+      } // end not sync
+
+      // clip [-1,1]
+      if(cos_phi2 >1){
+        cos_phi2 = 1;
+      }
+      if(cos_phi2 <-1){
+        cos_phi2 = -1;
+      }
+
+      /* compute histogram contribution and index map */
+      angle = acos(cos_phi2) * 180. / M_PI;
+      // angle's bin
+      map_idx = i*n + k;
+      k_map[map_idx] = angle / hist_bin_width;  //check
+      angles_map[map_idx] = angle;  // degree
+      for(b=0; b<ntics; b++){
+        ga = b*(180./ntics);  // grid angle // todo, just compute in radians to avoid extra arithmetic
+        ga = (ga - angle);
+        // histogram contribution
+        hist[i*ntics + b ] += exp(
+            -(ga*ga) / ( 2*sigma*sigma));
+      } /* bins */
+    } /* k*/
+
+    /*
+      At this point, the kernel should have accumulated all "good"
+      k contributions into the histogram.
+      The next section solves the histogram.
+
+      Potentially this could be a distinct kernel,
+      but would require looking up cl_idx again or assigning them as below.
+    */
+
+    /* find peak of histogram */
+    peak = -99999;
+    peak_idx = -1;
     for(b=0; b<ntics; b++){
-      ga = b*(180./ntics);  // grid angle // todo, just compute in radians to avoid extra arithmetic
-      ga = (ga - angle);
-      // histogram contribution
-      hist[pair_idx*ntics + b ] += exp(
-          -(ga*ga) / ( 2*sigma*sigma));
-    } /* bins */
-  } /* k*/
-
-  /*
-    At this point, the kernel should have accumulated all "good"
-    k contributions into the histogram.
-    The next section solves the histogram.
-
-    Potentially this could be a distinct kernel,
-    but would require looking up cl_idx again or assigning them as below.
-  */
-
-  /* find peak of histogram */
-  peak = -99999;
-  peak_idx = -1;
-  for(b=0; b<ntics; b++){
-    map_idx = pair_idx*ntics + b;
-    if(hist[map_idx] > peak){
-      peak = hist[map_idx];
-      peak_idx = b;
+      map_idx = i*ntics + b;
+      if(hist[map_idx] > peak){
+        peak = hist[map_idx];
+        peak_idx = b;
+      }
     }
-  }
 
-  /* find mean of rotations */
-  // initialize
-  map_idx = pair_idx*3;
-  angles[map_idx    ] = cl_idx12 * 2 * M_PI / n_theta + M_PI / 2;
-  angles[map_idx + 1] = 0;
-  angles[map_idx + 2] = -M_PI / 2 - cl_idx21 * 2 * M_PI / n_theta;
+    /* find mean of rotations */
+    // initialize
+    map_idx = pair_idx*3;
+    angles[map_idx    ] = cl_idx12 * 2 * M_PI / n_theta + M_PI / 2;
+    angles[map_idx + 1] = 0;
+    angles[map_idx + 2] = -M_PI / 2 - cl_idx21 * 2 * M_PI / n_theta;
 
-  cnt = 0;
-
-  if(full_width==-1){
-    /* adaptive width*/
-    w_theta_need = 0;
     cnt = 0;
-    while(cnt == 0){
-      w_theta_need += hist_bin_width;
+
+    if(full_width==-1){
+      /* adaptive width*/
+      w_theta_need = 0;
+      cnt = 0;
+      while(cnt == 0){
+        w_theta_need += hist_bin_width;
+        for(k=0; k<n; k++){
+          // image k in peak bin
+          map_idx = i*n + k;
+          if(abs(k_map[map_idx] - peak_idx) < w_theta_need){
+            cnt += 1;  // this image was in the peak
+            angles[pair_idx*3 + 1] += angles_map[map_idx];  // accumulate angle
+          } /* fi < w_theta_need*/
+        } /* k */
+      } /* while */
+    } /* fi adaptive */
+    else {
+      /* fixed width */
+      // find satisfying indices
       for(k=0; k<n; k++){
         // image k in peak bin
-        map_idx = pair_idx*n + k;
-        if(abs(k_map[map_idx] - peak_idx) < w_theta_need){
+        map_idx = i*n + k;
+        if(abs(k_map[map_idx] - peak_idx) < full_width){
           cnt += 1;  // this image was in the peak
           angles[pair_idx*3 + 1] += angles_map[map_idx];  // accumulate angle
-        } /* fi < w_theta_need*/
+        } /* fi full_width */
       } /* k */
-    } /* while */
-  } /* fi adaptive */
-  else {
-    /* fixed width */
-    // find satisfying indices
-    for(k=0; k<n; k++){
-      // image k in peak bin
-      map_idx = pair_idx*n + k;
-      if(abs(k_map[map_idx] - peak_idx) < full_width){
-        cnt += 1;  // this image was in the peak
-        angles[pair_idx*3 + 1] += angles_map[map_idx];  // accumulate angle
-      } /* fi full_width */
-    } /* k */
-  } /* fi fixed width */
+    } /* fi fixed width */
 
 
-  /* Divide accumulated angles (resulting in the mean alpha angle) */   //(todo, better handle 0/0?)
-  /* convert degree radian */
-  angles[pair_idx*3 + 1] *= M_PI / (180*cnt);
+    /* Divide accumulated angles (resulting in the mean alpha angle) */   //(todo, better handle 0/0?)
+    /* convert degree radian */
+    angles[pair_idx*3 + 1] *= M_PI / (180*cnt);
 
+  } /* j */
 } /* kernel */
 
 extern "C" __global__
diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index b97d33fdc1..e753960095 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -828,8 +828,16 @@ def _estimate_all_Rijs(self, clmatrix):
         #     res = self._estimate_all_Rijs_cupy(Rijs)
         # else:
         #     res = self._estimate_all_Rijs_host(Rijs)
+        from time import perf_counter
+
+        tic = perf_counter()
         res = self._estimate_all_Rijs_cupy(clmatrix)
+        toc1 = perf_counter()
+        print("_estimate_all_Rijs_cupy", toc1 - tic)
+
         res_host = self._estimate_all_Rijs_host(clmatrix)
+        toc2 = perf_counter()
+        print("_estimate_all_Rijs_host", toc2 - toc1)
 
         return res
 
@@ -848,26 +856,20 @@ def _estimate_all_Rijs_cupy(self, clmatrix):
         # workspace arrays
         ntics = int(180 / self.hist_bin_width)
         n_pairs = self.n_img * (self.n_img - 1) // 2
-        hist = cp.zeros((n_pairs, ntics), dtype=np.float64)
+        hist = cp.zeros((self.n_img, ntics), dtype=np.float64)
         # k_map stores the mapping of k indices to histogram bins
-        k_map = cp.zeros(
-            (n_pairs, self.n_img), dtype=np.int32
-        )  # note, high memory , probably need to change...
-        angles_map = cp.zeros(
-            (n_pairs, self.n_img), dtype=np.float64
-        )  # note, high memory , probably need to change...
+        k_map = cp.zeros((self.n_img, self.n_img), dtype=np.int32)
+        angles_map = cp.zeros((self.n_img, self.n_img), dtype=np.float64)
         angles = cp.zeros((n_pairs, 3), dtype=np.float64)
 
         # Configure grid of blocks
-        blkszx = 32
+        blkszx = 1024
         nblkx = (self.n_img + blkszx - 1) // blkszx
-        blkszy = 32
-        nblky = (self.n_img + blkszy - 1) // blkszy
 
         logger.info("Launching `estimate_all_angles` kernel.")
         estimate_all_angles(
-            (nblkx, nblky),
-            (blkszx, blkszy),
+            (nblkx,),
+            (blkszx,),
             (
                 self.n_img,
                 self.n_theta,

From 1882d6c09c999e030c0393b03a46d08edee2c1b8 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 19 Sep 2024 08:19:12 -0400
Subject: [PATCH 22/45] implement nvcc backend and int16_t

---
 src/aspire/abinitio/commonline_base.cu   |  3 ++-
 src/aspire/abinitio/commonline_base.py   | 10 ++++++++--
 src/aspire/abinitio/commonline_sync3n.cu | 10 ++++++++--
 src/aspire/abinitio/commonline_sync3n.py |  6 +++---
 4 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/src/aspire/abinitio/commonline_base.cu b/src/aspire/abinitio/commonline_base.cu
index a6bad78aea..eb493acf44 100644
--- a/src/aspire/abinitio/commonline_base.cu
+++ b/src/aspire/abinitio/commonline_base.cu
@@ -1,7 +1,8 @@
+#include <stdint.h>
 #include <cupy/complex.cuh>
 
 extern "C" __global__
-void build_clmatrix_kernel(int n, int m, int r, const complex<double>* __restrict__ pf, double* __restrict__ clmatrix, double* __restrict__  cl_dist, double* __restrict__ shifts_1d, int n_shifts, double* __restrict__ shifts,  const complex<double>* __restrict__ shift_phases)
+void build_clmatrix_kernel(int n, int m, int r, const complex<double>* __restrict__ pf, int16_t* __restrict__ clmatrix, double* __restrict__  cl_dist, double* __restrict__ shifts_1d, int n_shifts, double* __restrict__ shifts,  const complex<double>* __restrict__ shift_phases)
 {
   /* n n_img */
   /* m,r st (n, m, r) = pf.shape, ie len(pf[i])  */
diff --git a/src/aspire/abinitio/commonline_base.py b/src/aspire/abinitio/commonline_base.py
index c5d99e96b7..efb33280e2 100644
--- a/src/aspire/abinitio/commonline_base.py
+++ b/src/aspire/abinitio/commonline_base.py
@@ -72,6 +72,12 @@ def __init__(
         self.rotations = None
         self._pf = None
 
+        # Sanity limit to match potential clmatrix dtype of int16.
+        if self.n_img > (2**15 - 1):
+            raise NotImplementedError(
+                "Commonlines implementation limited to <2**15 images."
+            )
+
         # Auto configure GPU
         self.__gpu_module = None
         try:
@@ -297,7 +303,7 @@ def build_clmatrix_cu(self):
         # the common line with image j. Note the common line index
         # starts from 0 instead of 1 as Matlab version. -1 means
         # there is no common line such as clmatrix[i,i].
-        clmatrix = -cp.ones((n_img, n_img), dtype=np.float64)
+        clmatrix = -cp.ones((n_img, n_img), dtype=np.int16)
         # When cl_dist[i, j] is not -1, it stores the maximum value
         # of correlation between image i and j for all possible 1D shifts.
         # We will use cl_dist[i, j] = -1 (including j<=i) to
@@ -711,4 +717,4 @@ def __init_cupy_module():
             module_code = fh.read()
 
         # CUPY compile the CUDA code
-        return cp.RawModule(code=module_code)
+        return cp.RawModule(code=module_code, backend="nvcc")
diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu
index ec24101546..a36744bef5 100644
--- a/src/aspire/abinitio/commonline_sync3n.cu
+++ b/src/aspire/abinitio/commonline_sync3n.cu
@@ -1,3 +1,5 @@
+#include "stdint.h"
+
 # define M_PI           3.14159265358979323846  /* pi */
 
 /* from i,j indices to the common index in the N-choose-2 sized array */
@@ -6,6 +8,7 @@
 
 
 /* convert euler angles (a,b,c) in ZYZ to rotation matrix r */
+__host__ __device__
 inline void ang2orth(double* r, double a, double b, double c){
   double sa = sin(a);
   double sb = sin(b);
@@ -28,6 +31,7 @@ inline void ang2orth(double* r, double a, double b, double c){
 }
 
 
+__host__ __device__
 inline void mult_3x3(double *out, double *R1, double *R2) {
   /* 3X3 matrices multiplication: out = R1*R2
    * Note, this differs from the MATLAB mult_3x3.
@@ -45,6 +49,7 @@ inline void mult_3x3(double *out, double *R1, double *R2) {
   }
 }
 
+__host__ __device__
 inline void JRJ(double *R, double *A) {
   /* multiple 3X3 matrix by J from both sizes: A = JRJ */
   A[0]=R[0];
@@ -58,6 +63,7 @@ inline void JRJ(double *R, double *A) {
   A[8]=R[8];
 }
 
+__host__ __device__
 inline double diff_norm_3x3(const double *R1, const double *R2) {
   /* difference 2 matrices and return squared norm: ||R1-R2||^2 */
   int i;
@@ -446,9 +452,9 @@ void estimate_all_angles(int n,
                          int full_width,
                          double sigma,
                          int sync,
-                         int* __restrict__ clmatrix,
+                         int16_t* __restrict__ clmatrix,
                          double* __restrict__ hist,
-                         int* __restrict__ k_map,
+                         uint16_t* __restrict__ k_map,
                          double* __restrict__ angles_map,
                          double* __restrict__ angles)
 {
diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index e753960095..7c98ac69f7 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -851,14 +851,14 @@ def _estimate_all_Rijs_cupy(self, clmatrix):
         sync = 1
 
         # transfer input to device
-        clmatrix = cp.asarray(clmatrix, dtype=np.int32)
+        clmatrix = cp.asarray(clmatrix, dtype=np.int16)
 
         # workspace arrays
         ntics = int(180 / self.hist_bin_width)
         n_pairs = self.n_img * (self.n_img - 1) // 2
         hist = cp.zeros((self.n_img, ntics), dtype=np.float64)
         # k_map stores the mapping of k indices to histogram bins
-        k_map = cp.zeros((self.n_img, self.n_img), dtype=np.int32)
+        k_map = cp.zeros((self.n_img, self.n_img), dtype=np.uint16)
         angles_map = cp.zeros((self.n_img, self.n_img), dtype=np.float64)
         angles = cp.zeros((n_pairs, 3), dtype=np.float64)
 
@@ -1157,4 +1157,4 @@ def __init_cupy_module():
             module_code = fh.read()
 
         # CUPY compile the CUDA code
-        return cp.RawModule(code=module_code)
+        return cp.RawModule(code=module_code, backend="nvcc")

From 306e35efbe1ade24d879f28300aa9bb58bce98b6 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 19 Sep 2024 14:10:09 -0400
Subject: [PATCH 23/45] split kernels

---
 src/aspire/abinitio/commonline_sync3n.cu | 349 +++++++++++++----------
 src/aspire/abinitio/commonline_sync3n.py |  55 ++--
 2 files changed, 234 insertions(+), 170 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu
index a36744bef5..09d4aa80be 100644
--- a/src/aspire/abinitio/commonline_sync3n.cu
+++ b/src/aspire/abinitio/commonline_sync3n.cu
@@ -446,17 +446,18 @@ void triangle_scores_inner(int n, double* Rijs, int n_intervals, unsigned int* s
 };
 
 extern "C" __global__
-void estimate_all_angles(int n,
-                         int n_theta,
-                         double hist_bin_width,
-                         int full_width,
-                         double sigma,
-                         int sync,
-                         int16_t* __restrict__ clmatrix,
-                         double* __restrict__ hist,
-                         uint16_t* __restrict__ k_map,
-                         double* __restrict__ angles_map,
-                         double* __restrict__ angles)
+void estimate_all_angles1(int j,
+                          int n,
+                          int n_theta,
+                          double hist_bin_width,
+                          int full_width,
+                          double sigma,
+                          int sync,
+                          int16_t* __restrict__ clmatrix,
+                          double* __restrict__ hist,
+                          uint16_t* __restrict__ k_map,
+                          double* __restrict__ angles_map,
+                          double* __restrict__ angles)
 {
   // try toget k_map as uint16_t
   /* n n_img */
@@ -464,7 +465,7 @@ void estimate_all_angles(int n,
   /* thread index (1d), represents "i" index */
   const unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
-  int j, k;
+  int k;
   int cl_diff1, cl_diff2, cl_diff3;
   double theta1, theta2, theta3;
   double c1, c2, c3;
@@ -489,169 +490,213 @@ void estimate_all_angles(int n,
   double ga;
   double angle;
   int b;
-  int peak_idx;
-  double peak;
 
-  for(j=i+1; j<n; j++){
-    pair_idx = PAIR_IDX(n,i,j);
 
-    /* initialize */
-    for(k=0; k<ntics; k++){
-      hist[i*ntics+k] = 0;
+  //  for(j=i+1; j<n; j++){
+  pair_idx = PAIR_IDX(n,i,j);
+
+  /* initialize */
+  for(k=0; k<ntics; k++){
+    hist[i*ntics+k] = 0;
+  }
+
+  // delete? cupy...
+  for(k=0; k<3; k++){
+    angles[pair_idx*3+k] = 0;
+  }
+
+  cl_idx12 = clmatrix[i*n + j];
+  cl_idx21 = clmatrix[j*n + i];
+
+  /* Assume that k_list starts as [0,n] */
+  for(k=0; k<n; k++){
+
+    cl_idx13 = clmatrix[i*n + k];
+    cl_idx31 = clmatrix[k*n + i];
+    cl_idx23 = clmatrix[j*n + k];
+    cl_idx32 = clmatrix[k*n + j];
+
+    // test `k` values
+    if(k==i) continue;
+    if(cl_idx13 == -1) continue;  // i, k
+    //if(cl_idx31 == -1) continue;  // k, i
+    if(cl_idx23 == -1) continue;  // j, k
+    // if(cl_idx32 == -1) return;  // k, j
+
+    // self._get_cos_phis(cl_diff1, cl_diff2, cl_diff3, n_theta)
+    cl_diff1 = cl_idx13 - cl_idx12;
+    cl_diff2 = cl_idx23 - cl_idx21;
+    cl_diff3 = cl_idx32 - cl_idx31;
+
+    theta1 = cl_diff1 * 2 * M_PI / n_theta;
+    theta2 = cl_diff2 * 2 * M_PI / n_theta;
+    theta3 = cl_diff3 * 2 * M_PI / n_theta;
+
+    c1 = cos(theta1);
+    c2 = cos(theta2);
+    c3 = cos(theta3);
+
+    // test if we have a good_idx
+    cond = 1 + 2 * c1 * c2 * c3 - (c1*c1 + c2*c2 + c3*c3);
+    if(cond <= 1e-5) continue;  // that value of k is not good, skip
+
+    /* Calculated cos values of angle between i and j images */
+    if( sync == 1){
+
+      cos_phi2 = (c3 - c1*c2) / (sqrt(1 - c1*c1) * sqrt(1 - c2*c2));
+
+      /* Some synchronization must be applied when common line is out by 180 degrees.
+         Here fix the angles between c_ij(c_ji) and c_ik(c_jk) to be smaller than pi/2,
+         otherwise there will be an ambiguity between alpha and pi-alpha.
+      */
+
+      /* Check sync conditions */
+      ind1 = (theta1 > (M_PI + TOL_idx)) | (
+          (theta1 < -TOL_idx) & (theta1 > -M_PI)
+                                            );
+      ind2 = (theta2 > (M_PI + TOL_idx)) || (
+          (theta2 < -TOL_idx) && (theta2 > -M_PI)
+                                             );
+      if( (ind1 && !ind2) || (!ind1 && ind2)){
+        /* Apply sync */
+        cos_phi2 = -cos_phi2;
+      }
+
+    }  // end sync
+    else{
+      cos_phi2 = (c3 - c1*c2 ) / (sin(theta1) * sin(theta2));
+    } // end not sync
+
+    // clip [-1,1]
+    if(cos_phi2 >1){
+      cos_phi2 = 1;
     }
-    for(k=0; k<3; k++){
-      angles[pair_idx*3+k] = 0;
+    if(cos_phi2 <-1){
+      cos_phi2 = -1;
     }
 
-    cl_idx12 = clmatrix[i*n + j];
-    cl_idx21 = clmatrix[j*n + i];
+    /* compute histogram contribution and index map */
+    angle = acos(cos_phi2) * 180. / M_PI;
+    // angle's bin
+    map_idx = i*n + k;
+    k_map[map_idx] = angle / hist_bin_width;  //check
+    angles_map[map_idx] = angle;  // degree
+    for(b=0; b<ntics; b++){
+      ga = b*(180./ntics);  // grid angle // todo, just compute in radians to avoid extra arithmetic
+      ga = (ga - angle);
+      // histogram contribution
+      hist[i*ntics + b ] += exp(
+          -(ga*ga) / ( 2*sigma*sigma));
+    } /* bins */
+  } /* k*/
+
+  /*
+    At this point, the kernel should have accumulated all "good"
+    k contributions into the histogram for I j.
+    The next section solves the histogram.
+
+    The angles are also initialized while we have the cl_idx values.
+  */
+
+  // initialize
+  map_idx = pair_idx*3;
+  angles[map_idx    ] = cl_idx12 * 2 * M_PI / n_theta + M_PI / 2;
+  angles[map_idx + 1] = 0;
+  angles[map_idx + 2] = -M_PI / 2 - cl_idx21 * 2 * M_PI / n_theta;
+
+
+  //  } /* j */
+} /* kernel */
 
-    /* Assume that k_list starts as [0,n] */
-    for(k=0; k<n; k++){
 
-      cl_idx13 = clmatrix[i*n + k];
-      cl_idx31 = clmatrix[k*n + i];
-      cl_idx23 = clmatrix[j*n + k];
-      cl_idx32 = clmatrix[k*n + j];
-
-      // test `k` values
-      if(k==i) continue;
-      if(cl_idx13 == -1) continue;  // i, k
-      //if(cl_idx31 == -1) continue;  // k, i
-      if(cl_idx23 == -1) continue;  // j, k
-      // if(cl_idx32 == -1) return;  // k, j
-
-      // self._get_cos_phis(cl_diff1, cl_diff2, cl_diff3, n_theta)
-      cl_diff1 = cl_idx13 - cl_idx12;
-      cl_diff2 = cl_idx23 - cl_idx21;
-      cl_diff3 = cl_idx32 - cl_idx31;
-
-      theta1 = cl_diff1 * 2 * M_PI / n_theta;
-      theta2 = cl_diff2 * 2 * M_PI / n_theta;
-      theta3 = cl_diff3 * 2 * M_PI / n_theta;
-
-      c1 = cos(theta1);
-      c2 = cos(theta2);
-      c3 = cos(theta3);
-
-      // test if we have a good_idx
-      cond = 1 + 2 * c1 * c2 * c3 - (c1*c1 + c2*c2 + c3*c3);
-      if(cond <= 1e-5) continue;  // that value of k is not good, skip
-
-      /* Calculated cos values of angle between i and j images */
-      if( sync == 1){
-
-        cos_phi2 = (c3 - c1*c2) / (sqrt(1 - c1*c1) * sqrt(1 - c2*c2));
-
-        /* Some synchronization must be applied when common line is out by 180 degrees.
-           Here fix the angles between c_ij(c_ji) and c_ik(c_jk) to be smaller than pi/2,
-           otherwise there will be an ambiguity between alpha and pi-alpha.
-        */
-
-        /* Check sync conditions */
-        ind1 = (theta1 > (M_PI + TOL_idx)) | (
-            (theta1 < -TOL_idx) & (theta1 > -M_PI)
-                                              );
-        ind2 = (theta2 > (M_PI + TOL_idx)) || (
-            (theta2 < -TOL_idx) && (theta2 > -M_PI)
-                                               );
-        if( (ind1 && !ind2) || (!ind1 && ind2)){
-          /* Apply sync */
-          cos_phi2 = -cos_phi2;
-        }
 
-      }  // end sync
-      else{
-        cos_phi2 = (c3 - c1*c2 ) / (sin(theta1) * sin(theta2));
-      } // end not sync
 
-      // clip [-1,1]
-      if(cos_phi2 >1){
-        cos_phi2 = 1;
-      }
-      if(cos_phi2 <-1){
-        cos_phi2 = -1;
-      }
+extern "C" __global__
+void estimate_all_angles2(int j,
+                          int n,
+                          int n_theta,
+                          double hist_bin_width,
+                          int full_width,
+                          double* __restrict__ hist,
+                          uint16_t* __restrict__ k_map,
+                          double* __restrict__ angles_map,
+                          double* __restrict__ angles)
+{
+  // try toget k_map as uint16_t
+  /* n n_img */
 
-      /* compute histogram contribution and index map */
-      angle = acos(cos_phi2) * 180. / M_PI;
-      // angle's bin
-      map_idx = i*n + k;
-      k_map[map_idx] = angle / hist_bin_width;  //check
-      angles_map[map_idx] = angle;  // degree
-      for(b=0; b<ntics; b++){
-        ga = b*(180./ntics);  // grid angle // todo, just compute in radians to avoid extra arithmetic
-        ga = (ga - angle);
-        // histogram contribution
-        hist[i*ntics + b ] += exp(
-            -(ga*ga) / ( 2*sigma*sigma));
-      } /* bins */
-    } /* k*/
-
-    /*
-      At this point, the kernel should have accumulated all "good"
-      k contributions into the histogram.
-      The next section solves the histogram.
-
-      Potentially this could be a distinct kernel,
-      but would require looking up cl_idx again or assigning them as below.
-    */
-
-    /* find peak of histogram */
-    peak = -99999;
-    peak_idx = -1;
-    for(b=0; b<ntics; b++){
-      map_idx = i*ntics + b;
-      if(hist[map_idx] > peak){
-        peak = hist[map_idx];
-        peak_idx = b;
-      }
+  /* thread index (1d), represents "i" index */
+  const unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+  int k;
+  int cnt;
+  double w_theta_need;
+
+  /* no-op when out of bounds */
+  if(i >= n) return;
+
+  // vote_ij creates good_k list per (i,j)
+  int pair_idx;
+  int map_idx; /* tmp index var */
+
+  int ntics = 180. / hist_bin_width;
+  int b;
+  int peak_idx;
+  double peak;
+
+  //for(j=i+1; j<n; j++){
+  pair_idx = PAIR_IDX(n,i,j);
+
+  /* find peak of histogram */
+  peak = -99999;
+  peak_idx = -1;
+  for(b=0; b<ntics; b++){
+    map_idx = i*ntics + b;
+    if(hist[map_idx] > peak){
+      peak = hist[map_idx];
+      peak_idx = b;
     }
+  }
 
-    /* find mean of rotations */
-    // initialize
-    map_idx = pair_idx*3;
-    angles[map_idx    ] = cl_idx12 * 2 * M_PI / n_theta + M_PI / 2;
-    angles[map_idx + 1] = 0;
-    angles[map_idx + 2] = -M_PI / 2 - cl_idx21 * 2 * M_PI / n_theta;
+  /* find mean of rotations */
 
-    cnt = 0;
+  cnt = 0;
 
-    if(full_width==-1){
-      /* adaptive width*/
-      w_theta_need = 0;
-      cnt = 0;
-      while(cnt == 0){
-        w_theta_need += hist_bin_width;
-        for(k=0; k<n; k++){
-          // image k in peak bin
-          map_idx = i*n + k;
-          if(abs(k_map[map_idx] - peak_idx) < w_theta_need){
-            cnt += 1;  // this image was in the peak
-            angles[pair_idx*3 + 1] += angles_map[map_idx];  // accumulate angle
-          } /* fi < w_theta_need*/
-        } /* k */
-      } /* while */
-    } /* fi adaptive */
-    else {
-      /* fixed width */
-      // find satisfying indices
+  if(full_width==-1){
+    /* adaptive width*/
+    w_theta_need = 0;
+    cnt = 0;
+    while(cnt == 0){
+      w_theta_need += hist_bin_width;
       for(k=0; k<n; k++){
         // image k in peak bin
         map_idx = i*n + k;
-        if(abs(k_map[map_idx] - peak_idx) < full_width){
+        if(abs(k_map[map_idx] - peak_idx) < w_theta_need){
           cnt += 1;  // this image was in the peak
           angles[pair_idx*3 + 1] += angles_map[map_idx];  // accumulate angle
-        } /* fi full_width */
+        } /* fi < w_theta_need*/
       } /* k */
-    } /* fi fixed width */
+    } /* while */
+  } /* fi adaptive */
+  else {
+    /* fixed width */
+    // find satisfying indices
+    for(k=0; k<n; k++){
+      // image k in peak bin
+      map_idx = i*n + k;
+      if(abs(k_map[map_idx] - peak_idx) < full_width){
+        cnt += 1;  // this image was in the peak
+        angles[pair_idx*3 + 1] += angles_map[map_idx];  // accumulate angle
+      } /* fi full_width */
+    } /* k */
+  } /* fi fixed width */
 
 
-    /* Divide accumulated angles (resulting in the mean alpha angle) */   //(todo, better handle 0/0?)
-    /* convert degree radian */
-    angles[pair_idx*3 + 1] *= M_PI / (180*cnt);
+  /* Divide accumulated angles (resulting in the mean alpha angle) */   //(todo, better handle 0/0?)
+  /* convert degree radian */
+  angles[pair_idx*3 + 1] *= M_PI / (180*cnt);
 
-  } /* j */
+  //  } /* j */
 } /* kernel */
 
 extern "C" __global__
diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 7c98ac69f7..24beb99898 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -844,7 +844,8 @@ def _estimate_all_Rijs(self, clmatrix):
     def _estimate_all_Rijs_cupy(self, clmatrix):
         import cupy as cp
 
-        estimate_all_angles = self.__gpu_module.get_function("estimate_all_angles")
+        estimate_all_angles1 = self.__gpu_module.get_function("estimate_all_angles1")
+        estimate_all_angles2 = self.__gpu_module.get_function("estimate_all_angles2")
         angles_to_rots = self.__gpu_module.get_function("angles_to_rots")
 
         sigma = 3.0
@@ -867,23 +868,41 @@ def _estimate_all_Rijs_cupy(self, clmatrix):
         nblkx = (self.n_img + blkszx - 1) // blkszx
 
         logger.info("Launching `estimate_all_angles` kernel.")
-        estimate_all_angles(
-            (nblkx,),
-            (blkszx,),
-            (
-                self.n_img,
-                self.n_theta,
-                np.float64(self.hist_bin_width),
-                self.full_width,
-                np.float64(sigma),
-                sync,
-                clmatrix,  # input
-                hist,  # tmp
-                k_map,  # tmp
-                angles_map,  # tmp
-                angles,  # output
-            ),
-        )
+        for j in range(0, self.n_img):
+            estimate_all_angles1(
+                (nblkx,),
+                (blkszx,),
+                (
+                    j,
+                    self.n_img,
+                    self.n_theta,
+                    np.float64(self.hist_bin_width),
+                    self.full_width,
+                    np.float64(sigma),
+                    sync,
+                    clmatrix,  # input
+                    hist,  # tmp
+                    k_map,  # tmp
+                    angles_map,  # tmp
+                    angles,  # output
+                ),
+            )
+
+            estimate_all_angles2(
+                (nblkx,),
+                (blkszx,),
+                (
+                    j,
+                    self.n_img,
+                    self.n_theta,
+                    np.float64(self.hist_bin_width),
+                    self.full_width,
+                    hist,  # tmp
+                    k_map,  # tmp
+                    angles_map,  # tmp
+                    angles,  # output
+                ),
+            )
 
         # no longer need workspace vars
         del hist

From 18b0548562a8030d27083fa4e491c34a8361045b Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 19 Sep 2024 15:32:09 -0400
Subject: [PATCH 24/45] general cleanup

---
 src/aspire/abinitio/commonline_sync3n.cu | 126 ++++++++++-------------
 src/aspire/abinitio/commonline_sync3n.py |   1 -
 2 files changed, 57 insertions(+), 70 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu
index 09d4aa80be..5d3d0340d1 100644
--- a/src/aspire/abinitio/commonline_sync3n.cu
+++ b/src/aspire/abinitio/commonline_sync3n.cu
@@ -17,8 +17,8 @@ inline void ang2orth(double* r, double a, double b, double c){
   double cb = cos(b);
   double cc = cos(c);
 
-  // ZYZ Proper Euler angles
-  // https://en.wikipedia.org/wiki/Euler_angles#Rotation_matrix
+  /* ZYZ Proper Euler angles */
+  /* https://en.wikipedia.org/wiki/Euler_angles#Rotation_matrix */
   r[0] = ca*cb*cc - sa*sc;
   r[1] = -cc*sa -ca*cb*sc;
   r[2] = ca*sb;
@@ -459,11 +459,11 @@ void estimate_all_angles1(int j,
                           double* __restrict__ angles_map,
                           double* __restrict__ angles)
 {
-  // try toget k_map as uint16_t
   /* n n_img */
 
   /* thread index (1d), represents "i" index */
   const unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+  /* j is image j index */
 
   int k;
   int cl_diff1, cl_diff2, cl_diff3;
@@ -477,8 +477,6 @@ void estimate_all_angles1(int j,
   /* no-op when out of bounds */
   if(i >= n) return;
 
-  // vote_ij creates good_k list per (i,j)
-  int pair_idx;
   int map_idx; /* tmp index var */
 
   int cl_idx12, cl_idx21;
@@ -487,28 +485,22 @@ void estimate_all_angles1(int j,
   int ntics = 180. / hist_bin_width;
   const double TOL_idx = 1e-12;
   bool ind1, ind2;
-  double ga;
-  double angle;
+  double grid_angle, angle_diff, angle;
   int b;
+  const double two_sigma_sq = 2*sigma*sigma;
 
 
-  //  for(j=i+1; j<n; j++){
-  pair_idx = PAIR_IDX(n,i,j);
+  const int pair_idx = PAIR_IDX(n,i,j);
 
   /* initialize */
-  for(k=0; k<ntics; k++){
-    hist[i*ntics+k] = 0;
-  }
-
-  // delete? cupy...
-  for(k=0; k<3; k++){
-    angles[pair_idx*3+k] = 0;
+  for(b=0; b<ntics; b++){
+    hist[i*ntics + b] = 0;
   }
 
   cl_idx12 = clmatrix[i*n + j];
   cl_idx21 = clmatrix[j*n + i];
 
-  /* Assume that k_list starts as [0,n] */
+  /* Assume that k_list starts as all n images */
   for(k=0; k<n; k++){
 
     cl_idx13 = clmatrix[i*n + k];
@@ -516,14 +508,14 @@ void estimate_all_angles1(int j,
     cl_idx23 = clmatrix[j*n + k];
     cl_idx32 = clmatrix[k*n + j];
 
-    // test `k` values
+    /* test `k` values */
     if(k==i) continue;
-    if(cl_idx13 == -1) continue;  // i, k
-    //if(cl_idx31 == -1) continue;  // k, i
-    if(cl_idx23 == -1) continue;  // j, k
-    // if(cl_idx32 == -1) return;  // k, j
+    if(cl_idx13 == -1) continue;  /* i, k */
+    //if(cl_idx31 == -1) continue;  // k, i ?
+    if(cl_idx23 == -1) continue;  /* j, k */
+    // if(cl_idx32 == -1) return;  // k, j ?
 
-    // self._get_cos_phis(cl_diff1, cl_diff2, cl_diff3, n_theta)
+    /* get cosine angles */
     cl_diff1 = cl_idx13 - cl_idx12;
     cl_diff2 = cl_idx23 - cl_idx21;
     cl_diff3 = cl_idx32 - cl_idx31;
@@ -536,18 +528,19 @@ void estimate_all_angles1(int j,
     c2 = cos(theta2);
     c3 = cos(theta3);
 
-    // test if we have a good_idx
+    /* test if we have a good index */
     cond = 1 + 2 * c1 * c2 * c3 - (c1*c1 + c2*c2 + c3*c3);
-    if(cond <= 1e-5) continue;  // that value of k is not good, skip
+    if(cond <= 1e-5) continue;  /* current value of k is not good, skip */
 
     /* Calculated cos values of angle between i and j images */
     if( sync == 1){
 
       cos_phi2 = (c3 - c1*c2) / (sqrt(1 - c1*c1) * sqrt(1 - c2*c2));
 
-      /* Some synchronization must be applied when common line is out by 180 degrees.
-         Here fix the angles between c_ij(c_ji) and c_ik(c_jk) to be smaller than pi/2,
-         otherwise there will be an ambiguity between alpha and pi-alpha.
+      /*
+        Some synchronization must be applied when common line is out by 180 degrees.
+        Here fix the angles between c_ij(c_ji) and c_ik(c_jk) to be smaller than pi/2,
+        otherwise there will be an ambiguity between alpha and pi-alpha.
       */
 
       /* Check sync conditions */
@@ -562,12 +555,12 @@ void estimate_all_angles1(int j,
         cos_phi2 = -cos_phi2;
       }
 
-    }  // end sync
+    }  /* end sync */
     else{
       cos_phi2 = (c3 - c1*c2 ) / (sin(theta1) * sin(theta2));
-    } // end not sync
+    } /* end not sync */
 
-    // clip [-1,1]
+    /* clip cosine phi between [-1,1] */
     if(cos_phi2 >1){
       cos_phi2 = 1;
     }
@@ -575,18 +568,19 @@ void estimate_all_angles1(int j,
       cos_phi2 = -1;
     }
 
-    /* compute histogram contribution and index map */
+    /* compute histogram contribution, angle mapping, and index mappings. */
     angle = acos(cos_phi2) * 180. / M_PI;
-    // angle's bin
+    /*  angle's bin */
     map_idx = i*n + k;
-    k_map[map_idx] = angle / hist_bin_width;  //check
-    angles_map[map_idx] = angle;  // degree
+    k_map[map_idx] = angle / hist_bin_width;  // check
+    angles_map[map_idx] = angle;  /* degrees */
     for(b=0; b<ntics; b++){
-      ga = b*(180./ntics);  // grid angle // todo, just compute in radians to avoid extra arithmetic
-      ga = (ga - angle);
-      // histogram contribution
+      // todo, just compute in radians to avoid extra arithmetic
+      grid_angle = b * (180./ntics);  /* grid angle */  // check, I think off by one-ish (grid vs bins)
+      angle_diff = (grid_angle - angle);
+      /* accumulate histogram contribution */
       hist[i*ntics + b ] += exp(
-          -(ga*ga) / ( 2*sigma*sigma));
+          -(angle_diff*angle_diff) / ( two_sigma_sq));
     } /* bins */
   } /* k*/
 
@@ -595,18 +589,17 @@ void estimate_all_angles1(int j,
     k contributions into the histogram for I j.
     The next section solves the histogram.
 
-    The angles are also initialized while we have the cl_idx values.
+    Two known euler angles are also initialized while we have the cl_idx values.
   */
 
-  // initialize
+  /* Initialize euler angles */
   map_idx = pair_idx*3;
   angles[map_idx    ] = cl_idx12 * 2 * M_PI / n_theta + M_PI / 2;
   angles[map_idx + 1] = 0;
   angles[map_idx + 2] = -M_PI / 2 - cl_idx21 * 2 * M_PI / n_theta;
 
 
-  //  } /* j */
-} /* kernel */
+} /* estimate_all_angles1 kernel */
 
 
 
@@ -614,7 +607,6 @@ void estimate_all_angles1(int j,
 extern "C" __global__
 void estimate_all_angles2(int j,
                           int n,
-                          int n_theta,
                           double hist_bin_width,
                           int full_width,
                           double* __restrict__ hist,
@@ -622,21 +614,19 @@ void estimate_all_angles2(int j,
                           double* __restrict__ angles_map,
                           double* __restrict__ angles)
 {
-  // try toget k_map as uint16_t
   /* n n_img */
 
-  /* thread index (1d), represents "i" index */
+  /* thread index (1d), represents image i index */
   const unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+  /* j is image j index */
 
   int k;
   int cnt;
-  double w_theta_need;
+  double w_theta_needed;
 
   /* no-op when out of bounds */
   if(i >= n) return;
 
-  // vote_ij creates good_k list per (i,j)
-  int pair_idx;
   int map_idx; /* tmp index var */
 
   int ntics = 180. / hist_bin_width;
@@ -644,10 +634,9 @@ void estimate_all_angles2(int j,
   int peak_idx;
   double peak;
 
-  //for(j=i+1; j<n; j++){
-  pair_idx = PAIR_IDX(n,i,j);
+  const int pair_idx = PAIR_IDX(n,i,j);
 
-  /* find peak of histogram */
+  /* Find peak and peak index in histogram */
   peak = -99999;
   peak_idx = -1;
   for(b=0; b<ntics; b++){
@@ -660,44 +649,43 @@ void estimate_all_angles2(int j,
 
   /* find mean of rotations */
 
-  cnt = 0;
-
   if(full_width==-1){
     /* adaptive width*/
-    w_theta_need = 0;
+    w_theta_needed = 0;
     cnt = 0;
     while(cnt == 0){
-      w_theta_need += hist_bin_width;
+      /* broaden search width */
+      w_theta_needed += hist_bin_width;
+      /* find satisfying indices */
       for(k=0; k<n; k++){
-        // image k in peak bin
+        /* determine if image k in peak bin(s) */
         map_idx = i*n + k;
-        if(abs(k_map[map_idx] - peak_idx) < w_theta_need){
-          cnt += 1;  // this image was in the peak
-          angles[pair_idx*3 + 1] += angles_map[map_idx];  // accumulate angle
-        } /* fi < w_theta_need*/
+        if(abs(k_map[map_idx] - peak_idx) < w_theta_needed){
+          cnt += 1;  /* count this image */
+          angles[pair_idx*3 + 1] += angles_map[map_idx];  /* accumulate angle */
+        } /* fi < w_theta_needed */
       } /* k */
     } /* while */
   } /* fi adaptive */
   else {
     /* fixed width */
-    // find satisfying indices
+    cnt = 0;
+    /* determine if image k in peak bin(s) */
     for(k=0; k<n; k++){
-      // image k in peak bin
       map_idx = i*n + k;
       if(abs(k_map[map_idx] - peak_idx) < full_width){
-        cnt += 1;  // this image was in the peak
-        angles[pair_idx*3 + 1] += angles_map[map_idx];  // accumulate angle
+        cnt += 1;  /* count this image */
+        angles[pair_idx*3 + 1] += angles_map[map_idx];  /* accumulate angle */
       } /* fi full_width */
     } /* k */
   } /* fi fixed width */
 
-
-  /* Divide accumulated angles (resulting in the mean alpha angle) */   //(todo, better handle 0/0?)
+  /* Divide accumulated angles (resulting in the mean alpha angle) */
+  // (todo, better handle 0/0?)
   /* convert degree radian */
   angles[pair_idx*3 + 1] *= M_PI / (180*cnt);
 
-  //  } /* j */
-} /* kernel */
+} /* estimate_all_angles2 kernel */
 
 extern "C" __global__
 void angles_to_rots(int n,
diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 24beb99898..b11f27e13e 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -894,7 +894,6 @@ def _estimate_all_Rijs_cupy(self, clmatrix):
                 (
                     j,
                     self.n_img,
-                    self.n_theta,
                     np.float64(self.hist_bin_width),
                     self.full_width,
                     hist,  # tmp

From 2f543301a32799dc1b96722f54e9f74467c8bab4 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 19 Sep 2024 16:14:20 -0400
Subject: [PATCH 25/45] threads over k

---
 src/aspire/abinitio/commonline_sync3n.cu | 181 ++++++++++++-----------
 src/aspire/abinitio/commonline_sync3n.py |  14 +-
 2 files changed, 101 insertions(+), 94 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu
index 5d3d0340d1..6841ad2d5d 100644
--- a/src/aspire/abinitio/commonline_sync3n.cu
+++ b/src/aspire/abinitio/commonline_sync3n.cu
@@ -464,18 +464,19 @@ void estimate_all_angles1(int j,
   /* thread index (1d), represents "i" index */
   const unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
   /* j is image j index */
+  const unsigned int k = blockDim.y * blockIdx.y + threadIdx.y;
 
-  int k;
   int cl_diff1, cl_diff2, cl_diff3;
   double theta1, theta2, theta3;
   double c1, c2, c3;
   double cond;
   double cos_phi2;
-  int cnt;
   double w_theta_need;
 
   /* no-op when out of bounds */
   if(i >= n) return;
+  if(k >= n) return;
+
 
   int map_idx; /* tmp index var */
 
@@ -493,96 +494,96 @@ void estimate_all_angles1(int j,
   const int pair_idx = PAIR_IDX(n,i,j);
 
   /* initialize */
-  for(b=0; b<ntics; b++){
-    hist[i*ntics + b] = 0;
+  if((k==0) ^ (k==n-2)){  /* unique per ijk grid */
+    for(b=0; b<ntics; b++){
+      hist[i*ntics + b] = 0;
+    }
   }
 
   cl_idx12 = clmatrix[i*n + j];
   cl_idx21 = clmatrix[j*n + i];
 
   /* Assume that k_list starts as all n images */
-  for(k=0; k<n; k++){
-
-    cl_idx13 = clmatrix[i*n + k];
-    cl_idx31 = clmatrix[k*n + i];
-    cl_idx23 = clmatrix[j*n + k];
-    cl_idx32 = clmatrix[k*n + j];
-
-    /* test `k` values */
-    if(k==i) continue;
-    if(cl_idx13 == -1) continue;  /* i, k */
-    //if(cl_idx31 == -1) continue;  // k, i ?
-    if(cl_idx23 == -1) continue;  /* j, k */
-    // if(cl_idx32 == -1) return;  // k, j ?
-
-    /* get cosine angles */
-    cl_diff1 = cl_idx13 - cl_idx12;
-    cl_diff2 = cl_idx23 - cl_idx21;
-    cl_diff3 = cl_idx32 - cl_idx31;
-
-    theta1 = cl_diff1 * 2 * M_PI / n_theta;
-    theta2 = cl_diff2 * 2 * M_PI / n_theta;
-    theta3 = cl_diff3 * 2 * M_PI / n_theta;
-
-    c1 = cos(theta1);
-    c2 = cos(theta2);
-    c3 = cos(theta3);
-
-    /* test if we have a good index */
-    cond = 1 + 2 * c1 * c2 * c3 - (c1*c1 + c2*c2 + c3*c3);
-    if(cond <= 1e-5) continue;  /* current value of k is not good, skip */
-
-    /* Calculated cos values of angle between i and j images */
-    if( sync == 1){
-
-      cos_phi2 = (c3 - c1*c2) / (sqrt(1 - c1*c1) * sqrt(1 - c2*c2));
-
-      /*
-        Some synchronization must be applied when common line is out by 180 degrees.
-        Here fix the angles between c_ij(c_ji) and c_ik(c_jk) to be smaller than pi/2,
-        otherwise there will be an ambiguity between alpha and pi-alpha.
-      */
-
-      /* Check sync conditions */
-      ind1 = (theta1 > (M_PI + TOL_idx)) | (
-          (theta1 < -TOL_idx) & (theta1 > -M_PI)
-                                            );
-      ind2 = (theta2 > (M_PI + TOL_idx)) || (
-          (theta2 < -TOL_idx) && (theta2 > -M_PI)
-                                             );
-      if( (ind1 && !ind2) || (!ind1 && ind2)){
-        /* Apply sync */
-        cos_phi2 = -cos_phi2;
-      }
 
-    }  /* end sync */
-    else{
-      cos_phi2 = (c3 - c1*c2 ) / (sin(theta1) * sin(theta2));
-    } /* end not sync */
-
-    /* clip cosine phi between [-1,1] */
-    if(cos_phi2 >1){
-      cos_phi2 = 1;
-    }
-    if(cos_phi2 <-1){
-      cos_phi2 = -1;
+  cl_idx13 = clmatrix[i*n + k];
+  cl_idx31 = clmatrix[k*n + i];
+  cl_idx23 = clmatrix[j*n + k];
+  cl_idx32 = clmatrix[k*n + j];
+
+  /* test `k` values */
+  if(k==i) return;
+  if(cl_idx13 == -1) return;  /* i, k */
+  //if(cl_idx31 == -1) return;  // k, i ?
+  if(cl_idx23 == -1) return;  /* j, k */
+  // if(cl_idx32 == -1) return;  // k, j ?
+
+  /* get cosine angles */
+  cl_diff1 = cl_idx13 - cl_idx12;
+  cl_diff2 = cl_idx23 - cl_idx21;
+  cl_diff3 = cl_idx32 - cl_idx31;
+
+  theta1 = cl_diff1 * 2 * M_PI / n_theta;
+  theta2 = cl_diff2 * 2 * M_PI / n_theta;
+  theta3 = cl_diff3 * 2 * M_PI / n_theta;
+
+  c1 = cos(theta1);
+  c2 = cos(theta2);
+  c3 = cos(theta3);
+
+  /* test if we have a good index */
+  cond = 1 + 2 * c1 * c2 * c3 - (c1*c1 + c2*c2 + c3*c3);
+  if(cond <= 1e-5) return;  /* current value of k is not good, skip */
+
+  /* Calculated cos values of angle between i and j images */
+  if( sync == 1){
+
+    cos_phi2 = (c3 - c1*c2) / (sqrt(1 - c1*c1) * sqrt(1 - c2*c2));
+
+    /*
+      Some synchronization must be applied when common line is out by 180 degrees.
+      Here fix the angles between c_ij(c_ji) and c_ik(c_jk) to be smaller than pi/2,
+      otherwise there will be an ambiguity between alpha and pi-alpha.
+    */
+
+    /* Check sync conditions */
+    ind1 = (theta1 > (M_PI + TOL_idx)) | (
+        (theta1 < -TOL_idx) & (theta1 > -M_PI)
+                                          );
+    ind2 = (theta2 > (M_PI + TOL_idx)) || (
+        (theta2 < -TOL_idx) && (theta2 > -M_PI)
+                                           );
+    if( (ind1 && !ind2) || (!ind1 && ind2)){
+      /* Apply sync */
+      cos_phi2 = -cos_phi2;
     }
 
-    /* compute histogram contribution, angle mapping, and index mappings. */
-    angle = acos(cos_phi2) * 180. / M_PI;
-    /*  angle's bin */
-    map_idx = i*n + k;
-    k_map[map_idx] = angle / hist_bin_width;  // check
-    angles_map[map_idx] = angle;  /* degrees */
-    for(b=0; b<ntics; b++){
-      // todo, just compute in radians to avoid extra arithmetic
-      grid_angle = b * (180./ntics);  /* grid angle */  // check, I think off by one-ish (grid vs bins)
-      angle_diff = (grid_angle - angle);
-      /* accumulate histogram contribution */
-      hist[i*ntics + b ] += exp(
-          -(angle_diff*angle_diff) / ( two_sigma_sq));
-    } /* bins */
-  } /* k*/
+  }  /* end sync */
+  else{
+    cos_phi2 = (c3 - c1*c2 ) / (sin(theta1) * sin(theta2));
+  } /* end not sync */
+
+  /* clip cosine phi between [-1,1] */
+  if(cos_phi2 >1){
+    cos_phi2 = 1;
+  }
+  if(cos_phi2 <-1){
+    cos_phi2 = -1;
+  }
+
+  /* compute histogram contribution, angle mapping, and index mappings. */
+  angle = acos(cos_phi2) * 180. / M_PI;
+  /*  angle's bin */
+  map_idx = i*n + k;
+  k_map[map_idx] = angle / hist_bin_width;  // check
+  angles_map[map_idx] = angle;  /* degrees */
+  for(b=0; b<ntics; b++){
+    // todo, just compute in radians to avoid extra arithmetic
+    grid_angle = b * (180./ntics);  /* grid angle */  // check, I think off by one-ish (grid vs bins)
+    angle_diff = (grid_angle - angle);
+    /* accumulate histogram contribution */
+    hist[i*ntics + b ] += exp(
+        -(angle_diff*angle_diff) / ( two_sigma_sq));
+  } /* bins */
 
   /*
     At this point, the kernel should have accumulated all "good"
@@ -593,12 +594,13 @@ void estimate_all_angles1(int j,
   */
 
   /* Initialize euler angles */
-  map_idx = pair_idx*3;
-  angles[map_idx    ] = cl_idx12 * 2 * M_PI / n_theta + M_PI / 2;
-  angles[map_idx + 1] = 0;
-  angles[map_idx + 2] = -M_PI / 2 - cl_idx21 * 2 * M_PI / n_theta;
-
-
+  if((k==0) ^ (k==n-2))  /* unique per ijk grid */
+  {
+    map_idx = pair_idx*3;
+    angles[map_idx    ] = cl_idx12 * 2 * M_PI / n_theta + M_PI / 2;
+    angles[map_idx + 1] = 0;
+    angles[map_idx + 2] = -M_PI / 2 - cl_idx21 * 2 * M_PI / n_theta;
+  }
 } /* estimate_all_angles1 kernel */
 
 
@@ -659,6 +661,7 @@ void estimate_all_angles2(int j,
       /* find satisfying indices */
       for(k=0; k<n; k++){
         /* determine if image k in peak bin(s) */
+        // Perhaps transpose the maps so thread i in fast
         map_idx = i*n + k;
         if(abs(k_map[map_idx] - peak_idx) < w_theta_needed){
           cnt += 1;  /* count this image */
diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index b11f27e13e..9b87a8f220 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -864,14 +864,18 @@ def _estimate_all_Rijs_cupy(self, clmatrix):
         angles = cp.zeros((n_pairs, 3), dtype=np.float64)
 
         # Configure grid of blocks
-        blkszx = 1024
+        blkszx = 32
         nblkx = (self.n_img + blkszx - 1) // blkszx
+        blkszy = 32
+        nblky = (self.n_img + blkszy - 1) // blkszy
+        blksz = 1024
+        nblk = (self.n_img + blksz - 1) // blksz
 
         logger.info("Launching `estimate_all_angles` kernel.")
         for j in range(0, self.n_img):
             estimate_all_angles1(
-                (nblkx,),
-                (blkszx,),
+                (nblkx, nblky),
+                (blkszx, blkszy),
                 (
                     j,
                     self.n_img,
@@ -889,8 +893,8 @@ def _estimate_all_Rijs_cupy(self, clmatrix):
             )
 
             estimate_all_angles2(
-                (nblkx,),
-                (blkszx,),
+                (nblk,),
+                (blksz,),
                 (
                     j,
                     self.n_img,

From 4b663fd67a90738b823516ac3e095733dc31c235 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 19 Sep 2024 17:06:07 -0400
Subject: [PATCH 26/45] continue cleanup threads over k

---
 src/aspire/abinitio/commonline_base.py   |  1 -
 src/aspire/abinitio/commonline_sync3n.py | 55 ++++++++++++++----------
 2 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/src/aspire/abinitio/commonline_base.py b/src/aspire/abinitio/commonline_base.py
index efb33280e2..66c960dd2d 100644
--- a/src/aspire/abinitio/commonline_base.py
+++ b/src/aspire/abinitio/commonline_base.py
@@ -6,7 +6,6 @@
 import scipy.sparse as sparse
 
 from aspire.image import Image
-from aspire.numeric import fft, xp
 from aspire.operators import PolarFT
 from aspire.utils import common_line_from_rots, fuzzy_mask, tqdm
 from aspire.utils.random import choice
diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 9b87a8f220..36a2e1356a 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -824,24 +824,14 @@ def _estimate_all_Rijs(self, clmatrix):
         :return: Estimated rotations
         """
         # host/gpu dispatch
-        # if self.__gpu_module:
-        #     res = self._estimate_all_Rijs_cupy(Rijs)
-        # else:
-        #     res = self._estimate_all_Rijs_host(Rijs)
-        from time import perf_counter
-
-        tic = perf_counter()
-        res = self._estimate_all_Rijs_cupy(clmatrix)
-        toc1 = perf_counter()
-        print("_estimate_all_Rijs_cupy", toc1 - tic)
-
-        res_host = self._estimate_all_Rijs_host(clmatrix)
-        toc2 = perf_counter()
-        print("_estimate_all_Rijs_host", toc2 - toc1)
+        if self.__gpu_module:
+            res = self._estimate_all_Rijs_cu(clmatrix)
+        else:
+            res = self._estimate_all_Rijs_host(clmatrix)
 
         return res
 
-    def _estimate_all_Rijs_cupy(self, clmatrix):
+    def _estimate_all_Rijs_cu(self, clmatrix):
         import cupy as cp
 
         estimate_all_angles1 = self.__gpu_module.get_function("estimate_all_angles1")
@@ -863,16 +853,24 @@ def _estimate_all_Rijs_cupy(self, clmatrix):
         angles_map = cp.zeros((self.n_img, self.n_img), dtype=np.float64)
         angles = cp.zeros((n_pairs, 3), dtype=np.float64)
 
-        # Configure grid of blocks
+        # Configure 2d grid of blocks (kernel part 1)
         blkszx = 32
         nblkx = (self.n_img + blkszx - 1) // blkszx
         blkszy = 32
         nblky = (self.n_img + blkszy - 1) // blkszy
+
+        # Configure 1d grid of blocks (kernel part 2)
         blksz = 1024
         nblk = (self.n_img + blksz - 1) // blksz
 
+        from time import perf_counter
+
         logger.info("Launching `estimate_all_angles` kernel.")
+        toc0 = perf_counter()
         for j in range(0, self.n_img):
+
+            # -------------------
+            # Vote into histogram
             estimate_all_angles1(
                 (nblkx, nblky),
                 (blkszx, blkszy),
@@ -892,6 +890,8 @@ def _estimate_all_Rijs_cupy(self, clmatrix):
                 ),
             )
 
+            # -------------------------
+            # Solve hist for mean angle
             estimate_all_angles2(
                 (nblk,),
                 (blksz,),
@@ -907,21 +907,28 @@ def _estimate_all_Rijs_cupy(self, clmatrix):
                 ),
             )
 
-        # no longer need workspace vars
+        # Force all kernels to complete
+        cp.cuda.runtime.deviceSynchronize()
+        toc1 = perf_counter()
+        logger.info(f"estimate_all_angles: {toc1-toc0}s")
+
+        # Inform CuPy we longer need these workspace vars
         del hist
         del k_map
         del angles_map
 
-        # convert angles to rots
+        # ---------------------------
+        # Convert angles to rotations
         rotations = cp.empty((n_pairs, 3, 3), dtype=np.float64)
 
-        blkszx = 1024
-        nblkx = (n_pairs + blkszx - 1) // blkszx
+        # Configure 1d grid of blocks
+        blksz = 1024
+        nblk = (n_pairs + blksz - 1) // blksz
 
         logger.info("Launching `angles_to_rots` kernel.")
         angles_to_rots(
-            (nblkx,),
-            (blkszx,),
+            (nblk,),
+            (blksz,),
             (
                 n_pairs,
                 angles,
@@ -929,6 +936,10 @@ def _estimate_all_Rijs_cupy(self, clmatrix):
             ),
         )
 
+        cp.cuda.runtime.deviceSynchronize()
+        toc2 = perf_counter()
+        logger.info(f"angles_to_rots: {toc2-toc1}s")
+
         # transfer results device to host
         rotations = rotations.get()
 

From 0e219cc153664083d1925991aacb25b415b7c43b Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 23 Sep 2024 06:41:49 -0400
Subject: [PATCH 27/45] fix j<i bound bug

---
 src/aspire/abinitio/commonline_sync3n.cu | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu
index 6841ad2d5d..725f1b0758 100644
--- a/src/aspire/abinitio/commonline_sync3n.cu
+++ b/src/aspire/abinitio/commonline_sync3n.cu
@@ -476,7 +476,7 @@ void estimate_all_angles1(int j,
   /* no-op when out of bounds */
   if(i >= n) return;
   if(k >= n) return;
-
+  if(i >= j) return;
 
   int map_idx; /* tmp index var */
 
@@ -502,6 +502,11 @@ void estimate_all_angles1(int j,
 
   cl_idx12 = clmatrix[i*n + j];
   cl_idx21 = clmatrix[j*n + i];
+  /*
+    MATLAB code indicated this condition might occur outside i==j;
+    Ask Yoel what other reasons this would occur.
+  */
+  if(cl_idx12 == -1) return;
 
   /* Assume that k_list starts as all n images */
 
@@ -628,6 +633,7 @@ void estimate_all_angles2(int j,
 
   /* no-op when out of bounds */
   if(i >= n) return;
+  if(i >= j) return;
 
   int map_idx; /* tmp index var */
 

From 3eaef56d02c15a3f9e65111c9aa816e942e2aca4 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 23 Sep 2024 11:09:51 -0400
Subject: [PATCH 28/45] fix adative param oversight bug

---
 src/aspire/abinitio/sync_voting.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/aspire/abinitio/sync_voting.py b/src/aspire/abinitio/sync_voting.py
index fb626a8a91..82bfcf8870 100644
--- a/src/aspire/abinitio/sync_voting.py
+++ b/src/aspire/abinitio/sync_voting.py
@@ -156,7 +156,7 @@ def _vote_ij(self, clmatrix, n_theta, i, j, k_list, sync=False):
         # that accidentally fall near the peak.
         peak_idx = angles_hist.argmax()
 
-        if str(self.full_width).lower() == "adaptive":
+        if self.full_width==-1:
             # Adaptive width  (MATLAB)
             # Look for the estimations in the peak of the histogram
             w_theta_needed = 0

From 1748a691f8cd167c7a7585b33bdbc0fe0a319286 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 23 Sep 2024 11:34:41 -0400
Subject: [PATCH 29/45] parallel case bug

---
 src/aspire/abinitio/commonline_sync3n.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu
index 725f1b0758..4b61767bbd 100644
--- a/src/aspire/abinitio/commonline_sync3n.cu
+++ b/src/aspire/abinitio/commonline_sync3n.cu
@@ -517,6 +517,7 @@ void estimate_all_angles1(int j,
 
   /* test `k` values */
   if(k==i) return;
+  if(k==j) return;
   if(cl_idx13 == -1) return;  /* i, k */
   //if(cl_idx31 == -1) return;  // k, i ?
   if(cl_idx23 == -1) return;  /* j, k */

From 90ff5b3d49899bf0f4dc83581c0c7ad24441d2ac Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Tue, 24 Sep 2024 14:31:01 -0400
Subject: [PATCH 30/45] C order, sigh

---
 src/aspire/abinitio/commonline_sync3n.cu | 35 +++++++++++-------------
 src/aspire/abinitio/commonline_sync3n.py |  7 ++++-
 src/aspire/abinitio/sync_voting.py       |  2 +-
 3 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu
index 4b61767bbd..c89064a4e2 100644
--- a/src/aspire/abinitio/commonline_sync3n.cu
+++ b/src/aspire/abinitio/commonline_sync3n.cu
@@ -1,6 +1,6 @@
 #include "stdint.h"
 
-# define M_PI           3.14159265358979323846  /* pi */
+#define M_PI 3.14159265358979323846  /* pi */
 
 /* from i,j indices to the common index in the N-choose-2 sized array */
 // careful, this is stricly the upper triangle!
@@ -477,13 +477,19 @@ void estimate_all_angles1(int j,
   if(i >= n) return;
   if(k >= n) return;
   if(i >= j) return;
+  /*
+    These are also tested later via the clmatrix values,
+    testing now avoids extra reads.
+  */
+  if(k==i) return;
+  if(k==j) return;
 
   int map_idx; /* tmp index var */
 
   int cl_idx12, cl_idx21;
   int cl_idx13, cl_idx31;
   int cl_idx23, cl_idx32;
-  int ntics = 180. / hist_bin_width;
+  const int ntics = 180. / hist_bin_width;
   const double TOL_idx = 1e-12;
   bool ind1, ind2;
   double grid_angle, angle_diff, angle;
@@ -493,13 +499,6 @@ void estimate_all_angles1(int j,
 
   const int pair_idx = PAIR_IDX(n,i,j);
 
-  /* initialize */
-  if((k==0) ^ (k==n-2)){  /* unique per ijk grid */
-    for(b=0; b<ntics; b++){
-      hist[i*ntics + b] = 0;
-    }
-  }
-
   cl_idx12 = clmatrix[i*n + j];
   cl_idx21 = clmatrix[j*n + i];
   /*
@@ -516,8 +515,6 @@ void estimate_all_angles1(int j,
   cl_idx32 = clmatrix[k*n + j];
 
   /* test `k` values */
-  if(k==i) return;
-  if(k==j) return;
   if(cl_idx13 == -1) return;  /* i, k */
   //if(cl_idx31 == -1) return;  // k, i ?
   if(cl_idx23 == -1) return;  /* j, k */
@@ -552,9 +549,9 @@ void estimate_all_angles1(int j,
     */
 
     /* Check sync conditions */
-    ind1 = (theta1 > (M_PI + TOL_idx)) | (
-        (theta1 < -TOL_idx) & (theta1 > -M_PI)
-                                          );
+    ind1 = (theta1 > (M_PI + TOL_idx)) || (
+        (theta1 < -TOL_idx) && (theta1 > -M_PI)
+                                           );
     ind2 = (theta2 > (M_PI + TOL_idx)) || (
         (theta2 < -TOL_idx) && (theta2 > -M_PI)
                                            );
@@ -586,9 +583,9 @@ void estimate_all_angles1(int j,
     // todo, just compute in radians to avoid extra arithmetic
     grid_angle = b * (180./ntics);  /* grid angle */  // check, I think off by one-ish (grid vs bins)
     angle_diff = (grid_angle - angle);
-    /* accumulate histogram contribution */
-    hist[i*ntics + b ] += exp(
-        -(angle_diff*angle_diff) / ( two_sigma_sq));
+    /* accumulate histogram contribution, atomic due to `k` */
+    atomicAdd(&(hist[i*ntics + b ]),
+              exp(-(angle_diff*angle_diff) / ( two_sigma_sq)));
   } /* bins */
 
   /*
@@ -600,7 +597,7 @@ void estimate_all_angles1(int j,
   */
 
   /* Initialize euler angles */
-  if((k==0) ^ (k==n-2))  /* unique per ijk grid */
+  /* can be done once? */
   {
     map_idx = pair_idx*3;
     angles[map_idx    ] = cl_idx12 * 2 * M_PI / n_theta + M_PI / 2;
@@ -638,7 +635,7 @@ void estimate_all_angles2(int j,
 
   int map_idx; /* tmp index var */
 
-  int ntics = 180. / hist_bin_width;
+  const int ntics = 180. / hist_bin_width;
   int b;
   int peak_idx;
   double peak;
diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 36a2e1356a..3110e179c3 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -842,7 +842,7 @@ def _estimate_all_Rijs_cu(self, clmatrix):
         sync = 1
 
         # transfer input to device
-        clmatrix = cp.asarray(clmatrix, dtype=np.int16)
+        clmatrix = cp.asarray(clmatrix, order="C", dtype=np.int16)
 
         # workspace arrays
         ntics = int(180 / self.hist_bin_width)
@@ -869,6 +869,11 @@ def _estimate_all_Rijs_cu(self, clmatrix):
         toc0 = perf_counter()
         for j in range(0, self.n_img):
 
+            # ------------------------------------------
+            # Zero histogram and k mapping for each `j`.
+            hist[:] = 0
+            k_map[:] = 0
+
             # -------------------
             # Vote into histogram
             estimate_all_angles1(
diff --git a/src/aspire/abinitio/sync_voting.py b/src/aspire/abinitio/sync_voting.py
index 82bfcf8870..63cb6a923f 100644
--- a/src/aspire/abinitio/sync_voting.py
+++ b/src/aspire/abinitio/sync_voting.py
@@ -156,7 +156,7 @@ def _vote_ij(self, clmatrix, n_theta, i, j, k_list, sync=False):
         # that accidentally fall near the peak.
         peak_idx = angles_hist.argmax()
 
-        if self.full_width==-1:
+        if self.full_width == -1:
             # Adaptive width  (MATLAB)
             # Look for the estimations in the peak of the histogram
             w_theta_needed = 0

From 627d4c5c2d3dac6602ff4270cd447542f2d91da3 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Fri, 4 Oct 2024 11:43:02 -0400
Subject: [PATCH 31/45] remove unused vars from build cl kernel

---
 src/aspire/abinitio/commonline_base.cu | 12 +++++++++---
 src/aspire/abinitio/commonline_base.py | 15 +--------------
 2 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/src/aspire/abinitio/commonline_base.cu b/src/aspire/abinitio/commonline_base.cu
index eb493acf44..acce2fbba2 100644
--- a/src/aspire/abinitio/commonline_base.cu
+++ b/src/aspire/abinitio/commonline_base.cu
@@ -2,7 +2,15 @@
 #include <cupy/complex.cuh>
 
 extern "C" __global__
-void build_clmatrix_kernel(int n, int m, int r, const complex<double>* __restrict__ pf, int16_t* __restrict__ clmatrix, double* __restrict__  cl_dist, double* __restrict__ shifts_1d, int n_shifts, double* __restrict__ shifts,  const complex<double>* __restrict__ shift_phases)
+void build_clmatrix_kernel(
+    int n,
+    int m,
+    int r,
+    const complex<double>* __restrict__ pf,
+    int16_t* const __restrict__ clmatrix,
+    const int n_shifts,
+    double* __restrict__ shifts,
+    const complex<double>* const __restrict__ shift_phases)
 {
   /* n n_img */
   /* m,r st (n, m, r) = pf.shape, ie len(pf[i])  */
@@ -69,7 +77,5 @@ void build_clmatrix_kernel(int n, int m, int r, const complex<double>* __restric
   ind = i*n + j;
   clmatrix[ind] = best_cl1;
   clmatrix[j*n+i] = best_cl2;  /* [j,i] */
-  cl_dist[ind] = 2*best_cl_dist;  // 2 of mystery
-  shifts_1d[ind] = shifts[best_s];
 
 }
diff --git a/src/aspire/abinitio/commonline_base.py b/src/aspire/abinitio/commonline_base.py
index 66c960dd2d..d70618cee1 100644
--- a/src/aspire/abinitio/commonline_base.py
+++ b/src/aspire/abinitio/commonline_base.py
@@ -303,13 +303,6 @@ def build_clmatrix_cu(self):
         # starts from 0 instead of 1 as Matlab version. -1 means
         # there is no common line such as clmatrix[i,i].
         clmatrix = -cp.ones((n_img, n_img), dtype=np.int16)
-        # When cl_dist[i, j] is not -1, it stores the maximum value
-        # of correlation between image i and j for all possible 1D shifts.
-        # We will use cl_dist[i, j] = -1 (including j<=i) to
-        # represent that there is no need to check common line
-        # between i and j. Since it is symmetric,
-        # only above the diagonal entries are necessary.
-        cl_dist = -cp.ones((n_img, n_img), dtype=np.float64)
 
         # Allocate variables used for shift estimation
 
@@ -319,8 +312,6 @@ def build_clmatrix_cu(self):
         # Set resolution of shift estimation in pixels. Note that
         # shift_step can be any positive real number.
         shift_step = self.shift_step
-        # 1D shift between common-lines
-        shifts_1d = cp.zeros((n_img, n_img), dtype=np.float64)  # check dtype
 
         # Prepare the shift phases to try and generate filter for common-line detection
         r_max = pf.shape[2]
@@ -357,8 +348,6 @@ def build_clmatrix_cu(self):
                 r_max,
                 pf,
                 clmatrix,
-                cl_dist,
-                shifts_1d,
                 len(shifts),
                 shifts,
                 shift_phases,
@@ -366,11 +355,9 @@ def build_clmatrix_cu(self):
         )
 
         # Copy result device arrays to host
-        shifts_1d = shifts_1d.get().astype(self.dtype, copy=False)
         clmatrix = clmatrix.get().astype(self.dtype, copy=False)
-        cl_dist = cl_dist.get().astype(self.dtype, copy=False)  # diagnostic
 
-        return shifts_1d, clmatrix
+        return None, clmatrix
 
     def estimate_shifts(self, equations_factor=1, max_memory=4000):
         """

From 230fd0f31b333e9ed4b1df1807bd8974fce6d6f4 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Fri, 4 Oct 2024 12:19:57 -0400
Subject: [PATCH 32/45] remove unused vars from build cl kernel

---
 src/aspire/abinitio/commonline_base.cu | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/aspire/abinitio/commonline_base.cu b/src/aspire/abinitio/commonline_base.cu
index acce2fbba2..7ec2fad014 100644
--- a/src/aspire/abinitio/commonline_base.cu
+++ b/src/aspire/abinitio/commonline_base.cu
@@ -29,12 +29,11 @@ void build_clmatrix_kernel(
   int k;
   int s;
   int cl1, cl2;
-  int best_cl1, best_cl2, best_s;
+  int best_cl1, best_cl2;
   double dist, best_cl_dist;
   double p1, p2;
   complex<double> pfik, pfjk;
 
-  best_s = -99999;
   best_cl1 = -1;
   best_cl2 = -1;
   best_cl_dist = -1/0;
@@ -57,7 +56,6 @@ void build_clmatrix_kernel(
           best_cl_dist = dist;
           best_cl1 = cl1;
           best_cl2 = cl2;
-          best_s = s;
         }
 
         dist = p1 + p2;
@@ -65,7 +63,6 @@ void build_clmatrix_kernel(
           best_cl_dist = dist;
           best_cl1 = cl1;
           best_cl2 = cl2 + m; // m is pf.shape[1], which should be n_theta//2...
-          best_s = s;
         }
 
       } /* s */

From b64164e0c3cceb2de993734531e402b4ec27a2ad Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 10 Oct 2024 09:45:23 -0400
Subject: [PATCH 33/45] continue removing unused vars

---
 src/aspire/abinitio/commonline_base.cu | 4 ++--
 src/aspire/abinitio/commonline_base.py | 6 ++----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/aspire/abinitio/commonline_base.cu b/src/aspire/abinitio/commonline_base.cu
index 7ec2fad014..3fbc5306e4 100644
--- a/src/aspire/abinitio/commonline_base.cu
+++ b/src/aspire/abinitio/commonline_base.cu
@@ -1,4 +1,5 @@
 #include <stdint.h>
+#include <math.h>
 #include <cupy/complex.cuh>
 
 extern "C" __global__
@@ -9,7 +10,6 @@ void build_clmatrix_kernel(
     const complex<double>* __restrict__ pf,
     int16_t* const __restrict__ clmatrix,
     const int n_shifts,
-    double* __restrict__ shifts,
     const complex<double>* const __restrict__ shift_phases)
 {
   /* n n_img */
@@ -36,7 +36,7 @@ void build_clmatrix_kernel(
 
   best_cl1 = -1;
   best_cl2 = -1;
-  best_cl_dist = -1/0;
+  best_cl_dist = -INFINITY;
 
   for(cl1=0; cl1<m; cl1++){
     for(cl2=0; cl2<m; cl2++){
diff --git a/src/aspire/abinitio/commonline_base.py b/src/aspire/abinitio/commonline_base.py
index d70618cee1..6ac23f7c4e 100644
--- a/src/aspire/abinitio/commonline_base.py
+++ b/src/aspire/abinitio/commonline_base.py
@@ -315,11 +315,10 @@ def build_clmatrix_cu(self):
 
         # Prepare the shift phases to try and generate filter for common-line detection
         r_max = pf.shape[2]
-        shifts, shift_phases, h = self._generate_shift_phase_and_filter(
+        _, shift_phases, h = self._generate_shift_phase_and_filter(
             r_max, max_shift, shift_step
         )
         # Transfer to device, dtypes must match kernel header.
-        shifts = cp.asarray(shifts, dtype=np.float64)
         shift_phases = cp.asarray(shift_phases, dtype=np.complex128)
 
         # Apply bandpass filter, normalize each ray of each image
@@ -348,8 +347,7 @@ def build_clmatrix_cu(self):
                 r_max,
                 pf,
                 clmatrix,
-                len(shifts),
-                shifts,
+                len(shift_phases),
                 shift_phases,
             ),
         )

From 58fdb60b368cdf859d622254ec62e436dab2120d Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 10 Oct 2024 09:49:50 -0400
Subject: [PATCH 34/45] update constants

---
 src/aspire/abinitio/commonline_base.cu   | 10 +++++-----
 src/aspire/abinitio/commonline_sync3n.cu |  3 +--
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/aspire/abinitio/commonline_base.cu b/src/aspire/abinitio/commonline_base.cu
index 3fbc5306e4..37e112e4b8 100644
--- a/src/aspire/abinitio/commonline_base.cu
+++ b/src/aspire/abinitio/commonline_base.cu
@@ -4,9 +4,9 @@
 
 extern "C" __global__
 void build_clmatrix_kernel(
-    int n,
-    int m,
-    int r,
+    const int n,
+    const int m,
+    const int r,
     const complex<double>* __restrict__ pf,
     int16_t* const __restrict__ clmatrix,
     const int n_shifts,
@@ -16,8 +16,8 @@ void build_clmatrix_kernel(
   /* m,r st (n, m, r) = pf.shape, ie len(pf[i])  */
 
   /* thread index (1d), represents "i" index */
-  unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-  unsigned int j = blockDim.y * blockIdx.y + threadIdx.y;
+  const unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+  const unsigned int j = blockDim.y * blockIdx.y + threadIdx.y;
 
   /* no-op when out of bounds */
   if(i >= n) return;
diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu
index c89064a4e2..14e27669b9 100644
--- a/src/aspire/abinitio/commonline_sync3n.cu
+++ b/src/aspire/abinitio/commonline_sync3n.cu
@@ -1,6 +1,5 @@
 #include "stdint.h"
-
-#define M_PI 3.14159265358979323846  /* pi */
+#include "math.h"
 
 /* from i,j indices to the common index in the N-choose-2 sized array */
 // careful, this is stricly the upper triangle!

From 7e6415dc61fbd1e5ef39686eaa1047809756cacd Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 10 Oct 2024 10:26:59 -0400
Subject: [PATCH 35/45] add single precision build CL kernel and launching code

---
 src/aspire/abinitio/commonline_base.cu | 75 ++++++++++++++++++++++++++
 src/aspire/abinitio/commonline_base.py | 21 ++++++--
 2 files changed, 91 insertions(+), 5 deletions(-)

diff --git a/src/aspire/abinitio/commonline_base.cu b/src/aspire/abinitio/commonline_base.cu
index 37e112e4b8..b889cad62d 100644
--- a/src/aspire/abinitio/commonline_base.cu
+++ b/src/aspire/abinitio/commonline_base.cu
@@ -70,6 +70,81 @@ void build_clmatrix_kernel(
   }/* cl1 */
 
 
+  /* update global best for i, j*/
+  ind = i*n + j;
+  clmatrix[ind] = best_cl1;
+  clmatrix[j*n+i] = best_cl2;  /* [j,i] */
+
+}
+
+extern "C" __global__
+void fbuild_clmatrix_kernel(
+    const int n,
+    const int m,
+    const int r,
+    const complex<float>* __restrict__ pf,
+    int16_t* const __restrict__ clmatrix,
+    const int n_shifts,
+    const complex<float>* const __restrict__ shift_phases)
+{
+  /* n n_img */
+  /* m,r st (n, m, r) = pf.shape, ie len(pf[i])  */
+
+  /* thread index (1d), represents "i" index */
+  const unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+  const unsigned int j = blockDim.y * blockIdx.y + threadIdx.y;
+
+  /* no-op when out of bounds */
+  if(i >= n) return;
+  if(j >= n) return;
+  /* no-op lower triangle */
+  if(j <= i) return;
+
+  int ind;
+  int k;
+  int s;
+  int cl1, cl2;
+  int best_cl1, best_cl2;
+  float dist, best_cl_dist;
+  float p1, p2;
+  complex<float> pfik, pfjk;
+
+  best_cl1 = -1;
+  best_cl2 = -1;
+  best_cl_dist = -INFINITY;
+
+  for(cl1=0; cl1<m; cl1++){
+    for(cl2=0; cl2<m; cl2++){
+      for(s=0; s<n_shifts; s++){
+        p1 = 0;
+        p2 = 0;
+        /* inner most dim of dot (matmul) */
+        for(k=0; k<r; k++){
+          pfik = pf[k*m*n + cl1*n + i];
+          pfjk = conj(pf[k*m*n + cl2*n + j]) * shift_phases[s*r + k];
+          p1 += real(pfik) * real(pfjk);
+          p2 += imag(pfik) * imag(pfjk);
+        }
+
+        dist = p1 - p2;
+        if(dist > best_cl_dist){
+          best_cl_dist = dist;
+          best_cl1 = cl1;
+          best_cl2 = cl2;
+        }
+
+        dist = p1 + p2;
+        if(dist > best_cl_dist){
+          best_cl_dist = dist;
+          best_cl1 = cl1;
+          best_cl2 = cl2 + m; // m is pf.shape[1], which should be n_theta//2...
+        }
+
+      } /* s */
+    } /* cl2 */
+  }/* cl1 */
+
+
   /* update global best for i, j*/
   ind = i*n + j;
   clmatrix[ind] = best_cl1;
diff --git a/src/aspire/abinitio/commonline_base.py b/src/aspire/abinitio/commonline_base.py
index 6ac23f7c4e..df01578abe 100644
--- a/src/aspire/abinitio/commonline_base.py
+++ b/src/aspire/abinitio/commonline_base.py
@@ -7,7 +7,7 @@
 
 from aspire.image import Image
 from aspire.operators import PolarFT
-from aspire.utils import common_line_from_rots, fuzzy_mask, tqdm
+from aspire.utils import common_line_from_rots, complex_type, fuzzy_mask, tqdm
 from aspire.utils.random import choice
 
 logger = logging.getLogger(__name__)
@@ -319,16 +319,23 @@ def build_clmatrix_cu(self):
             r_max, max_shift, shift_step
         )
         # Transfer to device, dtypes must match kernel header.
-        shift_phases = cp.asarray(shift_phases, dtype=np.complex128)
+        shift_phases = cp.asarray(shift_phases, dtype=complex_type(self.dtype))
 
         # Apply bandpass filter, normalize each ray of each image
         # Note that only use half of each ray
         pf = self._apply_filter_and_norm("ijk, k -> ijk", pf, r_max, h)
         # Tranpose `pf` for better (CUDA) memory access pattern, and cast as needed.
-        pf = cp.ascontiguousarray(pf.T, dtype=np.complex128)
+        pf = cp.ascontiguousarray(pf.T, dtype=complex_type(self.dtype))
 
         # Get kernel
-        build_clmatrix_kernel = self.__gpu_module.get_function("build_clmatrix_kernel")
+        if self.dtype == np.float64:
+            build_clmatrix_kernel = self.__gpu_module.get_function(
+                "build_clmatrix_kernel"
+            )
+        elif self.dtype == np.float32:
+            build_clmatrix_kernel = self.__gpu_module.get_function(
+                "fbuild_clmatrix_kernel"
+            )
 
         # Configure grid of blocks
         blkszx = 32
@@ -701,4 +708,8 @@ def __init_cupy_module():
             module_code = fh.read()
 
         # CUPY compile the CUDA code
-        return cp.RawModule(code=module_code, backend="nvcc")
+        return cp.RawModule(
+            code=module_code,
+            backend="nvcc",
+            options=("-O3", "--use_fast_math", "--extra-device-vectorization"),
+        )

From a5701557ad43bcd3ab0cb41b7ca8596961f2d9c7 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 10 Oct 2024 13:39:53 -0400
Subject: [PATCH 36/45] revert accidental config commit

---
 src/aspire/config_default.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/aspire/config_default.yaml b/src/aspire/config_default.yaml
index 6e32ead3d2..9af8732dd1 100644
--- a/src/aspire/config_default.yaml
+++ b/src/aspire/config_default.yaml
@@ -1,9 +1,9 @@
 version: 0.13.0
 common:
     # numeric module to use - one of numpy/cupy
-    numeric: cupy
+    numeric: numpy
     # fft backend to use - one of pyfftw/scipy/cupy/mkl
-    fft: cupy
+    fft: scipy
 
     # Set cache directory for ASPIRE example data.
     # By default the cache location will be set by pooch.os_cache(),

From bc4c3b8187ab98055b6eb9a7952de52c884452cc Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 10 Oct 2024 13:52:24 -0400
Subject: [PATCH 37/45] cleanup base cuda code a little

---
 src/aspire/abinitio/commonline_base.cu | 38 ++++++++++++--------------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/src/aspire/abinitio/commonline_base.cu b/src/aspire/abinitio/commonline_base.cu
index b889cad62d..26529a03cc 100644
--- a/src/aspire/abinitio/commonline_base.cu
+++ b/src/aspire/abinitio/commonline_base.cu
@@ -13,9 +13,11 @@ void build_clmatrix_kernel(
     const complex<double>* const __restrict__ shift_phases)
 {
   /* n n_img */
-  /* m,r st (n, m, r) = pf.shape, ie len(pf[i])  */
+  /* m angular componentns, n_theta//2 */
+  /* r radial componentns */
+  /* (n, m, r) = pf.shape in python (before transpose for CUDA kernel) */
 
-  /* thread index (1d), represents "i" index */
+  /* thread index (2d), represents "i" and "j" indices */
   const unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
   const unsigned int j = blockDim.y * blockIdx.y + threadIdx.y;
 
@@ -25,7 +27,6 @@ void build_clmatrix_kernel(
   /* no-op lower triangle */
   if(j <= i) return;
 
-  int ind;
   int k;
   int s;
   int cl1, cl2;
@@ -49,7 +50,7 @@ void build_clmatrix_kernel(
           pfjk = conj(pf[k*m*n + cl2*n + j]) * shift_phases[s*r + k];
           p1 += real(pfik) * real(pfjk);
           p2 += imag(pfik) * imag(pfjk);
-        }
+        } /* k */
 
         dist = p1 - p2;
         if(dist > best_cl_dist){
@@ -62,20 +63,18 @@ void build_clmatrix_kernel(
         if(dist > best_cl_dist){
           best_cl_dist = dist;
           best_cl1 = cl1;
-          best_cl2 = cl2 + m; // m is pf.shape[1], which should be n_theta//2...
+          best_cl2 = cl2 + m; /* m is pf.shape[1], which should be n_theta//2 */
         }
 
       } /* s */
     } /* cl2 */
   }/* cl1 */
 
-
-  /* update global best for i, j*/
-  ind = i*n + j;
-  clmatrix[ind] = best_cl1;
+  /* update global best for i, j */
+  clmatrix[i*n + j] = best_cl1;
   clmatrix[j*n+i] = best_cl2;  /* [j,i] */
 
-}
+} /* build_clmatrix_kernel */
 
 extern "C" __global__
 void fbuild_clmatrix_kernel(
@@ -88,9 +87,11 @@ void fbuild_clmatrix_kernel(
     const complex<float>* const __restrict__ shift_phases)
 {
   /* n n_img */
-  /* m,r st (n, m, r) = pf.shape, ie len(pf[i])  */
+  /* m angular componentns, n_theta//2 */
+  /* r radial componentns */
+  /* (n, m, r) = pf.shape in python (before transpose for CUDA kernel) */
 
-  /* thread index (1d), represents "i" index */
+  /* thread index (2d), represents "i" and "j" indices */
   const unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
   const unsigned int j = blockDim.y * blockIdx.y + threadIdx.y;
 
@@ -100,7 +101,6 @@ void fbuild_clmatrix_kernel(
   /* no-op lower triangle */
   if(j <= i) return;
 
-  int ind;
   int k;
   int s;
   int cl1, cl2;
@@ -124,7 +124,7 @@ void fbuild_clmatrix_kernel(
           pfjk = conj(pf[k*m*n + cl2*n + j]) * shift_phases[s*r + k];
           p1 += real(pfik) * real(pfjk);
           p2 += imag(pfik) * imag(pfjk);
-        }
+        } /* k */
 
         dist = p1 - p2;
         if(dist > best_cl_dist){
@@ -137,17 +137,15 @@ void fbuild_clmatrix_kernel(
         if(dist > best_cl_dist){
           best_cl_dist = dist;
           best_cl1 = cl1;
-          best_cl2 = cl2 + m; // m is pf.shape[1], which should be n_theta//2...
+          best_cl2 = cl2 + m; /* m is pf.shape[1], which should be n_theta//2 */
         }
 
       } /* s */
     } /* cl2 */
   }/* cl1 */
 
-
-  /* update global best for i, j*/
-  ind = i*n + j;
-  clmatrix[ind] = best_cl1;
+  /* update global best for i, j */
+  clmatrix[i*n + j] = best_cl1;
   clmatrix[j*n+i] = best_cl2;  /* [j,i] */
 
-}
+} /* fbuild_clmatrix_kernel */

From c4ca481f32bcc81e521353b9fe244068dc7bd977 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 10 Oct 2024 15:19:25 -0400
Subject: [PATCH 38/45] self review cleanup

---
 src/aspire/abinitio/commonline_base.py   | 49 +++++++++++++-------
 src/aspire/abinitio/commonline_sync3n.cu | 58 +++++++++++++-----------
 src/aspire/abinitio/commonline_sync3n.py | 26 +++++++----
 src/aspire/abinitio/sync_voting.py       |  2 +-
 4 files changed, 80 insertions(+), 55 deletions(-)

diff --git a/src/aspire/abinitio/commonline_base.py b/src/aspire/abinitio/commonline_base.py
index df01578abe..91049890ad 100644
--- a/src/aspire/abinitio/commonline_base.py
+++ b/src/aspire/abinitio/commonline_base.py
@@ -155,7 +155,11 @@ def estimate_rotations(self):
         raise NotImplementedError("subclasses should implement this")
 
     def build_clmatrix(self):
-        """dispatch, compare, dispair"""
+        """
+        Build common-lines matrix from Fourier stack of 2D images
+
+        Wrapper for cpu/gpu dispatch.
+        """
 
         logger.info("Begin building Common Lines Matrix")
 
@@ -242,7 +246,6 @@ def build_clmatrix_host(self):
             subset_j = np.sort(choice(n_remaining, n_j, replace=False) + i + 1)
 
             for j in subset_j:
-                # for j in range(i+1,n_img):  # for testing, rm later
                 p2_flipped = np.conj(pf[j])
 
                 for shift in range(len(shifts)):
@@ -286,14 +289,15 @@ def build_clmatrix_cu(self):
         import cupy as cp
 
         n_img = self.n_img
+        r = pf.shape[2]
 
         if self.n_theta % 2 == 1:
             msg = "n_theta must be even"
             logger.error(msg)
             raise NotImplementedError(msg)
 
-        # need to do a copy to prevent modifying self.pf for other functions
-        # also places on GPU
+        # Copy to prevent modifying self.pf for other functions
+        # Simultaneously place on GPU
         pf = cp.array(self.pf)
 
         # Allocate local variables for return
@@ -305,25 +309,26 @@ def build_clmatrix_cu(self):
         clmatrix = -cp.ones((n_img, n_img), dtype=np.int16)
 
         # Allocate variables used for shift estimation
-
-        # set maximum value of 1D shift (in pixels) to search
+        #
+        # Set maximum value of 1D shift (in pixels) to search
         # between common-lines.
-        max_shift = self.max_shift
         # Set resolution of shift estimation in pixels. Note that
         # shift_step can be any positive real number.
-        shift_step = self.shift_step
-
+        #
         # Prepare the shift phases to try and generate filter for common-line detection
-        r_max = pf.shape[2]
+        #
+        # Note the CUDA implementation has been optimized to not
+        # compute or return diagnostic 1d shifts.
         _, shift_phases, h = self._generate_shift_phase_and_filter(
-            r_max, max_shift, shift_step
+            r, self.max_shift, self.shift_step
         )
         # Transfer to device, dtypes must match kernel header.
         shift_phases = cp.asarray(shift_phases, dtype=complex_type(self.dtype))
 
         # Apply bandpass filter, normalize each ray of each image
-        # Note that only use half of each ray
-        pf = self._apply_filter_and_norm("ijk, k -> ijk", pf, r_max, h)
+        # Note that this only uses half of each ray
+        pf = self._apply_filter_and_norm("ijk, k -> ijk", pf, r, h)
+
         # Tranpose `pf` for better (CUDA) memory access pattern, and cast as needed.
         pf = cp.ascontiguousarray(pf.T, dtype=complex_type(self.dtype))
 
@@ -336,11 +341,17 @@ def build_clmatrix_cu(self):
             build_clmatrix_kernel = self.__gpu_module.get_function(
                 "fbuild_clmatrix_kernel"
             )
+        else:
+            raise NotImplementedError(
+                "build_clmatrix_kernel only implemented for float32 and float64."
+            )
 
         # Configure grid of blocks
         blkszx = 32
-        nblkx = (self.n_img + blkszx - 2) // blkszx  # n_img-1
+        # Enough blocks to cover n_img-1
+        nblkx = (self.n_img + blkszx - 2) // blkszx
         blkszy = 32
+        # Enough blocks to cover n_img
         nblky = (self.n_img + blkszy - 1) // blkszy
 
         # Launch
@@ -351,7 +362,7 @@ def build_clmatrix_cu(self):
             (
                 n_img,
                 pf.shape[1],
-                r_max,
+                r,
                 pf,
                 clmatrix,
                 len(shift_phases),
@@ -362,6 +373,7 @@ def build_clmatrix_cu(self):
         # Copy result device arrays to host
         clmatrix = clmatrix.get().astype(self.dtype, copy=False)
 
+        # Note diagnostic 1d shifts are not computed in the CUDA implementation.
         return None, clmatrix
 
     def estimate_shifts(self, equations_factor=1, max_memory=4000):
@@ -697,7 +709,7 @@ def _apply_filter_and_norm(self, subscripts, pf, r_max, h):
     def __init_cupy_module():
         """
         Private utility method to read in CUDA source and return as
-        compiled CUPY module.
+        compiled CuPy module.
         """
 
         import cupy as cp
@@ -707,7 +719,10 @@ def __init_cupy_module():
         with open(fp, "r") as fh:
             module_code = fh.read()
 
-        # CUPY compile the CUDA code
+        # CuPy compile the CUDA code
+        # Note these optimizations are to steer aggresive optimization
+        # for single precision code.  Fast math will potentionally
+        # reduce accuracy in single precision.
         return cp.RawModule(
             code=module_code,
             backend="nvcc",
diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu
index 14e27669b9..65ab91a77d 100644
--- a/src/aspire/abinitio/commonline_sync3n.cu
+++ b/src/aspire/abinitio/commonline_sync3n.cu
@@ -1,8 +1,8 @@
 #include "stdint.h"
 #include "math.h"
 
-/* from i,j indices to the common index in the N-choose-2 sized array */
-// careful, this is stricly the upper triangle!
+/* From i,j indices to the common index in the N-choose-2 sized array */
+/* Careful, this is strictly the upper triangle! */
 #define PAIR_IDX(N,I,J) ((2*N-I-1)*I/2 + J-I-1)
 
 
@@ -459,10 +459,11 @@ void estimate_all_angles1(int j,
                           double* __restrict__ angles)
 {
   /* n n_img */
+  /* j is image j index */
 
-  /* thread index (1d), represents "i" index */
+  /* thread index represents "i" index */
   const unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-  /* j is image j index */
+  /* thread index represents "k" index */
   const unsigned int k = blockDim.y * blockIdx.y + threadIdx.y;
 
   int cl_diff1, cl_diff2, cl_diff3;
@@ -515,9 +516,7 @@ void estimate_all_angles1(int j,
 
   /* test `k` values */
   if(cl_idx13 == -1) return;  /* i, k */
-  //if(cl_idx31 == -1) return;  // k, i ?
   if(cl_idx23 == -1) return;  /* j, k */
-  // if(cl_idx32 == -1) return;  // k, j ?
 
   /* get cosine angles */
   cl_diff1 = cl_idx13 - cl_idx12;
@@ -565,38 +564,43 @@ void estimate_all_angles1(int j,
   } /* end not sync */
 
   /* clip cosine phi between [-1,1] */
-  if(cos_phi2 >1){
+  if(cos_phi2 > 1){
     cos_phi2 = 1;
   }
-  if(cos_phi2 <-1){
+  if(cos_phi2 < -1){
     cos_phi2 = -1;
   }
 
   /* compute histogram contribution, angle mapping, and index mappings. */
   angle = acos(cos_phi2) * 180. / M_PI;
-  /*  angle's bin */
+  /* index of angle's bin */
   map_idx = i*n + k;
-  k_map[map_idx] = angle / hist_bin_width;  // check
+  /*
+    For each k, keep track of bin and angles.
+    Note, this is slightly different than the host
+    which uses slightly different angle/hist grids (likely an oversight).
+  */
+  k_map[map_idx] = angle / hist_bin_width;
   angles_map[map_idx] = angle;  /* degrees */
   for(b=0; b<ntics; b++){
-    // todo, just compute in radians to avoid extra arithmetic
-    grid_angle = b * (180./ntics);  /* grid angle */  // check, I think off by one-ish (grid vs bins)
+    /* Potential optimization, just compute in radians to avoid extra arithmetic. */
+    grid_angle = b * (180./ntics);  /* grid angle */
     angle_diff = (grid_angle - angle);
-    /* accumulate histogram contribution, atomic due to `k` */
+    /* accumulate histogram contribution, atomic due to concurrent `k` */
     atomicAdd(&(hist[i*ntics + b ]),
               exp(-(angle_diff*angle_diff) / ( two_sigma_sq)));
-  } /* bins */
+  } /* b bins */
 
   /*
     At this point, the kernel should have accumulated all "good"
-    k contributions into the histogram for I j.
-    The next section solves the histogram.
+    k contributions into the histogram for I, j.
+    The next kernel solves the histogram.
 
-    Two known euler angles are also initialized while we have the cl_idx values.
+    The two known euler angles are initialized while we have the cl_idx values.
   */
 
   /* Initialize euler angles */
-  /* can be done once? */
+  /* Can be done once, but checking seemed to make the kernel slower... */
   {
     map_idx = pair_idx*3;
     angles[map_idx    ] = cl_idx12 * 2 * M_PI / n_theta + M_PI / 2;
@@ -619,10 +623,10 @@ void estimate_all_angles2(int j,
                           double* __restrict__ angles)
 {
   /* n n_img */
+  /* j is image j index */
 
   /* thread index (1d), represents image i index */
   const unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-  /* j is image j index */
 
   int k;
   int cnt;
@@ -664,15 +668,15 @@ void estimate_all_angles2(int j,
       /* find satisfying indices */
       for(k=0; k<n; k++){
         /* determine if image k in peak bin(s) */
-        // Perhaps transpose the maps so thread i in fast
+        // Perhaps transpose the maps so thread i fast
         map_idx = i*n + k;
         if(abs(k_map[map_idx] - peak_idx) < w_theta_needed){
           cnt += 1;  /* count this image */
           angles[pair_idx*3 + 1] += angles_map[map_idx];  /* accumulate angle */
-        } /* fi < w_theta_needed */
+        } /* < w_theta_needed */
       } /* k */
-    } /* while */
-  } /* fi adaptive */
+    } /* cnt */
+  } /* full_width -1, adaptive */
   else {
     /* fixed width */
     cnt = 0;
@@ -682,13 +686,13 @@ void estimate_all_angles2(int j,
       if(abs(k_map[map_idx] - peak_idx) < full_width){
         cnt += 1;  /* count this image */
         angles[pair_idx*3 + 1] += angles_map[map_idx];  /* accumulate angle */
-      } /* fi full_width */
+      } /* full_width */
     } /* k */
-  } /* fi fixed width */
+  } /* fixed width */
 
   /* Divide accumulated angles (resulting in the mean alpha angle) */
-  // (todo, better handle 0/0?)
-  /* convert degree radian */
+  // (todo, can we have cnt = 0?)
+  /* convert degree to radian */
   angles[pair_idx*3 + 1] *= M_PI / (180*cnt);
 
 } /* estimate_all_angles2 kernel */
diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 3110e179c3..eb9364aa77 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -62,6 +62,7 @@ def __init__(
         full_width="adaptive",
         epsilon=1e-2,
         max_iters=1000,
+        sigma=3,
         seed=None,
         mask=True,
         S_weighting=False,
@@ -83,6 +84,7 @@ def __init__(
             `hist_bin_width`s required to find at least one valid image index.
         :param epsilon: Tolerance for the power method.
         :param max_iter: Maximum iterations for the power method.
+        :param sigma: Voting contribution smoothing factor.
         :param seed: Optional seed for RNG.
         :param mask: Option to mask `src.images` with a fuzzy mask (boolean).
             Default, `True`, applies a mask.
@@ -113,6 +115,7 @@ def __init__(
 
         self.epsilon = epsilon
         self.max_iters = max_iters
+        self.sigma = float(sigma)
         self.seed = seed
 
         # Sync3N specific vars
@@ -838,7 +841,8 @@ def _estimate_all_Rijs_cu(self, clmatrix):
         estimate_all_angles2 = self.__gpu_module.get_function("estimate_all_angles2")
         angles_to_rots = self.__gpu_module.get_function("angles_to_rots")
 
-        sigma = 3.0
+        # Use the sync3n MATLAB implementation,
+        # other mode exists to support other CL methods.
         sync = 1
 
         # transfer input to device
@@ -848,18 +852,20 @@ def _estimate_all_Rijs_cu(self, clmatrix):
         ntics = int(180 / self.hist_bin_width)
         n_pairs = self.n_img * (self.n_img - 1) // 2
         hist = cp.zeros((self.n_img, ntics), dtype=np.float64)
-        # k_map stores the mapping of k indices to histogram bins
+        # k_map stores the mapping of i, k indices to histogram bins
         k_map = cp.zeros((self.n_img, self.n_img), dtype=np.uint16)
+        # angles_map stores mapping of i, k indices to angles
         angles_map = cp.zeros((self.n_img, self.n_img), dtype=np.float64)
+        # resulting pairs i,j euler angles
         angles = cp.zeros((n_pairs, 3), dtype=np.float64)
 
-        # Configure 2d grid of blocks (kernel part 1)
+        # Configure 2d grid of blocks (kernel 1)
         blkszx = 32
-        nblkx = (self.n_img + blkszx - 1) // blkszx
+        nblkx = (self.n_img + blkszx - 1) // blkszx  # i
         blkszy = 32
-        nblky = (self.n_img + blkszy - 1) // blkszy
+        nblky = (self.n_img + blkszy - 1) // blkszy  # k
 
-        # Configure 1d grid of blocks (kernel part 2)
+        # Configure 1d grid of blocks (kernel 2)
         blksz = 1024
         nblk = (self.n_img + blksz - 1) // blksz
 
@@ -885,7 +891,7 @@ def _estimate_all_Rijs_cu(self, clmatrix):
                     self.n_theta,
                     np.float64(self.hist_bin_width),
                     self.full_width,
-                    np.float64(sigma),
+                    np.float64(self.sigma),
                     sync,
                     clmatrix,  # input
                     hist,  # tmp
@@ -912,12 +918,12 @@ def _estimate_all_Rijs_cu(self, clmatrix):
                 ),
             )
 
-        # Force all kernels to complete
+        # Force all kernels to complete before timing
         cp.cuda.runtime.deviceSynchronize()
         toc1 = perf_counter()
         logger.info(f"estimate_all_angles: {toc1-toc0}s")
 
-        # Inform CuPy we longer need these workspace vars
+        # Explicitly inform CuPy we longer need these workspace vars
         del hist
         del k_map
         del angles_map
@@ -926,7 +932,7 @@ def _estimate_all_Rijs_cu(self, clmatrix):
         # Convert angles to rotations
         rotations = cp.empty((n_pairs, 3, 3), dtype=np.float64)
 
-        # Configure 1d grid of blocks
+        # Configure another 1d grid of blocks
         blksz = 1024
         nblk = (n_pairs + blksz - 1) // blksz
 
diff --git a/src/aspire/abinitio/sync_voting.py b/src/aspire/abinitio/sync_voting.py
index 63cb6a923f..651ba35c5e 100644
--- a/src/aspire/abinitio/sync_voting.py
+++ b/src/aspire/abinitio/sync_voting.py
@@ -142,7 +142,7 @@ def _vote_ij(self, clmatrix, n_theta, i, j, k_list, sync=False):
         # similar. This sigma ensures that the width of the density
         # estimation kernel is roughly 10 degrees. For 15 degrees, the
         # value of the kernel is negligible.
-        sigma = 3.0
+        sigma = getattr(self, "sigma", 3.0)  # get from class if avail
 
         # Compute the histogram of the angles between images i and j
         angles_distances = angles_grid[None, :] - angles[:, None]

From 1631cff0d3a75f8804692011c2670c392caaef9b Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 10 Oct 2024 15:22:03 -0400
Subject: [PATCH 39/45] use adaptive width mode for sync3n tests

---
 tests/test_commonline_sync3n.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_commonline_sync3n.py b/tests/test_commonline_sync3n.py
index 53fa544ea6..68f1eb84e9 100644
--- a/tests/test_commonline_sync3n.py
+++ b/tests/test_commonline_sync3n.py
@@ -63,7 +63,7 @@ def source_orientation_objs(resolution, offsets, dtype):
         shift_step = 0.25  # Reduce shift steps for non-integer offsets of Simulation.
 
     orient_est = CLSync3N(
-        src, max_shift=max_shift, shift_step=shift_step, seed=789, full_width=2
+        src, max_shift=max_shift, shift_step=shift_step, seed=789,
     )
 
     # Estimate rotations once for all tests.

From 706c2bdfb6893184ce7946da1d351c7ee31dbe4a Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 10 Oct 2024 15:41:17 -0400
Subject: [PATCH 40/45] add additional sync3n code paths

---
 src/aspire/abinitio/commonline_base.py |  2 +-
 tests/test_commonline_sync3n.py        | 33 +++++++++++++++++++++++++-
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/src/aspire/abinitio/commonline_base.py b/src/aspire/abinitio/commonline_base.py
index 91049890ad..cda8d51d78 100644
--- a/src/aspire/abinitio/commonline_base.py
+++ b/src/aspire/abinitio/commonline_base.py
@@ -289,7 +289,7 @@ def build_clmatrix_cu(self):
         import cupy as cp
 
         n_img = self.n_img
-        r = pf.shape[2]
+        r = self.pf.shape[2]
 
         if self.n_theta % 2 == 1:
             msg = "n_theta must be even"
diff --git a/tests/test_commonline_sync3n.py b/tests/test_commonline_sync3n.py
index 68f1eb84e9..815381f303 100644
--- a/tests/test_commonline_sync3n.py
+++ b/tests/test_commonline_sync3n.py
@@ -63,7 +63,10 @@ def source_orientation_objs(resolution, offsets, dtype):
         shift_step = 0.25  # Reduce shift steps for non-integer offsets of Simulation.
 
     orient_est = CLSync3N(
-        src, max_shift=max_shift, shift_step=shift_step, seed=789,
+        src,
+        max_shift=max_shift,
+        shift_step=shift_step,
+        seed=789,
     )
 
     # Estimate rotations once for all tests.
@@ -138,3 +141,31 @@ def test_estimate_rotations(source_orientation_objs):
     if src.offsets.all() != 0:
         tol = 4
     mean_aligned_angular_distance(orient_est.rotations, src.rotations, degree_tol=tol)
+
+
+@pytest.mark.expensive
+def test_weighted_sync3n(source_orientation_objs):
+    """
+    Test alternative Sync3N configuration code paths.
+    """
+    src, _ = source_orientation_objs
+
+    orient_est = CLSync3N(
+        src, seed=789, S_weighting=True, J_weighting=True, full_width=2, sigma=3.1415
+    )
+    # Estimate rotations
+    orient_est.estimate_rotations()
+
+    gt_clmatrix = rots_to_clmatrix(src.rotations, orient_est.n_theta)
+
+    angle_diffs = abs(orient_est.clmatrix - gt_clmatrix) * 360 / orient_est.n_theta
+
+    # Count number of estimates within 5 degrees of ground truth.
+    within_5 = np.sum((angle_diffs - 360) % 360 < 5)
+
+    # Check that at least 98% of estimates are within 5 degrees.
+    tol = 0.98
+    if src.offsets.all() != 0:
+        # Set tolerance to 75% when using nonzero offsets.
+        tol = 0.75
+    assert within_5 / angle_diffs.size > tol

From f2025c8865f38e0952503f7ef3f50543f7313ccc Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Fri, 11 Oct 2024 15:40:17 -0400
Subject: [PATCH 41/45] must use smaller shift step for unit test size problem

---
 tests/test_commonline_sync3n.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/tests/test_commonline_sync3n.py b/tests/test_commonline_sync3n.py
index 815381f303..8305aa85e9 100644
--- a/tests/test_commonline_sync3n.py
+++ b/tests/test_commonline_sync3n.py
@@ -150,8 +150,22 @@ def test_weighted_sync3n(source_orientation_objs):
     """
     src, _ = source_orientation_objs
 
+    # Search for common lines over less shifts for 0 offsets.
+    max_shift = 1 / src.L
+    shift_step = 1
+    if src.offsets.all() != 0:
+        max_shift = 0.20
+        shift_step = 0.25  # Reduce shift steps for non-integer offsets of Simulation.
+
     orient_est = CLSync3N(
-        src, seed=789, S_weighting=True, J_weighting=True, full_width=2, sigma=3.1415
+        src,
+        max_shift=max_shift,
+        shift_step=shift_step,
+        seed=789,
+        S_weighting=True,
+        J_weighting=True,
+        full_width=2,
+        sigma=2.9,
     )
     # Estimate rotations
     orient_est.estimate_rotations()

From 13cc09fe4b1afb29c67bd7a40b12f88c48c37c23 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Wed, 16 Oct 2024 10:33:38 -0400
Subject: [PATCH 42/45] Remove missed debug string change

---
 src/aspire/abinitio/commonline_base.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/aspire/abinitio/commonline_base.py b/src/aspire/abinitio/commonline_base.py
index cda8d51d78..3329e117a3 100644
--- a/src/aspire/abinitio/commonline_base.py
+++ b/src/aspire/abinitio/commonline_base.py
@@ -228,9 +228,7 @@ def build_clmatrix_host(self):
 
         # Setup a progress bar
         _total_pairs_to_test = self.n_img * (self.n_check - 1) // 2
-        pbar = tqdm(
-            desc="Searching over common line pairs (orig)", total=_total_pairs_to_test
-        )
+        pbar = tqdm(desc="Searching over common line pairs", total=_total_pairs_to_test)
 
         # Search for common lines between [i, j] pairs of images.
         # Creating pf and building common lines are different to the Matlab version.

From 78030fe1dea3b8cc69e90baafd5e7631c9e4d5b8 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Wed, 16 Oct 2024 10:35:01 -0400
Subject: [PATCH 43/45] Remove range(0,...)

---
 src/aspire/abinitio/commonline_sync3n.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index eb9364aa77..9dd394e00b 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -873,7 +873,7 @@ def _estimate_all_Rijs_cu(self, clmatrix):
 
         logger.info("Launching `estimate_all_angles` kernel.")
         toc0 = perf_counter()
-        for j in range(0, self.n_img):
+        for j in range(self.n_img):
 
             # ------------------------------------------
             # Zero histogram and k mapping for each `j`.

From 6a99b0589e2deea3c13486a44acbd9557936c6cc Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Fri, 18 Oct 2024 09:22:43 -0400
Subject: [PATCH 44/45] change var name from dist to xcorr

---
 src/aspire/abinitio/commonline_base.cu | 32 +++++++++++++-------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/aspire/abinitio/commonline_base.cu b/src/aspire/abinitio/commonline_base.cu
index 26529a03cc..a3ee96ceed 100644
--- a/src/aspire/abinitio/commonline_base.cu
+++ b/src/aspire/abinitio/commonline_base.cu
@@ -31,13 +31,13 @@ void build_clmatrix_kernel(
   int s;
   int cl1, cl2;
   int best_cl1, best_cl2;
-  double dist, best_cl_dist;
+  double xcorr, best_cl_xcorr;
   double p1, p2;
   complex<double> pfik, pfjk;
 
   best_cl1 = -1;
   best_cl2 = -1;
-  best_cl_dist = -INFINITY;
+  best_cl_xcorr = -INFINITY;
 
   for(cl1=0; cl1<m; cl1++){
     for(cl2=0; cl2<m; cl2++){
@@ -52,16 +52,16 @@ void build_clmatrix_kernel(
           p2 += imag(pfik) * imag(pfjk);
         } /* k */
 
-        dist = p1 - p2;
-        if(dist > best_cl_dist){
-          best_cl_dist = dist;
+        xcorr = p1 - p2;
+        if(xcorr > best_cl_xcorr){
+          best_cl_xcorr = xcorr;
           best_cl1 = cl1;
           best_cl2 = cl2;
         }
 
-        dist = p1 + p2;
-        if(dist > best_cl_dist){
-          best_cl_dist = dist;
+        xcorr = p1 + p2;
+        if(xcorr > best_cl_xcorr){
+          best_cl_xcorr = xcorr;
           best_cl1 = cl1;
           best_cl2 = cl2 + m; /* m is pf.shape[1], which should be n_theta//2 */
         }
@@ -105,13 +105,13 @@ void fbuild_clmatrix_kernel(
   int s;
   int cl1, cl2;
   int best_cl1, best_cl2;
-  float dist, best_cl_dist;
+  float xcorr, best_cl_xcorr;
   float p1, p2;
   complex<float> pfik, pfjk;
 
   best_cl1 = -1;
   best_cl2 = -1;
-  best_cl_dist = -INFINITY;
+  best_cl_xcorr = -INFINITY;
 
   for(cl1=0; cl1<m; cl1++){
     for(cl2=0; cl2<m; cl2++){
@@ -126,16 +126,16 @@ void fbuild_clmatrix_kernel(
           p2 += imag(pfik) * imag(pfjk);
         } /* k */
 
-        dist = p1 - p2;
-        if(dist > best_cl_dist){
-          best_cl_dist = dist;
+        xcorr = p1 - p2;
+        if(xcorr > best_cl_xcorr){
+          best_cl_xcorr = xcorr;
           best_cl1 = cl1;
           best_cl2 = cl2;
         }
 
-        dist = p1 + p2;
-        if(dist > best_cl_dist){
-          best_cl_dist = dist;
+        xcorr = p1 + p2;
+        if(xcorr > best_cl_xcorr){
+          best_cl_xcorr = xcorr;
           best_cl1 = cl1;
           best_cl2 = cl2 + m; /* m is pf.shape[1], which should be n_theta//2 */
         }

From c47b626c508a5777c4dfb7cc1f57f8c617ed993c Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Fri, 18 Oct 2024 09:26:24 -0400
Subject: [PATCH 45/45] Remove kernel timing

---
 src/aspire/abinitio/commonline_sync3n.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 9dd394e00b..879728436d 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -869,10 +869,6 @@ def _estimate_all_Rijs_cu(self, clmatrix):
         blksz = 1024
         nblk = (self.n_img + blksz - 1) // blksz
 
-        from time import perf_counter
-
-        logger.info("Launching `estimate_all_angles` kernel.")
-        toc0 = perf_counter()
         for j in range(self.n_img):
 
             # ------------------------------------------
@@ -918,10 +914,8 @@ def _estimate_all_Rijs_cu(self, clmatrix):
                 ),
             )
 
-        # Force all kernels to complete before timing
+        # Force all kernels to complete
         cp.cuda.runtime.deviceSynchronize()
-        toc1 = perf_counter()
-        logger.info(f"estimate_all_angles: {toc1-toc0}s")
 
         # Explicitly inform CuPy we longer need these workspace vars
         del hist
@@ -947,9 +941,8 @@ def _estimate_all_Rijs_cu(self, clmatrix):
             ),
         )
 
+        # Force all kernels to complete
         cp.cuda.runtime.deviceSynchronize()
-        toc2 = perf_counter()
-        logger.info(f"angles_to_rots: {toc2-toc1}s")
 
         # transfer results device to host
         rotations = rotations.get()