From ab54323799ea0db9b9c6a4b74747e175d6e1a3fc Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Wed, 27 Mar 2024 14:48:21 -0400
Subject: [PATCH 01/60] init add

---
 src/aspire/abinitio/commonline_sync3n.py | 269 +++++++++++++++++++++++
 1 file changed, 269 insertions(+)
 create mode 100644 src/aspire/abinitio/commonline_sync3n.py

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
new file mode 100644
index 0000000000..a31ec87032
--- /dev/null
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -0,0 +1,269 @@
+import logging
+
+import numpy as np
+from numpy.linalg import eigh, norm, svd
+
+from aspire.abinitio import CLOrient3D, SyncVotingMixin
+from aspire.operators import PolarFT
+from aspire.utils import (
+    J_conjugate,
+    Rotation,
+    all_pairs,
+    all_triplets,
+    anorm,
+    cyclic_rotations,
+    tqdm,
+    trange,
+)
+from aspire.utils.random import randn
+
+logger = logging.getLogger(__name__)
+
+
+class CLSync3N(CLOrient3D, SyncVotingMixin):
+    """
+    Define a class to estimate 3D orientations using common lines (2017) methods.
+    """
+
+    def __init__(
+        self,
+        src,
+        n_rad=None,
+        n_theta=None,
+        max_shift=0.15,
+        shift_step=1,
+        epsilon=1e-3,
+        max_iters=1000,
+        degree_res=1,
+        seed=None,
+        mask=True,
+    ):
+        """
+        Initialize object for estimating 3D orientations.
+
+        :param src: The source object of 2D denoised or class-averaged images with metadata
+        :param n_rad: The number of points in the radial direction
+        :param n_theta: The number of points in the theta direction
+        :param max_shift: Maximum range for shifts as a proportion of resolution. Default = 0.15.
+        :param shift_step: Resolution of shift estimation in pixels. Default = 1 pixel.
+        :param epsilon: Tolerance for the power method.
+        :param max_iter: Maximum iterations for the power method.
+        :param degree_res: Degree resolution for estimating in-plane rotations.
+        :param seed: Optional seed for RNG.
+        :param mask: Option to mask `src.images` with a fuzzy mask (boolean).
+            Default, `True`, applies a mask.
+        """
+
+        super().__init__(
+            src,
+            n_rad=n_rad,
+            n_theta=n_theta,
+            max_shift=max_shift,
+            shift_step=shift_step,
+            mask=mask,
+        )
+
+        self.epsilon = epsilon
+        self.max_iters = max_iters
+        self.degree_res = degree_res
+        self.seed = seed
+
+    def estimate_rotations(self):
+        """
+        Estimate rotation matrices for molecules with C3 or C4 symmetry.
+
+        :return: Array of rotation matrices, size n_imgx3x3.
+        """
+        Rij0 = self._estimate_relative_viewing_directions()
+
+        logger.info("Performing global handedness synchronization.")
+        Rij = self._global_J_sync(Rij0)
+
+        # sync3n
+        S = cryo_sync3n_syncmatrix(Rij)
+
+        # optionally S weights
+
+        # S to rot
+        # cryo_sync3n_S_to_rot(S)
+
+        self.rotations = Ris
+
+    ###########################################
+    # The hackberries taste like hackberries  #
+    ###########################################
+    def cryo_sync3n_S_to_rot(S):
+        """
+        S is (n_img, n_img, 3,3)
+        """
+
+        # Convert S to stupid shape
+        S = np.transpose(S, (0, 2, 1, 3)).reshape(3 * n_img, 3 * n_img)
+
+        # Extract three eigenvectors corresponding to non-zero eigenvalues.
+        d, v = stable_eigsh(S, 10)
+        sort_idx = np.argsort(-d)
+        logger.info(
+            f"Top 10 eigenvalues from synchronization voting matrix: {d[sort_idx]}"
+        )
+
+        # Only need the top 3 eigen-vectors.
+        v = v[:, sort_idx[:3]]
+
+        v1 = v[: 3 * n_img : 3].T.copy()
+        v2 = v[1 : 3 * n_img : 3].T.copy()
+        v3 = v[2 : 3 * n_img : 3].T.copy()
+
+        rotations = np.empty((n_img, 3, 3), dtype=self.dtype)
+        rotations[:, :, 0] = v1.T
+        rotations[:, :, 1] = v2.T
+        rotations[:, :, 2] = v3.T
+        # Make sure that we got rotations by enforcing R to be
+        # a rotation (in case the error is large)
+        rotations = nearest_rotations(rotations)
+
+        return rotations
+
+    def cryo_sync3n_syncmatrix(Rij):
+
+        S = np.zeros((self.n_img, self.n_img, 3, 3), dtype=self.dtype)
+        I = np.eye(3, dtype=self.dtype)
+
+        idx = 0
+        for i in range(self.n_img):
+            # S( (3*i-2):(3*i) , (3*i-2):(3*i) ) = I; % Rii = I
+            S[i, i] = I
+            for j in range(i + 1, N):
+                idx += 1
+                # S( (3*i-2):(3*i) , (3*j-2):(3*j) ) = Rij(:,:,idx); % Rij
+                S[i, j] = Rij[idx]
+                # S( (3*j-2):(3*j) , (3*i-2):(3*i) ) = Rij(:,:,idx)'; % Rji = Rij'
+                S[j, i] = Rij[idx].T
+
+        return S
+
+    ###########################################
+    # Primary Methods                         #
+    ###########################################
+
+    def _estimate_relative_viewing_directions(self):
+        """
+        Estimate the relative viewing directions vij = vi*vj^T, i<j, and vii = vi*vi^T, where
+        vi is the third row of the i'th rotation matrix Ri.
+        """
+        logger.info(f"Estimating relative viewing directions for {self.n_img} images.")
+        # Step 1: Detect a single pair of common-lines between each pair of images
+        self.build_clmatrix()
+
+        # Step 4: Calculate relative rotations
+        Rijs = self._estimate_all_Rijs_c3_c4(clmatrix)
+
+        return Rijs
+
+    def _global_J_sync(self, vijs):
+        """ """
+        n_img = self.n_img
+
+        # Determine relative handedness of vijs.
+        sign_ij_J = self._J_sync_power_method(vijs)
+
+        # Synchronize vijs
+        for i, sign in enumerate(sign_ij_J):
+            if sign == -1:
+                vijs[i] = J_conjugate(vijs[i])
+
+        return vijs
+
+    def _estimate_all_Rijs_c3_c4(self, clmatrix):
+        """
+        Estimate Rijs using the voting method.
+        """
+        n_img = self.n_img
+        n_theta = self.n_theta
+        pairs = all_pairs(n_img)
+        Rijs = np.zeros((len(pairs), 3, 3))
+
+        for idx, (i, j) in enumerate(pairs):
+            Rijs[idx] = self._syncmatrix_ij_vote_3n(
+                clmatrix, i, j, np.arange(n_img), n_theta
+            )
+
+        return Rijs
+
+    def _syncmatrix_ij_vote_3n(self, clmatrix, i, j, k_list, n_theta):
+        """
+        Compute the (i,j) rotation block of the synchronization matrix using voting method
+
+        Given the common lines matrix `clmatrix`, a list of images specified in k_list
+        and the number of common lines n_theta, find the (i, j) rotation block Rij.
+
+        :param clmatrix: The common lines matrix
+        :param i: The i image
+        :param j: The j image
+        :param k_list: The list of images for the third image for voting algorithm
+        :param n_theta: The number of points in the theta direction (common lines)
+        :return: The (i,j) rotation block of the synchronization matrix
+        """
+        good_k = self._vote_ij(clmatrix, n_theta, i, j, k_list)
+
+        rots = self._rotratio_eulerangle_vec(clmatrix, i, j, good_k, n_theta)
+
+        if rots is not None:
+            rot_mean = np.mean(rots, 0)
+
+        else:
+            # This is for the case that images i and j correspond to the same
+            # viewing direction and differ only by in-plane rotation.
+            # We set to zero as in the Matlab code.
+            rot_mean = np.zeros((3, 3))
+
+        return rot_mean
+
+    #######################################
+    # Secondary Methods for Global J Sync #
+    #######################################
+
+    def _J_sync_power_method(self, vijs):
+        """
+        Calculate the leading eigenvector of the J-synchronization matrix
+        using the power method.
+
+        As the J-synchronization matrix is of size (n-choose-2)x(n-choose-2), we
+        use the power method to compute the eigenvalues and eigenvectors,
+        while constructing the matrix on-the-fly.
+
+        :param vijs: (n-choose-2)x3x3 array of estimates of relative orientation matrices.
+
+        :return: An array of length n-choose-2 consisting of 1 or -1, where the sign of the
+        i'th entry indicates whether the i'th relative orientation matrix will be J-conjugated.
+        """
+
+        # Set power method tolerance and maximum iterations.
+        epsilon = self.epsilon
+        max_iters = self.max_iters
+
+        # Initialize candidate eigenvectors
+        n_vijs = vijs.shape[0]
+        vec = randn(n_vijs, seed=self.seed)
+        vec = vec / norm(vec)
+        residual = 1
+        itr = 0
+
+        # Power method iterations
+        logger.info(
+            "Initiating power method to estimate J-synchronization matrix eigenvector."
+        )
+        while itr < max_iters and residual > epsilon:
+            itr += 1
+            vec_new = self._signs_times_v(vijs, vec)
+            vec_new = vec_new / norm(vec_new)
+            residual = norm(vec_new - vec)
+            vec = vec_new
+            logger.info(
+                f"Iteration {itr}, residual {round(residual, 5)} (target {epsilon})"
+            )
+
+        # We need only the signs of the eigenvector
+        J_sync = np.sign(vec)
+
+        return J_sync

From f8cff85af445d3cdf3162ff952307a17dd750e20 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Wed, 27 Mar 2024 16:19:48 -0400
Subject: [PATCH 02/60] fix typos

---
 src/aspire/abinitio/__init__.py          |   1 +
 src/aspire/abinitio/commonline_sync3n.py | 112 ++++++++++++++++++++---
 2 files changed, 100 insertions(+), 13 deletions(-)

diff --git a/src/aspire/abinitio/__init__.py b/src/aspire/abinitio/__init__.py
index ff14cc2d45..9d4b0f483c 100644
--- a/src/aspire/abinitio/__init__.py
+++ b/src/aspire/abinitio/__init__.py
@@ -4,6 +4,7 @@
 
 # isort: off
 from .commonline_sync import CLSyncVoting
+from .commonline_sync3n import CLSync3N
 from .commonline_c3_c4 import CLSymmetryC3C4
 from .commonline_cn import CLSymmetryCn
 from .commonline_c2 import CLSymmetryC2
diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index a31ec87032..efe6c8b179 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -16,6 +16,8 @@
     trange,
 )
 from aspire.utils.random import randn
+from aspire.utils.matlab_compat import stable_eigsh
+from aspire.utils import nearest_rotations
 
 logger = logging.getLogger(__name__)
 
@@ -32,7 +34,7 @@ def __init__(
         n_theta=None,
         max_shift=0.15,
         shift_step=1,
-        epsilon=1e-3,
+        epsilon=1e-2,
         max_iters=1000,
         degree_res=1,
         seed=None,
@@ -80,25 +82,25 @@ def estimate_rotations(self):
         Rij = self._global_J_sync(Rij0)
 
         # sync3n
-        S = cryo_sync3n_syncmatrix(Rij)
+        S = self.cryo_sync3n_syncmatrix(Rij)
 
         # optionally S weights
 
         # S to rot
-        # cryo_sync3n_S_to_rot(S)
+        Ris = self.cryo_sync3n_S_to_rot(S)
 
         self.rotations = Ris
 
     ###########################################
     # The hackberries taste like hackberries  #
     ###########################################
-    def cryo_sync3n_S_to_rot(S):
+    def cryo_sync3n_S_to_rot(self, S):
         """
         S is (n_img, n_img, 3,3)
         """
 
         # Convert S to stupid shape
-        S = np.transpose(S, (0, 2, 1, 3)).reshape(3 * n_img, 3 * n_img)
+        S = np.transpose(S, (0, 2, 1, 3)).reshape(3 * self.n_img, 3 * self.n_img)
 
         # Extract three eigenvectors corresponding to non-zero eigenvalues.
         d, v = stable_eigsh(S, 10)
@@ -110,11 +112,11 @@ def cryo_sync3n_S_to_rot(S):
         # Only need the top 3 eigen-vectors.
         v = v[:, sort_idx[:3]]
 
-        v1 = v[: 3 * n_img : 3].T.copy()
-        v2 = v[1 : 3 * n_img : 3].T.copy()
-        v3 = v[2 : 3 * n_img : 3].T.copy()
+        v1 = v[: 3 * self.n_img : 3].T.copy()
+        v2 = v[1 : 3 * self.n_img : 3].T.copy()
+        v3 = v[2 : 3 * self.n_img : 3].T.copy()
 
-        rotations = np.empty((n_img, 3, 3), dtype=self.dtype)
+        rotations = np.empty((self.n_img, 3, 3), dtype=self.dtype)
         rotations[:, :, 0] = v1.T
         rotations[:, :, 1] = v2.T
         rotations[:, :, 2] = v3.T
@@ -124,7 +126,7 @@ def cryo_sync3n_S_to_rot(S):
 
         return rotations
 
-    def cryo_sync3n_syncmatrix(Rij):
+    def cryo_sync3n_syncmatrix(self, Rij):
 
         S = np.zeros((self.n_img, self.n_img, 3, 3), dtype=self.dtype)
         I = np.eye(3, dtype=self.dtype)
@@ -133,12 +135,12 @@ def cryo_sync3n_syncmatrix(Rij):
         for i in range(self.n_img):
             # S( (3*i-2):(3*i) , (3*i-2):(3*i) ) = I; % Rii = I
             S[i, i] = I
-            for j in range(i + 1, N):
-                idx += 1
+            for j in range(i + 1, self.n_img):
                 # S( (3*i-2):(3*i) , (3*j-2):(3*j) ) = Rij(:,:,idx); % Rij
                 S[i, j] = Rij[idx]
                 # S( (3*j-2):(3*j) , (3*i-2):(3*i) ) = Rij(:,:,idx)'; % Rji = Rij'
                 S[j, i] = Rij[idx].T
+                idx += 1
 
         return S
 
@@ -156,7 +158,7 @@ def _estimate_relative_viewing_directions(self):
         self.build_clmatrix()
 
         # Step 4: Calculate relative rotations
-        Rijs = self._estimate_all_Rijs_c3_c4(clmatrix)
+        Rijs = self._estimate_all_Rijs_c3_c4(self.clmatrix)
 
         return Rijs
 
@@ -267,3 +269,87 @@ def _J_sync_power_method(self, vijs):
         J_sync = np.sign(vec)
 
         return J_sync
+    def _signs_times_v(self, vijs, vec):
+        """
+        Multiplication of the J-synchronization matrix by a candidate eigenvector.
+
+        The J-synchronization matrix is a matrix representation of the handedness graph, Gamma, whose set of
+        nodes consists of the estimates vijs and whose set of edges consists of the undirected edges between
+        all triplets of estimates vij, vjk, and vik, where i<j<k. The weight of an edge is set to +1 if its
+        incident nodes agree in handednes and -1 if not.
+
+        The J-synchronization matrix is of size (n-choose-2)x(n-choose-2), where each entry corresponds to
+        the relative handedness of vij and vjk. The entry (ij, jk), where ij and jk are retrieved from the
+        all_pairs indexing, is 1 if vij and vjk are of the same handedness and -1 if not. All other entries
+        (ij, kl) hold a zero.
+
+        Due to the large size of the J-synchronization matrix we construct it on the fly as follows.
+        For each triplet of outer products vij, vjk, and vik, the associated elements of the J-synchronization
+        matrix are populated with +1 or -1 and multiplied by the corresponding elements of
+        the current candidate eigenvector supplied by the power method. The new candidate eigenvector
+        is updated for each triplet.
+
+        :param vijs: (n-choose-2)x3x3 array, where each 3x3 slice holds the outer product of vi and vj.
+
+        :param vec: The current candidate eigenvector of length n-choose-2 from the power method.
+
+        :return: New candidate eigenvector of length n-choose-2. The product of the J-sync matrix and vec.
+        """
+
+        # All pairs (i,j) and triplets (i,j,k) where i<j<k
+        n_img = self.n_img
+        triplets = all_triplets(n_img)
+        pairs, pairs_to_linear = all_pairs(n_img, return_map=True)
+
+        # There are 4 possible configurations of relative handedness for each triplet (vij, vjk, vik).
+        # 'conjugate' expresses which node of the triplet must be conjugated (True) to achieve synchronization.
+        conjugate = np.empty((4, 3), bool)
+        conjugate[0] = [False, False, False]
+        conjugate[1] = [True, False, False]
+        conjugate[2] = [False, True, False]
+        conjugate[3] = [False, False, True]
+
+        # 'edges' corresponds to whether conjugation agrees between the pairs (vij, vjk), (vjk, vik),
+        # and (vik, vij). True if the pairs are in agreement, False otherwise.
+        edges = np.empty((4, 3), bool)
+        edges[:, 0] = conjugate[:, 0] == conjugate[:, 1]
+        edges[:, 1] = conjugate[:, 1] == conjugate[:, 2]
+        edges[:, 2] = conjugate[:, 2] == conjugate[:, 0]
+
+        # The corresponding entries in the J-synchronization matrix are +1 if the pair of nodes agree, -1 if not.
+        edge_signs = np.where(edges, 1, -1)
+
+        # For each triplet of nodes we apply the 4 configurations of conjugation and determine the
+        # relative handedness based on the condition that vij @ vjk - vik = 0 for synchronized nodes.
+        # We then construct the corresponding entries of the J-synchronization matrix with 'edge_signs'
+        # corresponding to the conjugation configuration producing the smallest residual for the above
+        # condition. Finally, we the multiply the 'edge_signs' by the cooresponding entries of 'vec'.
+        v = vijs
+        new_vec = np.zeros_like(vec)
+        for i, j, k in triplets:
+            ij = pairs_to_linear[i, j]
+            jk = pairs_to_linear[j, k]
+            ik = pairs_to_linear[i, k]
+            vij, vjk, vik = v[ij], v[jk], v[ik]
+            vij_J = J_conjugate(vij)
+            vjk_J = J_conjugate(vjk)
+            vik_J = J_conjugate(vik)
+
+            conjugated_pairs = np.where(
+                conjugate[..., np.newaxis, np.newaxis],
+                [vij_J, vjk_J, vik_J],
+                [vij, vjk, vik],
+            )
+            residual = np.stack([norm(x @ y - z) for x, y, z in conjugated_pairs])
+
+            min_residual = np.argmin(residual)
+
+            # Assign edge weights
+            s_ij_jk, s_ik_jk, s_ij_ik = edge_signs[min_residual]
+
+            # Update multiplication of signs times vec
+            new_vec[ij] += s_ij_jk * vec[jk] + s_ij_ik * vec[ik]
+            new_vec[jk] += s_ij_jk * vec[ij] + s_ik_jk * vec[ik]
+            new_vec[ik] += s_ij_jk * vec[ij] + s_ik_jk * vec[jk]
+
+        return new_vec

From e61fbb2f5d0ae72bf93da5d29dd0738352500d84 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Tue, 2 Apr 2024 11:33:49 -0400
Subject: [PATCH 03/60] cleanup S init and usage, func names, etc

---
 src/aspire/abinitio/commonline_sync3n.py | 103 +++++++++++------------
 1 file changed, 51 insertions(+), 52 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index efe6c8b179..ce41d687f0 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -1,30 +1,19 @@
 import logging
 
 import numpy as np
-from numpy.linalg import eigh, norm, svd
+from numpy.linalg import norm
 
 from aspire.abinitio import CLOrient3D, SyncVotingMixin
-from aspire.operators import PolarFT
-from aspire.utils import (
-    J_conjugate,
-    Rotation,
-    all_pairs,
-    all_triplets,
-    anorm,
-    cyclic_rotations,
-    tqdm,
-    trange,
-)
-from aspire.utils.random import randn
+from aspire.utils import J_conjugate, all_pairs, all_triplets, nearest_rotations
 from aspire.utils.matlab_compat import stable_eigsh
-from aspire.utils import nearest_rotations
+from aspire.utils.random import randn
 
 logger = logging.getLogger(__name__)
 
 
 class CLSync3N(CLOrient3D, SyncVotingMixin):
     """
-    Define a class to estimate 3D orientations using common lines (2017) methods.
+    Define a class to estimate 3D orientations using common lines Sync3N methods (2017).
     """
 
     def __init__(
@@ -70,78 +59,87 @@ def __init__(
         self.degree_res = degree_res
         self.seed = seed
 
+    ###########################################
+    # High level algorithm steps              #
+    ###########################################
     def estimate_rotations(self):
         """
-        Estimate rotation matrices for molecules with C3 or C4 symmetry.
+        Estimate rotation matrices.
 
         :return: Array of rotation matrices, size n_imgx3x3.
         """
+
+        # Initial estimate of viewing directions
         Rij0 = self._estimate_relative_viewing_directions()
 
-        logger.info("Performing global handedness synchronization.")
+        # Compute and apply global handedness
         Rij = self._global_J_sync(Rij0)
 
-        # sync3n
-        S = self.cryo_sync3n_syncmatrix(Rij)
+        # Build sync3n matrix
+        S = self._construct_sync3n_matrix(Rij)
 
-        # optionally S weights
+        # Optionally S weights
+        # todo
 
-        # S to rot
-        Ris = self.cryo_sync3n_S_to_rot(S)
+        # Yield rotations from S
+        Ris = self._sync3n_S_to_rot(S)
 
         self.rotations = Ris
 
     ###########################################
     # The hackberries taste like hackberries  #
     ###########################################
-    def cryo_sync3n_S_to_rot(self, S):
+    def _sync3n_S_to_rot(self, S, n_eigs=4):
         """
-        S is (n_img, n_img, 3,3)
+        Use eigen decomposition of S to estimate transforms,
+        then project transforms to nearest rotations.
         """
 
-        # Convert S to stupid shape
-        S = np.transpose(S, (0, 2, 1, 3)).reshape(3 * self.n_img, 3 * self.n_img)
+        if n_eigs < 3:
+            raise ValueError(
+                f"n_eigs must be greater than 3, default is 4. Invoked with {n_eigs}"
+            )
 
         # Extract three eigenvectors corresponding to non-zero eigenvalues.
-        d, v = stable_eigsh(S, 10)
+        d, v = stable_eigsh(S, n_eigs)
         sort_idx = np.argsort(-d)
         logger.info(
-            f"Top 10 eigenvalues from synchronization voting matrix: {d[sort_idx]}"
+            f"Top {n_eigs} eigenvalues from synchronization voting matrix: {d[sort_idx]}"
         )
 
         # Only need the top 3 eigen-vectors.
         v = v[:, sort_idx[:3]]
 
-        v1 = v[: 3 * self.n_img : 3].T.copy()
-        v2 = v[1 : 3 * self.n_img : 3].T.copy()
-        v3 = v[2 : 3 * self.n_img : 3].T.copy()
+        # Yield estimated rotations from the eigen-vectors
+        v = v.reshape(3, self.n_img, 3)
+        rotations = np.transpose(v, (1, 0, 2))  # Check, may be (1, 2 , 0) for T
 
-        rotations = np.empty((self.n_img, 3, 3), dtype=self.dtype)
-        rotations[:, :, 0] = v1.T
-        rotations[:, :, 1] = v2.T
-        rotations[:, :, 2] = v3.T
-        # Make sure that we got rotations by enforcing R to be
-        # a rotation (in case the error is large)
+        # Enforce we are returning actual rotations
         rotations = nearest_rotations(rotations)
 
         return rotations
 
-    def cryo_sync3n_syncmatrix(self, Rij):
+    def _construct_sync3n_matrix(self, Rij):
+        """
+        Construct sync3n matrix from estimated rotations Rij.
+        """
 
-        S = np.zeros((self.n_img, self.n_img, 3, 3), dtype=self.dtype)
-        I = np.eye(3, dtype=self.dtype)
+        # Initialize S with diag identity blocks
+        n = self.n_img
+        S = np.eye(3 * n, dtype=self.dtype).reshape(n, 3, n, 3)
 
         idx = 0
-        for i in range(self.n_img):
-            # S( (3*i-2):(3*i) , (3*i-2):(3*i) ) = I; % Rii = I
-            S[i, i] = I
-            for j in range(i + 1, self.n_img):
+        for i in range(n):
+            for j in range(i + 1, n):
                 # S( (3*i-2):(3*i) , (3*j-2):(3*j) ) = Rij(:,:,idx); % Rij
-                S[i, j] = Rij[idx]
+                S[i, :, j, :] = Rij[idx]
                 # S( (3*j-2):(3*j) , (3*i-2):(3*i) ) = Rij(:,:,idx)'; % Rji = Rij'
-                S[j, i] = Rij[idx].T
+                S[j, :, i, :] = Rij[idx].T
                 idx += 1
 
+        # Convert S shape to 3Nx3N
+        S = S.reshape(3 * n, 3 * n)
+
         return S
 
     ###########################################
@@ -154,22 +152,22 @@ def _estimate_relative_viewing_directions(self):
         vi is the third row of the i'th rotation matrix Ri.
         """
         logger.info(f"Estimating relative viewing directions for {self.n_img} images.")
-        # Step 1: Detect a single pair of common-lines between each pair of images
+        # Detect a single pair of common-lines between each pair of images
         self.build_clmatrix()
 
-        # Step 4: Calculate relative rotations
+        # Calculate relative rotations
         Rijs = self._estimate_all_Rijs_c3_c4(self.clmatrix)
 
         return Rijs
 
     def _global_J_sync(self, vijs):
         """ """
-        n_img = self.n_img
 
         # Determine relative handedness of vijs.
         sign_ij_J = self._J_sync_power_method(vijs)
 
         # Synchronize vijs
+        logger.info("Applying global handedness synchronization.")
         for i, sign in enumerate(sign_ij_J):
             if sign == -1:
                 vijs[i] = J_conjugate(vijs[i])
@@ -240,6 +238,9 @@ def _J_sync_power_method(self, vijs):
         i'th entry indicates whether the i'th relative orientation matrix will be J-conjugated.
         """
 
+        logger.info(
+            "Initiating power method to estimate J-synchronization matrix eigenvector."
+        )
         # Set power method tolerance and maximum iterations.
         epsilon = self.epsilon
         max_iters = self.max_iters
@@ -252,9 +253,6 @@ def _J_sync_power_method(self, vijs):
         itr = 0
 
         # Power method iterations
-        logger.info(
-            "Initiating power method to estimate J-synchronization matrix eigenvector."
-        )
         while itr < max_iters and residual > epsilon:
             itr += 1
             vec_new = self._signs_times_v(vijs, vec)
@@ -269,6 +267,7 @@ def _J_sync_power_method(self, vijs):
         J_sync = np.sign(vec)
 
         return J_sync
+
     def _signs_times_v(self, vijs, vec):
         """
         Multiplication of the J-synchronization matrix by a candidate eigenvector.

From 2f087d16404bfa70adc9ace874f2315ebfbf162f Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 4 Apr 2024 09:47:30 -0400
Subject: [PATCH 04/60] stub in W

---
 src/aspire/abinitio/commonline_sync3n.py | 47 ++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index ce41d687f0..0204886cd9 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -59,6 +59,10 @@ def __init__(
         self.degree_res = degree_res
         self.seed = seed
 
+        # Sync3N specific vars
+        self._W = None
+        self._D_null = 1e-13
+
     ###########################################
     # High level algorithm steps              #
     ###########################################
@@ -100,6 +104,42 @@ def _sync3n_S_to_rot(self, S, n_eigs=4):
                 f"n_eigs must be greater than 3, default is 4. Invoked with {n_eigs}"
             )
 
+        if self._W is not None:
+            W = self._W
+            if not W.shape == (self.n_img, self.n_img):
+                raise RuntimeError(
+                    f"Shape of W should be {(self.n_img, self.n_img)}."
+                    f" Received {W.shape}."
+                )
+            # Initialize D
+            D = np.mean(W, axis=1)  # D, check axis
+
+            Dhalf = D
+            # Compute mask of trouble D values
+            nulls = np.abs(D) < self._D_null
+            # Avoid trouble values when exponentiating
+            Dhalf[~nulls] = Dhalf[~nulls] ** (-0.5)
+            # Flush trouble values to zero
+            Dhalf[nulls] = 0
+            # expand diagonal
+            Dhalf = np.diag(Dhalf)
+
+            # Report W Diagnostic
+            W_normalized = Dhalf**2 @ W
+            nzidx = np.sum(W_normalized, axis=1) != 0
+            err = np.linalg.norm(np.sum(W_normalized[nzidx], axis=1) - self.n_img)
+            if err > 1e-10:
+                logger.warning(f"Large Weights Matrix Normalization Error: {err}")
+
+            # Make W of size 3Nx3N
+            W = np.kron(W, np.ones((3, 3)))
+
+            # Make Dhalf of size 3Nx3N
+            Dhalf = np.diag(np.kron(np.diag(Dhalf), np.ones((1, 3)))[0])
+
+            # Apply weights to S
+            S = Dhalf @ (W * S) @ Dhalf
+
         # Extract three eigenvectors corresponding to non-zero eigenvalues.
         d, v = stable_eigsh(S, n_eigs)
         sort_idx = np.argsort(-d)
@@ -110,6 +150,13 @@ def _sync3n_S_to_rot(self, S, n_eigs=4):
         # Only need the top 3 eigen-vectors.
         v = v[:, sort_idx[:3]]
 
+        # Cancel symmetrization when using weights W
+        if self._W is not None:
+            # Untill now we used a symmetrized variant of the weighted Sync matrix,
+            # thus we didn't get the right eigenvectors. to fix that we just need
+            # to multiply:
+            v = Dhalf @ v
+
         # Yield estimated rotations from the eigen-vectors
         v = v.reshape(3, self.n_img, 3)
         rotations = np.transpose(v, (1, 0, 2))  # Check, may be (1, 2 , 0) for T

From 0f772411a3e2b70376bbf6b4cd9c9dc0291c6caa Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Fri, 5 Apr 2024 10:13:12 -0400
Subject: [PATCH 05/60] stub in W

---
 src/aspire/abinitio/commonline_sync3n.py | 31 +++++++++++++++++-------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 0204886cd9..b66bb6a833 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -28,6 +28,7 @@ def __init__(
         degree_res=1,
         seed=None,
         mask=True,
+        S_weighting=False,
     ):
         """
         Initialize object for estimating 3D orientations.
@@ -60,7 +61,7 @@ def __init__(
         self.seed = seed
 
         # Sync3N specific vars
-        self._W = None
+        self.S_weighting = S_weighting
         self._D_null = 1e-13
 
     ###########################################
@@ -82,18 +83,20 @@ def estimate_rotations(self):
         # Build sync3n matrix
         S = self._construct_sync3n_matrix(Rij)
 
-        # Optionally S weights
-        # todo
+        # Optionally compute S weights
+        W = None
+        if self.S_weighting is True:
+            W = self._syncmatrix_weights(Rij)
 
         # Yield rotations from S
-        Ris = self._sync3n_S_to_rot(S)
+        Ris = self._sync3n_S_to_rot(S, W)
 
         self.rotations = Ris
 
     ###########################################
     # The hackberries taste like hackberries  #
     ###########################################
-    def _sync3n_S_to_rot(self, S, n_eigs=4):
+    def _sync3n_S_to_rot(self, S, W=None, n_eigs=4):
         """
         Use eigen decomposition of S to estimate transforms,
         then project transforms to nearest rotations.
@@ -104,8 +107,8 @@ def _sync3n_S_to_rot(self, S, n_eigs=4):
                 f"n_eigs must be greater than 3, default is 4. Invoked with {n_eigs}"
             )
 
-        if self._W is not None:
-            W = self._W
+        if W is not None:
+            logger.info("Applying weights to synchronization matrix.")
             if not W.shape == (self.n_img, self.n_img):
                 raise RuntimeError(
                     f"Shape of W should be {(self.n_img, self.n_img)}."
@@ -151,7 +154,7 @@ def _sync3n_S_to_rot(self, S, n_eigs=4):
         v = v[:, sort_idx[:3]]
 
         # Cancel symmetrization when using weights W
-        if self._W is not None:
+        if W is not None:
             # Untill now we used a symmetrized variant of the weighted Sync matrix,
             # thus we didn't get the right eigenvectors. to fix that we just need
             # to multiply:
@@ -159,7 +162,7 @@ def _sync3n_S_to_rot(self, S, n_eigs=4):
 
         # Yield estimated rotations from the eigen-vectors
         v = v.reshape(3, self.n_img, 3)
-        rotations = np.transpose(v, (1, 0, 2))  # Check, may be (1, 2 , 0) for T
+        rotations = np.transpose(v, (1, 0, 2))
 
         # Enforce we are returning actual rotations
         rotations = nearest_rotations(rotations)
@@ -189,6 +192,16 @@ def _construct_sync3n_matrix(self, Rij):
 
         return S
 
+    def _syncmatrix_weights(self, Rij):
+        """
+        Given relative rotations matrix `Rij`,
+        compute probability weights for S.
+        """
+        logger.info("Computing synchronization matrix weights.")
+        # Test with identity weights,
+        # todo, port cryo_sync3n_syncmatrix_weights
+        return np.ones((self.n_img, self.n_img))
+
     ###########################################
     # Primary Methods                         #
     ###########################################

From 1d3cbeffdbdef6d6fcff781d85e7669df38f6e1b Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Fri, 5 Apr 2024 15:48:31 -0400
Subject: [PATCH 06/60] begin stubbing in actual S weight computation

---
 src/aspire/abinitio/commonline_sync3n.py | 77 ++++++++++++++++++++++--
 1 file changed, 72 insertions(+), 5 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index b66bb6a833..8deef9c790 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -192,15 +192,82 @@ def _construct_sync3n_matrix(self, Rij):
 
         return S
 
-    def _syncmatrix_weights(self, Rij):
+    def _syncmatrix_weights(
+        self,
+        Rij,
+        permitted_inconsistency=1.5,
+        p_domain_limit=0.7,
+        max_iterations=12,
+        min_p_permitted=0.04,
+    ):
         """
         Given relative rotations matrix `Rij`,
-        compute probability weights for S.
+        compute and return probability weights for S.
         """
         logger.info("Computing synchronization matrix weights.")
-        # Test with identity weights,
-        # todo, port cryo_sync3n_syncmatrix_weights
-        return np.ones((self.n_img, self.n_img))
+
+        def body(prev_too_low, Pmin, Pmax, hist, p_domain_limit=p_domain_limit):
+            # Get inistial estimate for Pij
+            P, sigma, Rsquare, Pij, hist, fit, cum_scores = self._triangle_scores(
+                Rij, hist, Pmin, Pmax
+            )
+
+            # Check if P and Pij are consistent
+            mean_Pij = np.mean(Pij)
+            too_low = P < mean_Pij / permitted_inconsistency
+            too_high = P > mean_Pij * permitted_inconsistency
+            inconsistent = too_low | too_high
+
+            # Check trend
+            if prev_too_low is not None and too_low != prev_too_low:
+                p_domain_limit = np.sqrt(p_domain_limit)
+
+            # define limits for next P estimation
+            if too_high:
+                if P < min_p_permitted:
+                    logger.error(
+                        "Triangles Scores are too bad distributed, whatever small P we force."
+                    )
+
+                Pmax = P
+                if Pmax is not None:
+                    Pmax = Pmax * p_domain_limit
+
+                Pmin = Pmax * p_domain_limit
+            else:
+                Pmin = P
+                if Pmin is not None:
+                    Pmin = Pmin / p_domain_limit
+
+                Pmax = Pmin / p_domain_limit
+
+            return inconsistent, Pij, (too_low, Pmin, Pmax, hist)
+
+        # Repeat iteratively until estimations of P & Pij are consistent
+        i = 0
+        res = (None,) * 4
+        inconsistent = True
+        while inconsistent and i < max_iterations:
+            inconsistent, Pij, res = body(*res)
+
+        # Pack W
+        # N = 0.5 * (1 + np.sqrt(1+8*Rij.shape[2])) #? what
+        W = np.zeros((self.n_img, self.n_img))
+        idx = 0
+        for i in range(self.n_img):
+            for j in range(i, self.n_img):
+                W[i, j] = Pij[idx]
+                W[j, i] = Pij[idx]
+                idx += 1
+
+        return W
+
+    def _triangle_scores(self, Rij, hist, Pmin, Pmax):
+        """
+        Todo
+        """
+        # return P, sigma, Rsquare, Pij, hist, fit, cum_scores
+        pass
 
     ###########################################
     # Primary Methods                         #

From 906aa2d077458224353de114c3d31da335022ddc Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 8 Apr 2024 10:02:35 -0400
Subject: [PATCH 07/60] fix typo bug

---
 src/aspire/abinitio/commonline_sync3n.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 8deef9c790..68b13dd747 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -226,18 +226,20 @@ def body(prev_too_low, Pmin, Pmax, hist, p_domain_limit=p_domain_limit):
             if too_high:
                 if P < min_p_permitted:
                     logger.error(
-                        "Triangles Scores are too bad distributed, whatever small P we force."
+                        "Triangles Scores are poorly distributed, whatever small P we force."
                     )
 
-                Pmax = P
                 if Pmax is not None:
                     Pmax = Pmax * p_domain_limit
+                else:
+                    Pmax = P
 
                 Pmin = Pmax * p_domain_limit
-            else:
-                Pmin = P
+            else:  # too low
                 if Pmin is not None:
                     Pmin = Pmin / p_domain_limit
+                else:
+                    Pmin = P
 
                 Pmax = Pmin / p_domain_limit
 
@@ -251,7 +253,6 @@ def body(prev_too_low, Pmin, Pmax, hist, p_domain_limit=p_domain_limit):
             inconsistent, Pij, res = body(*res)
 
         # Pack W
-        # N = 0.5 * (1 + np.sqrt(1+8*Rij.shape[2])) #? what
         W = np.zeros((self.n_img, self.n_img))
         idx = 0
         for i in range(self.n_img):

From 8b14db3a401278cf25f0946bc01dfee055162e77 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 8 Apr 2024 16:06:44 -0400
Subject: [PATCH 08/60] fix rot reshape bug and stub in probability_scores

---
 src/aspire/abinitio/commonline_sync3n.py | 190 ++++++++++++++++++++++-
 1 file changed, 183 insertions(+), 7 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 68b13dd747..2f21c7da4b 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -2,6 +2,7 @@
 
 import numpy as np
 from numpy.linalg import norm
+from scipy.optimize import curve_fit
 
 from aspire.abinitio import CLOrient3D, SyncVotingMixin
 from aspire.utils import J_conjugate, all_pairs, all_triplets, nearest_rotations
@@ -96,7 +97,7 @@ def estimate_rotations(self):
     ###########################################
     # The hackberries taste like hackberries  #
     ###########################################
-    def _sync3n_S_to_rot(self, S, W=None, n_eigs=4):
+    def _sync3n_S_to_rot(self, S, W=None, n_eigs=10):
         """
         Use eigen decomposition of S to estimate transforms,
         then project transforms to nearest rotations.
@@ -104,7 +105,7 @@ def _sync3n_S_to_rot(self, S, W=None, n_eigs=4):
 
         if n_eigs < 3:
             raise ValueError(
-                f"n_eigs must be greater than 3, default is 4. Invoked with {n_eigs}"
+                f"n_eigs must be greater than 3, default is 10. Invoked with {n_eigs}"
             )
 
         if W is not None:
@@ -161,8 +162,7 @@ def _sync3n_S_to_rot(self, S, W=None, n_eigs=4):
             v = Dhalf @ v
 
         # Yield estimated rotations from the eigen-vectors
-        v = v.reshape(3, self.n_img, 3)
-        rotations = np.transpose(v, (1, 0, 2))
+        rotations = v.reshape(self.n_img, 3, 3).transpose(0, 2, 1)
 
         # Enforce we are returning actual rotations
         rotations = nearest_rotations(rotations)
@@ -263,12 +263,188 @@ def body(prev_too_low, Pmin, Pmax, hist, p_domain_limit=p_domain_limit):
 
         return W
 
-    def _triangle_scores(self, Rij, hist, Pmin, Pmax):
+    def _triangle_scores_mex(self, Rijs, hist_intervals):
+        pass
+        # return cum_scores, hist_scores
+
+    def _pairs_probabilities(self, Rijs, P2, A, a, B, b, x0):
+        # The following is adopted from Matlab parias_probabilities_mex.c `looper`
+        # The code should be thread/parallel safe over `i` when results are gathered (via sum).
+
+        # Initialize probability result arrays
+        ln_f_ind = np.zeros(len(Rij), dtype=self.dtype)
+        ln_f_arb = np.zeros(len(Rij), dtype=self.dtype)
+
+        c = np.empty((4), dtype=self.dtype)
+        for i in range(self.n_img):
+            for j in range(i, self.n_img):
+                Rij = Rijs[i * self.n_img + j]
+                for k in range(j, self.n_img):
+                    Rik = Rijs[i * self.n_img + k]
+                    Rjk = Rijs[j * self.n_img + k]
+
+                    # Compute conjugated rotats
+                    Rij_J = J_conjugate(Rij)
+                    Rik_J = J_conjugate(Rik)
+                    Rjk_J = J_conjugate(Rjk)
+
+                    # Compute R muls and norms
+                    c[0] = np.sum(((Rij @ Rjk) - Rik) ** 2)
+                    c[1] = np.sum(((Rij_J @ Rjk) - Rjk) ** 2)
+                    c[3] = np.sum(((Rij @ Rjk_J) - Rik) ** 2)
+                    c[4] = np.sum(((Rij @ Rjk) - Rik_J) ** 2)
+
+                    # Find best match
+                    best_i = np.argmin(c)
+                    best_val = c[best_i]
+
+                    # For each triangle side, find the best alternative
+
+                    # Compute scores
+                    s_ij_jk = 1 - np.sqrt(best_val / alt_ij_jk)
+                    s_ik_jk = 1 - np.sqrt(best_val / alt_ik_jk)
+                    s_ij_ik = 1 - np.sqrt(best_val / alt_ij_ik)
+
+                    # Update probabilities
+                    # # Probability of pair ij having score given indicicative common line
+                    # P2, B, b, x0, A, a
+                    f_ij_jk = np.log(
+                        P2
+                        * (
+                            B
+                            * np.pow(1 - s_ij_jk, b)
+                            * np.exp(-b / (1 - x0) * (1 - s_ij_jk))
+                        )
+                        + (1 - P2) * A * np.pow((1 - s_ij_jk), a)
+                    )
+                    f_ik_jk = np.log(
+                        P2
+                        * (
+                            B
+                            * np.pow(1 - s_ik_jk, b)
+                            * np.exp(-b / (1 - x0) * (1 - s_ik_jk))
+                        )
+                        + (1 - P2) * A * np.pow((1 - s_ik_jk), a)
+                    )
+                    f_ij_ik = np.log(
+                        P2
+                        * (
+                            B
+                            * np.pow(1 - s_ij_ik, b)
+                            * np.exp(-b / (1 - x0) * (1 - s_ij_ik))
+                        )
+                        + (1 - P2) * A * np.pow((1 - s_ij_ik), a)
+                    )
+                    ln_f_ind[ij] += f_ij_jk + f_ij_ik
+                    ln_f_ind[jk] += f_ij_jk + f_ik_jk
+                    ln_f_ind[ik] += f_ik_jk + f_ij_ik
+
+                    # # Probability of pair ij having score given arbitrary common line
+                    f_ij_jk = np.log(A * np.pow((1 - s_ij_jk), a))
+                    f_ik_jk = np.log(A * np.pow((1 - s_ik_jk), a))
+                    f_ij_ik = np.log(A * np.pow((1 - s_ij_ik), a))
+                    ln_f_arb[ij] += f_ij_jk + f_ij_ik
+                    ln_f_arb[jk] += f_ij_jk + f_ik_jk
+                    ln_f_arb[ik] += f_ik_jk + f_ij_ik
+
+        return ln_f_ind, ln_f_arb
+
+    def _triangle_scores(
+        self,
+        Rijs,
+        hist,
+        Pmin,
+        Pmax,
+        hist_intervals=100,
+        a=2.2,
+        peak2sigma=2.43e-2,
+        P=0.5,
+        b=2.5,
+        x0=0.78,
+    ):
         """
         Todo
+
+        :param a: magic number
+        :param peak2sigma: empirical relation between the location of
+            the peak of the histigram, and the mean error in the
+            common lines estimations.
+            AKA, magic number
+        :param P:
+        :param b:
+        :param x0:
         """
-        # return P, sigma, Rsquare, Pij, hist, fit, cum_scores
-        pass
+
+        Pmin = Pmin or 0
+        Pmin = max(Pmin, 0)  # Clamp probability to [0,1]
+        Pmax = Pmax or 1
+        Pmax = min(Pmax, 1)  # Clamp probability to [0,1]
+
+        if hist is not None:
+            cum_scores, scores_hist = self._triangle_scores_mex(Rijs, hist_intervals)
+
+            # Normalize cumulated scores
+            cum_scores /= len(Rij)
+
+        # Histogram decomposition: P & sigma evaluation
+        h = 1 / hist_intervals
+        hist_x = np.arange(h / 2, 1, h)
+        # normalization factor of one component of the histogram
+        A = (
+            (self.n_img * (self.n_img - 1) * (self.n_img - 2) / 2)
+            / hist_intervals
+            * (a + 1)
+        )
+        # normalization of 2nd component: B = P*N_delta/sum(f), where f is the component formula
+        B0 = P ** (self.n_img * (self.n_img - 1) * (self.n_img - 2) / 2) / np.sum(
+            ((1 - hist_x) ** b) * np.exp(-b / (1 - x0) * (1 - hist_x))
+        )
+        start_values = np.array([B0, P, b, x0], dtype=np.float64)
+        lower_bounds = np.array([0, Pmin**3, 2, 0], dtype=np.float64)
+        upper_bounds = np.array([np.inf, Pmax**3, np.inf, 1], dtype=np.float64)
+
+        # Fit distribution
+        def fun(x, B, P, b, x0, A=A, a=a):
+            """Function to fit. x is data vector."""
+            return (1 - P) @ A * (1 - x) ** a + P * B * (1 - x) ** b * np.exp(
+                -b / (1 - x0) * (1 - x)
+            )
+
+        popt, pcov = curve_fit(
+            fun,
+            hist_x.astype(np.float64, copy=False),
+            scores_hist.astype(np.float64, copy=False),
+            p0=start_values,
+            bounds=(lower_bounds, upper_bounds),
+        )
+        B, P, b, x0 = popt
+
+        # Derive P and sigma
+        P = P ** (1 / 3)
+        peak = x0  # can rm later
+        sigma = (1 - peak) / peak2sigma
+
+        # Initialize probability computations
+        # Local histograms analysis
+        A = a + 1  # distribution 1st component normalization factor
+        # distribution 2nd component normalization factor
+        B = B / (
+            (self.n_img * (self.n_img - 1) * (self.n_img - 2) / 2) / hist_intervals
+        )
+
+        # Calculate probabilities
+        ln_f_ind, ln_f_arb = self._pairs_probabilities(Rij, P**2, A, a, B, b, x0)
+        Pij = 1 / (1 + (1 - P) / P * np.exp(ln_f_arb - ln_f_ind))
+
+        # Fix singular output
+        num_nan = np.sum(np.isnan(Pij))
+        if num_nan > 0:
+            logger.error(
+                f"NaN probabilities occurred {num_nan} times out of {size(Pij)}. Setting NaNs to zero."
+            )
+            Pij = np.nan_to_num(Pij)
+
+        return P, sigma, Rsquare, Pij, scores_hist, fit, cum_scores
 
     ###########################################
     # Primary Methods                         #

From c11b90876b4fc997c36eca1a87d23732fdf04917 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Tue, 9 Apr 2024 13:07:48 -0400
Subject: [PATCH 09/60] stub in triangle scores and pair probabilities

---
 src/aspire/abinitio/commonline_sync3n.py | 212 ++++++++++++++++++-----
 1 file changed, 170 insertions(+), 42 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 2f21c7da4b..4aba3102ce 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -5,12 +5,47 @@
 from scipy.optimize import curve_fit
 
 from aspire.abinitio import CLOrient3D, SyncVotingMixin
-from aspire.utils import J_conjugate, all_pairs, all_triplets, nearest_rotations
+from aspire.utils import J_conjugate, all_pairs, all_triplets, nearest_rotations, trange
 from aspire.utils.matlab_compat import stable_eigsh
 from aspire.utils.random import randn
 
 logger = logging.getLogger(__name__)
 
+# Initialize alternatives
+#
+# When we find the best J-configuration, we also compare it to the alternative 2nd best one.
+# this comparison is done for every pair in the triplete independently. to make sure that the
+# alternative is indeed different in relation to the pair, we document the differences between
+# the configurations in advance:
+# ALTS(:,best_conf,pair) = the two configurations in which J-sync differs from best_conf in relation to pair
+
+_ALTS = np.empty((3, 4, 3), dtype=int)
+# Rewrite this later.
+_ALTS[0][0][0] = 1
+_ALTS[0][1][0] = 0
+_ALTS[0][2][0] = 0
+_ALTS[0][3][0] = 1
+_ALTS[1][0][0] = 2
+_ALTS[1][1][0] = 3
+_ALTS[1][2][0] = 3
+_ALTS[1][3][0] = 2
+_ALTS[0][0][1] = 2
+_ALTS[0][1][1] = 2
+_ALTS[0][2][1] = 0
+_ALTS[0][3][1] = 0
+_ALTS[1][0][1] = 3
+_ALTS[1][1][1] = 3
+_ALTS[1][2][1] = 1
+_ALTS[1][3][1] = 1
+_ALTS[0][0][2] = 1
+_ALTS[0][1][2] = 0
+_ALTS[0][2][2] = 1
+_ALTS[0][3][2] = 0
+_ALTS[1][0][2] = 3
+_ALTS[1][1][2] = 2
+_ALTS[1][2][2] = 3
+_ALTS[1][3][2] = 2
+
 
 class CLSync3N(CLOrient3D, SyncVotingMixin):
     """
@@ -56,6 +91,9 @@ def __init__(
             mask=mask,
         )
 
+        # Generate pair mappings
+        self._pairs, self._pairs_to_linear = all_pairs(self.n_img, return_map=True)
+
         self.epsilon = epsilon
         self.max_iters = max_iters
         self.degree_res = degree_res
@@ -208,7 +246,7 @@ def _syncmatrix_weights(
 
         def body(prev_too_low, Pmin, Pmax, hist, p_domain_limit=p_domain_limit):
             # Get inistial estimate for Pij
-            P, sigma, Rsquare, Pij, hist, fit, cum_scores = self._triangle_scores(
+            P, sigma, Pij, hist, cum_scores = self._triangle_scores(
                 Rij, hist, Pmin, Pmax
             )
 
@@ -249,14 +287,15 @@ def body(prev_too_low, Pmin, Pmax, hist, p_domain_limit=p_domain_limit):
         i = 0
         res = (None,) * 4
         inconsistent = True
-        while inconsistent and i < max_iterations:
+        while inconsistent and i < 1:  # max_iterations:
             inconsistent, Pij, res = body(*res)
+            i += 1
 
         # Pack W
         W = np.zeros((self.n_img, self.n_img))
         idx = 0
         for i in range(self.n_img):
-            for j in range(i, self.n_img):
+            for j in range(i + 1, self.n_img):
                 W[i, j] = Pij[idx]
                 W[j, i] = Pij[idx]
                 idx += 1
@@ -264,24 +303,104 @@ def body(prev_too_low, Pmin, Pmax, hist, p_domain_limit=p_domain_limit):
         return W
 
     def _triangle_scores_mex(self, Rijs, hist_intervals):
-        pass
-        # return cum_scores, hist_scores
+        # The following is adopted from Matlab triangle_scores_mex.c
+        # The code should be thread/parallel safe over `i` when results are gathered (via sum).
+
+        # Initialize probability result arrays
+        cum_scores = np.zeros(len(Rijs), dtype=self.dtype)
+        scores_hist = np.zeros(hist_intervals, dtype=self.dtype)
+        h = 1 / hist_intervals
+
+        c = np.empty((4), dtype=self.dtype)
+        for i in trange(self.n_img, desc="Computing triangle scores"):
+            for j in range(
+                i + 1, self.n_img - 1
+            ):  # check bound (taken from MATLAB mex)
+                ij = self._pairs_to_linear[i, j]
+                Rij = Rijs[ij]
+                for k in range(j + 1, self.n_img):
+                    ik = self._pairs_to_linear[i, k]
+                    jk = self._pairs_to_linear[j, k]
+                    Rik = Rijs[ik]
+                    Rjk = Rijs[jk]
+
+                    # Compute conjugated rotats
+                    Rij_J = J_conjugate(Rij)
+                    Rik_J = J_conjugate(Rik)
+                    Rjk_J = J_conjugate(Rjk)
+
+                    # Compute R muls and norms
+                    c[0] = np.sum(((Rij @ Rjk) - Rik) ** 2)
+                    c[1] = np.sum(((Rij_J @ Rjk) - Rik) ** 2)
+                    c[2] = np.sum(((Rij @ Rjk_J) - Rik) ** 2)
+                    c[3] = np.sum(((Rij @ Rjk) - Rik_J) ** 2)
+
+                    # Find best match
+                    best_i = np.argmin(c)
+                    best_val = c[best_i]
+
+                    # For each triangle side, find the best alternative
+                    alt_ij_jk = c[_ALTS[0][best_i][0]]
+                    if c[_ALTS[1][best_i][0]] < alt_ij_jk:
+                        alt_ij_jk = c[_ALTS[1][best_i][0]]
+                    alt_ik_jk = c[_ALTS[0][best_i][1]]
+                    if c[_ALTS[1][best_i][1]] < alt_ik_jk:
+                        alt_ik_jk = c[_ALTS[1][best_i][1]]
+                    alt_ij_ik = c[_ALTS[0][best_i][2]]
+                    if c[_ALTS[1][best_i][2]] < alt_ij_ik:
+                        alt_ij_ik = c[_ALTS[1][best_i][2]]
+
+                    # Compute scores
+                    s_ij_jk = 1 - np.sqrt(best_val / alt_ij_jk)
+                    s_ik_jk = 1 - np.sqrt(best_val / alt_ik_jk)
+                    s_ij_ik = 1 - np.sqrt(best_val / alt_ij_ik)
+
+                    # Update cumulated scores
+                    cum_scores[ij] += s_ij_jk + s_ij_ik
+                    cum_scores[jk] += s_ij_jk + s_ik_jk
+                    cum_scores[ik] += s_ik_jk + s_ij_ik
+
+                    # Update histogram
+                    threshold = 0
+                    for l1 in range(hist_intervals):
+                        threshold += h
+                        if s_ij_jk < threshold:
+                            break
+
+                    for l2 in range(hist_intervals):
+                        threshold += h
+                        if s_ik_jk < threshold:
+                            break
+
+                    for l3 in range(hist_intervals):
+                        threshold += h
+                        if s_ij_ik < threshold:
+                            break
+
+                    scores_hist[l1] += 1
+                    scores_hist[l2] += 1
+                    scores_hist[l3] += 1
+
+        return cum_scores, scores_hist
 
     def _pairs_probabilities(self, Rijs, P2, A, a, B, b, x0):
-        # The following is adopted from Matlab parias_probabilities_mex.c `looper`
+        # The following is adopted from Matlab pairas_probabilities_mex.c `looper`
         # The code should be thread/parallel safe over `i` when results are gathered (via sum).
 
         # Initialize probability result arrays
-        ln_f_ind = np.zeros(len(Rij), dtype=self.dtype)
-        ln_f_arb = np.zeros(len(Rij), dtype=self.dtype)
+        ln_f_ind = np.zeros(len(Rijs), dtype=self.dtype)
+        ln_f_arb = np.zeros(len(Rijs), dtype=self.dtype)
 
         c = np.empty((4), dtype=self.dtype)
-        for i in range(self.n_img):
-            for j in range(i, self.n_img):
-                Rij = Rijs[i * self.n_img + j]
-                for k in range(j, self.n_img):
-                    Rik = Rijs[i * self.n_img + k]
-                    Rjk = Rijs[j * self.n_img + k]
+        for i in trange(self.n_img, desc="Computing pair probabilities"):
+            for j in range(i + 1, self.n_img - 1):
+                ij = self._pairs_to_linear[i, j]
+                Rij = Rijs[ij]
+                for k in range(j + 1, self.n_img):
+                    ik = self._pairs_to_linear[i, k]
+                    jk = self._pairs_to_linear[j, k]
+                    Rik = Rijs[ik]
+                    Rjk = Rijs[jk]
 
                     # Compute conjugated rotats
                     Rij_J = J_conjugate(Rij)
@@ -290,15 +409,24 @@ def _pairs_probabilities(self, Rijs, P2, A, a, B, b, x0):
 
                     # Compute R muls and norms
                     c[0] = np.sum(((Rij @ Rjk) - Rik) ** 2)
-                    c[1] = np.sum(((Rij_J @ Rjk) - Rjk) ** 2)
-                    c[3] = np.sum(((Rij @ Rjk_J) - Rik) ** 2)
-                    c[4] = np.sum(((Rij @ Rjk) - Rik_J) ** 2)
+                    c[1] = np.sum(((Rij_J @ Rjk) - Rik) ** 2)
+                    c[2] = np.sum(((Rij @ Rjk_J) - Rik) ** 2)
+                    c[3] = np.sum(((Rij @ Rjk) - Rik_J) ** 2)
 
                     # Find best match
                     best_i = np.argmin(c)
                     best_val = c[best_i]
 
                     # For each triangle side, find the best alternative
+                    alt_ij_jk = c[_ALTS[0][best_i][0]]
+                    if c[_ALTS[1][best_i][0]] < alt_ij_jk:
+                        alt_ij_jk = c[_ALTS[1][best_i][0]]
+                    alt_ik_jk = c[_ALTS[0][best_i][1]]
+                    if c[_ALTS[1][best_i][1]] < alt_ik_jk:
+                        alt_ik_jk = c[_ALTS[1][best_i][1]]
+                    alt_ij_ik = c[_ALTS[0][best_i][2]]
+                    if c[_ALTS[1][best_i][2]] < alt_ij_ik:
+                        alt_ij_ik = c[_ALTS[1][best_i][2]]
 
                     # Compute scores
                     s_ij_jk = 1 - np.sqrt(best_val / alt_ij_jk)
@@ -312,37 +440,37 @@ def _pairs_probabilities(self, Rijs, P2, A, a, B, b, x0):
                         P2
                         * (
                             B
-                            * np.pow(1 - s_ij_jk, b)
+                            * np.power(1 - s_ij_jk, b)
                             * np.exp(-b / (1 - x0) * (1 - s_ij_jk))
                         )
-                        + (1 - P2) * A * np.pow((1 - s_ij_jk), a)
+                        + (1 - P2) * A * np.power((1 - s_ij_jk), a)
                     )
                     f_ik_jk = np.log(
                         P2
                         * (
                             B
-                            * np.pow(1 - s_ik_jk, b)
+                            * np.power(1 - s_ik_jk, b)
                             * np.exp(-b / (1 - x0) * (1 - s_ik_jk))
                         )
-                        + (1 - P2) * A * np.pow((1 - s_ik_jk), a)
+                        + (1 - P2) * A * np.power((1 - s_ik_jk), a)
                     )
                     f_ij_ik = np.log(
                         P2
                         * (
                             B
-                            * np.pow(1 - s_ij_ik, b)
+                            * np.power(1 - s_ij_ik, b)
                             * np.exp(-b / (1 - x0) * (1 - s_ij_ik))
                         )
-                        + (1 - P2) * A * np.pow((1 - s_ij_ik), a)
+                        + (1 - P2) * A * np.power((1 - s_ij_ik), a)
                     )
                     ln_f_ind[ij] += f_ij_jk + f_ij_ik
                     ln_f_ind[jk] += f_ij_jk + f_ik_jk
                     ln_f_ind[ik] += f_ik_jk + f_ij_ik
 
                     # # Probability of pair ij having score given arbitrary common line
-                    f_ij_jk = np.log(A * np.pow((1 - s_ij_jk), a))
-                    f_ik_jk = np.log(A * np.pow((1 - s_ik_jk), a))
-                    f_ij_ik = np.log(A * np.pow((1 - s_ij_ik), a))
+                    f_ij_jk = np.log(A * np.power((1 - s_ij_jk), a))
+                    f_ik_jk = np.log(A * np.power((1 - s_ik_jk), a))
+                    f_ij_ik = np.log(A * np.power((1 - s_ij_ik), a))
                     ln_f_arb[ij] += f_ij_jk + f_ij_ik
                     ln_f_arb[jk] += f_ij_jk + f_ik_jk
                     ln_f_arb[ik] += f_ik_jk + f_ij_ik
@@ -352,7 +480,7 @@ def _pairs_probabilities(self, Rijs, P2, A, a, B, b, x0):
     def _triangle_scores(
         self,
         Rijs,
-        hist,
+        scores_hist,
         Pmin,
         Pmax,
         hist_intervals=100,
@@ -380,11 +508,12 @@ def _triangle_scores(
         Pmax = Pmax or 1
         Pmax = min(Pmax, 1)  # Clamp probability to [0,1]
 
-        if hist is not None:
+        cum_scores = None  # XXX Why do we even need cum_scores?
+        if scores_hist is None:
             cum_scores, scores_hist = self._triangle_scores_mex(Rijs, hist_intervals)
 
             # Normalize cumulated scores
-            cum_scores /= len(Rij)
+            cum_scores /= len(Rijs)
 
         # Histogram decomposition: P & sigma evaluation
         h = 1 / hist_intervals
@@ -406,10 +535,11 @@ def _triangle_scores(
         # Fit distribution
         def fun(x, B, P, b, x0, A=A, a=a):
             """Function to fit. x is data vector."""
-            return (1 - P) @ A * (1 - x) ** a + P * B * (1 - x) ** b * np.exp(
+            return (1 - P) * A * (1 - x) ** a + P * B * (1 - x) ** b * np.exp(
                 -b / (1 - x0) * (1 - x)
             )
 
+        breakpoint()
         popt, pcov = curve_fit(
             fun,
             hist_x.astype(np.float64, copy=False),
@@ -433,18 +563,18 @@ def fun(x, B, P, b, x0, A=A, a=a):
         )
 
         # Calculate probabilities
-        ln_f_ind, ln_f_arb = self._pairs_probabilities(Rij, P**2, A, a, B, b, x0)
+        ln_f_ind, ln_f_arb = self._pairs_probabilities(Rijs, P**2, A, a, B, b, x0)
         Pij = 1 / (1 + (1 - P) / P * np.exp(ln_f_arb - ln_f_ind))
 
         # Fix singular output
         num_nan = np.sum(np.isnan(Pij))
         if num_nan > 0:
             logger.error(
-                f"NaN probabilities occurred {num_nan} times out of {size(Pij)}. Setting NaNs to zero."
+                f"NaN probabilities occurred {num_nan} times out of {np.size(Pij)}. Setting NaNs to zero."
             )
             Pij = np.nan_to_num(Pij)
 
-        return P, sigma, Rsquare, Pij, scores_hist, fit, cum_scores
+        return P, sigma, Pij, scores_hist, cum_scores
 
     ###########################################
     # Primary Methods                         #
@@ -484,10 +614,9 @@ def _estimate_all_Rijs_c3_c4(self, clmatrix):
         """
         n_img = self.n_img
         n_theta = self.n_theta
-        pairs = all_pairs(n_img)
-        Rijs = np.zeros((len(pairs), 3, 3))
+        Rijs = np.zeros((len(self._pairs), 3, 3))
 
-        for idx, (i, j) in enumerate(pairs):
+        for idx, (i, j) in enumerate(self._pairs):
             Rijs[idx] = self._syncmatrix_ij_vote_3n(
                 clmatrix, i, j, np.arange(n_img), n_theta
             )
@@ -599,10 +728,9 @@ def _signs_times_v(self, vijs, vec):
         :return: New candidate eigenvector of length n-choose-2. The product of the J-sync matrix and vec.
         """
 
-        # All pairs (i,j) and triplets (i,j,k) where i<j<k
+        # All triplets (i,j,k) where i<j<k
         n_img = self.n_img
         triplets = all_triplets(n_img)
-        pairs, pairs_to_linear = all_pairs(n_img, return_map=True)
 
         # There are 4 possible configurations of relative handedness for each triplet (vij, vjk, vik).
         # 'conjugate' expresses which node of the triplet must be conjugated (True) to achieve synchronization.
@@ -630,9 +758,9 @@ def _signs_times_v(self, vijs, vec):
         v = vijs
         new_vec = np.zeros_like(vec)
         for i, j, k in triplets:
-            ij = pairs_to_linear[i, j]
-            jk = pairs_to_linear[j, k]
-            ik = pairs_to_linear[i, k]
+            ij = self._pairs_to_linear[i, j]
+            jk = self._pairs_to_linear[j, k]
+            ik = self._pairs_to_linear[i, k]
             vij, vjk, vik = v[ij], v[jk], v[ik]
             vij_J = J_conjugate(vij)
             vjk_J = J_conjugate(vjk)

From 11e040b7d092d09ef95561f04422442bd817de4a Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Tue, 9 Apr 2024 13:10:10 -0400
Subject: [PATCH 10/60] tox checks

[skip ci]
---
 src/aspire/abinitio/commonline_sync3n.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 4aba3102ce..263d104d0f 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -362,24 +362,24 @@ def _triangle_scores_mex(self, Rijs, hist_intervals):
 
                     # Update histogram
                     threshold = 0
-                    for l1 in range(hist_intervals):
+                    for _l1 in range(hist_intervals):
                         threshold += h
                         if s_ij_jk < threshold:
                             break
 
-                    for l2 in range(hist_intervals):
+                    for _l2 in range(hist_intervals):
                         threshold += h
                         if s_ik_jk < threshold:
                             break
 
-                    for l3 in range(hist_intervals):
+                    for _l3 in range(hist_intervals):
                         threshold += h
                         if s_ij_ik < threshold:
                             break
 
-                    scores_hist[l1] += 1
-                    scores_hist[l2] += 1
-                    scores_hist[l3] += 1
+                    scores_hist[_l1] += 1
+                    scores_hist[_l2] += 1
+                    scores_hist[_l3] += 1
 
         return cum_scores, scores_hist
 

From 244613a279baf6270c2452c335c9dca384905131 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Tue, 9 Apr 2024 14:09:24 -0400
Subject: [PATCH 11/60] light cleanup

---
 src/aspire/abinitio/commonline_sync3n.py | 66 +++++++++---------------
 1 file changed, 25 insertions(+), 41 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 263d104d0f..f1f3cc0d3a 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -19,32 +19,13 @@
 # the configurations in advance:
 # ALTS(:,best_conf,pair) = the two configurations in which J-sync differs from best_conf in relation to pair
 
-_ALTS = np.empty((3, 4, 3), dtype=int)
-# Rewrite this later.
-_ALTS[0][0][0] = 1
-_ALTS[0][1][0] = 0
-_ALTS[0][2][0] = 0
-_ALTS[0][3][0] = 1
-_ALTS[1][0][0] = 2
-_ALTS[1][1][0] = 3
-_ALTS[1][2][0] = 3
-_ALTS[1][3][0] = 2
-_ALTS[0][0][1] = 2
-_ALTS[0][1][1] = 2
-_ALTS[0][2][1] = 0
-_ALTS[0][3][1] = 0
-_ALTS[1][0][1] = 3
-_ALTS[1][1][1] = 3
-_ALTS[1][2][1] = 1
-_ALTS[1][3][1] = 1
-_ALTS[0][0][2] = 1
-_ALTS[0][1][2] = 0
-_ALTS[0][2][2] = 1
-_ALTS[0][3][2] = 0
-_ALTS[1][0][2] = 3
-_ALTS[1][1][2] = 2
-_ALTS[1][2][2] = 3
-_ALTS[1][3][2] = 2
+_ALTS = np.array(
+    [
+        [[1, 2, 1], [0, 2, 0], [0, 0, 1], [1, 0, 0]],
+        [[2, 3, 3], [3, 3, 2], [3, 1, 3], [2, 1, 2]],
+    ],
+    dtype=int,
+)
 
 
 class CLSync3N(CLOrient3D, SyncVotingMixin):
@@ -114,28 +95,26 @@ def estimate_rotations(self):
         """
 
         # Initial estimate of viewing directions
-        Rij0 = self._estimate_relative_viewing_directions()
+        Rijs0 = self._estimate_relative_viewing_directions()
 
         # Compute and apply global handedness
-        Rij = self._global_J_sync(Rij0)
+        Rijs = self._global_J_sync(Rijs0)
 
         # Build sync3n matrix
-        S = self._construct_sync3n_matrix(Rij)
+        S = self._construct_sync3n_matrix(Rijs)
 
         # Optionally compute S weights
         W = None
         if self.S_weighting is True:
-            W = self._syncmatrix_weights(Rij)
+            W = self._syncmatrix_weights(Rijs)
 
         # Yield rotations from S
-        Ris = self._sync3n_S_to_rot(S, W)
-
-        self.rotations = Ris
+        self.rotations = self._sync3n_S_to_rot(S, W)
 
     ###########################################
     # The hackberries taste like hackberries  #
     ###########################################
-    def _sync3n_S_to_rot(self, S, W=None, n_eigs=10):
+    def _sync3n_S_to_rot(self, S, W=None, n_eigs=4):
         """
         Use eigen decomposition of S to estimate transforms,
         then project transforms to nearest rotations.
@@ -143,7 +122,7 @@ def _sync3n_S_to_rot(self, S, W=None, n_eigs=10):
 
         if n_eigs < 3:
             raise ValueError(
-                f"n_eigs must be greater than 3, default is 10. Invoked with {n_eigs}"
+                f"n_eigs must be greater than 3, default is 4. Invoked with {n_eigs}"
             )
 
         if W is not None:
@@ -232,7 +211,7 @@ def _construct_sync3n_matrix(self, Rij):
 
     def _syncmatrix_weights(
         self,
-        Rij,
+        Rijs,
         permitted_inconsistency=1.5,
         p_domain_limit=0.7,
         max_iterations=12,
@@ -247,7 +226,7 @@ def _syncmatrix_weights(
         def body(prev_too_low, Pmin, Pmax, hist, p_domain_limit=p_domain_limit):
             # Get inistial estimate for Pij
             P, sigma, Pij, hist, cum_scores = self._triangle_scores(
-                Rij, hist, Pmin, Pmax
+                Rijs, hist, Pmin, Pmax
             )
 
             # Check if P and Pij are consistent
@@ -287,7 +266,7 @@ def body(prev_too_low, Pmin, Pmax, hist, p_domain_limit=p_domain_limit):
         i = 0
         res = (None,) * 4
         inconsistent = True
-        while inconsistent and i < 1:  # max_iterations:
+        while inconsistent and i < max_iterations:
             inconsistent, Pij, res = body(*res)
             i += 1
 
@@ -343,9 +322,11 @@ def _triangle_scores_mex(self, Rijs, hist_intervals):
                     alt_ij_jk = c[_ALTS[0][best_i][0]]
                     if c[_ALTS[1][best_i][0]] < alt_ij_jk:
                         alt_ij_jk = c[_ALTS[1][best_i][0]]
+
                     alt_ik_jk = c[_ALTS[0][best_i][1]]
                     if c[_ALTS[1][best_i][1]] < alt_ik_jk:
                         alt_ik_jk = c[_ALTS[1][best_i][1]]
+
                     alt_ij_ik = c[_ALTS[0][best_i][2]]
                     if c[_ALTS[1][best_i][2]] < alt_ij_ik:
                         alt_ij_ik = c[_ALTS[1][best_i][2]]
@@ -539,7 +520,6 @@ def fun(x, B, P, b, x0, A=A, a=a):
                 -b / (1 - x0) * (1 - x)
             )
 
-        breakpoint()
         popt, pcov = curve_fit(
             fun,
             hist_x.astype(np.float64, copy=False),
@@ -590,7 +570,7 @@ def _estimate_relative_viewing_directions(self):
         self.build_clmatrix()
 
         # Calculate relative rotations
-        Rijs = self._estimate_all_Rijs_c3_c4(self.clmatrix)
+        Rijs = self._estimate_all_Rijs(self.clmatrix)
 
         return Rijs
 
@@ -608,7 +588,7 @@ def _global_J_sync(self, vijs):
 
         return vijs
 
-    def _estimate_all_Rijs_c3_c4(self, clmatrix):
+    def _estimate_all_Rijs(self, clmatrix):
         """
         Estimate Rijs using the voting method.
         """
@@ -685,6 +665,10 @@ def _J_sync_power_method(self, vijs):
         residual = 1
         itr = 0
 
+        # XXX, I don't like that epsilon>1 (residual) returns signs of random vector
+        #      maybe force to run once? or return vec as zeros in that case?
+        #      Seems unintended, but easy to do.
+
         # Power method iterations
         while itr < max_iters and residual > epsilon:
             itr += 1

From 21fb193bd156499ea695208a4d2cc35684b409ab Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Wed, 10 Apr 2024 10:40:40 -0400
Subject: [PATCH 12/60] J weighting

---
 src/aspire/abinitio/commonline_sync3n.py | 167 +++++++++++------------
 src/aspire/utils/misc.py                 |   8 +-
 2 files changed, 83 insertions(+), 92 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index f1f3cc0d3a..b382ada6b0 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -5,7 +5,7 @@
 from scipy.optimize import curve_fit
 
 from aspire.abinitio import CLOrient3D, SyncVotingMixin
-from aspire.utils import J_conjugate, all_pairs, all_triplets, nearest_rotations, trange
+from aspire.utils import J_conjugate, all_pairs, nearest_rotations, trange
 from aspire.utils.matlab_compat import stable_eigsh
 from aspire.utils.random import randn
 
@@ -27,6 +27,8 @@
     dtype=int,
 )
 
+_signs_confs = np.array([[1, 1, 1], [-1, 1, -1], [-1, -1, 1], [1, -1, -1]], dtype=int)
+
 
 class CLSync3N(CLOrient3D, SyncVotingMixin):
     """
@@ -46,6 +48,7 @@ def __init__(
         seed=None,
         mask=True,
         S_weighting=False,
+        J_weighting=False,
     ):
         """
         Initialize object for estimating 3D orientations.
@@ -82,6 +85,7 @@ def __init__(
 
         # Sync3N specific vars
         self.S_weighting = S_weighting
+        self.J_weighting = J_weighting
         self._D_null = 1e-13
 
     ###########################################
@@ -574,19 +578,18 @@ def _estimate_relative_viewing_directions(self):
 
         return Rijs
 
-    def _global_J_sync(self, vijs):
+    def _global_J_sync(self, Rijs):
         """ """
 
-        # Determine relative handedness of vijs.
-        sign_ij_J = self._J_sync_power_method(vijs)
+        # Determine relative handedness of Rijs.
+        sign_ij_J = self._J_sync_power_method(Rijs)
 
-        # Synchronize vijs
+        # Synchronize Rijs
         logger.info("Applying global handedness synchronization.")
-        for i, sign in enumerate(sign_ij_J):
-            if sign == -1:
-                vijs[i] = J_conjugate(vijs[i])
+        mask = sign_ij_J == -1
+        Rijs[mask] = J_conjugate(Rijs[mask])
 
-        return vijs
+        return Rijs
 
     def _estimate_all_Rijs(self, clmatrix):
         """
@@ -636,7 +639,7 @@ def _syncmatrix_ij_vote_3n(self, clmatrix, i, j, k_list, n_theta):
     # Secondary Methods for Global J Sync #
     #######################################
 
-    def _J_sync_power_method(self, vijs):
+    def _J_sync_power_method(self, Rijs):
         """
         Calculate the leading eigenvector of the J-synchronization matrix
         using the power method.
@@ -645,7 +648,7 @@ def _J_sync_power_method(self, vijs):
         use the power method to compute the eigenvalues and eigenvectors,
         while constructing the matrix on-the-fly.
 
-        :param vijs: (n-choose-2)x3x3 array of estimates of relative orientation matrices.
+        :param Rijs: (n-choose-2)x3x3 array of estimates of relative orientation matrices.
 
         :return: An array of length n-choose-2 consisting of 1 or -1, where the sign of the
         i'th entry indicates whether the i'th relative orientation matrix will be J-conjugated.
@@ -659,8 +662,8 @@ def _J_sync_power_method(self, vijs):
         max_iters = self.max_iters
 
         # Initialize candidate eigenvectors
-        n_vijs = vijs.shape[0]
-        vec = randn(n_vijs, seed=self.seed)
+        n_Rijs = Rijs.shape[0]
+        vec = randn(n_Rijs, seed=self.seed)
         vec = vec / norm(vec)
         residual = 1
         itr = 0
@@ -672,7 +675,7 @@ def _J_sync_power_method(self, vijs):
         # Power method iterations
         while itr < max_iters and residual > epsilon:
             itr += 1
-            vec_new = self._signs_times_v(vijs, vec)
+            vec_new = self._signs_times_v(Rijs, vec)
             vec_new = vec_new / norm(vec_new)
             residual = norm(vec_new - vec)
             vec = vec_new
@@ -685,86 +688,74 @@ def _J_sync_power_method(self, vijs):
 
         return J_sync
 
-    def _signs_times_v(self, vijs, vec):
+    def _signs_times_v(self, Rijs, vec):
         """
-        Multiplication of the J-synchronization matrix by a candidate eigenvector.
-
-        The J-synchronization matrix is a matrix representation of the handedness graph, Gamma, whose set of
-        nodes consists of the estimates vijs and whose set of edges consists of the undirected edges between
-        all triplets of estimates vij, vjk, and vik, where i<j<k. The weight of an edge is set to +1 if its
-        incident nodes agree in handednes and -1 if not.
-
-        The J-synchronization matrix is of size (n-choose-2)x(n-choose-2), where each entry corresponds to
-        the relative handedness of vij and vjk. The entry (ij, jk), where ij and jk are retrieved from the
-        all_pairs indexing, is 1 if vij and vjk are of the same handedness and -1 if not. All other entries
-        (ij, kl) hold a zero.
-
-        Due to the large size of the J-synchronization matrix we construct it on the fly as follows.
-        For each triplet of outer products vij, vjk, and vik, the associated elements of the J-synchronization
-        matrix are populated with +1 or -1 and multiplied by the corresponding elements of
-        the current candidate eigenvector supplied by the power method. The new candidate eigenvector
-        is updated for each triplet.
-
-        :param vijs: (n-choose-2)x3x3 array, where each 3x3 slice holds the outer product of vi and vj.
-
-        :param vec: The current candidate eigenvector of length n-choose-2 from the power method.
-
-        :return: New candidate eigenvector of length n-choose-2. The product of the J-sync matrix and vec.
+        Ported from _signs_times_v_mex.c
         """
+        # The code should be thread/parallel safe over `i`.
 
-        # All triplets (i,j,k) where i<j<k
-        n_img = self.n_img
-        triplets = all_triplets(n_img)
-
-        # There are 4 possible configurations of relative handedness for each triplet (vij, vjk, vik).
-        # 'conjugate' expresses which node of the triplet must be conjugated (True) to achieve synchronization.
-        conjugate = np.empty((4, 3), bool)
-        conjugate[0] = [False, False, False]
-        conjugate[1] = [True, False, False]
-        conjugate[2] = [False, True, False]
-        conjugate[3] = [False, False, True]
-
-        # 'edges' corresponds to whether conjugation agrees between the pairs (vij, vjk), (vjk, vik),
-        # and (vik, vij). True if the pairs are in agreement, False otherwise.
-        edges = np.empty((4, 3), bool)
-        edges[:, 0] = conjugate[:, 0] == conjugate[:, 1]
-        edges[:, 1] = conjugate[:, 1] == conjugate[:, 2]
-        edges[:, 2] = conjugate[:, 2] == conjugate[:, 0]
-
-        # The corresponding entries in the J-synchronization matrix are +1 if the pair of nodes agree, -1 if not.
-        edge_signs = np.where(edges, 1, -1)
-
-        # For each triplet of nodes we apply the 4 configurations of conjugation and determine the
-        # relative handedness based on the condition that vij @ vjk - vik = 0 for synchronized nodes.
-        # We then construct the corresponding entries of the J-synchronization matrix with 'edge_signs'
-        # corresponding to the conjugation configuration producing the smallest residual for the above
-        # condition. Finally, we the multiply the 'edge_signs' by the cooresponding entries of 'vec'.
-        v = vijs
         new_vec = np.zeros_like(vec)
-        for i, j, k in triplets:
-            ij = self._pairs_to_linear[i, j]
-            jk = self._pairs_to_linear[j, k]
-            ik = self._pairs_to_linear[i, k]
-            vij, vjk, vik = v[ij], v[jk], v[ik]
-            vij_J = J_conjugate(vij)
-            vjk_J = J_conjugate(vjk)
-            vik_J = J_conjugate(vik)
-
-            conjugated_pairs = np.where(
-                conjugate[..., np.newaxis, np.newaxis],
-                [vij_J, vjk_J, vik_J],
-                [vij, vjk, vik],
-            )
-            residual = np.stack([norm(x @ y - z) for x, y, z in conjugated_pairs])
+        c = np.empty((4), dtype=self.dtype)
+        desc = "Computing signs_times_v"
+        if self.J_weighting:
+            desc += " with J_weighting"
+        for i in trange(self.n_img, desc=desc):
+            for j in range(
+                i + 1, self.n_img - 1
+            ):  # check bound (taken from MATLAB mex)
+                ij = self._pairs_to_linear[i, j]
+                Rij = Rijs[ij]
+                for k in range(j + 1, self.n_img):
+                    ik = self._pairs_to_linear[i, k]
+                    jk = self._pairs_to_linear[j, k]
+                    Rik = Rijs[ik]
+                    Rjk = Rijs[jk]
 
-            min_residual = np.argmin(residual)
+                    # Compute conjugated rotats
+                    Rij_J = J_conjugate(Rij)
+                    Rik_J = J_conjugate(Rik)
+                    Rjk_J = J_conjugate(Rjk)
 
-            # Assign edge weights
-            s_ij_jk, s_ik_jk, s_ij_ik = edge_signs[min_residual]
+                    # Compute R muls and norms
+                    c[0] = np.sum(((Rij @ Rjk) - Rik) ** 2)
+                    c[1] = np.sum(((Rij_J @ Rjk) - Rik) ** 2)
+                    c[2] = np.sum(((Rij @ Rjk_J) - Rik) ** 2)
+                    c[3] = np.sum(((Rij @ Rjk) - Rik_J) ** 2)
+
+                    # Find best match
+                    best_i = np.argmin(c)
+                    best_val = c[best_i]
 
-            # Update multiplication of signs times vec
-            new_vec[ij] += s_ij_jk * vec[jk] + s_ij_ik * vec[ik]
-            new_vec[jk] += s_ij_jk * vec[ij] + s_ik_jk * vec[ik]
-            new_vec[ik] += s_ij_jk * vec[ij] + s_ik_jk * vec[jk]
+                    # MATLAB: scores_as_entries == 0
+                    s_ij_jk = _signs_confs[best_i][0]
+                    s_ik_jk = _signs_confs[best_i][1]
+                    s_ij_ik = _signs_confs[best_i][2]
+
+                    # Note there was a third J_weighting option (2) in MATLAB,
+                    # but it was not exposed at top level.
+                    if self.J_weighting:
+                        # MATLAB: scores_as_entries == 1
+                        # For each triangle side, find the best alternative
+                        alt_ij_jk = c[_ALTS[0][best_i][0]]
+                        if c[_ALTS[1][best_i][0]] < alt_ij_jk:
+                            alt_ij_jk = c[_ALTS[1][best_i][0]]
+
+                        alt_ik_jk = c[_ALTS[0][best_i][1]]
+                        if c[_ALTS[1][best_i][1]] < alt_ik_jk:
+                            alt_ik_jk = c[_ALTS[1][best_i][1]]
+
+                        alt_ij_ik = c[_ALTS[0][best_i][2]]
+                        if c[_ALTS[1][best_i][2]] < alt_ij_ik:
+                            alt_ij_ik = c[_ALTS[1][best_i][2]]
+
+                        # Compute scores
+                        s_ij_jk *= 1 - np.sqrt(best_val / alt_ij_jk)
+                        s_ik_jk *= 1 - np.sqrt(best_val / alt_ik_jk)
+                        s_ij_ik *= 1 - np.sqrt(best_val / alt_ij_ik)
+
+                    # Update vector entries
+                    new_vec[ij] += s_ij_jk * vec[jk] + s_ij_ik * vec[ik]
+                    new_vec[jk] += s_ij_jk * vec[ij] + s_ik_jk * vec[ik]
+                    new_vec[ik] += s_ij_jk * vec[ij] + s_ik_jk * vec[jk]
 
         return new_vec
diff --git a/src/aspire/utils/misc.py b/src/aspire/utils/misc.py
index 7541bbddf1..d0c30b9f90 100644
--- a/src/aspire/utils/misc.py
+++ b/src/aspire/utils/misc.py
@@ -368,12 +368,12 @@ def J_conjugate(A):
     """
     Conjugate the 3x3 matrix A by the diagonal matrix J=diag((-1, -1, 1)).
 
-    :param A: A 3x3 matrix.
-    :return: J*A*J
+    :param A: A 3x3 matrix, or nx3x3 matrix.
+    :return: J@A@J
     """
-    J = np.array([[1, 1, -1], [1, 1, -1], [-1, -1, 1]], dtype=A.dtype)
+    JJop = np.array([[1, 1, -1], [1, 1, -1], [-1, -1, 1]], dtype=A.dtype)
 
-    return A * J
+    return A * JJop
 
 
 def cyclic_rotations(order, dtype=np.float64):

From 9fbe9fff21fd4859e89e1d92a147f65ee70ba494 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Wed, 10 Apr 2024 16:04:24 -0400
Subject: [PATCH 13/60] add note about possible ij jk bug

[skip ci]
---
 src/aspire/abinitio/commonline_sync3n.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index b382ada6b0..880e63567a 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -756,6 +756,6 @@ def _signs_times_v(self, Rijs, vec):
                     # Update vector entries
                     new_vec[ij] += s_ij_jk * vec[jk] + s_ij_ik * vec[ik]
                     new_vec[jk] += s_ij_jk * vec[ij] + s_ik_jk * vec[ik]
-                    new_vec[ik] += s_ij_jk * vec[ij] + s_ik_jk * vec[jk]
+                    new_vec[ik] += s_ij_jk * vec[ij] + s_ik_jk * vec[jk]  # jk/ik? was a bug?? worked better with s_ij_jk...
 
         return new_vec

From 2e5e11fa26aad15e2218bb90815318f30503fd58 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 11 Apr 2024 15:00:00 -0400
Subject: [PATCH 14/60] hack in cupy kernel

---
 src/aspire/abinitio/commonline_sync3n.py | 298 ++++++++++++++++++-----
 1 file changed, 234 insertions(+), 64 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 880e63567a..c0e31f39a2 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -1,6 +1,7 @@
 import logging
 
 import numpy as np
+import cupy as cp
 from numpy.linalg import norm
 from scipy.optimize import curve_fit
 
@@ -689,73 +690,242 @@ def _J_sync_power_method(self, Rijs):
         return J_sync
 
     def _signs_times_v(self, Rijs, vec):
-        """
-        Ported from _signs_times_v_mex.c
-        """
-        # The code should be thread/parallel safe over `i`.
 
-        new_vec = np.zeros_like(vec)
-        c = np.empty((4), dtype=self.dtype)
-        desc = "Computing signs_times_v"
-        if self.J_weighting:
-            desc += " with J_weighting"
-        for i in trange(self.n_img, desc=desc):
-            for j in range(
-                i + 1, self.n_img - 1
-            ):  # check bound (taken from MATLAB mex)
-                ij = self._pairs_to_linear[i, j]
-                Rij = Rijs[ij]
-                for k in range(j + 1, self.n_img):
-                    ik = self._pairs_to_linear[i, k]
-                    jk = self._pairs_to_linear[j, k]
-                    Rik = Rijs[ik]
-                    Rjk = Rijs[jk]
+        # host/gpu dispatch
+        new_vec = _signs_times_v_host(self.n_img, Rijs, vec, self.J_weighting, _ALTS, _signs_confs)
 
-                    # Compute conjugated rotats
-                    Rij_J = J_conjugate(Rij)
-                    Rik_J = J_conjugate(Rik)
-                    Rjk_J = J_conjugate(Rjk)
+        return new_vec
 
-                    # Compute R muls and norms
-                    c[0] = np.sum(((Rij @ Rjk) - Rik) ** 2)
-                    c[1] = np.sum(((Rij_J @ Rjk) - Rik) ** 2)
-                    c[2] = np.sum(((Rij @ Rjk_J) - Rik) ** 2)
-                    c[3] = np.sum(((Rij @ Rjk) - Rik_J) ** 2)
+def PAIR_IDX(N,I,J):
+    return ((2*N-I-1)*I//2+J-I-1)
+    
+def _signs_times_v_host(n, Rijs, vec,J_weighting, _ALTS, _signs_confs):
+    """
+    Ported from _signs_times_v_mex.c
+
+    n: n_img
+    Rijs: nchoose2x3x3 array
+    vec: input array
+    new_vec: output array
+    J_weighting: bool
+    _ALTS= 2x4x3 const lut array
+    _signs_confs = 4x3 const lut array
+    """
+    # The code should be thread/parallel safe over `i`.
+
+    new_vec = np.zeros_like(vec)
+
+    c = np.empty((4))
+    desc = "Computing signs_times_v"
+    if J_weighting:
+        desc += " with J_weighting"
+    for i in trange(n, desc=desc):
+        for j in range(
+            i + 1, n - 1
+        ):  # check bound (taken from MATLAB mex)
+            #ij = self._pairs_to_linear[i, j]
+            ij = PAIR_IDX(n, i, j)
+            Rij = Rijs[ij]
+            for k in range(j + 1, n):
+                #ik = self._pairs_to_linear[i, k]
+                #jk = self._pairs_to_linear[j, k]
+                ik = PAIR_IDX(n, i, k)
+                jk = PAIR_IDX(n, j, k)
+                Rik = Rijs[ik]
+                Rjk = Rijs[jk]
+
+                # Compute conjugated rotats
+                Rij_J = J_conjugate(Rij)
+                Rik_J = J_conjugate(Rik)
+                Rjk_J = J_conjugate(Rjk)
+
+                # Compute R muls and norms
+                c[0] = np.sum(((Rij @ Rjk) - Rik) ** 2)
+                c[1] = np.sum(((Rij_J @ Rjk) - Rik) ** 2)
+                c[2] = np.sum(((Rij @ Rjk_J) - Rik) ** 2)
+                c[3] = np.sum(((Rij @ Rjk) - Rik_J) ** 2)
+
+                # Find best match
+                best_i = np.argmin(c)
+                best_val = c[best_i]
+
+                # MATLAB: scores_as_entries == 0
+                s_ij_jk = _signs_confs[best_i][0]
+                s_ik_jk = _signs_confs[best_i][1]
+                s_ij_ik = _signs_confs[best_i][2]
+
+                # Note there was a third J_weighting option (2) in MATLAB,
+                # but it was not exposed at top level.
+                if J_weighting:
+                    # MATLAB: scores_as_entries == 1
+                    # For each triangle side, find the best alternative
+                    alt_ij_jk = c[_ALTS[0][best_i][0]]
+                    if c[_ALTS[1][best_i][0]] < alt_ij_jk:
+                        alt_ij_jk = c[_ALTS[1][best_i][0]]
 
-                    # Find best match
-                    best_i = np.argmin(c)
-                    best_val = c[best_i]
+                    alt_ik_jk = c[_ALTS[0][best_i][1]]
+                    if c[_ALTS[1][best_i][1]] < alt_ik_jk:
+                        alt_ik_jk = c[_ALTS[1][best_i][1]]
 
-                    # MATLAB: scores_as_entries == 0
-                    s_ij_jk = _signs_confs[best_i][0]
-                    s_ik_jk = _signs_confs[best_i][1]
-                    s_ij_ik = _signs_confs[best_i][2]
-
-                    # Note there was a third J_weighting option (2) in MATLAB,
-                    # but it was not exposed at top level.
-                    if self.J_weighting:
-                        # MATLAB: scores_as_entries == 1
-                        # For each triangle side, find the best alternative
-                        alt_ij_jk = c[_ALTS[0][best_i][0]]
-                        if c[_ALTS[1][best_i][0]] < alt_ij_jk:
-                            alt_ij_jk = c[_ALTS[1][best_i][0]]
-
-                        alt_ik_jk = c[_ALTS[0][best_i][1]]
-                        if c[_ALTS[1][best_i][1]] < alt_ik_jk:
-                            alt_ik_jk = c[_ALTS[1][best_i][1]]
-
-                        alt_ij_ik = c[_ALTS[0][best_i][2]]
-                        if c[_ALTS[1][best_i][2]] < alt_ij_ik:
-                            alt_ij_ik = c[_ALTS[1][best_i][2]]
-
-                        # Compute scores
-                        s_ij_jk *= 1 - np.sqrt(best_val / alt_ij_jk)
-                        s_ik_jk *= 1 - np.sqrt(best_val / alt_ik_jk)
-                        s_ij_ik *= 1 - np.sqrt(best_val / alt_ij_ik)
-
-                    # Update vector entries
-                    new_vec[ij] += s_ij_jk * vec[jk] + s_ij_ik * vec[ik]
-                    new_vec[jk] += s_ij_jk * vec[ij] + s_ik_jk * vec[ik]
-                    new_vec[ik] += s_ij_jk * vec[ij] + s_ik_jk * vec[jk]  # jk/ik? was a bug?? worked better with s_ij_jk...
+                    alt_ij_ik = c[_ALTS[0][best_i][2]]
+                    if c[_ALTS[1][best_i][2]] < alt_ij_ik:
+                        alt_ij_ik = c[_ALTS[1][best_i][2]]
 
-        return new_vec
+                    # Compute scores
+                    s_ij_jk *= 1 - np.sqrt(best_val / alt_ij_jk)
+                    s_ik_jk *= 1 - np.sqrt(best_val / alt_ik_jk)
+                    s_ij_ik *= 1 - np.sqrt(best_val / alt_ij_ik)
+
+                # Update vector entries
+                new_vec[ij] += s_ij_jk * vec[jk] + s_ij_ik * vec[ik]
+                new_vec[jk] += s_ij_jk * vec[ij] + s_ik_jk * vec[ik]
+                new_vec[ik] += s_ij_jk * vec[ij] + s_ik_jk * vec[jk]  # jk/ik? was a bug?? worked better with s_ij_jk...
+
+    return new_vec
+
+def _signs_times_v_cupy(n, Rijs, vec, J_weighting, _ALTS, _signs_confs):
+    """
+    Ported from _signs_times_v_mex.c
+
+    n: n_img
+    Rijs: nchoose2x3x3 array
+    vec: input array
+    new_vec: output array
+    #todo J_weighting: bool
+    #todo _ALTS= 2x4x3 const lut array
+    #todo _signs_confs = 4x3 const lut array
+    """
+    # The code should be thread/parallel safe over `i`.
+
+
+    code = r'''
+
+/* from i,j indoces to the common index in the N-choose-2 sized array */
+#define PAIR_IDX(N,I,J) ((2*N-I-1)*I/2+J-I-1)
+
+inline void mult_3x3(double *out, double *R1, double *R2) {
+/* 3X3 matrices multiplication: out = R1*R2 */
+	int i,j;
+	for (i=0; i<3; i++) {
+		for (j=0;j<3;j++) {
+			out[3*j+i] = R1[3*0+i]*R2[3*j+0] + R1[3*1+i]*R2[3*j+1] + R1[3*2+i]*R2[3*j+2];
+		}
+	}
+}
+
+inline void JRJ(double *R, double *A) {
+/* multiple 3X3 matrix by J from both sizes: A = JRJ */
+	A[0]=R[0];
+	A[1]=R[1];
+	A[2]=-R[2];
+	A[3]=R[3];
+	A[4]=R[4];
+	A[5]=-R[5];
+	A[6]=-R[6];
+	A[7]=-R[7];
+	A[8]=R[8];
+}
+
+inline double diff_norm_3x3(const double *R1, const double *R2) {
+/* difference 2 matrices and return squared norm: ||R1-R2||^2 */
+	int i;
+	double norm = 0;
+	for (i=0; i<9; i++) {norm += (R1[i]-R2[i])*(R1[i]-R2[i]);}
+	return norm;
+}
+
+
+extern "C" __global__
+void signs_times_v(int n, double* Rijs, const double* vec, double* new_vec)
+{
+    /* thread index (1d), represents "i" index */
+    unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    /* no-op when out of bounds */
+    if(i >= n) return;
+
+    unsigned long n_pairs = n*(n-1)/2;
+    double c[4]={0,0,0,0};
+    unsigned int ij, jk, ik;
+    unsigned int eig;
+    int best_i;
+    double best_val;
+    int s_ij_jk, s_ik_jk, s_ij_ik;
+
+    double *Rij, *Rjk, *Rik;
+    double JRijJ[9], JRjkJ[9], JRikJ[9];
+    double tmp[9];
+
+    /* le sigh */
+    int signs_confs[4][3];
+    signs_confs[2-1][1-1]=-1; signs_confs[2-1][3-1]=-1;
+    signs_confs[3-1][1-1]=-1; signs_confs[3-1][2-1]=-1;
+    signs_confs[4-1][2-1]=-1; signs_confs[4-1][3-1]=-1;
+    
+    for(int j=i+1; j< n - 1; j++){
+        ij = PAIR_IDX(n, i, j);
+        for(int k=j+1; k< n; k++){
+            ik = PAIR_IDX(n, i, k);
+            jk = PAIR_IDX(n, j, k);
+
+            /* compute configurations matches scores */
+	    Rij = Rijs + 9*ij;
+	    Rjk = Rijs + 9*jk;
+            Rik = Rijs + 9*ik;
+                        
+            JRJ(Rij, JRijJ);
+            JRJ(Rjk, JRjkJ);
+            JRJ(Rik, JRikJ);
+                        
+            mult_3x3(tmp,Rij,Rjk);
+            c[0] = diff_norm_3x3(tmp,Rik);
+                        
+            mult_3x3(tmp,JRijJ,Rjk);
+            c[1] = diff_norm_3x3(tmp,Rik);
+                        
+            mult_3x3(tmp,Rij,JRjkJ);
+            c[2] = diff_norm_3x3(tmp,Rik);
+                        
+            mult_3x3(tmp,Rij,Rjk);
+            c[3] = diff_norm_3x3(tmp,JRikJ);
+                        
+            /* find best match */
+            best_i=0; best_val=c[0];
+            if (c[1]<best_val) {best_i=1; best_val=c[1];}
+            if (c[2]<best_val) {best_i=2; best_val=c[2];}
+            if (c[3]<best_val) {best_i=3; best_val=c[3];}
+        
+            /* set triangles entries to be signs */
+            s_ij_jk = signs_confs[best_i][0];
+            s_ik_jk = signs_confs[best_i][1];
+            s_ij_ik = signs_confs[best_i][2];
+
+            /* update multiplication */
+            new_vec[ij] += s_ij_jk*vec[jk] + s_ij_ik*vec[ik];
+            new_vec[jk] += s_ij_jk*vec[ij] + s_ik_jk*vec[ik];
+            new_vec[ik] += s_ij_ik*vec[ij] + s_ik_jk*vec[jk];  /* ij jk bug? */
+
+        } /* k */
+    } /* j */
+    return;
+};
+'''
+
+    module = cp.RawModule(code=code)
+    signs_times_v = module.get_function('signs_times_v')
+
+    Rijs_dev = cp.array(Rijs)
+    vec_dev = cp.array(vec)
+    new_vec_dev = cp.zeros_like(vec)
+
+    # call the kernel
+    blkszx = 512
+    nblkx = (n+blkszx-1)//blkszx
+    # blkszy = 512
+    # nblky = (n+blkszy-1)//blkszy
+
+    signs_times_v((nblkx,), (blkszx,), (n, Rijs_dev, vec_dev, new_vec_dev))
+
+    new_vec= new_vec_dev.get()
+
+    return new_vec

From 7a6752e0e37ba735dddb6128b3993f5d80bdb121 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 11 Apr 2024 15:00:18 -0400
Subject: [PATCH 15/60] quick test file

---
 x.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 x.py

diff --git a/x.py b/x.py
new file mode 100644
index 0000000000..1c024e8de9
--- /dev/null
+++ b/x.py
@@ -0,0 +1,15 @@
+from aspire.abinitio.commonline_sync3n import _signs_times_v_cupy
+import numpy as np
+import cupy as cp
+
+n = 7
+n_pairs = n*(n-1)//2
+vec = np.ones(n_pairs, dtype=np.float64)
+new_vec = np.zeros(n_pairs, dtype=np.float64)
+#Rijs = np.random.randn(n_pairs*3*3).astype(dtype=np.float64)
+Rijs = np.arange(n_pairs*3*3).astype(dtype=np.float64)
+
+
+new_vec = _signs_times_v_cupy(n, Rijs, vec, J_weighting=None, _ALTS=None, _signs_confs=None)
+
+print(new_vec)

From 1f988a8153e47e2ef88c1109a4183b03ce7d0984 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 11 Apr 2024 15:24:23 -0400
Subject: [PATCH 16/60] stashing

---
 src/aspire/abinitio/commonline_sync3n.py | 23 +++++++++++++++--------
 x.py                                     | 11 +++++++++--
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index c0e31f39a2..b3eb882270 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -28,8 +28,6 @@
     dtype=int,
 )
 
-_signs_confs = np.array([[1, 1, 1], [-1, 1, -1], [-1, -1, 1], [1, -1, -1]], dtype=int)
-
 
 class CLSync3N(CLOrient3D, SyncVotingMixin):
     """
@@ -692,14 +690,17 @@ def _J_sync_power_method(self, Rijs):
     def _signs_times_v(self, Rijs, vec):
 
         # host/gpu dispatch
-        new_vec = _signs_times_v_host(self.n_img, Rijs, vec, self.J_weighting, _ALTS, _signs_confs)
+        #new_vec = _signs_times_v_host(self.n_img, Rijs, vec, self.J_weighting, _ALTS)
+
+        assert self.J_weighting ==False, "not implemented yet"
+        new_vec = _signs_times_v_cupy(self.n_img, Rijs, vec, self.J_weighting, _ALTS)
 
         return new_vec
 
 def PAIR_IDX(N,I,J):
     return ((2*N-I-1)*I//2+J-I-1)
     
-def _signs_times_v_host(n, Rijs, vec,J_weighting, _ALTS, _signs_confs):
+def _signs_times_v_host(n, Rijs, vec,J_weighting, _ALTS):
     """
     Ported from _signs_times_v_mex.c
 
@@ -715,6 +716,7 @@ def _signs_times_v_host(n, Rijs, vec,J_weighting, _ALTS, _signs_confs):
 
     new_vec = np.zeros_like(vec)
 
+    _signs_confs = np.array([[1, 1, 1], [-1, 1, -1], [-1, -1, 1], [1, -1, -1]], dtype=int)
     c = np.empty((4))
     desc = "Computing signs_times_v"
     if J_weighting:
@@ -779,11 +781,12 @@ def _signs_times_v_host(n, Rijs, vec,J_weighting, _ALTS, _signs_confs):
                 # Update vector entries
                 new_vec[ij] += s_ij_jk * vec[jk] + s_ij_ik * vec[ik]
                 new_vec[jk] += s_ij_jk * vec[ij] + s_ik_jk * vec[ik]
-                new_vec[ik] += s_ij_jk * vec[ij] + s_ik_jk * vec[jk]  # jk/ik? was a bug?? worked better with s_ij_jk...
+                #new_vec[ik] += s_ij_jk * vec[ij] + s_ik_jk * vec[jk]  # jk/ik? was a bug?? worked better with s_ij_jk...
+                new_vec[ik] += s_ij_ik * vec[ij] + s_ik_jk * vec[jk]  # jk/ik? was a bug?? worked better with s_ij_jk...
 
     return new_vec
 
-def _signs_times_v_cupy(n, Rijs, vec, J_weighting, _ALTS, _signs_confs):
+def _signs_times_v_cupy(n, Rijs, vec, J_weighting, _ALTS):
     """
     Ported from _signs_times_v_mex.c
 
@@ -840,9 +843,12 @@ def _signs_times_v_cupy(n, Rijs, vec, J_weighting, _ALTS, _signs_confs):
 {
     /* thread index (1d), represents "i" index */
     unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+    //unsigned int j = blockDim.y * blockIdx.y + threadIdx.y;
 
     /* no-op when out of bounds */
     if(i >= n) return;
+    //if(j >= n-1) return;
+    //if(j < i+1) return;
 
     unsigned long n_pairs = n*(n-1)/2;
     double c[4]={0,0,0,0};
@@ -919,11 +925,12 @@ def _signs_times_v_cupy(n, Rijs, vec, J_weighting, _ALTS, _signs_confs):
     new_vec_dev = cp.zeros_like(vec)
 
     # call the kernel
-    blkszx = 512
+    blkszx = 128
     nblkx = (n+blkszx-1)//blkszx
-    # blkszy = 512
+    # blkszy = 2
     # nblky = (n+blkszy-1)//blkszy
 
+    #signs_times_v((nblkx,nblky), (blkszx,blkszy), (n, Rijs_dev, vec_dev, new_vec_dev))
     signs_times_v((nblkx,), (blkszx,), (n, Rijs_dev, vec_dev, new_vec_dev))
 
     new_vec= new_vec_dev.get()
diff --git a/x.py b/x.py
index 1c024e8de9..e8a3a78a70 100644
--- a/x.py
+++ b/x.py
@@ -1,4 +1,5 @@
 from aspire.abinitio.commonline_sync3n import _signs_times_v_cupy
+from aspire.abinitio.commonline_sync3n import _signs_times_v_host
 import numpy as np
 import cupy as cp
 
@@ -7,9 +8,15 @@
 vec = np.ones(n_pairs, dtype=np.float64)
 new_vec = np.zeros(n_pairs, dtype=np.float64)
 #Rijs = np.random.randn(n_pairs*3*3).astype(dtype=np.float64)
-Rijs = np.arange(n_pairs*3*3).astype(dtype=np.float64)
+Rijs = np.arange(n_pairs*3*3).reshape(n_pairs,3,3).astype(dtype=np.float64)
 
 
-new_vec = _signs_times_v_cupy(n, Rijs, vec, J_weighting=None, _ALTS=None, _signs_confs=None)
+new_vec = _signs_times_v_cupy(n, Rijs, vec, J_weighting=None, _ALTS=None)
 
+print("gpu\n")
 print(new_vec)
+
+new_vec_host = _signs_times_v_host(n, Rijs, vec, J_weighting=None, _ALTS=None)
+
+print("host\n",new_vec_host)
+

From 2712c08ec632339797e1da357330fc01f044db7e Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 11 Apr 2024 15:25:01 -0400
Subject: [PATCH 17/60] black

---
 src/aspire/abinitio/commonline_sync3n.py | 50 +++++++++++++-----------
 x.py                                     | 15 ++++---
 2 files changed, 34 insertions(+), 31 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index b3eb882270..6769d54550 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -1,7 +1,7 @@
 import logging
 
-import numpy as np
 import cupy as cp
+import numpy as np
 from numpy.linalg import norm
 from scipy.optimize import curve_fit
 
@@ -690,17 +690,19 @@ def _J_sync_power_method(self, Rijs):
     def _signs_times_v(self, Rijs, vec):
 
         # host/gpu dispatch
-        #new_vec = _signs_times_v_host(self.n_img, Rijs, vec, self.J_weighting, _ALTS)
+        # new_vec = _signs_times_v_host(self.n_img, Rijs, vec, self.J_weighting, _ALTS)
 
-        assert self.J_weighting ==False, "not implemented yet"
+        assert self.J_weighting == False, "not implemented yet"
         new_vec = _signs_times_v_cupy(self.n_img, Rijs, vec, self.J_weighting, _ALTS)
 
         return new_vec
 
-def PAIR_IDX(N,I,J):
-    return ((2*N-I-1)*I//2+J-I-1)
-    
-def _signs_times_v_host(n, Rijs, vec,J_weighting, _ALTS):
+
+def PAIR_IDX(N, I, J):
+    return (2 * N - I - 1) * I // 2 + J - I - 1
+
+
+def _signs_times_v_host(n, Rijs, vec, J_weighting, _ALTS):
     """
     Ported from _signs_times_v_mex.c
 
@@ -716,21 +718,21 @@ def _signs_times_v_host(n, Rijs, vec,J_weighting, _ALTS):
 
     new_vec = np.zeros_like(vec)
 
-    _signs_confs = np.array([[1, 1, 1], [-1, 1, -1], [-1, -1, 1], [1, -1, -1]], dtype=int)
+    _signs_confs = np.array(
+        [[1, 1, 1], [-1, 1, -1], [-1, -1, 1], [1, -1, -1]], dtype=int
+    )
     c = np.empty((4))
     desc = "Computing signs_times_v"
     if J_weighting:
         desc += " with J_weighting"
     for i in trange(n, desc=desc):
-        for j in range(
-            i + 1, n - 1
-        ):  # check bound (taken from MATLAB mex)
-            #ij = self._pairs_to_linear[i, j]
+        for j in range(i + 1, n - 1):  # check bound (taken from MATLAB mex)
+            # ij = self._pairs_to_linear[i, j]
             ij = PAIR_IDX(n, i, j)
             Rij = Rijs[ij]
             for k in range(j + 1, n):
-                #ik = self._pairs_to_linear[i, k]
-                #jk = self._pairs_to_linear[j, k]
+                # ik = self._pairs_to_linear[i, k]
+                # jk = self._pairs_to_linear[j, k]
                 ik = PAIR_IDX(n, i, k)
                 jk = PAIR_IDX(n, j, k)
                 Rik = Rijs[ik]
@@ -781,11 +783,14 @@ def _signs_times_v_host(n, Rijs, vec,J_weighting, _ALTS):
                 # Update vector entries
                 new_vec[ij] += s_ij_jk * vec[jk] + s_ij_ik * vec[ik]
                 new_vec[jk] += s_ij_jk * vec[ij] + s_ik_jk * vec[ik]
-                #new_vec[ik] += s_ij_jk * vec[ij] + s_ik_jk * vec[jk]  # jk/ik? was a bug?? worked better with s_ij_jk...
-                new_vec[ik] += s_ij_ik * vec[ij] + s_ik_jk * vec[jk]  # jk/ik? was a bug?? worked better with s_ij_jk...
+                # new_vec[ik] += s_ij_jk * vec[ij] + s_ik_jk * vec[jk]  # jk/ik? was a bug?? worked better with s_ij_jk...
+                new_vec[ik] += (
+                    s_ij_ik * vec[ij] + s_ik_jk * vec[jk]
+                )  # jk/ik? was a bug?? worked better with s_ij_jk...
 
     return new_vec
 
+
 def _signs_times_v_cupy(n, Rijs, vec, J_weighting, _ALTS):
     """
     Ported from _signs_times_v_mex.c
@@ -800,8 +805,7 @@ def _signs_times_v_cupy(n, Rijs, vec, J_weighting, _ALTS):
     """
     # The code should be thread/parallel safe over `i`.
 
-
-    code = r'''
+    code = r"""
 
 /* from i,j indoces to the common index in the N-choose-2 sized array */
 #define PAIR_IDX(N,I,J) ((2*N-I-1)*I/2+J-I-1)
@@ -915,10 +919,10 @@ def _signs_times_v_cupy(n, Rijs, vec, J_weighting, _ALTS):
     } /* j */
     return;
 };
-'''
+"""
 
     module = cp.RawModule(code=code)
-    signs_times_v = module.get_function('signs_times_v')
+    signs_times_v = module.get_function("signs_times_v")
 
     Rijs_dev = cp.array(Rijs)
     vec_dev = cp.array(vec)
@@ -926,13 +930,13 @@ def _signs_times_v_cupy(n, Rijs, vec, J_weighting, _ALTS):
 
     # call the kernel
     blkszx = 128
-    nblkx = (n+blkszx-1)//blkszx
+    nblkx = (n + blkszx - 1) // blkszx
     # blkszy = 2
     # nblky = (n+blkszy-1)//blkszy
 
-    #signs_times_v((nblkx,nblky), (blkszx,blkszy), (n, Rijs_dev, vec_dev, new_vec_dev))
+    # signs_times_v((nblkx,nblky), (blkszx,blkszy), (n, Rijs_dev, vec_dev, new_vec_dev))
     signs_times_v((nblkx,), (blkszx,), (n, Rijs_dev, vec_dev, new_vec_dev))
 
-    new_vec= new_vec_dev.get()
+    new_vec = new_vec_dev.get()
 
     return new_vec
diff --git a/x.py b/x.py
index e8a3a78a70..7aab023007 100644
--- a/x.py
+++ b/x.py
@@ -1,14 +1,14 @@
-from aspire.abinitio.commonline_sync3n import _signs_times_v_cupy
-from aspire.abinitio.commonline_sync3n import _signs_times_v_host
-import numpy as np
 import cupy as cp
+import numpy as np
+
+from aspire.abinitio.commonline_sync3n import _signs_times_v_cupy, _signs_times_v_host
 
 n = 7
-n_pairs = n*(n-1)//2
+n_pairs = n * (n - 1) // 2
 vec = np.ones(n_pairs, dtype=np.float64)
 new_vec = np.zeros(n_pairs, dtype=np.float64)
-#Rijs = np.random.randn(n_pairs*3*3).astype(dtype=np.float64)
-Rijs = np.arange(n_pairs*3*3).reshape(n_pairs,3,3).astype(dtype=np.float64)
+# Rijs = np.random.randn(n_pairs*3*3).astype(dtype=np.float64)
+Rijs = np.arange(n_pairs * 3 * 3).reshape(n_pairs, 3, 3).astype(dtype=np.float64)
 
 
 new_vec = _signs_times_v_cupy(n, Rijs, vec, J_weighting=None, _ALTS=None)
@@ -18,5 +18,4 @@
 
 new_vec_host = _signs_times_v_host(n, Rijs, vec, J_weighting=None, _ALTS=None)
 
-print("host\n",new_vec_host)
-
+print("host\n", new_vec_host)

From 1583cec1c2352e846eab0362b8378d80ea42cbbc Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Thu, 11 Apr 2024 16:36:58 -0400
Subject: [PATCH 18/60] debug

---
 src/aspire/abinitio/commonline_sync3n.py | 39 +++++++++++++-----------
 x.py                                     | 12 ++++----
 2 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 6769d54550..ac7a84238a 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -690,7 +690,7 @@ def _J_sync_power_method(self, Rijs):
     def _signs_times_v(self, Rijs, vec):
 
         # host/gpu dispatch
-        # new_vec = _signs_times_v_host(self.n_img, Rijs, vec, self.J_weighting, _ALTS)
+        # new_vec = _signs_times_v_host(self.n_img, Rijs, vec, self.J_weighting, _ALTS, self._pairs_to_linear)
 
         assert self.J_weighting == False, "not implemented yet"
         new_vec = _signs_times_v_cupy(self.n_img, Rijs, vec, self.J_weighting, _ALTS)
@@ -702,7 +702,7 @@ def PAIR_IDX(N, I, J):
     return (2 * N - I - 1) * I // 2 + J - I - 1
 
 
-def _signs_times_v_host(n, Rijs, vec, J_weighting, _ALTS):
+def _signs_times_v_host(n, Rijs, vec, J_weighting, _ALTS, _pairs_to_linear):
     """
     Ported from _signs_times_v_mex.c
 
@@ -727,14 +727,14 @@ def _signs_times_v_host(n, Rijs, vec, J_weighting, _ALTS):
         desc += " with J_weighting"
     for i in trange(n, desc=desc):
         for j in range(i + 1, n - 1):  # check bound (taken from MATLAB mex)
-            # ij = self._pairs_to_linear[i, j]
-            ij = PAIR_IDX(n, i, j)
+            ij = _pairs_to_linear[i, j]
+            #ij = PAIR_IDX(n, i, j)
             Rij = Rijs[ij]
             for k in range(j + 1, n):
-                # ik = self._pairs_to_linear[i, k]
-                # jk = self._pairs_to_linear[j, k]
-                ik = PAIR_IDX(n, i, k)
-                jk = PAIR_IDX(n, j, k)
+                ik = _pairs_to_linear[i, k]
+                jk = _pairs_to_linear[j, k]
+                #ik = PAIR_IDX(n, i, k)
+                #jk = PAIR_IDX(n, j, k)
                 Rik = Rijs[ik]
                 Rjk = Rijs[jk]
 
@@ -784,9 +784,8 @@ def _signs_times_v_host(n, Rijs, vec, J_weighting, _ALTS):
                 new_vec[ij] += s_ij_jk * vec[jk] + s_ij_ik * vec[ik]
                 new_vec[jk] += s_ij_jk * vec[ij] + s_ik_jk * vec[ik]
                 # new_vec[ik] += s_ij_jk * vec[ij] + s_ik_jk * vec[jk]  # jk/ik? was a bug?? worked better with s_ij_jk...
-                new_vec[ik] += (
-                    s_ij_ik * vec[ij] + s_ik_jk * vec[jk]
-                )  # jk/ik? was a bug?? worked better with s_ij_jk...
+                # jk/ik? was a bug?? worked better with s_ij_jk...
+                new_vec[ik] += s_ij_ik * vec[ij] + s_ik_jk * vec[jk]
 
     return new_vec
 
@@ -808,7 +807,8 @@ def _signs_times_v_cupy(n, Rijs, vec, J_weighting, _ALTS):
     code = r"""
 
 /* from i,j indoces to the common index in the N-choose-2 sized array */
-#define PAIR_IDX(N,I,J) ((2*N-I-1)*I/2+J-I-1)
+#define PAIR_IDX(N,I,J) ((2*N-I-1)*I/2 + J-I-1)
+
 
 inline void mult_3x3(double *out, double *R1, double *R2) {
 /* 3X3 matrices multiplication: out = R1*R2 */
@@ -854,10 +854,10 @@ def _signs_times_v_cupy(n, Rijs, vec, J_weighting, _ALTS):
     //if(j >= n-1) return;
     //if(j < i+1) return;
 
-    unsigned long n_pairs = n*(n-1)/2;
-    double c[4]={0,0,0,0};
-    unsigned int ij, jk, ik;
-    unsigned int eig;
+    double c[4];
+    int j, k;
+    for(k=0;k<4;k++){c[k]=0;}
+    unsigned long ij, jk, ik;
     int best_i;
     double best_val;
     int s_ij_jk, s_ik_jk, s_ij_ik;
@@ -868,13 +868,15 @@ def _signs_times_v_cupy(n, Rijs, vec, J_weighting, _ALTS):
 
     /* le sigh */
     int signs_confs[4][3];
+    for(int a=0; a<4; a++) { for(k=0; k<3; k++) { signs_confs[a][k]=1; } }
     signs_confs[2-1][1-1]=-1; signs_confs[2-1][3-1]=-1;
     signs_confs[3-1][1-1]=-1; signs_confs[3-1][2-1]=-1;
     signs_confs[4-1][2-1]=-1; signs_confs[4-1][3-1]=-1;
     
-    for(int j=i+1; j< n - 1; j++){
+
+    for(j=i+1; j< n - 1; j++){
         ij = PAIR_IDX(n, i, j);
-        for(int k=j+1; k< n; k++){
+        for(k=j+1; k< n; k++){
             ik = PAIR_IDX(n, i, k);
             jk = PAIR_IDX(n, j, k);
 
@@ -917,6 +919,7 @@ def _signs_times_v_cupy(n, Rijs, vec, J_weighting, _ALTS):
 
         } /* k */
     } /* j */
+
     return;
 };
 """
diff --git a/x.py b/x.py
index 7aab023007..ea15631e36 100644
--- a/x.py
+++ b/x.py
@@ -2,20 +2,20 @@
 import numpy as np
 
 from aspire.abinitio.commonline_sync3n import _signs_times_v_cupy, _signs_times_v_host
+from aspire.utils import all_pairs
 
-n = 7
+n = 4
 n_pairs = n * (n - 1) // 2
+_, _pairs_to_linear = all_pairs(n, return_map=True)
+
 vec = np.ones(n_pairs, dtype=np.float64)
-new_vec = np.zeros(n_pairs, dtype=np.float64)
 # Rijs = np.random.randn(n_pairs*3*3).astype(dtype=np.float64)
 Rijs = np.arange(n_pairs * 3 * 3).reshape(n_pairs, 3, 3).astype(dtype=np.float64)
 
 
 new_vec = _signs_times_v_cupy(n, Rijs, vec, J_weighting=None, _ALTS=None)
+print("gpu\n", new_vec)
 
-print("gpu\n")
-print(new_vec)
-
-new_vec_host = _signs_times_v_host(n, Rijs, vec, J_weighting=None, _ALTS=None)
 
+new_vec_host = _signs_times_v_host(n, Rijs, vec, J_weighting=None, _ALTS=None, _pairs_to_linear=_pairs_to_linear)
 print("host\n", new_vec_host)

From b44dea69ce0e3ce716f93f056a000863cb3e9c91 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Fri, 12 Apr 2024 11:00:11 -0400
Subject: [PATCH 19/60] stashing, cupy code mostly works

---
 src/aspire/abinitio/commonline_sync3n.py | 67 ++++++++++---------
 x.py                                     | 85 +++++++++++++++++++++---
 2 files changed, 111 insertions(+), 41 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index ac7a84238a..bf35c9c73e 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -312,10 +312,10 @@ def _triangle_scores_mex(self, Rijs, hist_intervals):
                     Rjk_J = J_conjugate(Rjk)
 
                     # Compute R muls and norms
-                    c[0] = np.sum(((Rij @ Rjk) - Rik) ** 2)
-                    c[1] = np.sum(((Rij_J @ Rjk) - Rik) ** 2)
-                    c[2] = np.sum(((Rij @ Rjk_J) - Rik) ** 2)
-                    c[3] = np.sum(((Rij @ Rjk) - Rik_J) ** 2)
+                    c[0] = np.sum(((Rij @ Rjk.T) - Rik) ** 2)
+                    c[1] = np.sum(((Rij_J @ Rjk.T) - Rik) ** 2)
+                    c[2] = np.sum(((Rij @ Rjk_J.T) - Rik) ** 2)
+                    c[3] = np.sum(((Rij @ Rjk.T) - Rik_J) ** 2)
 
                     # Find best match
                     best_i = np.argmin(c)
@@ -392,10 +392,10 @@ def _pairs_probabilities(self, Rijs, P2, A, a, B, b, x0):
                     Rjk_J = J_conjugate(Rjk)
 
                     # Compute R muls and norms
-                    c[0] = np.sum(((Rij @ Rjk) - Rik) ** 2)
-                    c[1] = np.sum(((Rij_J @ Rjk) - Rik) ** 2)
-                    c[2] = np.sum(((Rij @ Rjk_J) - Rik) ** 2)
-                    c[3] = np.sum(((Rij @ Rjk) - Rik_J) ** 2)
+                    c[0] = np.sum(((Rij @ Rjk.T) - Rik) ** 2)
+                    c[1] = np.sum(((Rij_J @ Rjk.T) - Rik) ** 2)
+                    c[2] = np.sum(((Rij @ Rjk_J.T) - Rik) ** 2)
+                    c[3] = np.sum(((Rij @ Rjk.T) - Rik_J) ** 2)
 
                     # Find best match
                     best_i = np.argmin(c)
@@ -728,13 +728,13 @@ def _signs_times_v_host(n, Rijs, vec, J_weighting, _ALTS, _pairs_to_linear):
     for i in trange(n, desc=desc):
         for j in range(i + 1, n - 1):  # check bound (taken from MATLAB mex)
             ij = _pairs_to_linear[i, j]
-            #ij = PAIR_IDX(n, i, j)
+            # ij = PAIR_IDX(n, i, j)
             Rij = Rijs[ij]
             for k in range(j + 1, n):
                 ik = _pairs_to_linear[i, k]
                 jk = _pairs_to_linear[j, k]
-                #ik = PAIR_IDX(n, i, k)
-                #jk = PAIR_IDX(n, j, k)
+                # ik = PAIR_IDX(n, i, k)
+                # jk = PAIR_IDX(n, j, k)
                 Rik = Rijs[ik]
                 Rjk = Rijs[jk]
 
@@ -744,10 +744,10 @@ def _signs_times_v_host(n, Rijs, vec, J_weighting, _ALTS, _pairs_to_linear):
                 Rjk_J = J_conjugate(Rjk)
 
                 # Compute R muls and norms
-                c[0] = np.sum(((Rij @ Rjk) - Rik) ** 2)
-                c[1] = np.sum(((Rij_J @ Rjk) - Rik) ** 2)
-                c[2] = np.sum(((Rij @ Rjk_J) - Rik) ** 2)
-                c[3] = np.sum(((Rij @ Rjk) - Rik_J) ** 2)
+                c[0] = np.sum(((Rij @ Rjk.T) - Rik) ** 2)
+                c[1] = np.sum(((Rij_J @ Rjk.T) - Rik) ** 2)
+                c[2] = np.sum(((Rij @ Rjk_J.T) - Rik) ** 2)
+                c[3] = np.sum(((Rij @ Rjk.T) - Rik_J) ** 2)
 
                 # Find best match
                 best_i = np.argmin(c)
@@ -810,12 +810,16 @@ def _signs_times_v_cupy(n, Rijs, vec, J_weighting, _ALTS):
 #define PAIR_IDX(N,I,J) ((2*N-I-1)*I/2 + J-I-1)
 
 
+// DEBUG TRANS BUGS
 inline void mult_3x3(double *out, double *R1, double *R2) {
-/* 3X3 matrices multiplication: out = R1*R2 */
+// /* 3X3 matrices multiplication: out = R1*R2 */
+// out.T = R1.T @ R2.T ?
 	int i,j;
 	for (i=0; i<3; i++) {
 		for (j=0;j<3;j++) {
-			out[3*j+i] = R1[3*0+i]*R2[3*j+0] + R1[3*1+i]*R2[3*j+1] + R1[3*2+i]*R2[3*j+2];
+//			out[3*j+i] = R1[3*0+i]*R2[3*j+0] + R1[3*1+i]*R2[3*j+1] + R1[3*2+i]*R2[3*j+2];
+			out[3*i+j] = R1[3*0+i]*R2[3*j+0] + R1[3*1+i]*R2[3*j+1] + R1[3*2+i]*R2[3*j+2];
+
 		}
 	}
 }
@@ -847,15 +851,13 @@ def _signs_times_v_cupy(n, Rijs, vec, J_weighting, _ALTS):
 {
     /* thread index (1d), represents "i" index */
     unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-    //unsigned int j = blockDim.y * blockIdx.y + threadIdx.y;
 
     /* no-op when out of bounds */
     if(i >= n) return;
-    //if(j >= n-1) return;
-    //if(j < i+1) return;
 
     double c[4];
-    int j, k;
+    int j;
+    int k;
     for(k=0;k<4;k++){c[k]=0;}
     unsigned long ij, jk, ik;
     int best_i;
@@ -874,7 +876,7 @@ def _signs_times_v_cupy(n, Rijs, vec, J_weighting, _ALTS):
     signs_confs[4-1][2-1]=-1; signs_confs[4-1][3-1]=-1;
     
 
-    for(j=i+1; j< n - 1; j++){
+    for(j=i+1; j< (n - 1); j++){
         ij = PAIR_IDX(n, i, j);
         for(k=j+1; k< n; k++){
             ik = PAIR_IDX(n, i, k);
@@ -913,9 +915,10 @@ def _signs_times_v_cupy(n, Rijs, vec, J_weighting, _ALTS):
             s_ij_ik = signs_confs[best_i][2];
 
             /* update multiplication */
-            new_vec[ij] += s_ij_jk*vec[jk] + s_ij_ik*vec[ik];
-            new_vec[jk] += s_ij_jk*vec[ij] + s_ik_jk*vec[ik];
-            new_vec[ik] += s_ij_ik*vec[ij] + s_ik_jk*vec[jk];  /* ij jk bug? */
+            new_vec[ij*n + i] += s_ij_jk*vec[jk] + s_ij_ik*vec[ik];
+            new_vec[jk*n + i] += s_ij_jk*vec[ij] + s_ik_jk*vec[ik];
+            new_vec[ik*n + i] += s_ij_ik*vec[ij] + s_ik_jk*vec[jk];  /* ij jk bug?, relating to mat mul T? */
+            //new_vec[ik*n + i] += s_ij_jk*vec[ij] + s_ik_jk*vec[jk];  /* ij jk bug? */
 
         } /* k */
     } /* j */
@@ -929,17 +932,19 @@ def _signs_times_v_cupy(n, Rijs, vec, J_weighting, _ALTS):
 
     Rijs_dev = cp.array(Rijs)
     vec_dev = cp.array(vec)
-    new_vec_dev = cp.zeros_like(vec)
+    # 2d over i then accum to avoid race on i
+    new_vec_dev = cp.zeros((vec.shape[0], n))
 
     # call the kernel
-    blkszx = 128
+    blkszx = 512
     nblkx = (n + blkszx - 1) // blkszx
-    # blkszy = 2
-    # nblky = (n+blkszy-1)//blkszy
 
-    # signs_times_v((nblkx,nblky), (blkszx,blkszy), (n, Rijs_dev, vec_dev, new_vec_dev))
     signs_times_v((nblkx,), (blkszx,), (n, Rijs_dev, vec_dev, new_vec_dev))
 
-    new_vec = new_vec_dev.get()
+    # accumulate, can reuse the vec_dev array now.
+    cp.sum(new_vec_dev, axis=1, out=vec_dev)
+
+    # dtoh
+    new_vec = vec_dev.get()
 
     return new_vec
diff --git a/x.py b/x.py
index ea15631e36..02d39fe255 100644
--- a/x.py
+++ b/x.py
@@ -1,21 +1,86 @@
+import pickle
+import time
+from collections import defaultdict
+
 import cupy as cp
+import matplotlib.pyplot as plt
 import numpy as np
 
 from aspire.abinitio.commonline_sync3n import _signs_times_v_cupy, _signs_times_v_host
 from aspire.utils import all_pairs
 
-n = 4
-n_pairs = n * (n - 1) // 2
-_, _pairs_to_linear = all_pairs(n, return_map=True)
 
-vec = np.ones(n_pairs, dtype=np.float64)
-# Rijs = np.random.randn(n_pairs*3*3).astype(dtype=np.float64)
-Rijs = np.arange(n_pairs * 3 * 3).reshape(n_pairs, 3, 3).astype(dtype=np.float64)
+def time_test(n):
+    n_pairs = n * (n - 1) // 2
+    _, _pairs_to_linear = all_pairs(n, return_map=True)
+
+    vec = np.ones(n_pairs, dtype=np.float64)
+    # Rijs = np.random.randn(n_pairs*3*3).astype(dtype=np.float64)
+    Rijs = np.arange(n_pairs * 3 * 3).reshape(n_pairs, 3, 3).astype(dtype=np.float64)
+
+    tic0 = time.perf_counter()
+    new_vec = _signs_times_v_cupy(n, Rijs, vec, J_weighting=None, _ALTS=None)
+    tic1 = time.perf_counter()
+    gpu_time = tic1 - tic0
+    print("gpu\n", new_vec)
+
+    tic2 = time.perf_counter()
+    new_vec_host = _signs_times_v_host(
+        n, Rijs, vec, J_weighting=None, _ALTS=None, _pairs_to_linear=_pairs_to_linear
+    )
+    tic3 = time.perf_counter()
+    host_time = tic3 - tic2
+    print("host\n", new_vec_host)
+
+    print(f"\n\n\nSize:\t{n}")
+    print("Allclose? ", np.allclose(new_vec_host, new_vec))
+    print(f"gpu_time: {gpu_time}")
+    print(f"host_time: {host_time}")
+    speedup = host_time / gpu_time
+    print(f"speedup: {speedup}")
+
+    return host_time, gpu_time, speedup
+
+
+def plotit(results):
+    N = np.array(list(results.keys()))
+    H = np.array([v["host"] for v in results.values()])
+    G = np.array([v["gpu"] for v in results.values()])
+    S = np.array([v["speedup"] for v in results.values()])
+
+    plt.plot(N, H, label="host python")
+    plt.plot(N, G, label="cuda")
+    plt.title("Walltimes (s)")
+    plt.legend()
+    plt.show()
+    plt.savefig("walltimes.png")
+    plt.clf()
+
+    plt.plot(N, S)
+    plt.title("Speedup Ratio")
+    plt.show()
+    plt.savefig("speedups.png")
+    plt.clf()
+
+
+def main():
+    results = defaultdict(dict)
+    # too long...! for n in [4,16,64,100,128,200,256,512,1024,2048,3000, 4096, 10000]:
+    # for n in [4,16]: # test
+    for n in [4, 16, 64, 100, 128, 200, 512]:
+        h, g, s = time_test(n)
+        results[n]["host"] = h
+        results[n]["gpu"] = g
+        results[n]["speedup"] = s
 
+        # save in case we cancel
+        with open("saved_results.pkl", "wb") as f:
+            pickle.dump(results, f)
 
-new_vec = _signs_times_v_cupy(n, Rijs, vec, J_weighting=None, _ALTS=None)
-print("gpu\n", new_vec)
+    print()
+    print(results)
+    print()
 
+    plotit(results)
 
-new_vec_host = _signs_times_v_host(n, Rijs, vec, J_weighting=None, _ALTS=None, _pairs_to_linear=_pairs_to_linear)
-print("host\n", new_vec_host)
+time_test(64)

From 6cd68861f720bb58e7d842ae41914d89b7684cb4 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Fri, 12 Apr 2024 11:45:37 -0400
Subject: [PATCH 20/60] still debating that bug, but stashing here

---
 src/aspire/abinitio/commonline_sync3n.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index bf35c9c73e..68bc7dce9e 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -744,10 +744,10 @@ def _signs_times_v_host(n, Rijs, vec, J_weighting, _ALTS, _pairs_to_linear):
                 Rjk_J = J_conjugate(Rjk)
 
                 # Compute R muls and norms
-                c[0] = np.sum(((Rij @ Rjk.T) - Rik) ** 2)
-                c[1] = np.sum(((Rij_J @ Rjk.T) - Rik) ** 2)
-                c[2] = np.sum(((Rij @ Rjk_J.T) - Rik) ** 2)
-                c[3] = np.sum(((Rij @ Rjk.T) - Rik_J) ** 2)
+                c[0] = np.sum(((Rjk @ Rij) - Rik) ** 2)
+                c[1] = np.sum(((Rjk @ Rij_J) - Rik) ** 2)
+                c[2] = np.sum(((Rjk_J @ Rij) - Rik) ** 2)
+                c[3] = np.sum(((Rjk @ Rij ) - Rik_J) ** 2)
 
                 # Find best match
                 best_i = np.argmin(c)
@@ -817,8 +817,8 @@ def _signs_times_v_cupy(n, Rijs, vec, J_weighting, _ALTS):
 	int i,j;
 	for (i=0; i<3; i++) {
 		for (j=0;j<3;j++) {
-//			out[3*j+i] = R1[3*0+i]*R2[3*j+0] + R1[3*1+i]*R2[3*j+1] + R1[3*2+i]*R2[3*j+2];
-			out[3*i+j] = R1[3*0+i]*R2[3*j+0] + R1[3*1+i]*R2[3*j+1] + R1[3*2+i]*R2[3*j+2];
+			out[3*j+i] = R1[3*0+i]*R2[3*j+0] + R1[3*1+i]*R2[3*j+1] + R1[3*2+i]*R2[3*j+2];
+//			out[3*i+j] = R1[3*0+i]*R2[3*j+0] + R1[3*1+i]*R2[3*j+1] + R1[3*2+i]*R2[3*j+2];
 
 		}
 	}

From 37c2f34a43aca6839008b772ea41e6495ecb9478 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Tue, 16 Apr 2024 08:19:03 -0400
Subject: [PATCH 21/60] autoconf gpu

[skip ci]
---
 src/aspire/abinitio/commonline_sync3n.py | 106 +++++++++++++----------
 x.py                                     |   1 +
 2 files changed, 59 insertions(+), 48 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 68bc7dce9e..fb5722f4e5 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -1,6 +1,5 @@
 import logging
 
-import cupy as cp
 import numpy as np
 from numpy.linalg import norm
 from scipy.optimize import curve_fit
@@ -87,6 +86,22 @@ def __init__(
         self.J_weighting = J_weighting
         self._D_null = 1e-13
 
+        # Auto configure GPU
+        self._use_gpu = False
+        try:
+            import cupy as cp
+
+            if cp.cuda.runtime.getDeviceCount() >= 1:
+                gpu_id = cp.cuda.runtime.getDevice()
+                logger.info(
+                    f"cupy and GPU {gpu_id} found by cuda runtime; enabling cupy."
+                )
+                self._use_gpu = True
+            else:
+                logger.info("GPU not found, defaulting to numpy.")
+        except ModuleNotFoundError:
+            logger.info("cupy not found, defaulting numpy.")
+
     ###########################################
     # High level algorithm steps              #
     ###########################################
@@ -286,7 +301,6 @@ def body(prev_too_low, Pmin, Pmax, hist, p_domain_limit=p_domain_limit):
 
     def _triangle_scores_mex(self, Rijs, hist_intervals):
         # The following is adopted from Matlab triangle_scores_mex.c
-        # The code should be thread/parallel safe over `i` when results are gathered (via sum).
 
         # Initialize probability result arrays
         cum_scores = np.zeros(len(Rijs), dtype=self.dtype)
@@ -369,7 +383,6 @@ def _triangle_scores_mex(self, Rijs, hist_intervals):
 
     def _pairs_probabilities(self, Rijs, P2, A, a, B, b, x0):
         # The following is adopted from Matlab pairas_probabilities_mex.c `looper`
-        # The code should be thread/parallel safe over `i` when results are gathered (via sum).
 
         # Initialize probability result arrays
         ln_f_ind = np.zeros(len(Rijs), dtype=self.dtype)
@@ -690,18 +703,19 @@ def _J_sync_power_method(self, Rijs):
     def _signs_times_v(self, Rijs, vec):
 
         # host/gpu dispatch
-        # new_vec = _signs_times_v_host(self.n_img, Rijs, vec, self.J_weighting, _ALTS, self._pairs_to_linear)
-
-        assert self.J_weighting == False, "not implemented yet"
-        new_vec = _signs_times_v_cupy(self.n_img, Rijs, vec, self.J_weighting, _ALTS)
+        if self._use_gpu:
+            assert self.J_weighting is False, "not implemented yet"
+            new_vec = _signs_times_v_cupy(
+                self.n_img, Rijs, vec, self.J_weighting, _ALTS
+            )
+        else:
+            new_vec = _signs_times_v_host(
+                self.n_img, Rijs, vec, self.J_weighting, _ALTS, self._pairs_to_linear
+            )
 
         return new_vec
 
 
-def PAIR_IDX(N, I, J):
-    return (2 * N - I - 1) * I // 2 + J - I - 1
-
-
 def _signs_times_v_host(n, Rijs, vec, J_weighting, _ALTS, _pairs_to_linear):
     """
     Ported from _signs_times_v_mex.c
@@ -714,7 +728,6 @@ def _signs_times_v_host(n, Rijs, vec, J_weighting, _ALTS, _pairs_to_linear):
     _ALTS= 2x4x3 const lut array
     _signs_confs = 4x3 const lut array
     """
-    # The code should be thread/parallel safe over `i`.
 
     new_vec = np.zeros_like(vec)
 
@@ -728,13 +741,10 @@ def _signs_times_v_host(n, Rijs, vec, J_weighting, _ALTS, _pairs_to_linear):
     for i in trange(n, desc=desc):
         for j in range(i + 1, n - 1):  # check bound (taken from MATLAB mex)
             ij = _pairs_to_linear[i, j]
-            # ij = PAIR_IDX(n, i, j)
             Rij = Rijs[ij]
             for k in range(j + 1, n):
                 ik = _pairs_to_linear[i, k]
                 jk = _pairs_to_linear[j, k]
-                # ik = PAIR_IDX(n, i, k)
-                # jk = PAIR_IDX(n, j, k)
                 Rik = Rijs[ik]
                 Rjk = Rijs[jk]
 
@@ -747,7 +757,7 @@ def _signs_times_v_host(n, Rijs, vec, J_weighting, _ALTS, _pairs_to_linear):
                 c[0] = np.sum(((Rjk @ Rij) - Rik) ** 2)
                 c[1] = np.sum(((Rjk @ Rij_J) - Rik) ** 2)
                 c[2] = np.sum(((Rjk_J @ Rij) - Rik) ** 2)
-                c[3] = np.sum(((Rjk @ Rij ) - Rik_J) ** 2)
+                c[3] = np.sum(((Rjk @ Rij) - Rik_J) ** 2)
 
                 # Find best match
                 best_i = np.argmin(c)
@@ -802,7 +812,7 @@ def _signs_times_v_cupy(n, Rijs, vec, J_weighting, _ALTS):
     #todo _ALTS= 2x4x3 const lut array
     #todo _signs_confs = 4x3 const lut array
     """
-    # The code should be thread/parallel safe over `i`.
+    import cupy as cp
 
     code = r"""
 
@@ -814,35 +824,35 @@ def _signs_times_v_cupy(n, Rijs, vec, J_weighting, _ALTS):
 inline void mult_3x3(double *out, double *R1, double *R2) {
 // /* 3X3 matrices multiplication: out = R1*R2 */
 // out.T = R1.T @ R2.T ?
-	int i,j;
-	for (i=0; i<3; i++) {
-		for (j=0;j<3;j++) {
-			out[3*j+i] = R1[3*0+i]*R2[3*j+0] + R1[3*1+i]*R2[3*j+1] + R1[3*2+i]*R2[3*j+2];
-//			out[3*i+j] = R1[3*0+i]*R2[3*j+0] + R1[3*1+i]*R2[3*j+1] + R1[3*2+i]*R2[3*j+2];
-
-		}
-	}
+        int i,j;
+        for (i=0; i<3; i++) {
+                for (j=0;j<3;j++) {
+                        out[3*j+i] = R1[3*0+i]*R2[3*j+0] + R1[3*1+i]*R2[3*j+1] + R1[3*2+i]*R2[3*j+2];
+//                      out[3*i+j] = R1[3*0+i]*R2[3*j+0] + R1[3*1+i]*R2[3*j+1] + R1[3*2+i]*R2[3*j+2];
+
+                }
+        }
 }
 
 inline void JRJ(double *R, double *A) {
 /* multiple 3X3 matrix by J from both sizes: A = JRJ */
-	A[0]=R[0];
-	A[1]=R[1];
-	A[2]=-R[2];
-	A[3]=R[3];
-	A[4]=R[4];
-	A[5]=-R[5];
-	A[6]=-R[6];
-	A[7]=-R[7];
-	A[8]=R[8];
+        A[0]=R[0];
+        A[1]=R[1];
+        A[2]=-R[2];
+        A[3]=R[3];
+        A[4]=R[4];
+        A[5]=-R[5];
+        A[6]=-R[6];
+        A[7]=-R[7];
+        A[8]=R[8];
 }
 
 inline double diff_norm_3x3(const double *R1, const double *R2) {
 /* difference 2 matrices and return squared norm: ||R1-R2||^2 */
-	int i;
-	double norm = 0;
-	for (i=0; i<9; i++) {norm += (R1[i]-R2[i])*(R1[i]-R2[i]);}
-	return norm;
+        int i;
+        double norm = 0;
+        for (i=0; i<9; i++) {norm += (R1[i]-R2[i])*(R1[i]-R2[i]);}
+        return norm;
 }
 
 
@@ -874,7 +884,7 @@ def _signs_times_v_cupy(n, Rijs, vec, J_weighting, _ALTS):
     signs_confs[2-1][1-1]=-1; signs_confs[2-1][3-1]=-1;
     signs_confs[3-1][1-1]=-1; signs_confs[3-1][2-1]=-1;
     signs_confs[4-1][2-1]=-1; signs_confs[4-1][3-1]=-1;
-    
+
 
     for(j=i+1; j< (n - 1); j++){
         ij = PAIR_IDX(n, i, j);
@@ -883,32 +893,32 @@ def _signs_times_v_cupy(n, Rijs, vec, J_weighting, _ALTS):
             jk = PAIR_IDX(n, j, k);
 
             /* compute configurations matches scores */
-	    Rij = Rijs + 9*ij;
-	    Rjk = Rijs + 9*jk;
+            Rij = Rijs + 9*ij;
+            Rjk = Rijs + 9*jk;
             Rik = Rijs + 9*ik;
-                        
+
             JRJ(Rij, JRijJ);
             JRJ(Rjk, JRjkJ);
             JRJ(Rik, JRikJ);
-                        
+
             mult_3x3(tmp,Rij,Rjk);
             c[0] = diff_norm_3x3(tmp,Rik);
-                        
+
             mult_3x3(tmp,JRijJ,Rjk);
             c[1] = diff_norm_3x3(tmp,Rik);
-                        
+
             mult_3x3(tmp,Rij,JRjkJ);
             c[2] = diff_norm_3x3(tmp,Rik);
-                        
+
             mult_3x3(tmp,Rij,Rjk);
             c[3] = diff_norm_3x3(tmp,JRikJ);
-                        
+
             /* find best match */
             best_i=0; best_val=c[0];
             if (c[1]<best_val) {best_i=1; best_val=c[1];}
             if (c[2]<best_val) {best_i=2; best_val=c[2];}
             if (c[3]<best_val) {best_i=3; best_val=c[3];}
-        
+
             /* set triangles entries to be signs */
             s_ij_jk = signs_confs[best_i][0];
             s_ik_jk = signs_confs[best_i][1];
diff --git a/x.py b/x.py
index 02d39fe255..9bdbaf7182 100644
--- a/x.py
+++ b/x.py
@@ -83,4 +83,5 @@ def main():
 
     plotit(results)
 
+
 time_test(64)

From ee8d87b50fe0a282fe31666065ecc0e290408806 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Tue, 23 Apr 2024 14:20:18 -0400
Subject: [PATCH 22/60] rm debug comments after checking matmul

---
 src/aspire/abinitio/commonline_sync3n.py | 23 ++++++++---------------
 x.py                                     |  2 +-
 2 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index fb5722f4e5..ee39d732cc 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -793,8 +793,6 @@ def _signs_times_v_host(n, Rijs, vec, J_weighting, _ALTS, _pairs_to_linear):
                 # Update vector entries
                 new_vec[ij] += s_ij_jk * vec[jk] + s_ij_ik * vec[ik]
                 new_vec[jk] += s_ij_jk * vec[ij] + s_ik_jk * vec[ik]
-                # new_vec[ik] += s_ij_jk * vec[ij] + s_ik_jk * vec[jk]  # jk/ik? was a bug?? worked better with s_ij_jk...
-                # jk/ik? was a bug?? worked better with s_ij_jk...
                 new_vec[ik] += s_ij_ik * vec[ij] + s_ik_jk * vec[jk]
 
     return new_vec
@@ -820,18 +818,14 @@ def _signs_times_v_cupy(n, Rijs, vec, J_weighting, _ALTS):
 #define PAIR_IDX(N,I,J) ((2*N-I-1)*I/2 + J-I-1)
 
 
-// DEBUG TRANS BUGS
 inline void mult_3x3(double *out, double *R1, double *R2) {
-// /* 3X3 matrices multiplication: out = R1*R2 */
-// out.T = R1.T @ R2.T ?
-        int i,j;
-        for (i=0; i<3; i++) {
-                for (j=0;j<3;j++) {
-                        out[3*j+i] = R1[3*0+i]*R2[3*j+0] + R1[3*1+i]*R2[3*j+1] + R1[3*2+i]*R2[3*j+2];
-//                      out[3*i+j] = R1[3*0+i]*R2[3*j+0] + R1[3*1+i]*R2[3*j+1] + R1[3*2+i]*R2[3*j+2];
-
-                }
-        }
+  /* 3X3 matrices multiplication: out = R1*R2 */
+  int i,j;
+  for (i=0; i<3; i++) {
+    for (j=0;j<3;j++) {
+      out[3*j+i] = R1[3*0+i]*R2[3*j+0] + R1[3*1+i]*R2[3*j+1] + R1[3*2+i]*R2[3*j+2];
+    }
+  }
 }
 
 inline void JRJ(double *R, double *A) {
@@ -927,8 +921,7 @@ def _signs_times_v_cupy(n, Rijs, vec, J_weighting, _ALTS):
             /* update multiplication */
             new_vec[ij*n + i] += s_ij_jk*vec[jk] + s_ij_ik*vec[ik];
             new_vec[jk*n + i] += s_ij_jk*vec[ij] + s_ik_jk*vec[ik];
-            new_vec[ik*n + i] += s_ij_ik*vec[ij] + s_ik_jk*vec[jk];  /* ij jk bug?, relating to mat mul T? */
-            //new_vec[ik*n + i] += s_ij_jk*vec[ij] + s_ik_jk*vec[jk];  /* ij jk bug? */
+            new_vec[ik*n + i] += s_ij_ik*vec[ij] + s_ik_jk*vec[jk];
 
         } /* k */
     } /* j */
diff --git a/x.py b/x.py
index 9bdbaf7182..a82da49ae7 100644
--- a/x.py
+++ b/x.py
@@ -84,4 +84,4 @@ def main():
     plotit(results)
 
 
-time_test(64)
+time_test(128)

From ea391fb0e2c13d1d59b74b8a58b5f46d4492d5eb Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Tue, 23 Apr 2024 15:57:19 -0400
Subject: [PATCH 23/60] fixup Rijk_lmnop muls

[skip ci]
---
 src/aspire/abinitio/commonline_sync3n.py | 88 +++++++++++++++++-------
 x.py                                     |  6 +-
 2 files changed, 65 insertions(+), 29 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index ee39d732cc..277b87bfd5 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -705,9 +705,7 @@ def _signs_times_v(self, Rijs, vec):
         # host/gpu dispatch
         if self._use_gpu:
             assert self.J_weighting is False, "not implemented yet"
-            new_vec = _signs_times_v_cupy(
-                self.n_img, Rijs, vec, self.J_weighting, _ALTS
-            )
+            new_vec = _signs_times_v_cupy(self.n_img, Rijs, vec, self.J_weighting)
         else:
             new_vec = _signs_times_v_host(
                 self.n_img, Rijs, vec, self.J_weighting, _ALTS, self._pairs_to_linear
@@ -726,7 +724,6 @@ def _signs_times_v_host(n, Rijs, vec, J_weighting, _ALTS, _pairs_to_linear):
     new_vec: output array
     J_weighting: bool
     _ALTS= 2x4x3 const lut array
-    _signs_confs = 4x3 const lut array
     """
 
     new_vec = np.zeros_like(vec)
@@ -734,6 +731,7 @@ def _signs_times_v_host(n, Rijs, vec, J_weighting, _ALTS, _pairs_to_linear):
     _signs_confs = np.array(
         [[1, 1, 1], [-1, 1, -1], [-1, -1, 1], [1, -1, -1]], dtype=int
     )
+
     c = np.empty((4))
     desc = "Computing signs_times_v"
     if J_weighting:
@@ -754,10 +752,10 @@ def _signs_times_v_host(n, Rijs, vec, J_weighting, _ALTS, _pairs_to_linear):
                 Rjk_J = J_conjugate(Rjk)
 
                 # Compute R muls and norms
-                c[0] = np.sum(((Rjk @ Rij) - Rik) ** 2)
-                c[1] = np.sum(((Rjk @ Rij_J) - Rik) ** 2)
-                c[2] = np.sum(((Rjk_J @ Rij) - Rik) ** 2)
-                c[3] = np.sum(((Rjk @ Rij) - Rik_J) ** 2)
+                c[0] = np.sum(((Rij @ Rjk) - Rik) ** 2)
+                c[1] = np.sum(((Rij_J @ Rjk) - Rik) ** 2)
+                c[2] = np.sum(((Rij @ Rjk_J) - Rik) ** 2)
+                c[3] = np.sum(((Rij @ Rjk) - Rik_J) ** 2)
 
                 # Find best match
                 best_i = np.argmin(c)
@@ -798,7 +796,7 @@ def _signs_times_v_host(n, Rijs, vec, J_weighting, _ALTS, _pairs_to_linear):
     return new_vec
 
 
-def _signs_times_v_cupy(n, Rijs, vec, J_weighting, _ALTS):
+def _signs_times_v_cupy(n, Rijs, vec, J_weighting):
     """
     Ported from _signs_times_v_mex.c
 
@@ -806,9 +804,7 @@ def _signs_times_v_cupy(n, Rijs, vec, J_weighting, _ALTS):
     Rijs: nchoose2x3x3 array
     vec: input array
     new_vec: output array
-    #todo J_weighting: bool
-    #todo _ALTS= 2x4x3 const lut array
-    #todo _signs_confs = 4x3 const lut array
+    J_weighting: bool
     """
     import cupy as cp
 
@@ -851,7 +847,7 @@ def _signs_times_v_cupy(n, Rijs, vec, J_weighting, _ALTS):
 
 
 extern "C" __global__
-void signs_times_v(int n, double* Rijs, const double* vec, double* new_vec)
+void signs_times_v(int n, double* Rijs, const double* vec, double* new_vec, bool J_weighting)
 {
     /* thread index (1d), represents "i" index */
     unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
@@ -860,24 +856,40 @@ def _signs_times_v_cupy(n, Rijs, vec, J_weighting, _ALTS):
     if(i >= n) return;
 
     double c[4];
-    int j;
-    int k;
+    unsigned int j;
+    unsigned int k;
     for(k=0;k<4;k++){c[k]=0;}
     unsigned long ij, jk, ik;
     int best_i;
     double best_val;
-    int s_ij_jk, s_ik_jk, s_ij_ik;
+    double s_ij_jk, s_ik_jk, s_ij_ik;
+    double alt_ij_jk, alt_ij_ik, alt_ik_jk;
 
     double *Rij, *Rjk, *Rik;
     double JRijJ[9], JRjkJ[9], JRikJ[9];
     double tmp[9];
 
-    /* le sigh */
     int signs_confs[4][3];
     for(int a=0; a<4; a++) { for(k=0; k<3; k++) { signs_confs[a][k]=1; } }
-    signs_confs[2-1][1-1]=-1; signs_confs[2-1][3-1]=-1;
-    signs_confs[3-1][1-1]=-1; signs_confs[3-1][2-1]=-1;
-    signs_confs[4-1][2-1]=-1; signs_confs[4-1][3-1]=-1;
+    signs_confs[1][0]=-1; signs_confs[1][2]=-1;
+    signs_confs[2][0]=-1; signs_confs[2][1]=-1;
+    signs_confs[3][1]=-1; signs_confs[3][2]=-1;
+
+    /* initialize alternatives */
+    /* when we find the best J-configuration, we also compare it to the alternative 2nd best one.
+    * this comparison is done for every pair in the triplete independently. to make sure that the
+    * alternative is indeed different in relation to the pair, we document the differences between
+    * the configurations in advance:
+    * ALTS(:,best_conf,pair) = the two configurations in which J-sync differs from
+    * best_conf in relation to pair */
+
+    int ALTS[2][4][3];
+    ALTS[0][0][0]=1; ALTS[0][1][0]=0; ALTS[0][2][0]=0; ALTS[0][3][0]=1;
+    ALTS[1][0][0]=2; ALTS[1][1][0]=3; ALTS[1][2][0]=3; ALTS[1][3][0]=2;
+    ALTS[0][0][1]=2; ALTS[0][1][1]=2; ALTS[0][2][1]=0; ALTS[0][3][1]=0;
+    ALTS[1][0][1]=3; ALTS[1][1][1]=3; ALTS[1][2][1]=1; ALTS[1][3][1]=1;
+    ALTS[0][0][2]=1; ALTS[0][1][2]=0; ALTS[0][2][2]=1; ALTS[0][3][2]=0;
+    ALTS[1][0][2]=3; ALTS[1][1][2]=2; ALTS[1][2][2]=3; ALTS[1][3][2]=2;
 
 
     for(j=i+1; j< (n - 1); j++){
@@ -895,16 +907,16 @@ def _signs_times_v_cupy(n, Rijs, vec, J_weighting, _ALTS):
             JRJ(Rjk, JRjkJ);
             JRJ(Rik, JRikJ);
 
-            mult_3x3(tmp,Rij,Rjk);
+            mult_3x3(tmp,Rjk,Rij);
             c[0] = diff_norm_3x3(tmp,Rik);
 
-            mult_3x3(tmp,JRijJ,Rjk);
+            mult_3x3(tmp,Rjk,JRijJ);
             c[1] = diff_norm_3x3(tmp,Rik);
 
-            mult_3x3(tmp,Rij,JRjkJ);
+            mult_3x3(tmp,JRjkJ,Rij);
             c[2] = diff_norm_3x3(tmp,Rik);
 
-            mult_3x3(tmp,Rij,Rjk);
+            mult_3x3(tmp,Rjk,Rij);
             c[3] = diff_norm_3x3(tmp,JRikJ);
 
             /* find best match */
@@ -918,6 +930,30 @@ def _signs_times_v_cupy(n, Rijs, vec, J_weighting, _ALTS):
             s_ik_jk = signs_confs[best_i][1];
             s_ij_ik = signs_confs[best_i][2];
 
+            /* J weighting */
+            if(J_weighting){
+                /* for each triangle side, find the best alternative */
+                alt_ij_jk = c[ALTS[0][best_i][0]];
+                if (c[ALTS[1][best_i][0]] < alt_ij_jk){
+                     alt_ij_jk = c[ALTS[1][best_i][0]];
+                }
+
+                alt_ik_jk = c[ALTS[0][best_i][1]];
+                if (c[ALTS[1][best_i][1]] < alt_ik_jk){
+                     alt_ik_jk = c[ALTS[1][best_i][1]];
+                }
+                alt_ij_ik = c[ALTS[0][best_i][2]];
+                if (c[ALTS[1][best_i][2]] < alt_ij_ik){
+                     alt_ij_ik = c[ALTS[1][best_i][2]];
+                }
+
+                /* Update scores */
+                s_ij_jk *= 1 - sqrt(best_val / alt_ij_jk);
+                s_ik_jk *= 1 - sqrt(best_val / alt_ik_jk);
+                s_ij_ik *= 1 - sqrt(best_val / alt_ij_ik);
+            }
+
+
             /* update multiplication */
             new_vec[ij*n + i] += s_ij_jk*vec[jk] + s_ij_ik*vec[ik];
             new_vec[jk*n + i] += s_ij_jk*vec[ij] + s_ik_jk*vec[ik];
@@ -941,8 +977,8 @@ def _signs_times_v_cupy(n, Rijs, vec, J_weighting, _ALTS):
     # call the kernel
     blkszx = 512
     nblkx = (n + blkszx - 1) // blkszx
-
-    signs_times_v((nblkx,), (blkszx,), (n, Rijs_dev, vec_dev, new_vec_dev))
+    assert J_weighting == False
+    signs_times_v((nblkx,), (blkszx,), (n, Rijs_dev, vec_dev, new_vec_dev, J_weighting))
 
     # accumulate, can reuse the vec_dev array now.
     cp.sum(new_vec_dev, axis=1, out=vec_dev)
diff --git a/x.py b/x.py
index a82da49ae7..ebc5e6d768 100644
--- a/x.py
+++ b/x.py
@@ -19,14 +19,14 @@ def time_test(n):
     Rijs = np.arange(n_pairs * 3 * 3).reshape(n_pairs, 3, 3).astype(dtype=np.float64)
 
     tic0 = time.perf_counter()
-    new_vec = _signs_times_v_cupy(n, Rijs, vec, J_weighting=None, _ALTS=None)
+    new_vec = _signs_times_v_cupy(n, Rijs, vec, J_weighting=False)
     tic1 = time.perf_counter()
     gpu_time = tic1 - tic0
     print("gpu\n", new_vec)
 
     tic2 = time.perf_counter()
     new_vec_host = _signs_times_v_host(
-        n, Rijs, vec, J_weighting=None, _ALTS=None, _pairs_to_linear=_pairs_to_linear
+        n, Rijs, vec, J_weighting=False, _ALTS=None, _pairs_to_linear=_pairs_to_linear
     )
     tic3 = time.perf_counter()
     host_time = tic3 - tic2
@@ -84,4 +84,4 @@ def main():
     plotit(results)
 
 
-time_test(128)
+time_test(64)

From 46c72dbde8ef2f984afb980283679e52ac6e1bce Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Wed, 24 Apr 2024 09:58:57 -0400
Subject: [PATCH 24/60] re-implement matmul

[skip ci]
---
 src/aspire/abinitio/commonline_sync3n.py | 33 ++++++++++++++----------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 277b87bfd5..510d789ac8 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -815,11 +815,18 @@ def _signs_times_v_cupy(n, Rijs, vec, J_weighting):
 
 
 inline void mult_3x3(double *out, double *R1, double *R2) {
-  /* 3X3 matrices multiplication: out = R1*R2 */
-  int i,j;
-  for (i=0; i<3; i++) {
-    for (j=0;j<3;j++) {
-      out[3*j+i] = R1[3*0+i]*R2[3*j+0] + R1[3*1+i]*R2[3*j+1] + R1[3*2+i]*R2[3*j+2];
+  /* 3X3 matrices multiplication: out = R1*R2
+   * Note, this differs from the MATLAB mult_3x3.
+  */
+
+  int i,j,k;
+
+  for(i=0; i<3; i++){
+    for(j=0; j<3; j++){
+      out[i*3 + j] = 0;
+      for (k=0; k<3; k++){
+        out[i*3 + j] += R1[i*3+k] * R2[k*3+j];
+      }
     }
   }
 }
@@ -907,17 +914,17 @@ def _signs_times_v_cupy(n, Rijs, vec, J_weighting):
             JRJ(Rjk, JRjkJ);
             JRJ(Rik, JRikJ);
 
-            mult_3x3(tmp,Rjk,Rij);
-            c[0] = diff_norm_3x3(tmp,Rik);
+            mult_3x3(tmp, Rij, Rjk);
+            c[0] = diff_norm_3x3(tmp, Rik);
 
-            mult_3x3(tmp,Rjk,JRijJ);
-            c[1] = diff_norm_3x3(tmp,Rik);
+            mult_3x3(tmp, JRijJ, Rjk);
+            c[1] = diff_norm_3x3(tmp, Rik);
 
-            mult_3x3(tmp,JRjkJ,Rij);
-            c[2] = diff_norm_3x3(tmp,Rik);
+            mult_3x3(tmp, Rij, JRjkJ);
+            c[2] = diff_norm_3x3(tmp, Rik);
 
-            mult_3x3(tmp,Rjk,Rij);
-            c[3] = diff_norm_3x3(tmp,JRikJ);
+            mult_3x3(tmp, Rij, Rjk);
+            c[3] = diff_norm_3x3(tmp, JRikJ);
 
             /* find best match */
             best_i=0; best_val=c[0];

From e7e4e293f1683a34ec3416811f9dcfa691e187b5 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Wed, 24 Apr 2024 16:43:38 -0400
Subject: [PATCH 25/60] add pairs prob kernel

[skip ci]
---
 src/aspire/abinitio/commonline_sync3n.py | 405 ++++++++++++++++-------
 1 file changed, 294 insertions(+), 111 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 510d789ac8..1f64cf7e12 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -299,7 +299,7 @@ def body(prev_too_low, Pmin, Pmax, hist, p_domain_limit=p_domain_limit):
 
         return W
 
-    def _triangle_scores_mex(self, Rijs, hist_intervals):
+    def _triangle_scores_inner(self, Rijs, hist_intervals):
         # The following is adopted from Matlab triangle_scores_mex.c
 
         # Initialize probability result arrays
@@ -326,10 +326,10 @@ def _triangle_scores_mex(self, Rijs, hist_intervals):
                     Rjk_J = J_conjugate(Rjk)
 
                     # Compute R muls and norms
-                    c[0] = np.sum(((Rij @ Rjk.T) - Rik) ** 2)
-                    c[1] = np.sum(((Rij_J @ Rjk.T) - Rik) ** 2)
-                    c[2] = np.sum(((Rij @ Rjk_J.T) - Rik) ** 2)
-                    c[3] = np.sum(((Rij @ Rjk.T) - Rik_J) ** 2)
+                    c[0] = np.sum(((Rij @ Rjk) - Rik) ** 2)
+                    c[1] = np.sum(((Rij_J @ Rjk) - Rik) ** 2)
+                    c[2] = np.sum(((Rij @ Rjk_J) - Rik) ** 2)
+                    c[3] = np.sum(((Rij @ Rjk) - Rik_J) ** 2)
 
                     # Find best match
                     best_i = np.argmin(c)
@@ -382,95 +382,15 @@ def _triangle_scores_mex(self, Rijs, hist_intervals):
         return cum_scores, scores_hist
 
     def _pairs_probabilities(self, Rijs, P2, A, a, B, b, x0):
-        # The following is adopted from Matlab pairas_probabilities_mex.c `looper`
-
-        # Initialize probability result arrays
-        ln_f_ind = np.zeros(len(Rijs), dtype=self.dtype)
-        ln_f_arb = np.zeros(len(Rijs), dtype=self.dtype)
-
-        c = np.empty((4), dtype=self.dtype)
-        for i in trange(self.n_img, desc="Computing pair probabilities"):
-            for j in range(i + 1, self.n_img - 1):
-                ij = self._pairs_to_linear[i, j]
-                Rij = Rijs[ij]
-                for k in range(j + 1, self.n_img):
-                    ik = self._pairs_to_linear[i, k]
-                    jk = self._pairs_to_linear[j, k]
-                    Rik = Rijs[ik]
-                    Rjk = Rijs[jk]
-
-                    # Compute conjugated rotats
-                    Rij_J = J_conjugate(Rij)
-                    Rik_J = J_conjugate(Rik)
-                    Rjk_J = J_conjugate(Rjk)
-
-                    # Compute R muls and norms
-                    c[0] = np.sum(((Rij @ Rjk.T) - Rik) ** 2)
-                    c[1] = np.sum(((Rij_J @ Rjk.T) - Rik) ** 2)
-                    c[2] = np.sum(((Rij @ Rjk_J.T) - Rik) ** 2)
-                    c[3] = np.sum(((Rij @ Rjk.T) - Rik_J) ** 2)
-
-                    # Find best match
-                    best_i = np.argmin(c)
-                    best_val = c[best_i]
-
-                    # For each triangle side, find the best alternative
-                    alt_ij_jk = c[_ALTS[0][best_i][0]]
-                    if c[_ALTS[1][best_i][0]] < alt_ij_jk:
-                        alt_ij_jk = c[_ALTS[1][best_i][0]]
-                    alt_ik_jk = c[_ALTS[0][best_i][1]]
-                    if c[_ALTS[1][best_i][1]] < alt_ik_jk:
-                        alt_ik_jk = c[_ALTS[1][best_i][1]]
-                    alt_ij_ik = c[_ALTS[0][best_i][2]]
-                    if c[_ALTS[1][best_i][2]] < alt_ij_ik:
-                        alt_ij_ik = c[_ALTS[1][best_i][2]]
-
-                    # Compute scores
-                    s_ij_jk = 1 - np.sqrt(best_val / alt_ij_jk)
-                    s_ik_jk = 1 - np.sqrt(best_val / alt_ik_jk)
-                    s_ij_ik = 1 - np.sqrt(best_val / alt_ij_ik)
-
-                    # Update probabilities
-                    # # Probability of pair ij having score given indicicative common line
-                    # P2, B, b, x0, A, a
-                    f_ij_jk = np.log(
-                        P2
-                        * (
-                            B
-                            * np.power(1 - s_ij_jk, b)
-                            * np.exp(-b / (1 - x0) * (1 - s_ij_jk))
-                        )
-                        + (1 - P2) * A * np.power((1 - s_ij_jk), a)
-                    )
-                    f_ik_jk = np.log(
-                        P2
-                        * (
-                            B
-                            * np.power(1 - s_ik_jk, b)
-                            * np.exp(-b / (1 - x0) * (1 - s_ik_jk))
-                        )
-                        + (1 - P2) * A * np.power((1 - s_ik_jk), a)
-                    )
-                    f_ij_ik = np.log(
-                        P2
-                        * (
-                            B
-                            * np.power(1 - s_ij_ik, b)
-                            * np.exp(-b / (1 - x0) * (1 - s_ij_ik))
-                        )
-                        + (1 - P2) * A * np.power((1 - s_ij_ik), a)
-                    )
-                    ln_f_ind[ij] += f_ij_jk + f_ij_ik
-                    ln_f_ind[jk] += f_ij_jk + f_ik_jk
-                    ln_f_ind[ik] += f_ik_jk + f_ij_ik
-
-                    # # Probability of pair ij having score given arbitrary common line
-                    f_ij_jk = np.log(A * np.power((1 - s_ij_jk), a))
-                    f_ik_jk = np.log(A * np.power((1 - s_ik_jk), a))
-                    f_ij_ik = np.log(A * np.power((1 - s_ij_ik), a))
-                    ln_f_arb[ij] += f_ij_jk + f_ij_ik
-                    ln_f_arb[jk] += f_ij_jk + f_ik_jk
-                    ln_f_arb[ik] += f_ik_jk + f_ij_ik
+        # dtype is critical for passing into C code...
+        params = np.arary([P2, A, a, B, b, x0], dtype=np.float64)
+        # host/gpu dispatch
+        if self._use_gpu:
+            ln_f_ind, ln_f_arb = _pairs_probabilities_cupy(self.n_img, Rijs, *params)
+        else:
+            ln_f_ind, ln_f_arb = _pairs_probabilities_host(
+                self.n_img, Rijs, *params, _ALTS, self._pairs_to_linear
+            )
 
         return ln_f_ind, ln_f_arb
 
@@ -507,7 +427,7 @@ def _triangle_scores(
 
         cum_scores = None  # XXX Why do we even need cum_scores?
         if scores_hist is None:
-            cum_scores, scores_hist = self._triangle_scores_mex(Rijs, hist_intervals)
+            cum_scores, scores_hist = self._triangle_scores_inner(Rijs, hist_intervals)
 
             # Normalize cumulated scores
             cum_scores /= len(Rijs)
@@ -704,7 +624,6 @@ def _signs_times_v(self, Rijs, vec):
 
         # host/gpu dispatch
         if self._use_gpu:
-            assert self.J_weighting is False, "not implemented yet"
             new_vec = _signs_times_v_cupy(self.n_img, Rijs, vec, self.J_weighting)
         else:
             new_vec = _signs_times_v_host(
@@ -796,19 +715,8 @@ def _signs_times_v_host(n, Rijs, vec, J_weighting, _ALTS, _pairs_to_linear):
     return new_vec
 
 
-def _signs_times_v_cupy(n, Rijs, vec, J_weighting):
-    """
-    Ported from _signs_times_v_mex.c
-
-    n: n_img
-    Rijs: nchoose2x3x3 array
-    vec: input array
-    new_vec: output array
-    J_weighting: bool
-    """
-    import cupy as cp
-
-    code = r"""
+def _init_cupy_module():
+    module_code = r"""
 
 /* from i,j indoces to the common index in the N-choose-2 sized array */
 #define PAIR_IDX(N,I,J) ((2*N-I-1)*I/2 + J-I-1)
@@ -971,9 +879,157 @@ def _signs_times_v_cupy(n, Rijs, vec, J_weighting):
 
     return;
 };
+
+extern "C" __global__
+void pairs_probabilities(int n, double* Rijs, double P2, double A, double a, double B, double b, double x0, double* ln_f_ind, double* ln_f_arb)
+{
+    /* thread index (1d), represents "i" index */
+    unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    /* no-op when out of bounds */
+    if(i >= n) return;
+
+    double c[4];
+    unsigned int j;
+    unsigned int k;
+    for(k=0;k<4;k++){c[k]=0;}
+    unsigned long ij, jk, ik;
+    int best_i;
+    double best_val;
+    double s_ij_jk, s_ik_jk, s_ij_ik;
+    double alt_ij_jk, alt_ij_ik, alt_ik_jk;
+    double f_ij_jk, f_ik_jk, f_ij_ik;
+
+
+    double *Rij, *Rjk, *Rik;
+    double JRijJ[9], JRjkJ[9], JRikJ[9];
+    double tmp[9];
+
+    int signs_confs[4][3];
+    for(int a=0; a<4; a++) { for(k=0; k<3; k++) { signs_confs[a][k]=1; } }
+    signs_confs[1][0]=-1; signs_confs[1][2]=-1;
+    signs_confs[2][0]=-1; signs_confs[2][1]=-1;
+    signs_confs[3][1]=-1; signs_confs[3][2]=-1;
+
+    /* initialize alternatives */
+    /* when we find the best J-configuration, we also compare it to the alternative 2nd best one.
+    * this comparison is done for every pair in the triplete independently. to make sure that the
+    * alternative is indeed different in relation to the pair, we document the differences between
+    * the configurations in advance:
+    * ALTS(:,best_conf,pair) = the two configurations in which J-sync differs from
+    * best_conf in relation to pair */
+
+    int ALTS[2][4][3];
+    ALTS[0][0][0]=1; ALTS[0][1][0]=0; ALTS[0][2][0]=0; ALTS[0][3][0]=1;
+    ALTS[1][0][0]=2; ALTS[1][1][0]=3; ALTS[1][2][0]=3; ALTS[1][3][0]=2;
+    ALTS[0][0][1]=2; ALTS[0][1][1]=2; ALTS[0][2][1]=0; ALTS[0][3][1]=0;
+    ALTS[1][0][1]=3; ALTS[1][1][1]=3; ALTS[1][2][1]=1; ALTS[1][3][1]=1;
+    ALTS[0][0][2]=1; ALTS[0][1][2]=0; ALTS[0][2][2]=1; ALTS[0][3][2]=0;
+    ALTS[1][0][2]=3; ALTS[1][1][2]=2; ALTS[1][2][2]=3; ALTS[1][3][2]=2;
+
+
+    for(j=i+1; j< (n - 1); j++){
+        ij = PAIR_IDX(n, i, j);
+        for(k=j+1; k< n; k++){
+            ik = PAIR_IDX(n, i, k);
+            jk = PAIR_IDX(n, j, k);
+
+            /* compute configurations matches scores */
+            Rij = Rijs + 9*ij;
+            Rjk = Rijs + 9*jk;
+            Rik = Rijs + 9*ik;
+
+            JRJ(Rij, JRijJ);
+            JRJ(Rjk, JRjkJ);
+            JRJ(Rik, JRikJ);
+
+            mult_3x3(tmp, Rij, Rjk);
+            c[0] = diff_norm_3x3(tmp, Rik);
+
+            mult_3x3(tmp, JRijJ, Rjk);
+            c[1] = diff_norm_3x3(tmp, Rik);
+
+            mult_3x3(tmp, Rij, JRjkJ);
+            c[2] = diff_norm_3x3(tmp, Rik);
+
+            mult_3x3(tmp, Rij, Rjk);
+            c[3] = diff_norm_3x3(tmp, JRikJ);
+
+            /* find best match */
+            best_i=0; best_val=c[0];
+            if (c[1]<best_val) {best_i=1; best_val=c[1];}
+            if (c[2]<best_val) {best_i=2; best_val=c[2];}
+            if (c[3]<best_val) {best_i=3; best_val=c[3];}
+
+             /* for each triangle side, find the best alternative */
+             alt_ij_jk = c[ALTS[0][best_i][0]];
+             if (c[ALTS[1][best_i][0]] < alt_ij_jk){
+                 alt_ij_jk = c[ALTS[1][best_i][0]];
+             }
+
+             alt_ik_jk = c[ALTS[0][best_i][1]];
+             if (c[ALTS[1][best_i][1]] < alt_ik_jk){
+                 alt_ik_jk = c[ALTS[1][best_i][1]];
+             }
+             alt_ij_ik = c[ALTS[0][best_i][2]];
+             if (c[ALTS[1][best_i][2]] < alt_ij_ik){
+                 alt_ij_ik = c[ALTS[1][best_i][2]];
+             }
+
+            /* Assign scores */
+            s_ij_jk = 1 - sqrt(best_val / alt_ij_jk);
+            s_ik_jk = 1 - sqrt(best_val / alt_ik_jk);
+            s_ij_ik = 1 - sqrt(best_val / alt_ij_ik);
+            
+
+            /* the probability of a pair ij to have the observed triangles scores,
+            given it has an indicative common line */
+            f_ij_jk = log( P2*(B*pow(1-s_ij_jk,b)*exp(-b/(1-x0)*(1-s_ij_jk))) + (1-P2)*A*pow((1-s_ij_jk),a) );
+            f_ik_jk = log( P2*(B*pow(1-s_ik_jk,b)*exp(-b/(1-x0)*(1-s_ik_jk))) + (1-P2)*A*pow((1-s_ik_jk),a) );
+            f_ij_ik = log( P2*(B*pow(1-s_ij_ik,b)*exp(-b/(1-x0)*(1-s_ij_ik))) + (1-P2)*A*pow((1-s_ij_ik),a) );
+	    ln_f_ind[ij*n +i] += f_ij_jk + f_ij_ik;
+	    ln_f_ind[jk*n +i] += f_ij_jk + f_ik_jk;
+	    ln_f_ind[ik*n +i] += f_ik_jk + f_ij_ik;
+			
+            /* the probability of a pair ij to have the observed triangles scores,
+             given it has an arbitrary common line */
+            f_ij_jk = log( A*pow((1-s_ij_jk),a) );
+	    f_ik_jk = log( A*pow((1-s_ik_jk),a) );
+	    f_ij_ik = log( A*pow((1-s_ij_ik),a) );
+	    ln_f_arb[ij*n +i] += f_ij_jk + f_ij_ik;
+	    ln_f_arb[jk*n +i] += f_ij_jk + f_ik_jk;
+            ln_f_arb[ik*n +i] += f_ik_jk + f_ij_ik;
+
+
+        } /* k */
+    } /* j */
+
+    return;
+};
+
 """
+    import cupy as cp
+
+    module = cp.RawModule(code=module_code)
+
+    return module
+
+
+def _signs_times_v_cupy(n, Rijs, vec, J_weighting):
+    """
+    Ported from _signs_times_v_mex.c
+
+    n: n_img
+    Rijs: nchoose2x3x3 array
+    vec: input array
+    new_vec: output array
+    J_weighting: bool
+    """
+    import cupy as cp
+
+    # xxx
+    module = _init_cupy_module()
 
-    module = cp.RawModule(code=code)
     signs_times_v = module.get_function("signs_times_v")
 
     Rijs_dev = cp.array(Rijs)
@@ -984,7 +1040,6 @@ def _signs_times_v_cupy(n, Rijs, vec, J_weighting):
     # call the kernel
     blkszx = 512
     nblkx = (n + blkszx - 1) // blkszx
-    assert J_weighting == False
     signs_times_v((nblkx,), (blkszx,), (n, Rijs_dev, vec_dev, new_vec_dev, J_weighting))
 
     # accumulate, can reuse the vec_dev array now.
@@ -994,3 +1049,131 @@ def _signs_times_v_cupy(n, Rijs, vec, J_weighting):
     new_vec = vec_dev.get()
 
     return new_vec
+
+
+# (n, Rijs_dev,                P2, A, a, B, b, x0, ln_f_ind_dev, ln_f_arb_dev)
+def _pairs_probabilities_cupy(n, Rijs, P2, A, a, B, b, x0):
+    """
+    n: n_img
+    Rijs: nchoose2x3x3 array
+
+    """
+    import cupy as cp
+
+    # xxx
+    module = _init_cupy_module()
+
+    pairs_probabilities = module.get_function("pairs_probabilities")
+
+    Rijs_dev = cp.array(Rijs)
+    ln_f_ind_dev = cp.zeros((n * (n - 1) // 2, n))  # n is for thread safety
+    ln_f_arb_dev = cp.zeros((n * (n - 1) // 2, n))  # n is for thread safety
+
+    # call the kernel
+    blkszx = 512
+    nblkx = (n + blkszx - 1) // blkszx
+    pairs_probabilities(
+        (nblkx,),
+        (blkszx,),
+        (n, Rijs_dev, P2, A, a, B, b, x0, ln_f_ind_dev, ln_f_arb_dev),
+    )
+
+    # accumulate over thread results
+    ln_f_arb = cp.sum(ln_f_arb_dev, axis=1).get()
+    ln_f_ind = cp.sum(ln_f_ind_dev, axis=1).get()
+
+    return ln_f_ind, ln_f_arb
+
+
+def _pairs_probabilities_host(n, Rijs, P2, A, a, B, b, x0, _ALTS, _pairs_to_linear):
+    # The following is adopted from Matlab pairs_probabilities_mex.c `looper`
+
+    # Initialize probability result arrays
+    ln_f_ind = np.zeros(len(Rijs), dtype=Rijs.dtype)
+    ln_f_arb = np.zeros(len(Rijs), dtype=Rijs.dtype)
+
+    c = np.empty((4), dtype=Rijs.dtype)
+    for i in trange(n, desc="Computing pair probabilities"):
+        for j in range(i + 1, n - 1):
+            ij = _pairs_to_linear[i, j]
+            Rij = Rijs[ij]
+            for k in range(j + 1, n):
+                ik = _pairs_to_linear[i, k]
+                jk = _pairs_to_linear[j, k]
+                Rik = Rijs[ik]
+                Rjk = Rijs[jk]
+
+                # Compute conjugated rotats
+                Rij_J = J_conjugate(Rij)
+                Rik_J = J_conjugate(Rik)
+                Rjk_J = J_conjugate(Rjk)
+
+                # Compute R muls and norms
+                c[0] = np.sum(((Rij @ Rjk) - Rik) ** 2)
+                c[1] = np.sum(((Rij_J @ Rjk) - Rik) ** 2)
+                c[2] = np.sum(((Rij @ Rjk_J) - Rik) ** 2)
+                c[3] = np.sum(((Rij @ Rjk) - Rik_J) ** 2)
+
+                # Find best match
+                best_i = np.argmin(c)
+                best_val = c[best_i]
+
+                # For each triangle side, find the best alternative
+                alt_ij_jk = c[_ALTS[0][best_i][0]]
+                if c[_ALTS[1][best_i][0]] < alt_ij_jk:
+                    alt_ij_jk = c[_ALTS[1][best_i][0]]
+                alt_ik_jk = c[_ALTS[0][best_i][1]]
+                if c[_ALTS[1][best_i][1]] < alt_ik_jk:
+                    alt_ik_jk = c[_ALTS[1][best_i][1]]
+                alt_ij_ik = c[_ALTS[0][best_i][2]]
+                if c[_ALTS[1][best_i][2]] < alt_ij_ik:
+                    alt_ij_ik = c[_ALTS[1][best_i][2]]
+
+                # Compute scores
+                s_ij_jk = 1 - np.sqrt(best_val / alt_ij_jk)
+                s_ik_jk = 1 - np.sqrt(best_val / alt_ik_jk)
+                s_ij_ik = 1 - np.sqrt(best_val / alt_ij_ik)
+
+                # Update probabilities
+                # # Probability of pair ij having score given indicicative common line
+                # P2, B, b, x0, A, a
+                f_ij_jk = np.log(
+                    P2
+                    * (
+                        B
+                        * np.power(1 - s_ij_jk, b)
+                        * np.exp(-b / (1 - x0) * (1 - s_ij_jk))
+                    )
+                    + (1 - P2) * A * np.power((1 - s_ij_jk), a)
+                )
+                f_ik_jk = np.log(
+                    P2
+                    * (
+                        B
+                        * np.power(1 - s_ik_jk, b)
+                        * np.exp(-b / (1 - x0) * (1 - s_ik_jk))
+                    )
+                    + (1 - P2) * A * np.power((1 - s_ik_jk), a)
+                )
+                f_ij_ik = np.log(
+                    P2
+                    * (
+                        B
+                        * np.power(1 - s_ij_ik, b)
+                        * np.exp(-b / (1 - x0) * (1 - s_ij_ik))
+                    )
+                    + (1 - P2) * A * np.power((1 - s_ij_ik), a)
+                )
+                ln_f_ind[ij] += f_ij_jk + f_ij_ik
+                ln_f_ind[jk] += f_ij_jk + f_ik_jk
+                ln_f_ind[ik] += f_ik_jk + f_ij_ik
+
+                # # Probability of pair ij having score given arbitrary common line
+                f_ij_jk = np.log(A * np.power((1 - s_ij_jk), a))
+                f_ik_jk = np.log(A * np.power((1 - s_ik_jk), a))
+                f_ij_ik = np.log(A * np.power((1 - s_ij_ik), a))
+                ln_f_arb[ij] += f_ij_jk + f_ij_ik
+                ln_f_arb[jk] += f_ij_jk + f_ik_jk
+                ln_f_arb[ik] += f_ik_jk + f_ij_ik
+
+    return ln_f_ind, ln_f_arb

From fe168d7b460fc67c0597e5807ac080cba558e720 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Fri, 26 Apr 2024 08:59:29 -0400
Subject: [PATCH 26/60] add triangle scores cupy kernel

---
 src/aspire/abinitio/commonline_sync3n.py | 358 +++++++++++++++++------
 1 file changed, 271 insertions(+), 87 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 1f64cf7e12..948accdd7f 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -300,84 +300,16 @@ def body(prev_too_low, Pmin, Pmax, hist, p_domain_limit=p_domain_limit):
         return W
 
     def _triangle_scores_inner(self, Rijs, hist_intervals):
-        # The following is adopted from Matlab triangle_scores_mex.c
 
-        # Initialize probability result arrays
-        cum_scores = np.zeros(len(Rijs), dtype=self.dtype)
-        scores_hist = np.zeros(hist_intervals, dtype=self.dtype)
-        h = 1 / hist_intervals
-
-        c = np.empty((4), dtype=self.dtype)
-        for i in trange(self.n_img, desc="Computing triangle scores"):
-            for j in range(
-                i + 1, self.n_img - 1
-            ):  # check bound (taken from MATLAB mex)
-                ij = self._pairs_to_linear[i, j]
-                Rij = Rijs[ij]
-                for k in range(j + 1, self.n_img):
-                    ik = self._pairs_to_linear[i, k]
-                    jk = self._pairs_to_linear[j, k]
-                    Rik = Rijs[ik]
-                    Rjk = Rijs[jk]
-
-                    # Compute conjugated rotats
-                    Rij_J = J_conjugate(Rij)
-                    Rik_J = J_conjugate(Rik)
-                    Rjk_J = J_conjugate(Rjk)
-
-                    # Compute R muls and norms
-                    c[0] = np.sum(((Rij @ Rjk) - Rik) ** 2)
-                    c[1] = np.sum(((Rij_J @ Rjk) - Rik) ** 2)
-                    c[2] = np.sum(((Rij @ Rjk_J) - Rik) ** 2)
-                    c[3] = np.sum(((Rij @ Rjk) - Rik_J) ** 2)
-
-                    # Find best match
-                    best_i = np.argmin(c)
-                    best_val = c[best_i]
-
-                    # For each triangle side, find the best alternative
-                    alt_ij_jk = c[_ALTS[0][best_i][0]]
-                    if c[_ALTS[1][best_i][0]] < alt_ij_jk:
-                        alt_ij_jk = c[_ALTS[1][best_i][0]]
-
-                    alt_ik_jk = c[_ALTS[0][best_i][1]]
-                    if c[_ALTS[1][best_i][1]] < alt_ik_jk:
-                        alt_ik_jk = c[_ALTS[1][best_i][1]]
-
-                    alt_ij_ik = c[_ALTS[0][best_i][2]]
-                    if c[_ALTS[1][best_i][2]] < alt_ij_ik:
-                        alt_ij_ik = c[_ALTS[1][best_i][2]]
-
-                    # Compute scores
-                    s_ij_jk = 1 - np.sqrt(best_val / alt_ij_jk)
-                    s_ik_jk = 1 - np.sqrt(best_val / alt_ik_jk)
-                    s_ij_ik = 1 - np.sqrt(best_val / alt_ij_ik)
-
-                    # Update cumulated scores
-                    cum_scores[ij] += s_ij_jk + s_ij_ik
-                    cum_scores[jk] += s_ij_jk + s_ik_jk
-                    cum_scores[ik] += s_ik_jk + s_ij_ik
-
-                    # Update histogram
-                    threshold = 0
-                    for _l1 in range(hist_intervals):
-                        threshold += h
-                        if s_ij_jk < threshold:
-                            break
-
-                    for _l2 in range(hist_intervals):
-                        threshold += h
-                        if s_ik_jk < threshold:
-                            break
-
-                    for _l3 in range(hist_intervals):
-                        threshold += h
-                        if s_ij_ik < threshold:
-                            break
-
-                    scores_hist[_l1] += 1
-                    scores_hist[_l2] += 1
-                    scores_hist[_l3] += 1
+        # host/gpu dispatch
+        if self._use_gpu:
+            cum_scores, scores_hist = _triangle_scores_inner_cupy(
+                self.n_img, Rijs, hist_intervals
+            )
+        else:
+            cum_scores, scores_hist = _triangle_scores_inner_host(
+                self.n_img, Rijs, hist_intervals, _ALTS, self._pairs_to_linear
+            )
 
         return cum_scores, scores_hist
 
@@ -980,24 +912,24 @@ def _init_cupy_module():
             s_ij_jk = 1 - sqrt(best_val / alt_ij_jk);
             s_ik_jk = 1 - sqrt(best_val / alt_ik_jk);
             s_ij_ik = 1 - sqrt(best_val / alt_ij_ik);
-            
+
 
             /* the probability of a pair ij to have the observed triangles scores,
             given it has an indicative common line */
             f_ij_jk = log( P2*(B*pow(1-s_ij_jk,b)*exp(-b/(1-x0)*(1-s_ij_jk))) + (1-P2)*A*pow((1-s_ij_jk),a) );
             f_ik_jk = log( P2*(B*pow(1-s_ik_jk,b)*exp(-b/(1-x0)*(1-s_ik_jk))) + (1-P2)*A*pow((1-s_ik_jk),a) );
             f_ij_ik = log( P2*(B*pow(1-s_ij_ik,b)*exp(-b/(1-x0)*(1-s_ij_ik))) + (1-P2)*A*pow((1-s_ij_ik),a) );
-	    ln_f_ind[ij*n +i] += f_ij_jk + f_ij_ik;
-	    ln_f_ind[jk*n +i] += f_ij_jk + f_ik_jk;
-	    ln_f_ind[ik*n +i] += f_ik_jk + f_ij_ik;
-			
+            ln_f_ind[ij*n +i] += f_ij_jk + f_ij_ik;
+            ln_f_ind[jk*n +i] += f_ij_jk + f_ik_jk;
+            ln_f_ind[ik*n +i] += f_ik_jk + f_ij_ik;
+
             /* the probability of a pair ij to have the observed triangles scores,
              given it has an arbitrary common line */
             f_ij_jk = log( A*pow((1-s_ij_jk),a) );
-	    f_ik_jk = log( A*pow((1-s_ik_jk),a) );
-	    f_ij_ik = log( A*pow((1-s_ij_ik),a) );
-	    ln_f_arb[ij*n +i] += f_ij_jk + f_ij_ik;
-	    ln_f_arb[jk*n +i] += f_ij_jk + f_ik_jk;
+            f_ik_jk = log( A*pow((1-s_ik_jk),a) );
+            f_ij_ik = log( A*pow((1-s_ij_ik),a) );
+            ln_f_arb[ij*n +i] += f_ij_jk + f_ij_ik;
+            ln_f_arb[jk*n +i] += f_ij_jk + f_ik_jk;
             ln_f_arb[ik*n +i] += f_ik_jk + f_ij_ik;
 
 
@@ -1007,6 +939,138 @@ def _init_cupy_module():
     return;
 };
 
+
+extern "C" __global__
+void triangle_scores_inner(int n, double* Rijs, int n_intervals, double* cum_scores, double* scores_hist)
+{
+    /* thread index (1d), represents "i" index */
+    unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    /* no-op when out of bounds */
+    if(i >= n) return;
+
+    double c[4];
+    unsigned int j;
+    unsigned int k;
+    for(k=0;k<4;k++){c[k]=0;}
+    unsigned long ij, jk, ik;
+    int best_i;
+    double best_val;
+    double s_ij_jk, s_ik_jk, s_ij_ik;
+    double alt_ij_jk, alt_ij_ik, alt_ik_jk;
+    unsigned int l1,l2,l3;
+    double threshold;
+    double h = 1. / n_intervals;
+
+    double *Rij, *Rjk, *Rik;
+    double JRijJ[9], JRjkJ[9], JRikJ[9];
+    double tmp[9];
+
+    /* initialize alternatives */
+    /* when we find the best J-configuration, we also compare it to the alternative 2nd best one.
+    * this comparison is done for every pair in the triplete independently. to make sure that the
+    * alternative is indeed different in relation to the pair, we document the differences between
+    * the configurations in advance:
+    * ALTS(:,best_conf,pair) = the two configurations in which J-sync differs from
+    * best_conf in relation to pair */
+
+    int ALTS[2][4][3];
+    ALTS[0][0][0]=1; ALTS[0][1][0]=0; ALTS[0][2][0]=0; ALTS[0][3][0]=1;
+    ALTS[1][0][0]=2; ALTS[1][1][0]=3; ALTS[1][2][0]=3; ALTS[1][3][0]=2;
+    ALTS[0][0][1]=2; ALTS[0][1][1]=2; ALTS[0][2][1]=0; ALTS[0][3][1]=0;
+    ALTS[1][0][1]=3; ALTS[1][1][1]=3; ALTS[1][2][1]=1; ALTS[1][3][1]=1;
+    ALTS[0][0][2]=1; ALTS[0][1][2]=0; ALTS[0][2][2]=1; ALTS[0][3][2]=0;
+    ALTS[1][0][2]=3; ALTS[1][1][2]=2; ALTS[1][2][2]=3; ALTS[1][3][2]=2;
+
+
+    for(j=i+1; j< (n - 1); j++){
+        ij = PAIR_IDX(n, i, j);
+        for(k=j+1; k< n; k++){
+            ik = PAIR_IDX(n, i, k);
+            jk = PAIR_IDX(n, j, k);
+
+            /* compute configurations matches scores */
+            Rij = Rijs + 9*ij;
+            Rjk = Rijs + 9*jk;
+            Rik = Rijs + 9*ik;
+
+            JRJ(Rij, JRijJ);
+            JRJ(Rjk, JRjkJ);
+            JRJ(Rik, JRikJ);
+
+            mult_3x3(tmp, Rij, Rjk);
+            c[0] = diff_norm_3x3(tmp, Rik);
+
+            mult_3x3(tmp, JRijJ, Rjk);
+            c[1] = diff_norm_3x3(tmp, Rik);
+
+            mult_3x3(tmp, Rij, JRjkJ);
+            c[2] = diff_norm_3x3(tmp, Rik);
+
+            mult_3x3(tmp, Rij, Rjk);
+            c[3] = diff_norm_3x3(tmp, JRikJ);
+
+            /* find best match */
+            best_i=0; best_val=c[0];
+            if (c[1]<best_val) {best_i=1; best_val=c[1];}
+            if (c[2]<best_val) {best_i=2; best_val=c[2];}
+            if (c[3]<best_val) {best_i=3; best_val=c[3];}
+
+             /* for each triangle side, find the best alternative */
+             alt_ij_jk = c[ALTS[0][best_i][0]];
+             if (c[ALTS[1][best_i][0]] < alt_ij_jk){
+                 alt_ij_jk = c[ALTS[1][best_i][0]];
+             }
+
+             alt_ik_jk = c[ALTS[0][best_i][1]];
+             if (c[ALTS[1][best_i][1]] < alt_ik_jk){
+                 alt_ik_jk = c[ALTS[1][best_i][1]];
+             }
+             alt_ij_ik = c[ALTS[0][best_i][2]];
+             if (c[ALTS[1][best_i][2]] < alt_ij_ik){
+                 alt_ij_ik = c[ALTS[1][best_i][2]];
+             }
+
+            /* Assign scores */
+            s_ij_jk = 1 - sqrt(best_val / alt_ij_jk);
+            s_ik_jk = 1 - sqrt(best_val / alt_ik_jk);
+            s_ij_ik = 1 - sqrt(best_val / alt_ij_ik);
+
+
+            /* update cumulated scores */
+            cum_scores[ij*n+i] += s_ij_jk + s_ij_ik;
+            cum_scores[jk*n+i] += s_ij_jk + s_ik_jk;
+            cum_scores[ik*n+i] += s_ik_jk + s_ij_ik;
+
+            /* update scores histogram */
+            threshold = 0;
+            for (l1=0; l1<n_intervals-1; l1++) {
+                threshold += h;
+                if (s_ij_jk < threshold) {break;}
+            }
+
+            threshold = 0;
+            for(l2=0; l2<n_intervals-1; l2++) {
+                threshold += h;
+                if(s_ik_jk < threshold) {break;}
+            }
+
+            threshold = 0;
+            for(l3=0; l3<n_intervals-1; l3++) {
+                threshold += h;
+                if (s_ij_ik < threshold) {break;}
+            }
+
+            scores_hist[l1*n+i] += 1;
+            scores_hist[l2*n+i] += 1;
+            scores_hist[l3*n+i] += 1;
+
+        } /* k */
+    } /* j */
+
+    return;
+};
+
 """
     import cupy as cp
 
@@ -1051,7 +1115,6 @@ def _signs_times_v_cupy(n, Rijs, vec, J_weighting):
     return new_vec
 
 
-# (n, Rijs_dev,                P2, A, a, B, b, x0, ln_f_ind_dev, ln_f_arb_dev)
 def _pairs_probabilities_cupy(n, Rijs, P2, A, a, B, b, x0):
     """
     n: n_img
@@ -1177,3 +1240,124 @@ def _pairs_probabilities_host(n, Rijs, P2, A, a, B, b, x0, _ALTS, _pairs_to_line
                 ln_f_arb[ik] += f_ik_jk + f_ij_ik
 
     return ln_f_ind, ln_f_arb
+
+
+def _triangle_scores_inner_host(n_img, Rijs, hist_intervals, _ALTS, _pairs_to_linear):
+    # The following is adopted from Matlab triangle_scores_mex.c
+
+    # Initialize probability result arrays
+    cum_scores = np.zeros(len(Rijs), dtype=Rijs.dtype)
+    scores_hist = np.zeros(hist_intervals, dtype=Rijs.dtype)
+    h = 1 / hist_intervals
+
+    c = np.empty((4), dtype=Rijs.dtype)
+    for i in trange(n_img, desc="Computing triangle scores"):
+        for j in range(i + 1, n_img - 1):  # check bound (taken from MATLAB mex)
+            ij = _pairs_to_linear[i, j]
+            Rij = Rijs[ij]
+            for k in range(j + 1, n_img):
+                ik = _pairs_to_linear[i, k]
+                jk = _pairs_to_linear[j, k]
+                Rik = Rijs[ik]
+                Rjk = Rijs[jk]
+
+                # Compute conjugated rotats
+                Rij_J = J_conjugate(Rij)
+                Rik_J = J_conjugate(Rik)
+                Rjk_J = J_conjugate(Rjk)
+
+                # Compute R muls and norms
+                c[0] = np.sum(((Rij @ Rjk) - Rik) ** 2)
+                c[1] = np.sum(((Rij_J @ Rjk) - Rik) ** 2)
+                c[2] = np.sum(((Rij @ Rjk_J) - Rik) ** 2)
+                c[3] = np.sum(((Rij @ Rjk) - Rik_J) ** 2)
+
+                # Find best match
+                best_i = np.argmin(c)
+                best_val = c[best_i]
+
+                # For each triangle side, find the best alternative
+                alt_ij_jk = c[_ALTS[0][best_i][0]]
+                if c[_ALTS[1][best_i][0]] < alt_ij_jk:
+                    alt_ij_jk = c[_ALTS[1][best_i][0]]
+
+                alt_ik_jk = c[_ALTS[0][best_i][1]]
+                if c[_ALTS[1][best_i][1]] < alt_ik_jk:
+                    alt_ik_jk = c[_ALTS[1][best_i][1]]
+
+                alt_ij_ik = c[_ALTS[0][best_i][2]]
+                if c[_ALTS[1][best_i][2]] < alt_ij_ik:
+                    alt_ij_ik = c[_ALTS[1][best_i][2]]
+
+                # Compute scores
+                s_ij_jk = 1 - np.sqrt(best_val / alt_ij_jk)
+                s_ik_jk = 1 - np.sqrt(best_val / alt_ik_jk)
+                s_ij_ik = 1 - np.sqrt(best_val / alt_ij_ik)
+
+                # Update cumulated scores
+                cum_scores[ij] += s_ij_jk + s_ij_ik
+                cum_scores[jk] += s_ij_jk + s_ik_jk
+                cum_scores[ik] += s_ik_jk + s_ij_ik
+
+                # Update histogram
+                threshold = 0
+                for _l1 in range(hist_intervals - 1):
+                    threshold += h
+                    if s_ij_jk < threshold:
+                        break
+
+                threshold = 0
+                for _l2 in range(hist_intervals - 1):
+                    threshold += h
+                    if s_ik_jk < threshold:
+                        break
+
+                threshold = 0
+                for _l3 in range(hist_intervals - 1):
+                    threshold += h
+                    if s_ij_ik < threshold:
+                        break
+
+                scores_hist[_l1] += 1
+                scores_hist[_l2] += 1
+                scores_hist[_l3] += 1
+
+    return cum_scores, scores_hist
+
+
+def _triangle_scores_inner_cupy(n_img, Rijs, hist_intervals):
+    """
+    n: n_img
+    Rijs: nchoose2x3x3 array
+
+    """
+    import cupy as cp
+
+    # xxx
+    module = _init_cupy_module()
+
+    triangle_scores = module.get_function("triangle_scores_inner")
+
+    Rijs_dev = cp.array(Rijs)
+    # xxx I think we can safely remove cum_scores
+    cum_scores_dev = cp.zeros(
+        (n_img * (n_img - 1) // 2, n_img), dtype=np.float64
+    )  # n is for thread safety
+    scores_hist_dev = cp.zeros(
+        (hist_intervals, n_img), dtype=np.float64
+    )  # n is for thread safety
+
+    # call the kernel
+    blkszx = 512
+    nblkx = (n_img + blkszx - 1) // blkszx
+    triangle_scores(
+        (nblkx,),
+        (blkszx,),
+        (n_img, Rijs_dev, hist_intervals, cum_scores_dev, scores_hist_dev),
+    )
+
+    # accumulate over thread results
+    cum_scores = cp.sum(cum_scores_dev, axis=1).get()
+    scores_hist = cp.sum(scores_hist_dev, axis=1).get()
+
+    return cum_scores, scores_hist

From 6d6fe6042b61c2b2484a58f806eebd8ef3f180fa Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Fri, 26 Apr 2024 09:32:52 -0400
Subject: [PATCH 27/60] initial bulk refactor

---
 src/aspire/abinitio/commonline_sync3n.py | 1219 +++++++---------------
 1 file changed, 400 insertions(+), 819 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 948accdd7f..7c9d65c81b 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -11,28 +11,28 @@
 
 logger = logging.getLogger(__name__)
 
-# Initialize alternatives
-#
-# When we find the best J-configuration, we also compare it to the alternative 2nd best one.
-# this comparison is done for every pair in the triplete independently. to make sure that the
-# alternative is indeed different in relation to the pair, we document the differences between
-# the configurations in advance:
-# ALTS(:,best_conf,pair) = the two configurations in which J-sync differs from best_conf in relation to pair
-
-_ALTS = np.array(
-    [
-        [[1, 2, 1], [0, 2, 0], [0, 0, 1], [1, 0, 0]],
-        [[2, 3, 3], [3, 3, 2], [3, 1, 3], [2, 1, 2]],
-    ],
-    dtype=int,
-)
-
 
 class CLSync3N(CLOrient3D, SyncVotingMixin):
     """
     Define a class to estimate 3D orientations using common lines Sync3N methods (2017).
     """
 
+    # Initialize alternatives
+    #
+    # When we find the best J-configuration, we also compare it to the alternative 2nd best one.
+    # this comparison is done for every pair in the triplete independently. to make sure that the
+    # alternative is indeed different in relation to the pair, we document the differences between
+    # the configurations in advance:
+    # ALTS(:,best_conf,pair) = the two configurations in which J-sync differs from best_conf in relation to pair
+
+    _ALTS = np.array(
+        [
+            [[1, 2, 1], [0, 2, 0], [0, 0, 1], [1, 0, 0]],
+            [[2, 3, 3], [3, 3, 2], [3, 1, 3], [2, 1, 2]],
+        ],
+        dtype=int,
+    )
+
     def __init__(
         self,
         src,
@@ -47,6 +47,7 @@ def __init__(
         mask=True,
         S_weighting=False,
         J_weighting=False,
+        hist_intervals=100,
     ):
         """
         Initialize object for estimating 3D orientations.
@@ -85,9 +86,10 @@ def __init__(
         self.S_weighting = S_weighting
         self.J_weighting = J_weighting
         self._D_null = 1e-13
+        self.hist_intervals = hist_intervals
 
         # Auto configure GPU
-        self._use_gpu = False
+        self._gpu_module = None
         try:
             import cupy as cp
 
@@ -96,9 +98,10 @@ def __init__(
                 logger.info(
                     f"cupy and GPU {gpu_id} found by cuda runtime; enabling cupy."
                 )
-                self._use_gpu = True
+                self._gpu_module = _init_cupy_module()
             else:
                 logger.info("GPU not found, defaulting to numpy.")
+
         except ModuleNotFoundError:
             logger.info("cupy not found, defaulting numpy.")
 
@@ -299,17 +302,140 @@ def body(prev_too_low, Pmin, Pmax, hist, p_domain_limit=p_domain_limit):
 
         return W
 
-    def _triangle_scores_inner(self, Rijs, hist_intervals):
+    def _triangle_scores_inner(self, Rijs):
 
         # host/gpu dispatch
-        if self._use_gpu:
-            cum_scores, scores_hist = _triangle_scores_inner_cupy(
-                self.n_img, Rijs, hist_intervals
-            )
+        if self._gpu_module:
+            cum_scores, scores_hist = self._triangle_scores_inner_cupy(Rijs)
         else:
-            cum_scores, scores_hist = _triangle_scores_inner_host(
-                self.n_img, Rijs, hist_intervals, _ALTS, self._pairs_to_linear
-            )
+            cum_scores, scores_hist = self._triangle_scores_inner_host(Rijs)
+
+        return cum_scores, scores_hist
+
+    def _triangle_scores_inner_host(self, Rijs):
+
+        # The following is adopted from Matlab triangle_scores_mex.c
+
+        # Initialize probability result arrays
+        cum_scores = np.zeros(len(Rijs), dtype=Rijs.dtype)
+        scores_hist = np.zeros(self.hist_intervals, dtype=Rijs.dtype)
+        h = 1 / self.hist_intervals
+
+        c = np.empty((4), dtype=Rijs.dtype)
+        for i in trange(self.n_img, desc="Computing triangle scores"):
+            for j in range(
+                i + 1, self.n_img - 1
+            ):  # check bound (taken from MATLAB mex)
+                ij = self._pairs_to_linear[i, j]
+                Rij = Rijs[ij]
+                for k in range(j + 1, self.n_img):
+                    ik = self._pairs_to_linear[i, k]
+                    jk = self._pairs_to_linear[j, k]
+                    Rik = Rijs[ik]
+                    Rjk = Rijs[jk]
+
+                    # Compute conjugated rotats
+                    Rij_J = J_conjugate(Rij)
+                    Rik_J = J_conjugate(Rik)
+                    Rjk_J = J_conjugate(Rjk)
+
+                    # Compute R muls and norms
+                    c[0] = np.sum(((Rij @ Rjk) - Rik) ** 2)
+                    c[1] = np.sum(((Rij_J @ Rjk) - Rik) ** 2)
+                    c[2] = np.sum(((Rij @ Rjk_J) - Rik) ** 2)
+                    c[3] = np.sum(((Rij @ Rjk) - Rik_J) ** 2)
+
+                    # Find best match
+                    best_i = np.argmin(c)
+                    best_val = c[best_i]
+
+                    # For each triangle side, find the best alternative
+                    alt_ij_jk = c[self._ALTS[0][best_i][0]]
+                    if c[self._ALTS[1][best_i][0]] < alt_ij_jk:
+                        alt_ij_jk = c[self._ALTS[1][best_i][0]]
+
+                    alt_ik_jk = c[self._ALTS[0][best_i][1]]
+                    if c[self._ALTS[1][best_i][1]] < alt_ik_jk:
+                        alt_ik_jk = c[self._ALTS[1][best_i][1]]
+
+                    alt_ij_ik = c[self._ALTS[0][best_i][2]]
+                    if c[self._ALTS[1][best_i][2]] < alt_ij_ik:
+                        alt_ij_ik = c[self._ALTS[1][best_i][2]]
+
+                    # Compute scores
+                    s_ij_jk = 1 - np.sqrt(best_val / alt_ij_jk)
+                    s_ik_jk = 1 - np.sqrt(best_val / alt_ik_jk)
+                    s_ij_ik = 1 - np.sqrt(best_val / alt_ij_ik)
+
+                    # Update cumulated scores
+                    cum_scores[ij] += s_ij_jk + s_ij_ik
+                    cum_scores[jk] += s_ij_jk + s_ik_jk
+                    cum_scores[ik] += s_ik_jk + s_ij_ik
+
+                    # Update histogram
+                    threshold = 0
+                    for _l1 in range(self.hist_intervals - 1):
+                        threshold += h
+                        if s_ij_jk < threshold:
+                            break
+
+                    threshold = 0
+                    for _l2 in range(self.hist_intervals - 1):
+                        threshold += h
+                        if s_ik_jk < threshold:
+                            break
+
+                    threshold = 0
+                    for _l3 in range(self.hist_intervals - 1):
+                        threshold += h
+                        if s_ij_ik < threshold:
+                            break
+
+                    scores_hist[_l1] += 1
+                    scores_hist[_l2] += 1
+                    scores_hist[_l3] += 1
+
+        return cum_scores, scores_hist
+
+    def _triangle_scores_inner_cupy(self, Rijs):
+        """
+        n: n_img
+        Rijs: nchoose2x3x3 array
+
+        """
+        import cupy as cp
+
+        triangle_scores = self._gpu_module.get_function("triangle_scores_inner")
+
+        Rijs_dev = cp.array(Rijs)
+
+        # xxx I think we can safely remove cum_scores
+        cum_scores_dev = cp.zeros(
+            (n_img * (n_img - 1) // 2, n_img), dtype=np.float64
+        )  # n is for thread safety
+
+        scores_hist_dev = cp.zeros(
+            (hist_intervals, n_img), dtype=np.float64
+        )  # n is for thread safety
+
+        # call the kernel
+        blkszx = 512
+        nblkx = (n_img + blkszx - 1) // blkszx
+        triangle_scores(
+            (nblkx,),
+            (blkszx,),
+            (
+                self.n_img,
+                Rijs_dev,
+                self.hist_intervals,
+                cum_scores_dev,
+                scores_hist_dev,
+            ),
+        )
+
+        # accumulate over thread results
+        cum_scores = cp.sum(cum_scores_dev, axis=1).get()
+        scores_hist = cp.sum(scores_hist_dev, axis=1).get()
 
         return cum_scores, scores_hist
 
@@ -317,12 +443,136 @@ def _pairs_probabilities(self, Rijs, P2, A, a, B, b, x0):
         # dtype is critical for passing into C code...
         params = np.arary([P2, A, a, B, b, x0], dtype=np.float64)
         # host/gpu dispatch
-        if self._use_gpu:
-            ln_f_ind, ln_f_arb = _pairs_probabilities_cupy(self.n_img, Rijs, *params)
+        if self._gpu_module:
+            ln_f_ind, ln_f_arb = self._pairs_probabilities_cupy(Rijs, *params)
         else:
-            ln_f_ind, ln_f_arb = _pairs_probabilities_host(
-                self.n_img, Rijs, *params, _ALTS, self._pairs_to_linear
-            )
+            ln_f_ind, ln_f_arb = self._pairs_probabilities_host(Rijs, *params)
+
+        return ln_f_ind, ln_f_arb
+
+    def _pairs_probabilities_host(self, Rijs, P2, A, a, B, b, x0):
+        # The following is adopted from Matlab pairs_probabilities_mex.c `looper`
+
+        # Initialize probability result arrays
+        ln_f_ind = np.zeros(len(Rijs), dtype=Rijs.dtype)
+        ln_f_arb = np.zeros(len(Rijs), dtype=Rijs.dtype)
+
+        c = np.empty((4), dtype=Rijs.dtype)
+        for i in trange(self.n_img, desc="Computing pair probabilities"):
+            for j in range(i + 1, self.n_img - 1):
+                ij = self._pairs_to_linear[i, j]
+                Rij = Rijs[ij]
+                for k in range(j + 1, self.n_img):
+                    ik = self._pairs_to_linear[i, k]
+                    jk = self._pairs_to_linear[j, k]
+                    Rik = Rijs[ik]
+                    Rjk = Rijs[jk]
+
+                    # Compute conjugated rotats
+                    Rij_J = J_conjugate(Rij)
+                    Rik_J = J_conjugate(Rik)
+                    Rjk_J = J_conjugate(Rjk)
+
+                    # Compute R muls and norms
+                    c[0] = np.sum(((Rij @ Rjk) - Rik) ** 2)
+                    c[1] = np.sum(((Rij_J @ Rjk) - Rik) ** 2)
+                    c[2] = np.sum(((Rij @ Rjk_J) - Rik) ** 2)
+                    c[3] = np.sum(((Rij @ Rjk) - Rik_J) ** 2)
+
+                    # Find best match
+                    best_i = np.argmin(c)
+                    best_val = c[best_i]
+
+                    # For each triangle side, find the best alternative
+                    alt_ij_jk = c[self._ALTS[0][best_i][0]]
+                    if c[self._ALTS[1][best_i][0]] < alt_ij_jk:
+                        alt_ij_jk = c[self._ALTS[1][best_i][0]]
+                    alt_ik_jk = c[self._ALTS[0][best_i][1]]
+                    if c[self._ALTS[1][best_i][1]] < alt_ik_jk:
+                        alt_ik_jk = c[self._ALTS[1][best_i][1]]
+                    alt_ij_ik = c[self._ALTS[0][best_i][2]]
+                    if c[self._ALTS[1][best_i][2]] < alt_ij_ik:
+                        alt_ij_ik = c[self._ALTS[1][best_i][2]]
+
+                    # Compute scores
+                    s_ij_jk = 1 - np.sqrt(best_val / alt_ij_jk)
+                    s_ik_jk = 1 - np.sqrt(best_val / alt_ik_jk)
+                    s_ij_ik = 1 - np.sqrt(best_val / alt_ij_ik)
+
+                    # Update probabilities
+                    # # Probability of pair ij having score given indicicative common line
+                    # P2, B, b, x0, A, a
+                    f_ij_jk = np.log(
+                        P2
+                        * (
+                            B
+                            * np.power(1 - s_ij_jk, b)
+                            * np.exp(-b / (1 - x0) * (1 - s_ij_jk))
+                        )
+                        + (1 - P2) * A * np.power((1 - s_ij_jk), a)
+                    )
+                    f_ik_jk = np.log(
+                        P2
+                        * (
+                            B
+                            * np.power(1 - s_ik_jk, b)
+                            * np.exp(-b / (1 - x0) * (1 - s_ik_jk))
+                        )
+                        + (1 - P2) * A * np.power((1 - s_ik_jk), a)
+                    )
+                    f_ij_ik = np.log(
+                        P2
+                        * (
+                            B
+                            * np.power(1 - s_ij_ik, b)
+                            * np.exp(-b / (1 - x0) * (1 - s_ij_ik))
+                        )
+                        + (1 - P2) * A * np.power((1 - s_ij_ik), a)
+                    )
+                    ln_f_ind[ij] += f_ij_jk + f_ij_ik
+                    ln_f_ind[jk] += f_ij_jk + f_ik_jk
+                    ln_f_ind[ik] += f_ik_jk + f_ij_ik
+
+                    # # Probability of pair ij having score given arbitrary common line
+                    f_ij_jk = np.log(A * np.power((1 - s_ij_jk), a))
+                    f_ik_jk = np.log(A * np.power((1 - s_ik_jk), a))
+                    f_ij_ik = np.log(A * np.power((1 - s_ij_ik), a))
+                    ln_f_arb[ij] += f_ij_jk + f_ij_ik
+                    ln_f_arb[jk] += f_ij_jk + f_ik_jk
+                    ln_f_arb[ik] += f_ik_jk + f_ij_ik
+
+        return ln_f_ind, ln_f_arb
+
+    def _pairs_probabilities_cupy(self, Rijs, P2, A, a, B, b, x0):
+        """
+        n: n_img
+        Rijs: nchoose2x3x3 array
+
+        """
+        import cupy as cp
+
+        pairs_probabilities = self._gpu_module.get_function("pairs_probabilities")
+
+        Rijs_dev = cp.array(Rijs)
+        ln_f_ind_dev = cp.zeros(
+            (self.n_img * (self.n_img - 1) // 2, self.n_img)
+        )  # second dim is for thread safety
+        ln_f_arb_dev = cp.zeros(
+            (self.n_img * (self.n_img - 1) // 2, self.n_img)
+        )  # second dim  is for thread safety
+
+        # call the kernel
+        blkszx = 512
+        nblkx = (self.n_img + blkszx - 1) // blkszx
+        pairs_probabilities(
+            (nblkx,),
+            (blkszx,),
+            (self.n_img, Rijs_dev, P2, A, a, B, b, x0, ln_f_ind_dev, ln_f_arb_dev),
+        )
+
+        # accumulate over thread results
+        ln_f_arb = cp.sum(ln_f_arb_dev, axis=1).get()
+        ln_f_ind = cp.sum(ln_f_ind_dev, axis=1).get()
 
         return ln_f_ind, ln_f_arb
 
@@ -332,7 +582,6 @@ def _triangle_scores(
         scores_hist,
         Pmin,
         Pmax,
-        hist_intervals=100,
         a=2.2,
         peak2sigma=2.43e-2,
         P=0.5,
@@ -359,7 +608,7 @@ def _triangle_scores(
 
         cum_scores = None  # XXX Why do we even need cum_scores?
         if scores_hist is None:
-            cum_scores, scores_hist = self._triangle_scores_inner(Rijs, hist_intervals)
+            cum_scores, scores_hist = self._triangle_scores_inner(Rijs)
 
             # Normalize cumulated scores
             cum_scores /= len(Rijs)
@@ -555,809 +804,141 @@ def _J_sync_power_method(self, Rijs):
     def _signs_times_v(self, Rijs, vec):
 
         # host/gpu dispatch
-        if self._use_gpu:
-            new_vec = _signs_times_v_cupy(self.n_img, Rijs, vec, self.J_weighting)
+        if self._gpu_module:
+            new_vec = self._signs_times_v_cupy(Rijs, vec)
         else:
-            new_vec = _signs_times_v_host(
-                self.n_img, Rijs, vec, self.J_weighting, _ALTS, self._pairs_to_linear
-            )
+            new_vec = self._signs_times_v_host(Rijs, vec)
 
         return new_vec
 
+    def _signs_times_v_host(self, Rijs, vec):
+        """
+        Ported from _signs_times_v_mex.c
+
+        n: n_img
+        Rijs: nchoose2x3x3 array
+        vec: input array
+        new_vec: output array
+        J_weighting: bool
+        _ALTS= 2x4x3 const lut array
+        """
 
-def _signs_times_v_host(n, Rijs, vec, J_weighting, _ALTS, _pairs_to_linear):
-    """
-    Ported from _signs_times_v_mex.c
-
-    n: n_img
-    Rijs: nchoose2x3x3 array
-    vec: input array
-    new_vec: output array
-    J_weighting: bool
-    _ALTS= 2x4x3 const lut array
-    """
-
-    new_vec = np.zeros_like(vec)
-
-    _signs_confs = np.array(
-        [[1, 1, 1], [-1, 1, -1], [-1, -1, 1], [1, -1, -1]], dtype=int
-    )
-
-    c = np.empty((4))
-    desc = "Computing signs_times_v"
-    if J_weighting:
-        desc += " with J_weighting"
-    for i in trange(n, desc=desc):
-        for j in range(i + 1, n - 1):  # check bound (taken from MATLAB mex)
-            ij = _pairs_to_linear[i, j]
-            Rij = Rijs[ij]
-            for k in range(j + 1, n):
-                ik = _pairs_to_linear[i, k]
-                jk = _pairs_to_linear[j, k]
-                Rik = Rijs[ik]
-                Rjk = Rijs[jk]
-
-                # Compute conjugated rotats
-                Rij_J = J_conjugate(Rij)
-                Rik_J = J_conjugate(Rik)
-                Rjk_J = J_conjugate(Rjk)
-
-                # Compute R muls and norms
-                c[0] = np.sum(((Rij @ Rjk) - Rik) ** 2)
-                c[1] = np.sum(((Rij_J @ Rjk) - Rik) ** 2)
-                c[2] = np.sum(((Rij @ Rjk_J) - Rik) ** 2)
-                c[3] = np.sum(((Rij @ Rjk) - Rik_J) ** 2)
-
-                # Find best match
-                best_i = np.argmin(c)
-                best_val = c[best_i]
-
-                # MATLAB: scores_as_entries == 0
-                s_ij_jk = _signs_confs[best_i][0]
-                s_ik_jk = _signs_confs[best_i][1]
-                s_ij_ik = _signs_confs[best_i][2]
-
-                # Note there was a third J_weighting option (2) in MATLAB,
-                # but it was not exposed at top level.
-                if J_weighting:
-                    # MATLAB: scores_as_entries == 1
-                    # For each triangle side, find the best alternative
-                    alt_ij_jk = c[_ALTS[0][best_i][0]]
-                    if c[_ALTS[1][best_i][0]] < alt_ij_jk:
-                        alt_ij_jk = c[_ALTS[1][best_i][0]]
-
-                    alt_ik_jk = c[_ALTS[0][best_i][1]]
-                    if c[_ALTS[1][best_i][1]] < alt_ik_jk:
-                        alt_ik_jk = c[_ALTS[1][best_i][1]]
-
-                    alt_ij_ik = c[_ALTS[0][best_i][2]]
-                    if c[_ALTS[1][best_i][2]] < alt_ij_ik:
-                        alt_ij_ik = c[_ALTS[1][best_i][2]]
-
-                    # Compute scores
-                    s_ij_jk *= 1 - np.sqrt(best_val / alt_ij_jk)
-                    s_ik_jk *= 1 - np.sqrt(best_val / alt_ik_jk)
-                    s_ij_ik *= 1 - np.sqrt(best_val / alt_ij_ik)
-
-                # Update vector entries
-                new_vec[ij] += s_ij_jk * vec[jk] + s_ij_ik * vec[ik]
-                new_vec[jk] += s_ij_jk * vec[ij] + s_ik_jk * vec[ik]
-                new_vec[ik] += s_ij_ik * vec[ij] + s_ik_jk * vec[jk]
-
-    return new_vec
-
-
-def _init_cupy_module():
-    module_code = r"""
-
-/* from i,j indoces to the common index in the N-choose-2 sized array */
-#define PAIR_IDX(N,I,J) ((2*N-I-1)*I/2 + J-I-1)
-
-
-inline void mult_3x3(double *out, double *R1, double *R2) {
-  /* 3X3 matrices multiplication: out = R1*R2
-   * Note, this differs from the MATLAB mult_3x3.
-  */
-
-  int i,j,k;
-
-  for(i=0; i<3; i++){
-    for(j=0; j<3; j++){
-      out[i*3 + j] = 0;
-      for (k=0; k<3; k++){
-        out[i*3 + j] += R1[i*3+k] * R2[k*3+j];
-      }
-    }
-  }
-}
-
-inline void JRJ(double *R, double *A) {
-/* multiple 3X3 matrix by J from both sizes: A = JRJ */
-        A[0]=R[0];
-        A[1]=R[1];
-        A[2]=-R[2];
-        A[3]=R[3];
-        A[4]=R[4];
-        A[5]=-R[5];
-        A[6]=-R[6];
-        A[7]=-R[7];
-        A[8]=R[8];
-}
-
-inline double diff_norm_3x3(const double *R1, const double *R2) {
-/* difference 2 matrices and return squared norm: ||R1-R2||^2 */
-        int i;
-        double norm = 0;
-        for (i=0; i<9; i++) {norm += (R1[i]-R2[i])*(R1[i]-R2[i]);}
-        return norm;
-}
-
-
-extern "C" __global__
-void signs_times_v(int n, double* Rijs, const double* vec, double* new_vec, bool J_weighting)
-{
-    /* thread index (1d), represents "i" index */
-    unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    /* no-op when out of bounds */
-    if(i >= n) return;
-
-    double c[4];
-    unsigned int j;
-    unsigned int k;
-    for(k=0;k<4;k++){c[k]=0;}
-    unsigned long ij, jk, ik;
-    int best_i;
-    double best_val;
-    double s_ij_jk, s_ik_jk, s_ij_ik;
-    double alt_ij_jk, alt_ij_ik, alt_ik_jk;
-
-    double *Rij, *Rjk, *Rik;
-    double JRijJ[9], JRjkJ[9], JRikJ[9];
-    double tmp[9];
-
-    int signs_confs[4][3];
-    for(int a=0; a<4; a++) { for(k=0; k<3; k++) { signs_confs[a][k]=1; } }
-    signs_confs[1][0]=-1; signs_confs[1][2]=-1;
-    signs_confs[2][0]=-1; signs_confs[2][1]=-1;
-    signs_confs[3][1]=-1; signs_confs[3][2]=-1;
-
-    /* initialize alternatives */
-    /* when we find the best J-configuration, we also compare it to the alternative 2nd best one.
-    * this comparison is done for every pair in the triplete independently. to make sure that the
-    * alternative is indeed different in relation to the pair, we document the differences between
-    * the configurations in advance:
-    * ALTS(:,best_conf,pair) = the two configurations in which J-sync differs from
-    * best_conf in relation to pair */
-
-    int ALTS[2][4][3];
-    ALTS[0][0][0]=1; ALTS[0][1][0]=0; ALTS[0][2][0]=0; ALTS[0][3][0]=1;
-    ALTS[1][0][0]=2; ALTS[1][1][0]=3; ALTS[1][2][0]=3; ALTS[1][3][0]=2;
-    ALTS[0][0][1]=2; ALTS[0][1][1]=2; ALTS[0][2][1]=0; ALTS[0][3][1]=0;
-    ALTS[1][0][1]=3; ALTS[1][1][1]=3; ALTS[1][2][1]=1; ALTS[1][3][1]=1;
-    ALTS[0][0][2]=1; ALTS[0][1][2]=0; ALTS[0][2][2]=1; ALTS[0][3][2]=0;
-    ALTS[1][0][2]=3; ALTS[1][1][2]=2; ALTS[1][2][2]=3; ALTS[1][3][2]=2;
-
-
-    for(j=i+1; j< (n - 1); j++){
-        ij = PAIR_IDX(n, i, j);
-        for(k=j+1; k< n; k++){
-            ik = PAIR_IDX(n, i, k);
-            jk = PAIR_IDX(n, j, k);
-
-            /* compute configurations matches scores */
-            Rij = Rijs + 9*ij;
-            Rjk = Rijs + 9*jk;
-            Rik = Rijs + 9*ik;
-
-            JRJ(Rij, JRijJ);
-            JRJ(Rjk, JRjkJ);
-            JRJ(Rik, JRikJ);
-
-            mult_3x3(tmp, Rij, Rjk);
-            c[0] = diff_norm_3x3(tmp, Rik);
-
-            mult_3x3(tmp, JRijJ, Rjk);
-            c[1] = diff_norm_3x3(tmp, Rik);
-
-            mult_3x3(tmp, Rij, JRjkJ);
-            c[2] = diff_norm_3x3(tmp, Rik);
-
-            mult_3x3(tmp, Rij, Rjk);
-            c[3] = diff_norm_3x3(tmp, JRikJ);
-
-            /* find best match */
-            best_i=0; best_val=c[0];
-            if (c[1]<best_val) {best_i=1; best_val=c[1];}
-            if (c[2]<best_val) {best_i=2; best_val=c[2];}
-            if (c[3]<best_val) {best_i=3; best_val=c[3];}
-
-            /* set triangles entries to be signs */
-            s_ij_jk = signs_confs[best_i][0];
-            s_ik_jk = signs_confs[best_i][1];
-            s_ij_ik = signs_confs[best_i][2];
-
-            /* J weighting */
-            if(J_weighting){
-                /* for each triangle side, find the best alternative */
-                alt_ij_jk = c[ALTS[0][best_i][0]];
-                if (c[ALTS[1][best_i][0]] < alt_ij_jk){
-                     alt_ij_jk = c[ALTS[1][best_i][0]];
-                }
-
-                alt_ik_jk = c[ALTS[0][best_i][1]];
-                if (c[ALTS[1][best_i][1]] < alt_ik_jk){
-                     alt_ik_jk = c[ALTS[1][best_i][1]];
-                }
-                alt_ij_ik = c[ALTS[0][best_i][2]];
-                if (c[ALTS[1][best_i][2]] < alt_ij_ik){
-                     alt_ij_ik = c[ALTS[1][best_i][2]];
-                }
-
-                /* Update scores */
-                s_ij_jk *= 1 - sqrt(best_val / alt_ij_jk);
-                s_ik_jk *= 1 - sqrt(best_val / alt_ik_jk);
-                s_ij_ik *= 1 - sqrt(best_val / alt_ij_ik);
-            }
-
-
-            /* update multiplication */
-            new_vec[ij*n + i] += s_ij_jk*vec[jk] + s_ij_ik*vec[ik];
-            new_vec[jk*n + i] += s_ij_jk*vec[ij] + s_ik_jk*vec[ik];
-            new_vec[ik*n + i] += s_ij_ik*vec[ij] + s_ik_jk*vec[jk];
-
-        } /* k */
-    } /* j */
-
-    return;
-};
-
-extern "C" __global__
-void pairs_probabilities(int n, double* Rijs, double P2, double A, double a, double B, double b, double x0, double* ln_f_ind, double* ln_f_arb)
-{
-    /* thread index (1d), represents "i" index */
-    unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    /* no-op when out of bounds */
-    if(i >= n) return;
-
-    double c[4];
-    unsigned int j;
-    unsigned int k;
-    for(k=0;k<4;k++){c[k]=0;}
-    unsigned long ij, jk, ik;
-    int best_i;
-    double best_val;
-    double s_ij_jk, s_ik_jk, s_ij_ik;
-    double alt_ij_jk, alt_ij_ik, alt_ik_jk;
-    double f_ij_jk, f_ik_jk, f_ij_ik;
-
-
-    double *Rij, *Rjk, *Rik;
-    double JRijJ[9], JRjkJ[9], JRikJ[9];
-    double tmp[9];
-
-    int signs_confs[4][3];
-    for(int a=0; a<4; a++) { for(k=0; k<3; k++) { signs_confs[a][k]=1; } }
-    signs_confs[1][0]=-1; signs_confs[1][2]=-1;
-    signs_confs[2][0]=-1; signs_confs[2][1]=-1;
-    signs_confs[3][1]=-1; signs_confs[3][2]=-1;
-
-    /* initialize alternatives */
-    /* when we find the best J-configuration, we also compare it to the alternative 2nd best one.
-    * this comparison is done for every pair in the triplete independently. to make sure that the
-    * alternative is indeed different in relation to the pair, we document the differences between
-    * the configurations in advance:
-    * ALTS(:,best_conf,pair) = the two configurations in which J-sync differs from
-    * best_conf in relation to pair */
-
-    int ALTS[2][4][3];
-    ALTS[0][0][0]=1; ALTS[0][1][0]=0; ALTS[0][2][0]=0; ALTS[0][3][0]=1;
-    ALTS[1][0][0]=2; ALTS[1][1][0]=3; ALTS[1][2][0]=3; ALTS[1][3][0]=2;
-    ALTS[0][0][1]=2; ALTS[0][1][1]=2; ALTS[0][2][1]=0; ALTS[0][3][1]=0;
-    ALTS[1][0][1]=3; ALTS[1][1][1]=3; ALTS[1][2][1]=1; ALTS[1][3][1]=1;
-    ALTS[0][0][2]=1; ALTS[0][1][2]=0; ALTS[0][2][2]=1; ALTS[0][3][2]=0;
-    ALTS[1][0][2]=3; ALTS[1][1][2]=2; ALTS[1][2][2]=3; ALTS[1][3][2]=2;
-
-
-    for(j=i+1; j< (n - 1); j++){
-        ij = PAIR_IDX(n, i, j);
-        for(k=j+1; k< n; k++){
-            ik = PAIR_IDX(n, i, k);
-            jk = PAIR_IDX(n, j, k);
-
-            /* compute configurations matches scores */
-            Rij = Rijs + 9*ij;
-            Rjk = Rijs + 9*jk;
-            Rik = Rijs + 9*ik;
-
-            JRJ(Rij, JRijJ);
-            JRJ(Rjk, JRjkJ);
-            JRJ(Rik, JRikJ);
-
-            mult_3x3(tmp, Rij, Rjk);
-            c[0] = diff_norm_3x3(tmp, Rik);
-
-            mult_3x3(tmp, JRijJ, Rjk);
-            c[1] = diff_norm_3x3(tmp, Rik);
-
-            mult_3x3(tmp, Rij, JRjkJ);
-            c[2] = diff_norm_3x3(tmp, Rik);
-
-            mult_3x3(tmp, Rij, Rjk);
-            c[3] = diff_norm_3x3(tmp, JRikJ);
-
-            /* find best match */
-            best_i=0; best_val=c[0];
-            if (c[1]<best_val) {best_i=1; best_val=c[1];}
-            if (c[2]<best_val) {best_i=2; best_val=c[2];}
-            if (c[3]<best_val) {best_i=3; best_val=c[3];}
-
-             /* for each triangle side, find the best alternative */
-             alt_ij_jk = c[ALTS[0][best_i][0]];
-             if (c[ALTS[1][best_i][0]] < alt_ij_jk){
-                 alt_ij_jk = c[ALTS[1][best_i][0]];
-             }
-
-             alt_ik_jk = c[ALTS[0][best_i][1]];
-             if (c[ALTS[1][best_i][1]] < alt_ik_jk){
-                 alt_ik_jk = c[ALTS[1][best_i][1]];
-             }
-             alt_ij_ik = c[ALTS[0][best_i][2]];
-             if (c[ALTS[1][best_i][2]] < alt_ij_ik){
-                 alt_ij_ik = c[ALTS[1][best_i][2]];
-             }
-
-            /* Assign scores */
-            s_ij_jk = 1 - sqrt(best_val / alt_ij_jk);
-            s_ik_jk = 1 - sqrt(best_val / alt_ik_jk);
-            s_ij_ik = 1 - sqrt(best_val / alt_ij_ik);
-
-
-            /* the probability of a pair ij to have the observed triangles scores,
-            given it has an indicative common line */
-            f_ij_jk = log( P2*(B*pow(1-s_ij_jk,b)*exp(-b/(1-x0)*(1-s_ij_jk))) + (1-P2)*A*pow((1-s_ij_jk),a) );
-            f_ik_jk = log( P2*(B*pow(1-s_ik_jk,b)*exp(-b/(1-x0)*(1-s_ik_jk))) + (1-P2)*A*pow((1-s_ik_jk),a) );
-            f_ij_ik = log( P2*(B*pow(1-s_ij_ik,b)*exp(-b/(1-x0)*(1-s_ij_ik))) + (1-P2)*A*pow((1-s_ij_ik),a) );
-            ln_f_ind[ij*n +i] += f_ij_jk + f_ij_ik;
-            ln_f_ind[jk*n +i] += f_ij_jk + f_ik_jk;
-            ln_f_ind[ik*n +i] += f_ik_jk + f_ij_ik;
-
-            /* the probability of a pair ij to have the observed triangles scores,
-             given it has an arbitrary common line */
-            f_ij_jk = log( A*pow((1-s_ij_jk),a) );
-            f_ik_jk = log( A*pow((1-s_ik_jk),a) );
-            f_ij_ik = log( A*pow((1-s_ij_ik),a) );
-            ln_f_arb[ij*n +i] += f_ij_jk + f_ij_ik;
-            ln_f_arb[jk*n +i] += f_ij_jk + f_ik_jk;
-            ln_f_arb[ik*n +i] += f_ik_jk + f_ij_ik;
-
-
-        } /* k */
-    } /* j */
-
-    return;
-};
-
-
-extern "C" __global__
-void triangle_scores_inner(int n, double* Rijs, int n_intervals, double* cum_scores, double* scores_hist)
-{
-    /* thread index (1d), represents "i" index */
-    unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    /* no-op when out of bounds */
-    if(i >= n) return;
-
-    double c[4];
-    unsigned int j;
-    unsigned int k;
-    for(k=0;k<4;k++){c[k]=0;}
-    unsigned long ij, jk, ik;
-    int best_i;
-    double best_val;
-    double s_ij_jk, s_ik_jk, s_ij_ik;
-    double alt_ij_jk, alt_ij_ik, alt_ik_jk;
-    unsigned int l1,l2,l3;
-    double threshold;
-    double h = 1. / n_intervals;
-
-    double *Rij, *Rjk, *Rik;
-    double JRijJ[9], JRjkJ[9], JRikJ[9];
-    double tmp[9];
-
-    /* initialize alternatives */
-    /* when we find the best J-configuration, we also compare it to the alternative 2nd best one.
-    * this comparison is done for every pair in the triplete independently. to make sure that the
-    * alternative is indeed different in relation to the pair, we document the differences between
-    * the configurations in advance:
-    * ALTS(:,best_conf,pair) = the two configurations in which J-sync differs from
-    * best_conf in relation to pair */
-
-    int ALTS[2][4][3];
-    ALTS[0][0][0]=1; ALTS[0][1][0]=0; ALTS[0][2][0]=0; ALTS[0][3][0]=1;
-    ALTS[1][0][0]=2; ALTS[1][1][0]=3; ALTS[1][2][0]=3; ALTS[1][3][0]=2;
-    ALTS[0][0][1]=2; ALTS[0][1][1]=2; ALTS[0][2][1]=0; ALTS[0][3][1]=0;
-    ALTS[1][0][1]=3; ALTS[1][1][1]=3; ALTS[1][2][1]=1; ALTS[1][3][1]=1;
-    ALTS[0][0][2]=1; ALTS[0][1][2]=0; ALTS[0][2][2]=1; ALTS[0][3][2]=0;
-    ALTS[1][0][2]=3; ALTS[1][1][2]=2; ALTS[1][2][2]=3; ALTS[1][3][2]=2;
-
-
-    for(j=i+1; j< (n - 1); j++){
-        ij = PAIR_IDX(n, i, j);
-        for(k=j+1; k< n; k++){
-            ik = PAIR_IDX(n, i, k);
-            jk = PAIR_IDX(n, j, k);
-
-            /* compute configurations matches scores */
-            Rij = Rijs + 9*ij;
-            Rjk = Rijs + 9*jk;
-            Rik = Rijs + 9*ik;
-
-            JRJ(Rij, JRijJ);
-            JRJ(Rjk, JRjkJ);
-            JRJ(Rik, JRikJ);
-
-            mult_3x3(tmp, Rij, Rjk);
-            c[0] = diff_norm_3x3(tmp, Rik);
-
-            mult_3x3(tmp, JRijJ, Rjk);
-            c[1] = diff_norm_3x3(tmp, Rik);
-
-            mult_3x3(tmp, Rij, JRjkJ);
-            c[2] = diff_norm_3x3(tmp, Rik);
-
-            mult_3x3(tmp, Rij, Rjk);
-            c[3] = diff_norm_3x3(tmp, JRikJ);
-
-            /* find best match */
-            best_i=0; best_val=c[0];
-            if (c[1]<best_val) {best_i=1; best_val=c[1];}
-            if (c[2]<best_val) {best_i=2; best_val=c[2];}
-            if (c[3]<best_val) {best_i=3; best_val=c[3];}
-
-             /* for each triangle side, find the best alternative */
-             alt_ij_jk = c[ALTS[0][best_i][0]];
-             if (c[ALTS[1][best_i][0]] < alt_ij_jk){
-                 alt_ij_jk = c[ALTS[1][best_i][0]];
-             }
-
-             alt_ik_jk = c[ALTS[0][best_i][1]];
-             if (c[ALTS[1][best_i][1]] < alt_ik_jk){
-                 alt_ik_jk = c[ALTS[1][best_i][1]];
-             }
-             alt_ij_ik = c[ALTS[0][best_i][2]];
-             if (c[ALTS[1][best_i][2]] < alt_ij_ik){
-                 alt_ij_ik = c[ALTS[1][best_i][2]];
-             }
-
-            /* Assign scores */
-            s_ij_jk = 1 - sqrt(best_val / alt_ij_jk);
-            s_ik_jk = 1 - sqrt(best_val / alt_ik_jk);
-            s_ij_ik = 1 - sqrt(best_val / alt_ij_ik);
-
-
-            /* update cumulated scores */
-            cum_scores[ij*n+i] += s_ij_jk + s_ij_ik;
-            cum_scores[jk*n+i] += s_ij_jk + s_ik_jk;
-            cum_scores[ik*n+i] += s_ik_jk + s_ij_ik;
-
-            /* update scores histogram */
-            threshold = 0;
-            for (l1=0; l1<n_intervals-1; l1++) {
-                threshold += h;
-                if (s_ij_jk < threshold) {break;}
-            }
-
-            threshold = 0;
-            for(l2=0; l2<n_intervals-1; l2++) {
-                threshold += h;
-                if(s_ik_jk < threshold) {break;}
-            }
-
-            threshold = 0;
-            for(l3=0; l3<n_intervals-1; l3++) {
-                threshold += h;
-                if (s_ij_ik < threshold) {break;}
-            }
-
-            scores_hist[l1*n+i] += 1;
-            scores_hist[l2*n+i] += 1;
-            scores_hist[l3*n+i] += 1;
-
-        } /* k */
-    } /* j */
-
-    return;
-};
-
-"""
-    import cupy as cp
-
-    module = cp.RawModule(code=module_code)
-
-    return module
-
-
-def _signs_times_v_cupy(n, Rijs, vec, J_weighting):
-    """
-    Ported from _signs_times_v_mex.c
-
-    n: n_img
-    Rijs: nchoose2x3x3 array
-    vec: input array
-    new_vec: output array
-    J_weighting: bool
-    """
-    import cupy as cp
-
-    # xxx
-    module = _init_cupy_module()
-
-    signs_times_v = module.get_function("signs_times_v")
-
-    Rijs_dev = cp.array(Rijs)
-    vec_dev = cp.array(vec)
-    # 2d over i then accum to avoid race on i
-    new_vec_dev = cp.zeros((vec.shape[0], n))
+        new_vec = np.zeros_like(vec)
 
-    # call the kernel
-    blkszx = 512
-    nblkx = (n + blkszx - 1) // blkszx
-    signs_times_v((nblkx,), (blkszx,), (n, Rijs_dev, vec_dev, new_vec_dev, J_weighting))
+        _signs_confs = np.array(
+            [[1, 1, 1], [-1, 1, -1], [-1, -1, 1], [1, -1, -1]], dtype=int
+        )
 
-    # accumulate, can reuse the vec_dev array now.
-    cp.sum(new_vec_dev, axis=1, out=vec_dev)
+        c = np.empty((4))
+        desc = "Computing signs_times_v"
+        if J_weighting:
+            desc += " with J_weighting"
+        for i in trange(self.n_img, desc=desc):
+            for j in range(
+                i + 1, self.n_img - 1
+            ):  # check bound (taken from MATLAB mex)
+                ij = self._pairs_to_linear[i, j]
+                Rij = Rijs[ij]
+                for k in range(j + 1, self.n_img):
+                    ik = self._pairs_to_linear[i, k]
+                    jk = self._pairs_to_linear[j, k]
+                    Rik = Rijs[ik]
+                    Rjk = Rijs[jk]
+
+                    # Compute conjugated rotats
+                    Rij_J = J_conjugate(Rij)
+                    Rik_J = J_conjugate(Rik)
+                    Rjk_J = J_conjugate(Rjk)
+
+                    # Compute R muls and norms
+                    c[0] = np.sum(((Rij @ Rjk) - Rik) ** 2)
+                    c[1] = np.sum(((Rij_J @ Rjk) - Rik) ** 2)
+                    c[2] = np.sum(((Rij @ Rjk_J) - Rik) ** 2)
+                    c[3] = np.sum(((Rij @ Rjk) - Rik_J) ** 2)
+
+                    # Find best match
+                    best_i = np.argmin(c)
+                    best_val = c[best_i]
+
+                    # MATLAB: scores_as_entries == 0
+                    s_ij_jk = _signs_confs[best_i][0]
+                    s_ik_jk = _signs_confs[best_i][1]
+                    s_ij_ik = _signs_confs[best_i][2]
+
+                    # Note there was a third J_weighting option (2) in MATLAB,
+                    # but it was not exposed at top level.
+                    if self.J_weighting:
+                        # MATLAB: scores_as_entries == 1
+                        # For each triangle side, find the best alternative
+                        alt_ij_jk = c[self._ALTS[0][best_i][0]]
+                        if c[self._ALTS[1][best_i][0]] < alt_ij_jk:
+                            alt_ij_jk = c[self._ALTS[1][best_i][0]]
+
+                        alt_ik_jk = c[self._ALTS[0][best_i][1]]
+                        if c[self._ALTS[1][best_i][1]] < alt_ik_jk:
+                            alt_ik_jk = c[self._ALTS[1][best_i][1]]
+
+                        alt_ij_ik = c[self._ALTS[0][best_i][2]]
+                        if c[self._ALTS[1][best_i][2]] < alt_ij_ik:
+                            alt_ij_ik = c[self._ALTS[1][best_i][2]]
+
+                        # Compute scores
+                        s_ij_jk *= 1 - np.sqrt(best_val / alt_ij_jk)
+                        s_ik_jk *= 1 - np.sqrt(best_val / alt_ik_jk)
+                        s_ij_ik *= 1 - np.sqrt(best_val / alt_ij_ik)
+
+                    # Update vector entries
+                    new_vec[ij] += s_ij_jk * vec[jk] + s_ij_ik * vec[ik]
+                    new_vec[jk] += s_ij_jk * vec[ij] + s_ik_jk * vec[ik]
+                    new_vec[ik] += s_ij_ik * vec[ij] + s_ik_jk * vec[jk]
 
-    # dtoh
-    new_vec = vec_dev.get()
+        return new_vec
 
-    return new_vec
+    def _signs_times_v_cupy(self, Rijs, vec):
+        """
+        Ported from _signs_times_v_mex.c
 
+        n: n_img
+        Rijs: nchoose2x3x3 array
+        vec: input array
+        new_vec: output array
+        J_weighting: bool
+        """
+        import cupy as cp
 
-def _pairs_probabilities_cupy(n, Rijs, P2, A, a, B, b, x0):
-    """
-    n: n_img
-    Rijs: nchoose2x3x3 array
+        signs_times_v = self._gpu_module.get_function("signs_times_v")
 
-    """
-    import cupy as cp
+        Rijs_dev = cp.array(Rijs)
+        vec_dev = cp.array(vec)
+        # 2d over i then accum to avoid race on i
+        new_vec_dev = cp.zeros((vec.shape[0], n))
 
-    # xxx
-    module = _init_cupy_module()
+        # call the kernel
+        blkszx = 512
+        nblkx = (n + blkszx - 1) // blkszx
+        signs_times_v(
+            (nblkx,), (blkszx,), (n, Rijs_dev, vec_dev, new_vec_dev, J_weighting)
+        )
 
-    pairs_probabilities = module.get_function("pairs_probabilities")
+        # accumulate, can reuse the vec_dev array now.
+        cp.sum(new_vec_dev, axis=1, out=vec_dev)
 
-    Rijs_dev = cp.array(Rijs)
-    ln_f_ind_dev = cp.zeros((n * (n - 1) // 2, n))  # n is for thread safety
-    ln_f_arb_dev = cp.zeros((n * (n - 1) // 2, n))  # n is for thread safety
+        # dtoh
+        new_vec = vec_dev.get()
 
-    # call the kernel
-    blkszx = 512
-    nblkx = (n + blkszx - 1) // blkszx
-    pairs_probabilities(
-        (nblkx,),
-        (blkszx,),
-        (n, Rijs_dev, P2, A, a, B, b, x0, ln_f_ind_dev, ln_f_arb_dev),
-    )
+        return new_vec
 
-    # accumulate over thread results
-    ln_f_arb = cp.sum(ln_f_arb_dev, axis=1).get()
-    ln_f_ind = cp.sum(ln_f_ind_dev, axis=1).get()
-
-    return ln_f_ind, ln_f_arb
-
-
-def _pairs_probabilities_host(n, Rijs, P2, A, a, B, b, x0, _ALTS, _pairs_to_linear):
-    # The following is adopted from Matlab pairs_probabilities_mex.c `looper`
-
-    # Initialize probability result arrays
-    ln_f_ind = np.zeros(len(Rijs), dtype=Rijs.dtype)
-    ln_f_arb = np.zeros(len(Rijs), dtype=Rijs.dtype)
-
-    c = np.empty((4), dtype=Rijs.dtype)
-    for i in trange(n, desc="Computing pair probabilities"):
-        for j in range(i + 1, n - 1):
-            ij = _pairs_to_linear[i, j]
-            Rij = Rijs[ij]
-            for k in range(j + 1, n):
-                ik = _pairs_to_linear[i, k]
-                jk = _pairs_to_linear[j, k]
-                Rik = Rijs[ik]
-                Rjk = Rijs[jk]
-
-                # Compute conjugated rotats
-                Rij_J = J_conjugate(Rij)
-                Rik_J = J_conjugate(Rik)
-                Rjk_J = J_conjugate(Rjk)
-
-                # Compute R muls and norms
-                c[0] = np.sum(((Rij @ Rjk) - Rik) ** 2)
-                c[1] = np.sum(((Rij_J @ Rjk) - Rik) ** 2)
-                c[2] = np.sum(((Rij @ Rjk_J) - Rik) ** 2)
-                c[3] = np.sum(((Rij @ Rjk) - Rik_J) ** 2)
-
-                # Find best match
-                best_i = np.argmin(c)
-                best_val = c[best_i]
-
-                # For each triangle side, find the best alternative
-                alt_ij_jk = c[_ALTS[0][best_i][0]]
-                if c[_ALTS[1][best_i][0]] < alt_ij_jk:
-                    alt_ij_jk = c[_ALTS[1][best_i][0]]
-                alt_ik_jk = c[_ALTS[0][best_i][1]]
-                if c[_ALTS[1][best_i][1]] < alt_ik_jk:
-                    alt_ik_jk = c[_ALTS[1][best_i][1]]
-                alt_ij_ik = c[_ALTS[0][best_i][2]]
-                if c[_ALTS[1][best_i][2]] < alt_ij_ik:
-                    alt_ij_ik = c[_ALTS[1][best_i][2]]
-
-                # Compute scores
-                s_ij_jk = 1 - np.sqrt(best_val / alt_ij_jk)
-                s_ik_jk = 1 - np.sqrt(best_val / alt_ik_jk)
-                s_ij_ik = 1 - np.sqrt(best_val / alt_ij_ik)
-
-                # Update probabilities
-                # # Probability of pair ij having score given indicicative common line
-                # P2, B, b, x0, A, a
-                f_ij_jk = np.log(
-                    P2
-                    * (
-                        B
-                        * np.power(1 - s_ij_jk, b)
-                        * np.exp(-b / (1 - x0) * (1 - s_ij_jk))
-                    )
-                    + (1 - P2) * A * np.power((1 - s_ij_jk), a)
-                )
-                f_ik_jk = np.log(
-                    P2
-                    * (
-                        B
-                        * np.power(1 - s_ik_jk, b)
-                        * np.exp(-b / (1 - x0) * (1 - s_ik_jk))
-                    )
-                    + (1 - P2) * A * np.power((1 - s_ik_jk), a)
-                )
-                f_ij_ik = np.log(
-                    P2
-                    * (
-                        B
-                        * np.power(1 - s_ij_ik, b)
-                        * np.exp(-b / (1 - x0) * (1 - s_ij_ik))
-                    )
-                    + (1 - P2) * A * np.power((1 - s_ij_ik), a)
-                )
-                ln_f_ind[ij] += f_ij_jk + f_ij_ik
-                ln_f_ind[jk] += f_ij_jk + f_ik_jk
-                ln_f_ind[ik] += f_ik_jk + f_ij_ik
-
-                # # Probability of pair ij having score given arbitrary common line
-                f_ij_jk = np.log(A * np.power((1 - s_ij_jk), a))
-                f_ik_jk = np.log(A * np.power((1 - s_ik_jk), a))
-                f_ij_ik = np.log(A * np.power((1 - s_ij_ik), a))
-                ln_f_arb[ij] += f_ij_jk + f_ij_ik
-                ln_f_arb[jk] += f_ij_jk + f_ik_jk
-                ln_f_arb[ik] += f_ik_jk + f_ij_ik
-
-    return ln_f_ind, ln_f_arb
-
-
-def _triangle_scores_inner_host(n_img, Rijs, hist_intervals, _ALTS, _pairs_to_linear):
-    # The following is adopted from Matlab triangle_scores_mex.c
-
-    # Initialize probability result arrays
-    cum_scores = np.zeros(len(Rijs), dtype=Rijs.dtype)
-    scores_hist = np.zeros(hist_intervals, dtype=Rijs.dtype)
-    h = 1 / hist_intervals
-
-    c = np.empty((4), dtype=Rijs.dtype)
-    for i in trange(n_img, desc="Computing triangle scores"):
-        for j in range(i + 1, n_img - 1):  # check bound (taken from MATLAB mex)
-            ij = _pairs_to_linear[i, j]
-            Rij = Rijs[ij]
-            for k in range(j + 1, n_img):
-                ik = _pairs_to_linear[i, k]
-                jk = _pairs_to_linear[j, k]
-                Rik = Rijs[ik]
-                Rjk = Rijs[jk]
-
-                # Compute conjugated rotats
-                Rij_J = J_conjugate(Rij)
-                Rik_J = J_conjugate(Rik)
-                Rjk_J = J_conjugate(Rjk)
-
-                # Compute R muls and norms
-                c[0] = np.sum(((Rij @ Rjk) - Rik) ** 2)
-                c[1] = np.sum(((Rij_J @ Rjk) - Rik) ** 2)
-                c[2] = np.sum(((Rij @ Rjk_J) - Rik) ** 2)
-                c[3] = np.sum(((Rij @ Rjk) - Rik_J) ** 2)
-
-                # Find best match
-                best_i = np.argmin(c)
-                best_val = c[best_i]
-
-                # For each triangle side, find the best alternative
-                alt_ij_jk = c[_ALTS[0][best_i][0]]
-                if c[_ALTS[1][best_i][0]] < alt_ij_jk:
-                    alt_ij_jk = c[_ALTS[1][best_i][0]]
-
-                alt_ik_jk = c[_ALTS[0][best_i][1]]
-                if c[_ALTS[1][best_i][1]] < alt_ik_jk:
-                    alt_ik_jk = c[_ALTS[1][best_i][1]]
-
-                alt_ij_ik = c[_ALTS[0][best_i][2]]
-                if c[_ALTS[1][best_i][2]] < alt_ij_ik:
-                    alt_ij_ik = c[_ALTS[1][best_i][2]]
-
-                # Compute scores
-                s_ij_jk = 1 - np.sqrt(best_val / alt_ij_jk)
-                s_ik_jk = 1 - np.sqrt(best_val / alt_ik_jk)
-                s_ij_ik = 1 - np.sqrt(best_val / alt_ij_ik)
-
-                # Update cumulated scores
-                cum_scores[ij] += s_ij_jk + s_ij_ik
-                cum_scores[jk] += s_ij_jk + s_ik_jk
-                cum_scores[ik] += s_ik_jk + s_ij_ik
-
-                # Update histogram
-                threshold = 0
-                for _l1 in range(hist_intervals - 1):
-                    threshold += h
-                    if s_ij_jk < threshold:
-                        break
-
-                threshold = 0
-                for _l2 in range(hist_intervals - 1):
-                    threshold += h
-                    if s_ik_jk < threshold:
-                        break
-
-                threshold = 0
-                for _l3 in range(hist_intervals - 1):
-                    threshold += h
-                    if s_ij_ik < threshold:
-                        break
-
-                scores_hist[_l1] += 1
-                scores_hist[_l2] += 1
-                scores_hist[_l3] += 1
-
-    return cum_scores, scores_hist
-
-
-def _triangle_scores_inner_cupy(n_img, Rijs, hist_intervals):
-    """
-    n: n_img
-    Rijs: nchoose2x3x3 array
+    @staticmethod
+    def _init_cupy_module():
+        """
+        Private utility method to read in CUDA source and return as compiled CUPY module.
+        """
 
-    """
-    import cupy as cp
-
-    # xxx
-    module = _init_cupy_module()
-
-    triangle_scores = module.get_function("triangle_scores_inner")
-
-    Rijs_dev = cp.array(Rijs)
-    # xxx I think we can safely remove cum_scores
-    cum_scores_dev = cp.zeros(
-        (n_img * (n_img - 1) // 2, n_img), dtype=np.float64
-    )  # n is for thread safety
-    scores_hist_dev = cp.zeros(
-        (hist_intervals, n_img), dtype=np.float64
-    )  # n is for thread safety
-
-    # call the kernel
-    blkszx = 512
-    nblkx = (n_img + blkszx - 1) // blkszx
-    triangle_scores(
-        (nblkx,),
-        (blkszx,),
-        (n_img, Rijs_dev, hist_intervals, cum_scores_dev, scores_hist_dev),
-    )
+        import cupy as cp
 
-    # accumulate over thread results
-    cum_scores = cp.sum(cum_scores_dev, axis=1).get()
-    scores_hist = cp.sum(scores_hist_dev, axis=1).get()
+        # Read in contents of file
+        with open("commonline_sync3n.cu", rb) as f:
+            module_code = f.read()
 
-    return cum_scores, scores_hist
+        # CUPY compile the CUDA code
+        return cp.RawModule(code=module_code)

From a6ad9f1e6c6d8e1ce211b86730db30f6b339e621 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Fri, 26 Apr 2024 10:42:49 -0400
Subject: [PATCH 28/60] initial cupy comparison test add

---
 src/aspire/abinitio/commonline_sync3n.cu | 421 +++++++++++++++++++++++
 src/aspire/abinitio/commonline_sync3n.py |  30 +-
 tests/test_commonline_sync3n_cupy.py     | 104 ++++++
 3 files changed, 541 insertions(+), 14 deletions(-)
 create mode 100644 src/aspire/abinitio/commonline_sync3n.cu
 create mode 100644 tests/test_commonline_sync3n_cupy.py

diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu
new file mode 100644
index 0000000000..3c0b0b9001
--- /dev/null
+++ b/src/aspire/abinitio/commonline_sync3n.cu
@@ -0,0 +1,421 @@
+
+/* from i,j indoces to the common index in the N-choose-2 sized array */
+#define PAIR_IDX(N,I,J) ((2*N-I-1)*I/2 + J-I-1)
+
+
+inline void mult_3x3(double *out, double *R1, double *R2) {
+  /* 3X3 matrices multiplication: out = R1*R2
+   * Note, this differs from the MATLAB mult_3x3.
+   */
+
+  int i,j,k;
+
+  for(i=0; i<3; i++){
+    for(j=0; j<3; j++){
+      out[i*3 + j] = 0;
+      for (k=0; k<3; k++){
+        out[i*3 + j] += R1[i*3+k] * R2[k*3+j];
+      }
+    }
+  }
+}
+
+inline void JRJ(double *R, double *A) {
+  /* multiple 3X3 matrix by J from both sizes: A = JRJ */
+  A[0]=R[0];
+  A[1]=R[1];
+  A[2]=-R[2];
+  A[3]=R[3];
+  A[4]=R[4];
+  A[5]=-R[5];
+  A[6]=-R[6];
+  A[7]=-R[7];
+  A[8]=R[8];
+}
+
+inline double diff_norm_3x3(const double *R1, const double *R2) {
+  /* difference 2 matrices and return squared norm: ||R1-R2||^2 */
+  int i;
+  double norm = 0;
+  for (i=0; i<9; i++) {norm += (R1[i]-R2[i])*(R1[i]-R2[i]);}
+  return norm;
+}
+
+
+extern "C" __global__
+void signs_times_v(int n, double* Rijs, const double* vec, double* new_vec, bool J_weighting)
+{
+  /* thread index (1d), represents "i" index */
+  unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+  /* no-op when out of bounds */
+  if(i >= n) return;
+
+  double c[4];
+  unsigned int j;
+  unsigned int k;
+  for(k=0;k<4;k++){c[k]=0;}
+  unsigned long ij, jk, ik;
+  int best_i;
+  double best_val;
+  double s_ij_jk, s_ik_jk, s_ij_ik;
+  double alt_ij_jk, alt_ij_ik, alt_ik_jk;
+
+  double *Rij, *Rjk, *Rik;
+  double JRijJ[9], JRjkJ[9], JRikJ[9];
+  double tmp[9];
+
+  int signs_confs[4][3];
+  for(int a=0; a<4; a++) { for(k=0; k<3; k++) { signs_confs[a][k]=1; } }
+  signs_confs[1][0]=-1; signs_confs[1][2]=-1;
+  signs_confs[2][0]=-1; signs_confs[2][1]=-1;
+  signs_confs[3][1]=-1; signs_confs[3][2]=-1;
+
+  /* initialize alternatives */
+  /* when we find the best J-configuration, we also compare it to the alternative 2nd best one.
+   * this comparison is done for every pair in the triplete independently. to make sure that the
+   * alternative is indeed different in relation to the pair, we document the differences between
+   * the configurations in advance:
+   * ALTS(:,best_conf,pair) = the two configurations in which J-sync differs from
+   * best_conf in relation to pair */
+
+  int ALTS[2][4][3];
+  ALTS[0][0][0]=1; ALTS[0][1][0]=0; ALTS[0][2][0]=0; ALTS[0][3][0]=1;
+  ALTS[1][0][0]=2; ALTS[1][1][0]=3; ALTS[1][2][0]=3; ALTS[1][3][0]=2;
+  ALTS[0][0][1]=2; ALTS[0][1][1]=2; ALTS[0][2][1]=0; ALTS[0][3][1]=0;
+  ALTS[1][0][1]=3; ALTS[1][1][1]=3; ALTS[1][2][1]=1; ALTS[1][3][1]=1;
+  ALTS[0][0][2]=1; ALTS[0][1][2]=0; ALTS[0][2][2]=1; ALTS[0][3][2]=0;
+  ALTS[1][0][2]=3; ALTS[1][1][2]=2; ALTS[1][2][2]=3; ALTS[1][3][2]=2;
+
+
+  for(j=i+1; j< (n - 1); j++){
+    ij = PAIR_IDX(n, i, j);
+    for(k=j+1; k< n; k++){
+      ik = PAIR_IDX(n, i, k);
+      jk = PAIR_IDX(n, j, k);
+
+      /* compute configurations matches scores */
+      Rij = Rijs + 9*ij;
+      Rjk = Rijs + 9*jk;
+      Rik = Rijs + 9*ik;
+
+      JRJ(Rij, JRijJ);
+      JRJ(Rjk, JRjkJ);
+      JRJ(Rik, JRikJ);
+
+      mult_3x3(tmp, Rij, Rjk);
+      c[0] = diff_norm_3x3(tmp, Rik);
+
+      mult_3x3(tmp, JRijJ, Rjk);
+      c[1] = diff_norm_3x3(tmp, Rik);
+
+      mult_3x3(tmp, Rij, JRjkJ);
+      c[2] = diff_norm_3x3(tmp, Rik);
+
+      mult_3x3(tmp, Rij, Rjk);
+      c[3] = diff_norm_3x3(tmp, JRikJ);
+
+      /* find best match */
+      best_i=0; best_val=c[0];
+      if (c[1]<best_val) {best_i=1; best_val=c[1];}
+      if (c[2]<best_val) {best_i=2; best_val=c[2];}
+      if (c[3]<best_val) {best_i=3; best_val=c[3];}
+
+      /* set triangles entries to be signs */
+      s_ij_jk = signs_confs[best_i][0];
+      s_ik_jk = signs_confs[best_i][1];
+      s_ij_ik = signs_confs[best_i][2];
+
+      /* J weighting */
+      if(J_weighting){
+        /* for each triangle side, find the best alternative */
+        alt_ij_jk = c[ALTS[0][best_i][0]];
+        if (c[ALTS[1][best_i][0]] < alt_ij_jk){
+          alt_ij_jk = c[ALTS[1][best_i][0]];
+        }
+
+        alt_ik_jk = c[ALTS[0][best_i][1]];
+        if (c[ALTS[1][best_i][1]] < alt_ik_jk){
+          alt_ik_jk = c[ALTS[1][best_i][1]];
+        }
+        alt_ij_ik = c[ALTS[0][best_i][2]];
+        if (c[ALTS[1][best_i][2]] < alt_ij_ik){
+          alt_ij_ik = c[ALTS[1][best_i][2]];
+        }
+
+        /* Update scores */
+        s_ij_jk *= 1 - sqrt(best_val / alt_ij_jk);
+        s_ik_jk *= 1 - sqrt(best_val / alt_ik_jk);
+        s_ij_ik *= 1 - sqrt(best_val / alt_ij_ik);
+      }
+
+
+      /* update multiplication */
+      new_vec[ij*n + i] += s_ij_jk*vec[jk] + s_ij_ik*vec[ik];
+      new_vec[jk*n + i] += s_ij_jk*vec[ij] + s_ik_jk*vec[ik];
+      new_vec[ik*n + i] += s_ij_ik*vec[ij] + s_ik_jk*vec[jk];
+
+    } /* k */
+  } /* j */
+
+  return;
+};
+
+extern "C" __global__
+void pairs_probabilities(int n, double* Rijs, double P2, double A, double a, double B, double b, double x0, double* ln_f_ind, double* ln_f_arb)
+{
+  /* thread index (1d), represents "i" index */
+  unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+  /* no-op when out of bounds */
+  if(i >= n) return;
+
+  double c[4];
+  unsigned int j;
+  unsigned int k;
+  for(k=0;k<4;k++){c[k]=0;}
+  unsigned long ij, jk, ik;
+  int best_i;
+  double best_val;
+  double s_ij_jk, s_ik_jk, s_ij_ik;
+  double alt_ij_jk, alt_ij_ik, alt_ik_jk;
+  double f_ij_jk, f_ik_jk, f_ij_ik;
+
+
+  double *Rij, *Rjk, *Rik;
+  double JRijJ[9], JRjkJ[9], JRikJ[9];
+  double tmp[9];
+
+  int signs_confs[4][3];
+  for(int a=0; a<4; a++) { for(k=0; k<3; k++) { signs_confs[a][k]=1; } }
+  signs_confs[1][0]=-1; signs_confs[1][2]=-1;
+  signs_confs[2][0]=-1; signs_confs[2][1]=-1;
+  signs_confs[3][1]=-1; signs_confs[3][2]=-1;
+
+  /* initialize alternatives */
+  /* when we find the best J-configuration, we also compare it to the alternative 2nd best one.
+   * this comparison is done for every pair in the triplete independently. to make sure that the
+   * alternative is indeed different in relation to the pair, we document the differences between
+   * the configurations in advance:
+   * ALTS(:,best_conf,pair) = the two configurations in which J-sync differs from
+   * best_conf in relation to pair */
+
+  int ALTS[2][4][3];
+  ALTS[0][0][0]=1; ALTS[0][1][0]=0; ALTS[0][2][0]=0; ALTS[0][3][0]=1;
+  ALTS[1][0][0]=2; ALTS[1][1][0]=3; ALTS[1][2][0]=3; ALTS[1][3][0]=2;
+  ALTS[0][0][1]=2; ALTS[0][1][1]=2; ALTS[0][2][1]=0; ALTS[0][3][1]=0;
+  ALTS[1][0][1]=3; ALTS[1][1][1]=3; ALTS[1][2][1]=1; ALTS[1][3][1]=1;
+  ALTS[0][0][2]=1; ALTS[0][1][2]=0; ALTS[0][2][2]=1; ALTS[0][3][2]=0;
+  ALTS[1][0][2]=3; ALTS[1][1][2]=2; ALTS[1][2][2]=3; ALTS[1][3][2]=2;
+
+
+  for(j=i+1; j< (n - 1); j++){
+    ij = PAIR_IDX(n, i, j);
+    for(k=j+1; k< n; k++){
+      ik = PAIR_IDX(n, i, k);
+      jk = PAIR_IDX(n, j, k);
+
+      /* compute configurations matches scores */
+      Rij = Rijs + 9*ij;
+      Rjk = Rijs + 9*jk;
+      Rik = Rijs + 9*ik;
+
+      JRJ(Rij, JRijJ);
+      JRJ(Rjk, JRjkJ);
+      JRJ(Rik, JRikJ);
+
+      mult_3x3(tmp, Rij, Rjk);
+      c[0] = diff_norm_3x3(tmp, Rik);
+
+      mult_3x3(tmp, JRijJ, Rjk);
+      c[1] = diff_norm_3x3(tmp, Rik);
+
+      mult_3x3(tmp, Rij, JRjkJ);
+      c[2] = diff_norm_3x3(tmp, Rik);
+
+      mult_3x3(tmp, Rij, Rjk);
+      c[3] = diff_norm_3x3(tmp, JRikJ);
+
+      /* find best match */
+      best_i=0; best_val=c[0];
+      if (c[1]<best_val) {best_i=1; best_val=c[1];}
+      if (c[2]<best_val) {best_i=2; best_val=c[2];}
+      if (c[3]<best_val) {best_i=3; best_val=c[3];}
+
+      /* for each triangle side, find the best alternative */
+      alt_ij_jk = c[ALTS[0][best_i][0]];
+      if (c[ALTS[1][best_i][0]] < alt_ij_jk){
+        alt_ij_jk = c[ALTS[1][best_i][0]];
+      }
+
+      alt_ik_jk = c[ALTS[0][best_i][1]];
+      if (c[ALTS[1][best_i][1]] < alt_ik_jk){
+        alt_ik_jk = c[ALTS[1][best_i][1]];
+      }
+      alt_ij_ik = c[ALTS[0][best_i][2]];
+      if (c[ALTS[1][best_i][2]] < alt_ij_ik){
+        alt_ij_ik = c[ALTS[1][best_i][2]];
+      }
+
+      /* Assign scores */
+      s_ij_jk = 1 - sqrt(best_val / alt_ij_jk);
+      s_ik_jk = 1 - sqrt(best_val / alt_ik_jk);
+      s_ij_ik = 1 - sqrt(best_val / alt_ij_ik);
+
+
+      /* the probability of a pair ij to have the observed triangles scores,
+         given it has an indicative common line */
+      f_ij_jk = log( P2*(B*pow(1-s_ij_jk,b)*exp(-b/(1-x0)*(1-s_ij_jk))) + (1-P2)*A*pow((1-s_ij_jk),a) );
+      f_ik_jk = log( P2*(B*pow(1-s_ik_jk,b)*exp(-b/(1-x0)*(1-s_ik_jk))) + (1-P2)*A*pow((1-s_ik_jk),a) );
+      f_ij_ik = log( P2*(B*pow(1-s_ij_ik,b)*exp(-b/(1-x0)*(1-s_ij_ik))) + (1-P2)*A*pow((1-s_ij_ik),a) );
+      ln_f_ind[ij*n +i] += f_ij_jk + f_ij_ik;
+      ln_f_ind[jk*n +i] += f_ij_jk + f_ik_jk;
+      ln_f_ind[ik*n +i] += f_ik_jk + f_ij_ik;
+
+      /* the probability of a pair ij to have the observed triangles scores,
+         given it has an arbitrary common line */
+      f_ij_jk = log( A*pow((1-s_ij_jk),a) );
+      f_ik_jk = log( A*pow((1-s_ik_jk),a) );
+      f_ij_ik = log( A*pow((1-s_ij_ik),a) );
+      ln_f_arb[ij*n +i] += f_ij_jk + f_ij_ik;
+      ln_f_arb[jk*n +i] += f_ij_jk + f_ik_jk;
+      ln_f_arb[ik*n +i] += f_ik_jk + f_ij_ik;
+
+
+    } /* k */
+  } /* j */
+
+  return;
+};
+
+
+extern "C" __global__
+void triangle_scores_inner(int n, double* Rijs, int n_intervals, double* cum_scores, double* scores_hist)
+{
+  /* thread index (1d), represents "i" index */
+  unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+  /* no-op when out of bounds */
+  if(i >= n) return;
+
+  double c[4];
+  unsigned int j;
+  unsigned int k;
+  for(k=0;k<4;k++){c[k]=0;}
+  unsigned long ij, jk, ik;
+  int best_i;
+  double best_val;
+  double s_ij_jk, s_ik_jk, s_ij_ik;
+  double alt_ij_jk, alt_ij_ik, alt_ik_jk;
+  unsigned int l1,l2,l3;
+  double threshold;
+  double h = 1. / n_intervals;
+
+  double *Rij, *Rjk, *Rik;
+  double JRijJ[9], JRjkJ[9], JRikJ[9];
+  double tmp[9];
+
+  /* initialize alternatives */
+  /* when we find the best J-configuration, we also compare it to the alternative 2nd best one.
+   * this comparison is done for every pair in the triplete independently. to make sure that the
+   * alternative is indeed different in relation to the pair, we document the differences between
+   * the configurations in advance:
+   * ALTS(:,best_conf,pair) = the two configurations in which J-sync differs from
+   * best_conf in relation to pair */
+
+  int ALTS[2][4][3];
+  ALTS[0][0][0]=1; ALTS[0][1][0]=0; ALTS[0][2][0]=0; ALTS[0][3][0]=1;
+  ALTS[1][0][0]=2; ALTS[1][1][0]=3; ALTS[1][2][0]=3; ALTS[1][3][0]=2;
+  ALTS[0][0][1]=2; ALTS[0][1][1]=2; ALTS[0][2][1]=0; ALTS[0][3][1]=0;
+  ALTS[1][0][1]=3; ALTS[1][1][1]=3; ALTS[1][2][1]=1; ALTS[1][3][1]=1;
+  ALTS[0][0][2]=1; ALTS[0][1][2]=0; ALTS[0][2][2]=1; ALTS[0][3][2]=0;
+  ALTS[1][0][2]=3; ALTS[1][1][2]=2; ALTS[1][2][2]=3; ALTS[1][3][2]=2;
+
+
+  for(j=i+1; j< (n - 1); j++){
+    ij = PAIR_IDX(n, i, j);
+    for(k=j+1; k< n; k++){
+      ik = PAIR_IDX(n, i, k);
+      jk = PAIR_IDX(n, j, k);
+
+      /* compute configurations matches scores */
+      Rij = Rijs + 9*ij;
+      Rjk = Rijs + 9*jk;
+      Rik = Rijs + 9*ik;
+
+      JRJ(Rij, JRijJ);
+      JRJ(Rjk, JRjkJ);
+      JRJ(Rik, JRikJ);
+
+      mult_3x3(tmp, Rij, Rjk);
+      c[0] = diff_norm_3x3(tmp, Rik);
+
+      mult_3x3(tmp, JRijJ, Rjk);
+      c[1] = diff_norm_3x3(tmp, Rik);
+
+      mult_3x3(tmp, Rij, JRjkJ);
+      c[2] = diff_norm_3x3(tmp, Rik);
+
+      mult_3x3(tmp, Rij, Rjk);
+      c[3] = diff_norm_3x3(tmp, JRikJ);
+
+      /* find best match */
+      best_i=0; best_val=c[0];
+      if (c[1]<best_val) {best_i=1; best_val=c[1];}
+      if (c[2]<best_val) {best_i=2; best_val=c[2];}
+      if (c[3]<best_val) {best_i=3; best_val=c[3];}
+
+      /* for each triangle side, find the best alternative */
+      alt_ij_jk = c[ALTS[0][best_i][0]];
+      if (c[ALTS[1][best_i][0]] < alt_ij_jk){
+        alt_ij_jk = c[ALTS[1][best_i][0]];
+      }
+
+      alt_ik_jk = c[ALTS[0][best_i][1]];
+      if (c[ALTS[1][best_i][1]] < alt_ik_jk){
+        alt_ik_jk = c[ALTS[1][best_i][1]];
+      }
+      alt_ij_ik = c[ALTS[0][best_i][2]];
+      if (c[ALTS[1][best_i][2]] < alt_ij_ik){
+        alt_ij_ik = c[ALTS[1][best_i][2]];
+      }
+
+      /* Assign scores */
+      s_ij_jk = 1 - sqrt(best_val / alt_ij_jk);
+      s_ik_jk = 1 - sqrt(best_val / alt_ik_jk);
+      s_ij_ik = 1 - sqrt(best_val / alt_ij_ik);
+
+
+      /* update cumulated scores */
+      cum_scores[ij*n+i] += s_ij_jk + s_ij_ik;
+      cum_scores[jk*n+i] += s_ij_jk + s_ik_jk;
+      cum_scores[ik*n+i] += s_ik_jk + s_ij_ik;
+
+      /* update scores histogram */
+      threshold = 0;
+      for (l1=0; l1<n_intervals-1; l1++) {
+        threshold += h;
+        if (s_ij_jk < threshold) {break;}
+      }
+
+      threshold = 0;
+      for(l2=0; l2<n_intervals-1; l2++) {
+        threshold += h;
+        if(s_ik_jk < threshold) {break;}
+      }
+
+      threshold = 0;
+      for(l3=0; l3<n_intervals-1; l3++) {
+        threshold += h;
+        if (s_ij_ik < threshold) {break;}
+      }
+
+      scores_hist[l1*n+i] += 1;
+      scores_hist[l2*n+i] += 1;
+      scores_hist[l3*n+i] += 1;
+
+    } /* k */
+  } /* j */
+
+  return;
+};
diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 7c9d65c81b..1f957168d0 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -1,4 +1,5 @@
 import logging
+import os.path
 
 import numpy as np
 from numpy.linalg import norm
@@ -37,7 +38,7 @@ def __init__(
         self,
         src,
         n_rad=None,
-        n_theta=None,
+        n_theta=360,
         max_shift=0.15,
         shift_step=1,
         epsilon=1e-2,
@@ -98,7 +99,7 @@ def __init__(
                 logger.info(
                     f"cupy and GPU {gpu_id} found by cuda runtime; enabling cupy."
                 )
-                self._gpu_module = _init_cupy_module()
+                self._gpu_module = self._init_cupy_module()
             else:
                 logger.info("GPU not found, defaulting to numpy.")
 
@@ -411,16 +412,16 @@ def _triangle_scores_inner_cupy(self, Rijs):
 
         # xxx I think we can safely remove cum_scores
         cum_scores_dev = cp.zeros(
-            (n_img * (n_img - 1) // 2, n_img), dtype=np.float64
+            (self.n_img * (self.n_img - 1) // 2, self.n_img), dtype=np.float64
         )  # n is for thread safety
 
         scores_hist_dev = cp.zeros(
-            (hist_intervals, n_img), dtype=np.float64
+            (self.hist_intervals, self.n_img), dtype=np.float64
         )  # n is for thread safety
 
         # call the kernel
         blkszx = 512
-        nblkx = (n_img + blkszx - 1) // blkszx
+        nblkx = (self.n_img + blkszx - 1) // blkszx
         triangle_scores(
             (nblkx,),
             (blkszx,),
@@ -614,12 +615,12 @@ def _triangle_scores(
             cum_scores /= len(Rijs)
 
         # Histogram decomposition: P & sigma evaluation
-        h = 1 / hist_intervals
+        h = 1 / self.hist_intervals
         hist_x = np.arange(h / 2, 1, h)
         # normalization factor of one component of the histogram
         A = (
             (self.n_img * (self.n_img - 1) * (self.n_img - 2) / 2)
-            / hist_intervals
+            / self.hist_intervals
             * (a + 1)
         )
         # normalization of 2nd component: B = P*N_delta/sum(f), where f is the component formula
@@ -656,7 +657,7 @@ def fun(x, B, P, b, x0, A=A, a=a):
         A = a + 1  # distribution 1st component normalization factor
         # distribution 2nd component normalization factor
         B = B / (
-            (self.n_img * (self.n_img - 1) * (self.n_img - 2) / 2) / hist_intervals
+            (self.n_img * (self.n_img - 1) * (self.n_img - 2) / 2) / self.hist_intervals
         )
 
         # Calculate probabilities
@@ -831,7 +832,7 @@ def _signs_times_v_host(self, Rijs, vec):
 
         c = np.empty((4))
         desc = "Computing signs_times_v"
-        if J_weighting:
+        if self.J_weighting:
             desc += " with J_weighting"
         for i in trange(self.n_img, desc=desc):
             for j in range(
@@ -911,13 +912,13 @@ def _signs_times_v_cupy(self, Rijs, vec):
         Rijs_dev = cp.array(Rijs)
         vec_dev = cp.array(vec)
         # 2d over i then accum to avoid race on i
-        new_vec_dev = cp.zeros((vec.shape[0], n))
+        new_vec_dev = cp.zeros((vec.shape[0], self.n_img))
 
         # call the kernel
         blkszx = 512
-        nblkx = (n + blkszx - 1) // blkszx
+        nblkx = (self.n_img + blkszx - 1) // blkszx
         signs_times_v(
-            (nblkx,), (blkszx,), (n, Rijs_dev, vec_dev, new_vec_dev, J_weighting)
+            (nblkx,), (blkszx,), (self.n_img, Rijs_dev, vec_dev, new_vec_dev, self.J_weighting)
         )
 
         # accumulate, can reuse the vec_dev array now.
@@ -937,8 +938,9 @@ def _init_cupy_module():
         import cupy as cp
 
         # Read in contents of file
-        with open("commonline_sync3n.cu", rb) as f:
-            module_code = f.read()
+        fp = os.path.join(os.path.dirname(__file__), "commonline_sync3n.cu")
+        with open(fp, 'r') as fh:
+            module_code = fh.read()
 
         # CUPY compile the CUDA code
         return cp.RawModule(code=module_code)
diff --git a/tests/test_commonline_sync3n_cupy.py b/tests/test_commonline_sync3n_cupy.py
new file mode 100644
index 0000000000..f80f6b12fc
--- /dev/null
+++ b/tests/test_commonline_sync3n_cupy.py
@@ -0,0 +1,104 @@
+import numpy as np
+import pytest
+
+from aspire.source import Simulation
+from aspire.abinitio.commonline_sync3n import CLSync3N
+
+DTYPE = np.float64
+N = 64
+n_pairs = N * (N - 1) // 2
+
+@pytest.fixture
+def src_fixture():
+    src = Simulation(n=N, L=32, C=1, dtype=DTYPE)
+    src = src.cache()
+    return src
+
+@pytest.fixture
+def cl3n_fixture(src_fixture):
+    cl = CLSync3N(src_fixture)
+    return cl
+
+@pytest.fixture
+def rijs_fixture():
+    Rijs = np.arange(n_pairs * 3 * 3).reshape(n_pairs, 3, 3)
+    Rijs = Rijs.astype(dtype=DTYPE, copy=False)
+    return Rijs
+
+def test_pairs_prob_host_vs_cupy(cl3n_fixture, rijs_fixture):
+    """
+    Compares pairs_probabilities  between host and cupy implementations.
+    """
+    
+    P2, A, a, B, b, x0 = 1, 2, 3, 4, 5, 6
+
+    # DTYPE is critical here (manually calling private method
+    params = np.array([P2, A, a, B, b, x0], dtype=np.float64)
+
+    # Execute CUPY
+    indscp, arbcp = cl3n_fixture._pairs_probabilities_cupy(rijs_fixture, *params)
+
+    # Execute host
+    indsh, arbh = cl3n_fixture._pairs_probabilities_host(rijs_fixture, *params)
+
+    # Compare host to cupy calls
+    np.testing.assert_allclose(indsh, indscp)
+    np.testing.assert_allclose(arbh, arbcp)
+
+def test_triangle_scores_host_vs_cupy(cl3n_fixture, rijs_fixture):
+    """
+    Compares triangle_scores between host and cupy implementations.
+    """
+    # DTYPE is critical here (manually calling private method
+
+    # Execute CUPY
+    cucp, hicp = cl3n_fixture._triangle_scores_inner_cupy(rijs_fixture)
+
+    # Execute host
+    cuh, hih = cl3n_fixture._triangle_scores_inner_host(rijs_fixture)
+
+    # Compare host to cupy calls
+    np.testing.assert_allclose(cucp,cuh)
+    np.testing.assert_allclose(hicp,hih)
+
+def test_stv_host_vs_cupy(cl3n_fixture, rijs_fixture):
+    """
+    Compares signs_times_v between host and cupy implementations.
+
+    Default J_weighting=False
+    """
+    # dummy data vector
+    vec = np.ones(n_pairs, dtype=DTYPE)
+
+    # J_weighting=False
+    assert cl3n_fixture.J_weighting == False
+
+    # Execute CUPY
+    new_vec_cp = cl3n_fixture._signs_times_v_cupy(rijs_fixture, vec)
+
+    # Execute host
+    new_vec_h = cl3n_fixture._signs_times_v_host(rijs_fixture, vec)
+
+    # Compare host to cupy calls
+    np.testing.assert_allclose(new_vec_cp, new_vec_h)
+
+def test_stvJwt_host_vs_cupy(cl3n_fixture, rijs_fixture):
+    """
+    Compares signs_times_v between host and cupy implementations.
+
+    Force J_weighting=True
+    """
+    # dummy data vector
+    vec = np.ones(n_pairs, dtype=DTYPE)
+
+    # J_weighting=True
+    cl3n_fixture.J_weighting = True
+
+    # Execute CUPY
+    new_vec_cp = cl3n_fixture._signs_times_v_cupy(rijs_fixture, vec)
+
+    # Execute host
+    new_vec_h = cl3n_fixture._signs_times_v_host(rijs_fixture, vec)
+
+    # Compare host to cupy calls
+    np.testing.assert_allclose(new_vec_cp, new_vec_h)

From 854a0bfe77baabbe94e8485d64eb486063f39acd Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Fri, 26 Apr 2024 10:47:29 -0400
Subject: [PATCH 29/60] cleanup cl3n compare test a little

---
 src/aspire/abinitio/commonline_sync3n.py |  6 ++++--
 tests/test_commonline_sync3n_cupy.py     | 25 +++++++++++++++---------
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 1f957168d0..bc6f7634d8 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -918,7 +918,9 @@ def _signs_times_v_cupy(self, Rijs, vec):
         blkszx = 512
         nblkx = (self.n_img + blkszx - 1) // blkszx
         signs_times_v(
-            (nblkx,), (blkszx,), (self.n_img, Rijs_dev, vec_dev, new_vec_dev, self.J_weighting)
+            (nblkx,),
+            (blkszx,),
+            (self.n_img, Rijs_dev, vec_dev, new_vec_dev, self.J_weighting),
         )
 
         # accumulate, can reuse the vec_dev array now.
@@ -939,7 +941,7 @@ def _init_cupy_module():
 
         # Read in contents of file
         fp = os.path.join(os.path.dirname(__file__), "commonline_sync3n.cu")
-        with open(fp, 'r') as fh:
+        with open(fp, "r") as fh:
             module_code = fh.read()
 
         # CUPY compile the CUDA code
diff --git a/tests/test_commonline_sync3n_cupy.py b/tests/test_commonline_sync3n_cupy.py
index f80f6b12fc..8068266e65 100644
--- a/tests/test_commonline_sync3n_cupy.py
+++ b/tests/test_commonline_sync3n_cupy.py
@@ -1,35 +1,39 @@
 import numpy as np
 import pytest
 
-from aspire.source import Simulation
 from aspire.abinitio.commonline_sync3n import CLSync3N
+from aspire.source import Simulation
 
-DTYPE = np.float64
-N = 64
+DTYPE = np.float64  # TODO, consider single precision.
+N = 64  # Number of images
 n_pairs = N * (N - 1) // 2
 
+
 @pytest.fixture
 def src_fixture():
     src = Simulation(n=N, L=32, C=1, dtype=DTYPE)
     src = src.cache()
     return src
 
+
 @pytest.fixture
 def cl3n_fixture(src_fixture):
     cl = CLSync3N(src_fixture)
     return cl
 
+
 @pytest.fixture
 def rijs_fixture():
     Rijs = np.arange(n_pairs * 3 * 3).reshape(n_pairs, 3, 3)
     Rijs = Rijs.astype(dtype=DTYPE, copy=False)
     return Rijs
 
+
 def test_pairs_prob_host_vs_cupy(cl3n_fixture, rijs_fixture):
     """
     Compares pairs_probabilities  between host and cupy implementations.
     """
-    
+
     P2, A, a, B, b, x0 = 1, 2, 3, 4, 5, 6
 
     # DTYPE is critical here (manually calling private method
@@ -45,6 +49,7 @@ def test_pairs_prob_host_vs_cupy(cl3n_fixture, rijs_fixture):
     np.testing.assert_allclose(indsh, indscp)
     np.testing.assert_allclose(arbh, arbcp)
 
+
 def test_triangle_scores_host_vs_cupy(cl3n_fixture, rijs_fixture):
     """
     Compares triangle_scores between host and cupy implementations.
@@ -58,8 +63,9 @@ def test_triangle_scores_host_vs_cupy(cl3n_fixture, rijs_fixture):
     cuh, hih = cl3n_fixture._triangle_scores_inner_host(rijs_fixture)
 
     # Compare host to cupy calls
-    np.testing.assert_allclose(cucp,cuh)
-    np.testing.assert_allclose(hicp,hih)
+    np.testing.assert_allclose(cucp, cuh)
+    np.testing.assert_allclose(hicp, hih)
+
 
 def test_stv_host_vs_cupy(cl3n_fixture, rijs_fixture):
     """
@@ -68,10 +74,10 @@ def test_stv_host_vs_cupy(cl3n_fixture, rijs_fixture):
     Default J_weighting=False
     """
     # dummy data vector
-    vec = np.ones(n_pairs, dtype=DTYPE)
+    vec = np.random.random(n_pairs).astype(dtype=DTYPE, copy=False)
 
     # J_weighting=False
-    assert cl3n_fixture.J_weighting == False
+    assert cl3n_fixture.J_weighting is False
 
     # Execute CUPY
     new_vec_cp = cl3n_fixture._signs_times_v_cupy(rijs_fixture, vec)
@@ -82,6 +88,7 @@ def test_stv_host_vs_cupy(cl3n_fixture, rijs_fixture):
     # Compare host to cupy calls
     np.testing.assert_allclose(new_vec_cp, new_vec_h)
 
+
 def test_stvJwt_host_vs_cupy(cl3n_fixture, rijs_fixture):
     """
     Compares signs_times_v between host and cupy implementations.
@@ -89,7 +96,7 @@ def test_stvJwt_host_vs_cupy(cl3n_fixture, rijs_fixture):
     Force J_weighting=True
     """
     # dummy data vector
-    vec = np.ones(n_pairs, dtype=DTYPE)
+    vec = np.random.random(n_pairs).astype(dtype=DTYPE, copy=False)
 
     # J_weighting=True
     cl3n_fixture.J_weighting = True

From 41e68c8f8bd56d1e61878557ea34340c1ece2a36 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Fri, 26 Apr 2024 10:49:42 -0400
Subject: [PATCH 30/60] rm merged test file

---
 x.py | 87 ------------------------------------------------------------
 1 file changed, 87 deletions(-)
 delete mode 100644 x.py

diff --git a/x.py b/x.py
deleted file mode 100644
index ebc5e6d768..0000000000
--- a/x.py
+++ /dev/null
@@ -1,87 +0,0 @@
-import pickle
-import time
-from collections import defaultdict
-
-import cupy as cp
-import matplotlib.pyplot as plt
-import numpy as np
-
-from aspire.abinitio.commonline_sync3n import _signs_times_v_cupy, _signs_times_v_host
-from aspire.utils import all_pairs
-
-
-def time_test(n):
-    n_pairs = n * (n - 1) // 2
-    _, _pairs_to_linear = all_pairs(n, return_map=True)
-
-    vec = np.ones(n_pairs, dtype=np.float64)
-    # Rijs = np.random.randn(n_pairs*3*3).astype(dtype=np.float64)
-    Rijs = np.arange(n_pairs * 3 * 3).reshape(n_pairs, 3, 3).astype(dtype=np.float64)
-
-    tic0 = time.perf_counter()
-    new_vec = _signs_times_v_cupy(n, Rijs, vec, J_weighting=False)
-    tic1 = time.perf_counter()
-    gpu_time = tic1 - tic0
-    print("gpu\n", new_vec)
-
-    tic2 = time.perf_counter()
-    new_vec_host = _signs_times_v_host(
-        n, Rijs, vec, J_weighting=False, _ALTS=None, _pairs_to_linear=_pairs_to_linear
-    )
-    tic3 = time.perf_counter()
-    host_time = tic3 - tic2
-    print("host\n", new_vec_host)
-
-    print(f"\n\n\nSize:\t{n}")
-    print("Allclose? ", np.allclose(new_vec_host, new_vec))
-    print(f"gpu_time: {gpu_time}")
-    print(f"host_time: {host_time}")
-    speedup = host_time / gpu_time
-    print(f"speedup: {speedup}")
-
-    return host_time, gpu_time, speedup
-
-
-def plotit(results):
-    N = np.array(list(results.keys()))
-    H = np.array([v["host"] for v in results.values()])
-    G = np.array([v["gpu"] for v in results.values()])
-    S = np.array([v["speedup"] for v in results.values()])
-
-    plt.plot(N, H, label="host python")
-    plt.plot(N, G, label="cuda")
-    plt.title("Walltimes (s)")
-    plt.legend()
-    plt.show()
-    plt.savefig("walltimes.png")
-    plt.clf()
-
-    plt.plot(N, S)
-    plt.title("Speedup Ratio")
-    plt.show()
-    plt.savefig("speedups.png")
-    plt.clf()
-
-
-def main():
-    results = defaultdict(dict)
-    # too long...! for n in [4,16,64,100,128,200,256,512,1024,2048,3000, 4096, 10000]:
-    # for n in [4,16]: # test
-    for n in [4, 16, 64, 100, 128, 200, 512]:
-        h, g, s = time_test(n)
-        results[n]["host"] = h
-        results[n]["gpu"] = g
-        results[n]["speedup"] = s
-
-        # save in case we cancel
-        with open("saved_results.pkl", "wb") as f:
-            pickle.dump(results, f)
-
-    print()
-    print(results)
-    print()
-
-    plotit(results)
-
-
-time_test(64)

From 61d349819423ee113ca585c1fb9f09376f3b2758 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Fri, 26 Apr 2024 10:53:52 -0400
Subject: [PATCH 31/60] fixup manifest

---
 MANIFEST.in | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MANIFEST.in b/MANIFEST.in
index 4477aa87c0..ecc7484b40 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -17,6 +17,7 @@ recursive-include docs *.rst
 recursive-include docs Makefile
 recursive-include docs *.sh
 recursive-include src *.conf
+recursive-include src *.cu
 recursive-include src *.yaml
 prune docs/build
 prune docs/source

From d571349bc115b551a7578be22a9618c42f6e0c7c Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Fri, 26 Apr 2024 10:58:00 -0400
Subject: [PATCH 32/60] remove unused cum_scores

---
 src/aspire/abinitio/commonline_sync3n.cu |  8 +-----
 src/aspire/abinitio/commonline_sync3n.py | 35 ++++++------------------
 tests/test_commonline_sync3n_cupy.py     | 10 ++++---
 3 files changed, 15 insertions(+), 38 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu
index 3c0b0b9001..58ee75a98e 100644
--- a/src/aspire/abinitio/commonline_sync3n.cu
+++ b/src/aspire/abinitio/commonline_sync3n.cu
@@ -290,7 +290,7 @@ void pairs_probabilities(int n, double* Rijs, double P2, double A, double a, dou
 
 
 extern "C" __global__
-void triangle_scores_inner(int n, double* Rijs, int n_intervals, double* cum_scores, double* scores_hist)
+void triangle_scores_inner(int n, double* Rijs, int n_intervals, double* scores_hist)
 {
   /* thread index (1d), represents "i" index */
   unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
@@ -385,12 +385,6 @@ void triangle_scores_inner(int n, double* Rijs, int n_intervals, double* cum_sco
       s_ik_jk = 1 - sqrt(best_val / alt_ik_jk);
       s_ij_ik = 1 - sqrt(best_val / alt_ij_ik);
 
-
-      /* update cumulated scores */
-      cum_scores[ij*n+i] += s_ij_jk + s_ij_ik;
-      cum_scores[jk*n+i] += s_ij_jk + s_ik_jk;
-      cum_scores[ik*n+i] += s_ik_jk + s_ij_ik;
-
       /* update scores histogram */
       threshold = 0;
       for (l1=0; l1<n_intervals-1; l1++) {
diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index bc6f7634d8..6f09350589 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -247,9 +247,7 @@ def _syncmatrix_weights(
 
         def body(prev_too_low, Pmin, Pmax, hist, p_domain_limit=p_domain_limit):
             # Get inistial estimate for Pij
-            P, sigma, Pij, hist, cum_scores = self._triangle_scores(
-                Rijs, hist, Pmin, Pmax
-            )
+            P, sigma, Pij, hist = self._triangle_scores(Rijs, hist, Pmin, Pmax)
 
             # Check if P and Pij are consistent
             mean_Pij = np.mean(Pij)
@@ -307,18 +305,17 @@ def _triangle_scores_inner(self, Rijs):
 
         # host/gpu dispatch
         if self._gpu_module:
-            cum_scores, scores_hist = self._triangle_scores_inner_cupy(Rijs)
+            scores_hist = self._triangle_scores_inner_cupy(Rijs)
         else:
-            cum_scores, scores_hist = self._triangle_scores_inner_host(Rijs)
+            scores_hist = self._triangle_scores_inner_host(Rijs)
 
-        return cum_scores, scores_hist
+        return scores_hist
 
     def _triangle_scores_inner_host(self, Rijs):
 
         # The following is adopted from Matlab triangle_scores_mex.c
 
         # Initialize probability result arrays
-        cum_scores = np.zeros(len(Rijs), dtype=Rijs.dtype)
         scores_hist = np.zeros(self.hist_intervals, dtype=Rijs.dtype)
         h = 1 / self.hist_intervals
 
@@ -368,11 +365,6 @@ def _triangle_scores_inner_host(self, Rijs):
                     s_ik_jk = 1 - np.sqrt(best_val / alt_ik_jk)
                     s_ij_ik = 1 - np.sqrt(best_val / alt_ij_ik)
 
-                    # Update cumulated scores
-                    cum_scores[ij] += s_ij_jk + s_ij_ik
-                    cum_scores[jk] += s_ij_jk + s_ik_jk
-                    cum_scores[ik] += s_ik_jk + s_ij_ik
-
                     # Update histogram
                     threshold = 0
                     for _l1 in range(self.hist_intervals - 1):
@@ -396,7 +388,7 @@ def _triangle_scores_inner_host(self, Rijs):
                     scores_hist[_l2] += 1
                     scores_hist[_l3] += 1
 
-        return cum_scores, scores_hist
+        return scores_hist
 
     def _triangle_scores_inner_cupy(self, Rijs):
         """
@@ -410,11 +402,6 @@ def _triangle_scores_inner_cupy(self, Rijs):
 
         Rijs_dev = cp.array(Rijs)
 
-        # xxx I think we can safely remove cum_scores
-        cum_scores_dev = cp.zeros(
-            (self.n_img * (self.n_img - 1) // 2, self.n_img), dtype=np.float64
-        )  # n is for thread safety
-
         scores_hist_dev = cp.zeros(
             (self.hist_intervals, self.n_img), dtype=np.float64
         )  # n is for thread safety
@@ -429,16 +416,14 @@ def _triangle_scores_inner_cupy(self, Rijs):
                 self.n_img,
                 Rijs_dev,
                 self.hist_intervals,
-                cum_scores_dev,
                 scores_hist_dev,
             ),
         )
 
         # accumulate over thread results
-        cum_scores = cp.sum(cum_scores_dev, axis=1).get()
         scores_hist = cp.sum(scores_hist_dev, axis=1).get()
 
-        return cum_scores, scores_hist
+        return scores_hist
 
     def _pairs_probabilities(self, Rijs, P2, A, a, B, b, x0):
         # dtype is critical for passing into C code...
@@ -607,12 +592,8 @@ def _triangle_scores(
         Pmax = Pmax or 1
         Pmax = min(Pmax, 1)  # Clamp probability to [0,1]
 
-        cum_scores = None  # XXX Why do we even need cum_scores?
         if scores_hist is None:
-            cum_scores, scores_hist = self._triangle_scores_inner(Rijs)
-
-            # Normalize cumulated scores
-            cum_scores /= len(Rijs)
+            scores_hist = self._triangle_scores_inner(Rijs)
 
         # Histogram decomposition: P & sigma evaluation
         h = 1 / self.hist_intervals
@@ -672,7 +653,7 @@ def fun(x, B, P, b, x0, A=A, a=a):
             )
             Pij = np.nan_to_num(Pij)
 
-        return P, sigma, Pij, scores_hist, cum_scores
+        return P, sigma, Pij, scores_hist
 
     ###########################################
     # Primary Methods                         #
diff --git a/tests/test_commonline_sync3n_cupy.py b/tests/test_commonline_sync3n_cupy.py
index 8068266e65..3cc0245ad7 100644
--- a/tests/test_commonline_sync3n_cupy.py
+++ b/tests/test_commonline_sync3n_cupy.py
@@ -9,6 +9,9 @@
 n_pairs = N * (N - 1) // 2
 
 
+# XXX TODO, conditionally run these only if GPU present.
+
+
 @pytest.fixture
 def src_fixture():
     src = Simulation(n=N, L=32, C=1, dtype=DTYPE)
@@ -57,14 +60,13 @@ def test_triangle_scores_host_vs_cupy(cl3n_fixture, rijs_fixture):
     # DTYPE is critical here (manually calling private method
 
     # Execute CUPY
-    cucp, hicp = cl3n_fixture._triangle_scores_inner_cupy(rijs_fixture)
+    hist_cp = cl3n_fixture._triangle_scores_inner_cupy(rijs_fixture)
 
     # Execute host
-    cuh, hih = cl3n_fixture._triangle_scores_inner_host(rijs_fixture)
+    hist_h = cl3n_fixture._triangle_scores_inner_host(rijs_fixture)
 
     # Compare host to cupy calls
-    np.testing.assert_allclose(cucp, cuh)
-    np.testing.assert_allclose(hicp, hih)
+    np.testing.assert_allclose(hist_cp, hist_h)
 
 
 def test_stv_host_vs_cupy(cl3n_fixture, rijs_fixture):

From 1aacb72f9489c257109dc255a5d7298a7bc2a27b Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 29 Apr 2024 08:42:52 -0400
Subject: [PATCH 33/60] atomic stv

---
 src/aspire/abinitio/commonline_sync3n.cu | 6 +++---
 src/aspire/abinitio/commonline_sync3n.py | 8 ++------
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu
index 58ee75a98e..884e5b44f4 100644
--- a/src/aspire/abinitio/commonline_sync3n.cu
+++ b/src/aspire/abinitio/commonline_sync3n.cu
@@ -151,9 +151,9 @@ void signs_times_v(int n, double* Rijs, const double* vec, double* new_vec, bool
 
 
       /* update multiplication */
-      new_vec[ij*n + i] += s_ij_jk*vec[jk] + s_ij_ik*vec[ik];
-      new_vec[jk*n + i] += s_ij_jk*vec[ij] + s_ik_jk*vec[ik];
-      new_vec[ik*n + i] += s_ij_ik*vec[ij] + s_ik_jk*vec[jk];
+      atomicAdd(&(new_vec[ij]), s_ij_jk*vec[jk] + s_ij_ik*vec[ik]);
+      atomicAdd(&(new_vec[jk]), s_ij_jk*vec[ij] + s_ik_jk*vec[ik]);
+      atomicAdd(&(new_vec[ik]), s_ij_ik*vec[ij] + s_ik_jk*vec[jk]);
 
     } /* k */
   } /* j */
diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 6f09350589..8bc41cc4ca 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -892,8 +892,7 @@ def _signs_times_v_cupy(self, Rijs, vec):
 
         Rijs_dev = cp.array(Rijs)
         vec_dev = cp.array(vec)
-        # 2d over i then accum to avoid race on i
-        new_vec_dev = cp.zeros((vec.shape[0], self.n_img))
+        new_vec_dev = cp.zeros((vec.shape[0]))
 
         # call the kernel
         blkszx = 512
@@ -904,11 +903,8 @@ def _signs_times_v_cupy(self, Rijs, vec):
             (self.n_img, Rijs_dev, vec_dev, new_vec_dev, self.J_weighting),
         )
 
-        # accumulate, can reuse the vec_dev array now.
-        cp.sum(new_vec_dev, axis=1, out=vec_dev)
-
         # dtoh
-        new_vec = vec_dev.get()
+        new_vec = new_vec_dev.get()
 
         return new_vec
 

From 9fba485dc6d36230dc6c1211c16c1a74f69114ba Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 29 Apr 2024 10:01:37 -0400
Subject: [PATCH 34/60] convert remaining kernels to use atomics instead of
 naive array safety

---
 src/aspire/abinitio/commonline_sync3n.cu | 18 +++++++++---------
 src/aspire/abinitio/commonline_sync3n.py | 20 +++++++-------------
 tests/test_commonline_sync3n_cupy.py     |  6 +++---
 3 files changed, 19 insertions(+), 25 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu
index 884e5b44f4..aaff3d0e76 100644
--- a/src/aspire/abinitio/commonline_sync3n.cu
+++ b/src/aspire/abinitio/commonline_sync3n.cu
@@ -268,18 +268,18 @@ void pairs_probabilities(int n, double* Rijs, double P2, double A, double a, dou
       f_ij_jk = log( P2*(B*pow(1-s_ij_jk,b)*exp(-b/(1-x0)*(1-s_ij_jk))) + (1-P2)*A*pow((1-s_ij_jk),a) );
       f_ik_jk = log( P2*(B*pow(1-s_ik_jk,b)*exp(-b/(1-x0)*(1-s_ik_jk))) + (1-P2)*A*pow((1-s_ik_jk),a) );
       f_ij_ik = log( P2*(B*pow(1-s_ij_ik,b)*exp(-b/(1-x0)*(1-s_ij_ik))) + (1-P2)*A*pow((1-s_ij_ik),a) );
-      ln_f_ind[ij*n +i] += f_ij_jk + f_ij_ik;
-      ln_f_ind[jk*n +i] += f_ij_jk + f_ik_jk;
-      ln_f_ind[ik*n +i] += f_ik_jk + f_ij_ik;
+      atomicAdd(&(ln_f_ind[ij]), f_ij_jk + f_ij_ik);
+      atomicAdd(&(ln_f_ind[jk]), f_ij_jk + f_ik_jk);
+      atomicAdd(&(ln_f_ind[ik]), f_ik_jk + f_ij_ik);
 
       /* the probability of a pair ij to have the observed triangles scores,
          given it has an arbitrary common line */
       f_ij_jk = log( A*pow((1-s_ij_jk),a) );
       f_ik_jk = log( A*pow((1-s_ik_jk),a) );
       f_ij_ik = log( A*pow((1-s_ij_ik),a) );
-      ln_f_arb[ij*n +i] += f_ij_jk + f_ij_ik;
-      ln_f_arb[jk*n +i] += f_ij_jk + f_ik_jk;
-      ln_f_arb[ik*n +i] += f_ik_jk + f_ij_ik;
+      atomicAdd(&(ln_f_arb[ij]), f_ij_jk + f_ij_ik);
+      atomicAdd(&(ln_f_arb[jk]), f_ij_jk + f_ik_jk);
+      atomicAdd(&(ln_f_arb[ik]), f_ik_jk + f_ij_ik);
 
 
     } /* k */
@@ -404,9 +404,9 @@ void triangle_scores_inner(int n, double* Rijs, int n_intervals, double* scores_
         if (s_ij_ik < threshold) {break;}
       }
 
-      scores_hist[l1*n+i] += 1;
-      scores_hist[l2*n+i] += 1;
-      scores_hist[l3*n+i] += 1;
+      atomicAdd(&(scores_hist[l1]), 1);
+      atomicAdd(&(scores_hist[l2]), 1);
+      atomicAdd(&(scores_hist[l3]), 1);
 
     } /* k */
   } /* j */
diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 8bc41cc4ca..463dc52d9c 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -402,9 +402,7 @@ def _triangle_scores_inner_cupy(self, Rijs):
 
         Rijs_dev = cp.array(Rijs)
 
-        scores_hist_dev = cp.zeros(
-            (self.hist_intervals, self.n_img), dtype=np.float64
-        )  # n is for thread safety
+        scores_hist_dev = cp.zeros((self.hist_intervals), dtype=np.float64)
 
         # call the kernel
         blkszx = 512
@@ -420,8 +418,8 @@ def _triangle_scores_inner_cupy(self, Rijs):
             ),
         )
 
-        # accumulate over thread results
-        scores_hist = cp.sum(scores_hist_dev, axis=1).get()
+        # d2h
+        scores_hist = scores_hist_dev.get()
 
         return scores_hist
 
@@ -540,12 +538,8 @@ def _pairs_probabilities_cupy(self, Rijs, P2, A, a, B, b, x0):
         pairs_probabilities = self._gpu_module.get_function("pairs_probabilities")
 
         Rijs_dev = cp.array(Rijs)
-        ln_f_ind_dev = cp.zeros(
-            (self.n_img * (self.n_img - 1) // 2, self.n_img)
-        )  # second dim is for thread safety
-        ln_f_arb_dev = cp.zeros(
-            (self.n_img * (self.n_img - 1) // 2, self.n_img)
-        )  # second dim  is for thread safety
+        ln_f_ind_dev = cp.zeros((self.n_img * (self.n_img - 1) // 2), dtype=np.float64)
+        ln_f_arb_dev = cp.zeros((self.n_img * (self.n_img - 1) // 2), dtype=np.float64)
 
         # call the kernel
         blkszx = 512
@@ -557,8 +551,8 @@ def _pairs_probabilities_cupy(self, Rijs, P2, A, a, B, b, x0):
         )
 
         # accumulate over thread results
-        ln_f_arb = cp.sum(ln_f_arb_dev, axis=1).get()
-        ln_f_ind = cp.sum(ln_f_ind_dev, axis=1).get()
+        ln_f_arb = ln_f_arb_dev.get()
+        ln_f_ind = ln_f_ind_dev.get()
 
         return ln_f_ind, ln_f_arb
 
diff --git a/tests/test_commonline_sync3n_cupy.py b/tests/test_commonline_sync3n_cupy.py
index 3cc0245ad7..81d967aa8e 100644
--- a/tests/test_commonline_sync3n_cupy.py
+++ b/tests/test_commonline_sync3n_cupy.py
@@ -12,20 +12,20 @@
 # XXX TODO, conditionally run these only if GPU present.
 
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def src_fixture():
     src = Simulation(n=N, L=32, C=1, dtype=DTYPE)
     src = src.cache()
     return src
 
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def cl3n_fixture(src_fixture):
     cl = CLSync3N(src_fixture)
     return cl
 
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def rijs_fixture():
     Rijs = np.arange(n_pairs * 3 * 3).reshape(n_pairs, 3, 3)
     Rijs = Rijs.astype(dtype=DTYPE, copy=False)

From a834b2f443db629c05cb8b8747b64dd533e0dcc5 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 29 Apr 2024 10:55:05 -0400
Subject: [PATCH 35/60] add some documentation

---
 src/aspire/abinitio/commonline_sync3n.py | 115 +++++++++++++++++------
 1 file changed, 85 insertions(+), 30 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 463dc52d9c..a7066a4abe 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -21,7 +21,7 @@ class CLSync3N(CLOrient3D, SyncVotingMixin):
     # Initialize alternatives
     #
     # When we find the best J-configuration, we also compare it to the alternative 2nd best one.
-    # this comparison is done for every pair in the triplete independently. to make sure that the
+    # this comparison is done for every pair in the triplet independently. to make sure that the
     # alternative is indeed different in relation to the pair, we document the differences between
     # the configurations in advance:
     # ALTS(:,best_conf,pair) = the two configurations in which J-sync differs from best_conf in relation to pair
@@ -43,7 +43,6 @@ def __init__(
         shift_step=1,
         epsilon=1e-2,
         max_iters=1000,
-        degree_res=1,
         seed=None,
         mask=True,
         S_weighting=False,
@@ -60,10 +59,15 @@ def __init__(
         :param shift_step: Resolution of shift estimation in pixels. Default = 1 pixel.
         :param epsilon: Tolerance for the power method.
         :param max_iter: Maximum iterations for the power method.
-        :param degree_res: Degree resolution for estimating in-plane rotations.
         :param seed: Optional seed for RNG.
         :param mask: Option to mask `src.images` with a fuzzy mask (boolean).
             Default, `True`, applies a mask.
+        :param S_weighting: Optionally apply probabilistic weighting
+            to the `S` matrix.
+        :param J_weighting: Optionally use `J` weights instead of
+            signs when computing `signs_times_v`.
+        :param hist_intervals: Number of histogram bins used to
+            compute triangle scores when `S_weighting` enabled.
         """
 
         super().__init__(
@@ -80,7 +84,6 @@ def __init__(
 
         self.epsilon = epsilon
         self.max_iters = max_iters
-        self.degree_res = degree_res
         self.seed = seed
 
         # Sync3N specific vars
@@ -241,11 +244,27 @@ def _syncmatrix_weights(
     ):
         """
         Given relative rotations matrix `Rij`,
-        compute and return probability weights for S.
+        compute and return probability weights `P` for S.
+
+        Default parameters here were taken from those in the MATLAB
+        code, with the original author noting they were found
+        empirically.
+
+        :param permitted_inconsistency: Consistency condition is
+            `mean(Pij)/permitted_inconsistency < P <
+            mean(Pij)*permitted_inconsistency`.
+        :param p_domain_limit: Domain of P is [Pmin,Pmax], with
+            Pmin=p_domain_limit*Pmax
+        :param max_iterations: Maximum iterations for P estimation.
+        :param min_p_permitted: Small value at which to stop
+            attempting to synchronize P.
         """
         logger.info("Computing synchronization matrix weights.")
 
-        def body(prev_too_low, Pmin, Pmax, hist, p_domain_limit=p_domain_limit):
+        def _body(prev_too_low, Pmin, Pmax, hist, p_domain_limit=p_domain_limit):
+            """
+            Helper function to run and test triangle_scores.
+            """
             # Get inistial estimate for Pij
             P, sigma, Pij, hist = self._triangle_scores(Rijs, hist, Pmin, Pmax)
 
@@ -287,7 +306,7 @@ def body(prev_too_low, Pmin, Pmax, hist, p_domain_limit=p_domain_limit):
         res = (None,) * 4
         inconsistent = True
         while inconsistent and i < max_iterations:
-            inconsistent, Pij, res = body(*res)
+            inconsistent, Pij, res = _body(*res)
             i += 1
 
         # Pack W
@@ -302,6 +321,13 @@ def body(prev_too_low, Pmin, Pmax, hist, p_domain_limit=p_domain_limit):
         return W
 
     def _triangle_scores_inner(self, Rijs):
+        """
+        Computes histogram of `triangle scores`.
+
+        Wrapper for cpu/gpu dispatch.
+
+        :param Rijs: nchoose2 by 3 by 3 array of rotations.
+        """
 
         # host/gpu dispatch
         if self._gpu_module:
@@ -312,6 +338,11 @@ def _triangle_scores_inner(self, Rijs):
         return scores_hist
 
     def _triangle_scores_inner_host(self, Rijs):
+        """
+        See _triangle_scores_inner.
+
+        CPU implementation.
+        """
 
         # The following is adopted from Matlab triangle_scores_mex.c
 
@@ -392,10 +423,11 @@ def _triangle_scores_inner_host(self, Rijs):
 
     def _triangle_scores_inner_cupy(self, Rijs):
         """
-        n: n_img
-        Rijs: nchoose2x3x3 array
+        See _triangle_scores_inner.
 
+        GPU implementation.
         """
+
         import cupy as cp
 
         triangle_scores = self._gpu_module.get_function("triangle_scores_inner")
@@ -424,6 +456,20 @@ def _triangle_scores_inner_cupy(self, Rijs):
         return scores_hist
 
     def _pairs_probabilities(self, Rijs, P2, A, a, B, b, x0):
+        """
+        This function computes the probability of a pair `ij` having
+        an observed value of triangles score under two priors.  Once
+        given it has an indicative common line, and again once given
+        it has an arbitrary common line.
+
+        The probability of the common line to be indicative can then
+        be derived by Bayes Theorem.
+
+        Wrapper for cpu/gpu dispatch.
+
+        :param Rijs: nchoose2 by 3 by 3 array of rotations.
+        XXX
+        """
         # dtype is critical for passing into C code...
         params = np.arary([P2, A, a, B, b, x0], dtype=np.float64)
         # host/gpu dispatch
@@ -435,6 +481,11 @@ def _pairs_probabilities(self, Rijs, P2, A, a, B, b, x0):
         return ln_f_ind, ln_f_arb
 
     def _pairs_probabilities_host(self, Rijs, P2, A, a, B, b, x0):
+        """
+        See _pairs_probabilities.
+
+        CPU implementation.
+        """
         # The following is adopted from Matlab pairs_probabilities_mex.c `looper`
 
         # Initialize probability result arrays
@@ -529,10 +580,11 @@ def _pairs_probabilities_host(self, Rijs, P2, A, a, B, b, x0):
 
     def _pairs_probabilities_cupy(self, Rijs, P2, A, a, B, b, x0):
         """
-        n: n_img
-        Rijs: nchoose2x3x3 array
+        See _pairs_probabilities.
 
+        GPU implementation.
         """
+
         import cupy as cp
 
         pairs_probabilities = self._gpu_module.get_function("pairs_probabilities")
@@ -569,16 +621,19 @@ def _triangle_scores(
         x0=0.78,
     ):
         """
-        Todo
+        Computes `triangle_scores`, attempts to fit curve to distribution, and uses estimated distribution to compute `pairs_probabilities`.
+
+        Default parameters here were taken from those in the MATLAB
+        code, with the original author noting they were found
+        empirically.
 
-        :param a: magic number
+        :param a:
         :param peak2sigma: empirical relation between the location of
             the peak of the histigram, and the mean error in the
             common lines estimations.
-            AKA, magic number
         :param P:
         :param b:
-        :param x0:
+        :param x0: Initial guess
         """
 
         Pmin = Pmin or 0
@@ -757,7 +812,8 @@ def _J_sync_power_method(self, Rijs):
         residual = 1
         itr = 0
 
-        # XXX, I don't like that epsilon>1 (residual) returns signs of random vector
+        # Todo
+        # I don't like that epsilon>1 (residual) returns signs of random vector
         #      maybe force to run once? or return vec as zeros in that case?
         #      Seems unintended, but easy to do.
 
@@ -778,7 +834,14 @@ def _J_sync_power_method(self, Rijs):
         return J_sync
 
     def _signs_times_v(self, Rijs, vec):
+        """
+        Multiplication of the J-synchronization matrix by a candidate eigenvector `vec`
 
+        Wrapper for cpu/gpu dispatch.
+
+        :param Rijs: An n-choose-2x3x3 array of estimates of relative rotations
+        :param vec: The current candidate eigenvector of length n-choose-2 from the power method.
+        """
         # host/gpu dispatch
         if self._gpu_module:
             new_vec = self._signs_times_v_cupy(Rijs, vec)
@@ -789,14 +852,9 @@ def _signs_times_v(self, Rijs, vec):
 
     def _signs_times_v_host(self, Rijs, vec):
         """
-        Ported from _signs_times_v_mex.c
+        See `_signs_times_v`.
 
-        n: n_img
-        Rijs: nchoose2x3x3 array
-        vec: input array
-        new_vec: output array
-        J_weighting: bool
-        _ALTS= 2x4x3 const lut array
+        CPU implementation.
         """
 
         new_vec = np.zeros_like(vec)
@@ -872,13 +930,9 @@ def _signs_times_v_host(self, Rijs, vec):
 
     def _signs_times_v_cupy(self, Rijs, vec):
         """
-        Ported from _signs_times_v_mex.c
+        See `_signs_times_v`.
 
-        n: n_img
-        Rijs: nchoose2x3x3 array
-        vec: input array
-        new_vec: output array
-        J_weighting: bool
+        CPU implementation.
         """
         import cupy as cp
 
@@ -905,7 +959,8 @@ def _signs_times_v_cupy(self, Rijs, vec):
     @staticmethod
     def _init_cupy_module():
         """
-        Private utility method to read in CUDA source and return as compiled CUPY module.
+        Private utility method to read in CUDA source and return as
+        compiled CUPY module.
         """
 
         import cupy as cp

From 98aa274164051d3de622542487a6aa2ea44e7877 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 29 Apr 2024 11:18:40 -0400
Subject: [PATCH 36/60] more cleanup

---
 src/aspire/abinitio/commonline_sync3n.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index a7066a4abe..33cb7a292a 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -451,7 +451,7 @@ def _triangle_scores_inner_cupy(self, Rijs):
         )
 
         # d2h
-        scores_hist = scores_hist_dev.get()
+        scores_hist = scores_hist_dev.get().astype(self.dtype, copy=False)
 
         return scores_hist
 
@@ -468,10 +468,17 @@ def _pairs_probabilities(self, Rijs, P2, A, a, B, b, x0):
         Wrapper for cpu/gpu dispatch.
 
         :param Rijs: nchoose2 by 3 by 3 array of rotations.
-        XXX
+        :param P2: distribution parameter
+        :param A: distribution parameter
+        :param a: distribution parameter
+        :param B: distribution parameter
+        :param b: distribution parameter
+        :param x0: Initial guess
+
         """
-        # dtype is critical for passing into C code...
-        params = np.arary([P2, A, a, B, b, x0], dtype=np.float64)
+        # These param values are passed to C, force doubles.
+        params = np.array([P2, A, a, B, b, x0], dtype=np.float64)
+
         # host/gpu dispatch
         if self._gpu_module:
             ln_f_ind, ln_f_arb = self._pairs_probabilities_cupy(Rijs, *params)
@@ -603,8 +610,8 @@ def _pairs_probabilities_cupy(self, Rijs, P2, A, a, B, b, x0):
         )
 
         # accumulate over thread results
-        ln_f_arb = ln_f_arb_dev.get()
-        ln_f_ind = ln_f_ind_dev.get()
+        ln_f_arb = ln_f_arb_dev.get().astype(self.dtype, copy=False)
+        ln_f_ind = ln_f_ind_dev.get().astype(self.dtype, copy=False)
 
         return ln_f_ind, ln_f_arb
 
@@ -952,7 +959,7 @@ def _signs_times_v_cupy(self, Rijs, vec):
         )
 
         # dtoh
-        new_vec = new_vec_dev.get()
+        new_vec = new_vec_dev.get().astype(self.dtype, copy=False)
 
         return new_vec
 

From 8161a2100ee94652ad8585fa842785cd4f836c56 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 29 Apr 2024 11:31:37 -0400
Subject: [PATCH 37/60] looks like this actually needs double precision.

---
 src/aspire/abinitio/commonline_sync3n.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 33cb7a292a..4729bf7785 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -451,7 +451,7 @@ def _triangle_scores_inner_cupy(self, Rijs):
         )
 
         # d2h
-        scores_hist = scores_hist_dev.get().astype(self.dtype, copy=False)
+        scores_hist = scores_hist_dev.get()
 
         return scores_hist
 
@@ -610,8 +610,8 @@ def _pairs_probabilities_cupy(self, Rijs, P2, A, a, B, b, x0):
         )
 
         # accumulate over thread results
-        ln_f_arb = ln_f_arb_dev.get().astype(self.dtype, copy=False)
-        ln_f_ind = ln_f_ind_dev.get().astype(self.dtype, copy=False)
+        ln_f_arb = ln_f_arb_dev.get()
+        ln_f_ind = ln_f_ind_dev.get()
 
         return ln_f_ind, ln_f_arb
 
@@ -959,7 +959,7 @@ def _signs_times_v_cupy(self, Rijs, vec):
         )
 
         # dtoh
-        new_vec = new_vec_dev.get().astype(self.dtype, copy=False)
+        new_vec = new_vec_dev.get()
 
         return new_vec
 

From accdc935df83460d946df94da379047cea85fc27 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 29 Apr 2024 13:00:03 -0400
Subject: [PATCH 38/60] fix precision bug in CL sync3n power method.

---
 src/aspire/abinitio/commonline_sync3n.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 4729bf7785..6793c146ee 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -827,7 +827,8 @@ def _J_sync_power_method(self, Rijs):
         # Power method iterations
         while itr < max_iters and residual > epsilon:
             itr += 1
-            vec_new = self._signs_times_v(Rijs, vec)
+            # Todo, this code code actually needs double precision for accuracy... forcing.
+            vec_new = self._signs_times_v(Rijs, vec).astype(np.float64, copy=False)
             vec_new = vec_new / norm(vec_new)
             residual = norm(vec_new - vec)
             vec = vec_new

From 14dfb1c657efd7330233fee87575a4bb30858857 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 29 Apr 2024 13:50:18 -0400
Subject: [PATCH 39/60] fixup some of the dtypes

---
 src/aspire/abinitio/commonline_sync3n.cu |  2 +-
 src/aspire/abinitio/commonline_sync3n.py | 15 ++++++++-------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu
index aaff3d0e76..99582b3f24 100644
--- a/src/aspire/abinitio/commonline_sync3n.cu
+++ b/src/aspire/abinitio/commonline_sync3n.cu
@@ -290,7 +290,7 @@ void pairs_probabilities(int n, double* Rijs, double P2, double A, double a, dou
 
 
 extern "C" __global__
-void triangle_scores_inner(int n, double* Rijs, int n_intervals, double* scores_hist)
+void triangle_scores_inner(int n, double* Rijs, int n_intervals, unsigned int* scores_hist)
 {
   /* thread index (1d), represents "i" index */
   unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 6793c146ee..886557630f 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -347,7 +347,7 @@ def _triangle_scores_inner_host(self, Rijs):
         # The following is adopted from Matlab triangle_scores_mex.c
 
         # Initialize probability result arrays
-        scores_hist = np.zeros(self.hist_intervals, dtype=Rijs.dtype)
+        scores_hist = np.zeros(self.hist_intervals, dtype=np.uint32)
         h = 1 / self.hist_intervals
 
         c = np.empty((4), dtype=Rijs.dtype)
@@ -432,9 +432,10 @@ def _triangle_scores_inner_cupy(self, Rijs):
 
         triangle_scores = self._gpu_module.get_function("triangle_scores_inner")
 
-        Rijs_dev = cp.array(Rijs)
+        Rijs_dev = cp.array(Rijs, dtype=np.float64)
 
-        scores_hist_dev = cp.zeros((self.hist_intervals), dtype=np.float64)
+        # This holds integer counts
+        scores_hist_dev = cp.zeros((self.hist_intervals), dtype=np.uint32)
 
         # call the kernel
         blkszx = 512
@@ -596,7 +597,7 @@ def _pairs_probabilities_cupy(self, Rijs, P2, A, a, B, b, x0):
 
         pairs_probabilities = self._gpu_module.get_function("pairs_probabilities")
 
-        Rijs_dev = cp.array(Rijs)
+        Rijs_dev = cp.array(Rijs, dtype=np.float64)
         ln_f_ind_dev = cp.zeros((self.n_img * (self.n_img - 1) // 2), dtype=np.float64)
         ln_f_arb_dev = cp.zeros((self.n_img * (self.n_img - 1) // 2), dtype=np.float64)
 
@@ -946,9 +947,9 @@ def _signs_times_v_cupy(self, Rijs, vec):
 
         signs_times_v = self._gpu_module.get_function("signs_times_v")
 
-        Rijs_dev = cp.array(Rijs)
-        vec_dev = cp.array(vec)
-        new_vec_dev = cp.zeros((vec.shape[0]))
+        Rijs_dev = cp.array(Rijs, dtype=np.float64)
+        vec_dev = cp.array(vec, dtype=np.float64)
+        new_vec_dev = cp.zeros((vec.shape[0]), dtype=np.float64)
 
         # call the kernel
         blkszx = 512

From 1af210fcf053a0d2a0113a81759ed1aea40b7f4c Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 29 Apr 2024 14:22:24 -0400
Subject: [PATCH 40/60] conditionally run host-gpu comparison

---
 tests/test_commonline_sync3n_cupy.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/test_commonline_sync3n_cupy.py b/tests/test_commonline_sync3n_cupy.py
index 81d967aa8e..e13147f6ae 100644
--- a/tests/test_commonline_sync3n_cupy.py
+++ b/tests/test_commonline_sync3n_cupy.py
@@ -4,14 +4,15 @@
 from aspire.abinitio.commonline_sync3n import CLSync3N
 from aspire.source import Simulation
 
+# If cupy is not available, skip this entire module
+pytest.importorskip("cupy")
+
+
 DTYPE = np.float64  # TODO, consider single precision.
 N = 64  # Number of images
 n_pairs = N * (N - 1) // 2
 
 
-# XXX TODO, conditionally run these only if GPU present.
-
-
 @pytest.fixture(scope="module")
 def src_fixture():
     src = Simulation(n=N, L=32, C=1, dtype=DTYPE)

From a53cd209f3f1f382c41548cd956952070b6b1488 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Tue, 7 May 2024 10:39:17 -0400
Subject: [PATCH 41/60] add MATLAB comparison tests

---
 tests/test_commonline_sync3n_cupy.py | 107 +++++++++++++++++++++++++++
 1 file changed, 107 insertions(+)

diff --git a/tests/test_commonline_sync3n_cupy.py b/tests/test_commonline_sync3n_cupy.py
index e13147f6ae..7165d368a7 100644
--- a/tests/test_commonline_sync3n_cupy.py
+++ b/tests/test_commonline_sync3n_cupy.py
@@ -112,3 +112,110 @@ def test_stvJwt_host_vs_cupy(cl3n_fixture, rijs_fixture):
 
     # Compare host to cupy calls
     np.testing.assert_allclose(new_vec_cp, new_vec_h)
+
+
+@pytest.fixture
+def matlab_ref_fixture():
+    """
+    Setup ASPIRE-Python objects using dummy data that is easily
+    constructed in MATLAB.
+    """
+    DTYPE = np.float64
+    n = 5
+    n_pairs = n * (n - 1) // 2
+
+    # Dummy input vector.
+    Rijs = np.transpose(
+        np.arange(1, n_pairs * 3 * 3 + 1, dtype=DTYPE).reshape(n_pairs, 3, 3), (0, 2, 1)
+    )
+    # Equivalent MATLAB
+    # n=5; np=n*(n-1)/2; rijs= reshape([1:np*3*3],[3,3,np])
+
+    # Create CL object for testing function calls
+    src = Simulation(L=8, n=n, C=1, dtype=DTYPE)
+    cl3n = CLSync3N(src, seed=314, S_weighting=False, J_weighting=False)
+
+    return Rijs, cl3n
+
+
+def test_triangles_scores(matlab_ref_fixture):
+    """
+    Compares output of identical dummy data between this
+    implementation and legacy MATLAB triangles_scores_mex.
+    """
+    Rijs, cl3n = matlab_ref_fixture
+
+    hist = cl3n._triangle_scores_inner(Rijs)
+
+    # Default is 100 histogram intervals,
+    # so the histogram reference is compressed.
+    ref_hist = np.zeros(cl3n.hist_intervals)
+    # Nonzeros, [[indices, ...], [values, ...]]
+    ref_compressed = np.array(
+        [[0, 10, 11, 12, 70, 71, 72, 76, 81, 89], [14, 2, 2, 2, 1, 1, 2, 1, 2, 3]]
+    )
+    # Pack the reference histogram
+    np.put(ref_hist, *ref_compressed)
+
+    np.testing.assert_allclose(hist, ref_hist)
+
+
+def test_pairs_prob_mex(matlab_ref_fixture):
+    """
+    Compares output of identical dummy data between this
+    implementation and legacy MATLAB pairs_probabilities_mex.
+    """
+    Rijs, cl3n = matlab_ref_fixture
+
+    params = np.arange(1, 7)
+
+    ln_f_ind, ln_f_arb = cl3n._pairs_probabilities_host(Rijs, *params)
+
+    ref_ln_f_ind = [
+        -24.1817,
+        -5.6554,
+        4.9117,
+        12.7047,
+        -12.9374,
+        -5.5158,
+        1.5289,
+        -9.0406,
+        -2.2067,
+        -7.3968,
+    ]
+
+    ref_ln_f_arb = [
+        -17.1264,
+        -6.7218,
+        -0.8876,
+        3.3437,
+        -10.7251,
+        -6.7051,
+        -2.9029,
+        -8.5061,
+        -4.8288,
+        -7.5608,
+    ]
+
+    np.testing.assert_allclose(ln_f_arb, ref_ln_f_arb, atol=5e-5)
+
+    np.testing.assert_allclose(ln_f_ind, ref_ln_f_ind, atol=5e-5)
+
+
+def test_signs_times_v_mex(matlab_ref_fixture):
+    """
+    Compares output of identical dummy data between this
+    implementation and legacy MATLAB signs_times_v.
+    """
+    Rijs, cl3n = matlab_ref_fixture
+
+    # Dummy input vector
+    vec = np.ones(len(Rijs), dtype=DTYPE)
+    # Equivalent matlab
+    # vec=ones([1,np]);
+
+    new_vec = cl3n._signs_times_v(Rijs, vec)
+
+    ref_vec = [0, -2, -2, 0, -6, -4, -2, -2, -2, 0]
+
+    np.testing.assert_allclose(new_vec, ref_vec)

From 73e3614d9a73a9419841bb4ecf033bbf404643a6 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Tue, 7 May 2024 11:45:13 -0400
Subject: [PATCH 42/60] Allow sync3n methods to run in singles via upcasting

---
 src/aspire/abinitio/commonline_sync3n.py |  8 ++---
 tests/test_commonline_sync3n_cupy.py     | 42 +++++++++++++++---------
 2 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 886557630f..36f251510f 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -611,8 +611,8 @@ def _pairs_probabilities_cupy(self, Rijs, P2, A, a, B, b, x0):
         )
 
         # accumulate over thread results
-        ln_f_arb = ln_f_arb_dev.get()
-        ln_f_ind = ln_f_ind_dev.get()
+        ln_f_arb = ln_f_arb_dev.get().astype(self.dtype, copy=False)
+        ln_f_ind = ln_f_ind_dev.get().astype(self.dtype, copy=False)
 
         return ln_f_ind, ln_f_arb
 
@@ -857,7 +857,7 @@ def _signs_times_v(self, Rijs, vec):
         else:
             new_vec = self._signs_times_v_host(Rijs, vec)
 
-        return new_vec
+        return new_vec.astype(vec.dtype, copy=False)
 
     def _signs_times_v_host(self, Rijs, vec):
         """
@@ -961,7 +961,7 @@ def _signs_times_v_cupy(self, Rijs, vec):
         )
 
         # dtoh
-        new_vec = new_vec_dev.get()
+        new_vec = new_vec_dev.get().astype(vec.dtype, copy=False)
 
         return new_vec
 
diff --git a/tests/test_commonline_sync3n_cupy.py b/tests/test_commonline_sync3n_cupy.py
index 7165d368a7..9bea14c21f 100644
--- a/tests/test_commonline_sync3n_cupy.py
+++ b/tests/test_commonline_sync3n_cupy.py
@@ -8,14 +8,19 @@
 pytest.importorskip("cupy")
 
 
-DTYPE = np.float64  # TODO, consider single precision.
-N = 64  # Number of images
+N = 32  # Number of images
 n_pairs = N * (N - 1) // 2
+DTYPES = [np.float32, np.float64]
+
+
+@pytest.fixture(scope="module", params=DTYPES, ids=lambda x: f"dtype={x}")
+def dtype(request):
+    return request.param
 
 
 @pytest.fixture(scope="module")
-def src_fixture():
-    src = Simulation(n=N, L=32, C=1, dtype=DTYPE)
+def src_fixture(dtype):
+    src = Simulation(n=N, L=32, C=1, dtype=dtype)
     src = src.cache()
     return src
 
@@ -27,9 +32,8 @@ def cl3n_fixture(src_fixture):
 
 
 @pytest.fixture(scope="module")
-def rijs_fixture():
-    Rijs = np.arange(n_pairs * 3 * 3).reshape(n_pairs, 3, 3)
-    Rijs = Rijs.astype(dtype=DTYPE, copy=False)
+def rijs_fixture(dtype):
+    Rijs = np.arange(n_pairs * 3 * 3, dtype=dtype).reshape(n_pairs, 3, 3)
     return Rijs
 
 
@@ -50,15 +54,17 @@ def test_pairs_prob_host_vs_cupy(cl3n_fixture, rijs_fixture):
     indsh, arbh = cl3n_fixture._pairs_probabilities_host(rijs_fixture, *params)
 
     # Compare host to cupy calls
-    np.testing.assert_allclose(indsh, indscp)
-    np.testing.assert_allclose(arbh, arbcp)
+    rtol = 1e-07  # np testing default
+    if rijs_fixture.dtype != np.float64:
+        rtol = 2e-5
+    np.testing.assert_allclose(indsh, indscp, rtol=rtol)
+    np.testing.assert_allclose(arbh, arbcp, rtol=rtol)
 
 
 def test_triangle_scores_host_vs_cupy(cl3n_fixture, rijs_fixture):
     """
     Compares triangle_scores between host and cupy implementations.
     """
-    # DTYPE is critical here (manually calling private method
 
     # Execute CUPY
     hist_cp = cl3n_fixture._triangle_scores_inner_cupy(rijs_fixture)
@@ -77,7 +83,7 @@ def test_stv_host_vs_cupy(cl3n_fixture, rijs_fixture):
     Default J_weighting=False
     """
     # dummy data vector
-    vec = np.random.random(n_pairs).astype(dtype=DTYPE, copy=False)
+    vec = np.ones(n_pairs, dtype=rijs_fixture.dtype)
 
     # J_weighting=False
     assert cl3n_fixture.J_weighting is False
@@ -99,7 +105,7 @@ def test_stvJwt_host_vs_cupy(cl3n_fixture, rijs_fixture):
     Force J_weighting=True
     """
     # dummy data vector
-    vec = np.random.random(n_pairs).astype(dtype=DTYPE, copy=False)
+    vec = np.ones(n_pairs, dtype=rijs_fixture.dtype)
 
     # J_weighting=True
     cl3n_fixture.J_weighting = True
@@ -111,7 +117,13 @@ def test_stvJwt_host_vs_cupy(cl3n_fixture, rijs_fixture):
     new_vec_h = cl3n_fixture._signs_times_v_host(rijs_fixture, vec)
 
     # Compare host to cupy calls
-    np.testing.assert_allclose(new_vec_cp, new_vec_h)
+    rtol = 1e-7  # np testing default
+    if vec.dtype != np.float64:
+        rtol = 3e-07
+    np.testing.assert_allclose(new_vec_cp, new_vec_h, rtol=rtol)
+
+
+# The following fixture and tests compare against the legacy MATLAB implementation
 
 
 @pytest.fixture
@@ -120,7 +132,7 @@ def matlab_ref_fixture():
     Setup ASPIRE-Python objects using dummy data that is easily
     constructed in MATLAB.
     """
-    DTYPE = np.float64
+    DTYPE = np.float64  # MATLAB code is doubles only
     n = 5
     n_pairs = n * (n - 1) // 2
 
@@ -210,7 +222,7 @@ def test_signs_times_v_mex(matlab_ref_fixture):
     Rijs, cl3n = matlab_ref_fixture
 
     # Dummy input vector
-    vec = np.ones(len(Rijs), dtype=DTYPE)
+    vec = np.ones(len(Rijs), dtype=Rijs.dtype)
     # Equivalent matlab
     # vec=ones([1,np]);
 

From 0a91eece68b003a3696dd30a0a368270d4b13087 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Wed, 3 Jul 2024 08:58:34 -0400
Subject: [PATCH 43/60] Update some docstrings

---
 src/aspire/abinitio/commonline_sync3n.py | 54 ++++++++++++++++++++----
 1 file changed, 45 insertions(+), 9 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 36f251510f..32db9d23e7 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -16,6 +16,15 @@
 class CLSync3N(CLOrient3D, SyncVotingMixin):
     """
     Define a class to estimate 3D orientations using common lines Sync3N methods (2017).
+
+    Ido Greenberg, Yoel Shkolnisky,
+    Common lines modeling for reference free Ab-initio reconstruction in cryo-EM,
+    Journal of Structural Biology,
+    Volume 200, Issue 2,
+    2017,
+    Pages 106-117,
+    ISSN 1047-8477,
+    https://doi.org/10.1016/j.jsb.2017.09.007.
     """
 
     # Initialize alternatives
@@ -136,13 +145,17 @@ def estimate_rotations(self):
         # Yield rotations from S
         self.rotations = self._sync3n_S_to_rot(S, W)
 
-    ###########################################
-    # The hackberries taste like hackberries  #
-    ###########################################
+    #######################
+    # Main Sync3N Methods #
+    #######################
     def _sync3n_S_to_rot(self, S, W=None, n_eigs=4):
         """
         Use eigen decomposition of S to estimate transforms,
         then project transforms to nearest rotations.
+
+        :param S: Numpy array represeting Synchronization matrix.
+        :param W: Optional weights array, default `None` is equal weighting of `S`.
+        :param n_eigs: Optional, number of eigenvalues to compute (min 3).
         """
 
         if n_eigs < 3:
@@ -214,6 +227,9 @@ def _sync3n_S_to_rot(self, S, W=None, n_eigs=4):
     def _construct_sync3n_matrix(self, Rij):
         """
         Construct sync3n matrix from estimated rotations Rij.
+
+        :param Rij: Numpy array of estimated rotations (all pairs).
+        :return: Synchronization matrix S, (3*N, 3*N).
         """
 
         # Initialize S with diag identity blocks
@@ -258,6 +274,7 @@ def _syncmatrix_weights(
         :param max_iterations: Maximum iterations for P estimation.
         :param min_p_permitted: Small value at which to stop
             attempting to synchronize P.
+        :return: Synchronization matrix weights `W`.
         """
         logger.info("Computing synchronization matrix weights.")
 
@@ -327,6 +344,7 @@ def _triangle_scores_inner(self, Rijs):
         Wrapper for cpu/gpu dispatch.
 
         :param Rijs: nchoose2 by 3 by 3 array of rotations.
+        :return: Histogram of triangle scores.
         """
 
         # host/gpu dispatch
@@ -475,7 +493,7 @@ def _pairs_probabilities(self, Rijs, P2, A, a, B, b, x0):
         :param B: distribution parameter
         :param b: distribution parameter
         :param x0: Initial guess
-
+        :return: (log indicative probabilities, log arbitrary probabilities)
         """
         # These param values are passed to C, force doubles.
         params = np.array([P2, A, a, B, b, x0], dtype=np.float64)
@@ -629,19 +647,23 @@ def _triangle_scores(
         x0=0.78,
     ):
         """
-        Computes `triangle_scores`, attempts to fit curve to distribution, and uses estimated distribution to compute `pairs_probabilities`.
+        Computes `triangle_scores`, attempts to fit curve to
+        distribution, and uses estimated distribution to compute
+        `pairs_probabilities`.
 
         Default parameters here were taken from those in the MATLAB
         code, with the original author noting they were found
         empirically.
 
-        :param a:
+        :param a: distribution parameter
         :param peak2sigma: empirical relation between the location of
             the peak of the histigram, and the mean error in the
             common lines estimations.
-        :param P:
-        :param b:
+        :param P: distribution parameter
+        :param b: distribution parameter
         :param x0: Initial guess
+        :return: Tuple of pairs probabilty Pij and related terms
+             (P, sigma, Pij, scores_hist)
         """
 
         Pmin = Pmin or 0
@@ -731,7 +753,17 @@ def _estimate_relative_viewing_directions(self):
         return Rijs
 
     def _global_J_sync(self, Rijs):
-        """ """
+        """
+        Apply global J-synchronization.
+
+        Given all pairs of estimated rotation matrices `Rijs` with
+        arbitrary handedness (J conjugation), attempt to detect and
+        conjugate entries of `Rijs` such that all rotations have same
+        handedness.
+
+        :param Rijs: Array of all pairs of rotation matrices
+        :return: Array of all pairs of J synchronized rotation matrices
+        """
 
         # Determine relative handedness of Rijs.
         sign_ij_J = self._J_sync_power_method(Rijs)
@@ -746,6 +778,9 @@ def _global_J_sync(self, Rijs):
     def _estimate_all_Rijs(self, clmatrix):
         """
         Estimate Rijs using the voting method.
+
+        :param clmatrix: Common lines matrix
+        :return: Estimated rotations
         """
         n_img = self.n_img
         n_theta = self.n_theta
@@ -850,6 +885,7 @@ def _signs_times_v(self, Rijs, vec):
 
         :param Rijs: An n-choose-2x3x3 array of estimates of relative rotations
         :param vec: The current candidate eigenvector of length n-choose-2 from the power method.
+        :return: New candidate eigenvector.
         """
         # host/gpu dispatch
         if self._gpu_module:

From 51ffdacad72c0dbf3aa539a78b0eaf71674302c8 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Wed, 3 Jul 2024 09:14:42 -0400
Subject: [PATCH 44/60] initial add cl sync3n test

---
 tests/test_commonline_sync3n.py | 101 ++++++++++++++++++++++++++++++++
 1 file changed, 101 insertions(+)
 create mode 100644 tests/test_commonline_sync3n.py

diff --git a/tests/test_commonline_sync3n.py b/tests/test_commonline_sync3n.py
new file mode 100644
index 0000000000..f9be7a103d
--- /dev/null
+++ b/tests/test_commonline_sync3n.py
@@ -0,0 +1,101 @@
+import os
+
+import numpy as np
+import pytest
+
+from aspire.abinitio import CLSync3N
+from aspire.source import Simulation
+from aspire.utils import mean_aligned_angular_distance, rots_to_clmatrix
+from aspire.volume import AsymmetricVolume
+
+DATA_DIR = os.path.join(os.path.dirname(__file__), "saved_test_data")
+
+RESOLUTION = [
+    40,
+    41,
+]
+
+# `None` defaults to random offsets.
+OFFSETS = [
+    0,
+    #    pytest.param(None, marks=pytest.mark.expensive),
+]
+
+DTYPES = [
+    #    np.float32,
+    #    pytest.param(np.float64, marks=pytest.mark.expensive),
+    np.float64,
+]
+
+
+@pytest.fixture(params=RESOLUTION, ids=lambda x: f"resolution={x}")
+def resolution(request):
+    return request.param
+
+
+@pytest.fixture(params=OFFSETS, ids=lambda x: f"offsets={x}")
+def offsets(request):
+    return request.param
+
+
+@pytest.fixture(params=DTYPES, ids=lambda x: f"dtype={x}")
+def dtype(request):
+    return request.param
+
+
+@pytest.fixture
+def source_orientation_objs(resolution, offsets, dtype):
+    src = Simulation(
+        n=50,
+        L=resolution,
+        vols=AsymmetricVolume(L=resolution, C=1, K=100, seed=0).generate(),
+        offsets=offsets,
+        amplitudes=1,
+        seed=0,
+    ).cache()
+
+    # # Search for common lines over less shifts for 0 offsets.
+    # max_shift = 1 / resolution
+    # shift_step = 1
+    # if src.offsets.all() != 0:
+    #     max_shift = 0.20
+    #     shift_step = 0.25  # Reduce shift steps for non-integer offsets of Simulation.
+    # orient_est = CLSync3N(
+    #     src, max_shift=max_shift, shift_step=shift_step, mask=False
+
+    # )
+    orient_est = CLSync3N(src)
+
+    return src, orient_est
+
+
+def test_build_clmatrix(source_orientation_objs):
+    src, orient_est = source_orientation_objs
+
+    # Build clmatrix estimate.
+    orient_est.build_clmatrix()
+
+    gt_clmatrix = rots_to_clmatrix(src.rotations, orient_est.n_theta)
+
+    angle_diffs = abs(orient_est.clmatrix - gt_clmatrix) * 360 / orient_est.n_theta
+
+    # Count number of estimates within 5 degrees of ground truth.
+    within_5 = np.sum((angle_diffs - 360) % 360 < 5)
+
+    # Check that at least 98% of estimates are within 5 degrees.
+    tol = 0.98
+    if src.offsets.all() != 0:
+        # Set tolerance to 95% when using nonzero offsets.
+        tol = 0.95
+    assert within_5 / angle_diffs.size > tol
+
+
+def test_estimate_rotations(source_orientation_objs):
+    src, orient_est = source_orientation_objs
+
+    orient_est.estimate_rotations()
+
+    # Register estimates to ground truth rotations and compute the
+    # mean angular distance between them (in degrees).
+    # Assert that mean angular distance is less than 1 degree.
+    mean_aligned_angular_distance(orient_est.rotations, src.rotations, degree_tol=1)

From 4c5102ce745e1882dada18d9658f6d1dfc19fefc Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Wed, 3 Jul 2024 10:13:14 -0400
Subject: [PATCH 45/60] add minimal test

---
 src/aspire/abinitio/commonline_sync3n.py |  8 ++++++-
 tests/test_commonline_sync3n.py          | 29 ++++++++----------------
 2 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 32db9d23e7..45456f8a70 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -99,7 +99,13 @@ def __init__(
         self.S_weighting = S_weighting
         self.J_weighting = J_weighting
         self._D_null = 1e-13
-        self.hist_intervals = hist_intervals
+        self.hist_intervals = int(hist_intervals)
+        # Warn if histogram may be too sparse for curve fitting
+        if self.S_weighting and (src.n < hist_intervals):
+            logger.warning(
+                f"`hist_intervals` {hist_intervals} > src.n {src.n}."
+                "  Consider reducing if curve fitting is infeasable."
+            )
 
         # Auto configure GPU
         self._gpu_module = None
diff --git a/tests/test_commonline_sync3n.py b/tests/test_commonline_sync3n.py
index f9be7a103d..119eec2ae1 100644
--- a/tests/test_commonline_sync3n.py
+++ b/tests/test_commonline_sync3n.py
@@ -18,53 +18,42 @@
 # `None` defaults to random offsets.
 OFFSETS = [
     0,
-    #    pytest.param(None, marks=pytest.mark.expensive),
 ]
 
 DTYPES = [
-    #    np.float32,
+    np.float32,
     #    pytest.param(np.float64, marks=pytest.mark.expensive),
     np.float64,
 ]
 
 
-@pytest.fixture(params=RESOLUTION, ids=lambda x: f"resolution={x}")
+@pytest.fixture(params=RESOLUTION, ids=lambda x: f"resolution={x}", scope="module")
 def resolution(request):
     return request.param
 
 
-@pytest.fixture(params=OFFSETS, ids=lambda x: f"offsets={x}")
+@pytest.fixture(params=OFFSETS, ids=lambda x: f"offsets={x}", scope="module")
 def offsets(request):
     return request.param
 
 
-@pytest.fixture(params=DTYPES, ids=lambda x: f"dtype={x}")
+@pytest.fixture(params=DTYPES, ids=lambda x: f"dtype={x}", scope="module")
 def dtype(request):
     return request.param
 
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def source_orientation_objs(resolution, offsets, dtype):
     src = Simulation(
-        n=50,
+        n=100,
         L=resolution,
-        vols=AsymmetricVolume(L=resolution, C=1, K=100, seed=0).generate(),
+        vols=AsymmetricVolume(L=resolution, C=1, K=100, seed=123).generate(),
         offsets=offsets,
         amplitudes=1,
-        seed=0,
+        seed=456,
     ).cache()
 
-    # # Search for common lines over less shifts for 0 offsets.
-    # max_shift = 1 / resolution
-    # shift_step = 1
-    # if src.offsets.all() != 0:
-    #     max_shift = 0.20
-    #     shift_step = 0.25  # Reduce shift steps for non-integer offsets of Simulation.
-    # orient_est = CLSync3N(
-    #     src, max_shift=max_shift, shift_step=shift_step, mask=False
-
-    # )
-    orient_est = CLSync3N(src)
+    orient_est = CLSync3N(src, S_weighting=True, seed=789)
 
     return src, orient_est
 

From d0c2c0d48331af04f0f0f45bcaec1931adddda51 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Wed, 3 Jul 2024 10:20:52 -0400
Subject: [PATCH 46/60] actually test the different dtypes

---
 tests/test_orient_sync_voting.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_orient_sync_voting.py b/tests/test_orient_sync_voting.py
index 31d6b20e94..3e875e9467 100644
--- a/tests/test_orient_sync_voting.py
+++ b/tests/test_orient_sync_voting.py
@@ -52,7 +52,7 @@ def source_orientation_objs(resolution, offsets, dtype):
     src = Simulation(
         n=50,
         L=resolution,
-        vols=AsymmetricVolume(L=resolution, C=1, K=100, seed=0).generate(),
+        vols=AsymmetricVolume(L=resolution, C=1, K=100, seed=0, dtype=dtype).generate(),
         offsets=offsets,
         amplitudes=1,
         seed=0,

From 52a099e2a91c0a5ad87da14a4d0448e8ac5e8eda Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Wed, 3 Jul 2024 10:24:05 -0400
Subject: [PATCH 47/60] mark float64 and odd sync3n as expensive

---
 tests/test_commonline_sync3n.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/test_commonline_sync3n.py b/tests/test_commonline_sync3n.py
index 119eec2ae1..6640fa871f 100644
--- a/tests/test_commonline_sync3n.py
+++ b/tests/test_commonline_sync3n.py
@@ -12,18 +12,16 @@
 
 RESOLUTION = [
     40,
-    41,
+    pytest.param(41, marks=pytest.mark.expensive),
 ]
 
-# `None` defaults to random offsets.
 OFFSETS = [
     0,
 ]
 
 DTYPES = [
     np.float32,
-    #    pytest.param(np.float64, marks=pytest.mark.expensive),
-    np.float64,
+    pytest.param(np.float64, marks=pytest.mark.expensive),
 ]
 
 
@@ -47,7 +45,9 @@ def source_orientation_objs(resolution, offsets, dtype):
     src = Simulation(
         n=100,
         L=resolution,
-        vols=AsymmetricVolume(L=resolution, C=1, K=100, seed=123).generate(),
+        vols=AsymmetricVolume(
+            L=resolution, C=1, K=100, seed=123, dtype=dtype
+        ).generate(),
         offsets=offsets,
         amplitudes=1,
         seed=456,

From 1124033c904e9c769f364aa60ab5dc330372cbfe Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Fri, 19 Jul 2024 08:45:47 -0400
Subject: [PATCH 48/60] first pass addressing review remarks

---
 src/aspire/abinitio/commonline_sync3n.py | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 45456f8a70..4c06d882d9 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -122,7 +122,7 @@ def __init__(
                 logger.info("GPU not found, defaulting to numpy.")
 
         except ModuleNotFoundError:
-            logger.info("cupy not found, defaulting numpy.")
+            logger.info("cupy not found, defaulting to numpy.")
 
     ###########################################
     # High level algorithm steps              #
@@ -159,7 +159,7 @@ def _sync3n_S_to_rot(self, S, W=None, n_eigs=4):
         Use eigen decomposition of S to estimate transforms,
         then project transforms to nearest rotations.
 
-        :param S: Numpy array represeting Synchronization matrix.
+        :param S: Numpy array representing Synchronization matrix.
         :param W: Optional weights array, default `None` is equal weighting of `S`.
         :param n_eigs: Optional, number of eigenvalues to compute (min 3).
         """
@@ -177,7 +177,7 @@ def _sync3n_S_to_rot(self, S, W=None, n_eigs=4):
                     f" Received {W.shape}."
                 )
             # Initialize D
-            D = np.mean(W, axis=1)  # D, check axis
+            D = np.mean(W, axis=1)
 
             Dhalf = D
             # Compute mask of trouble D values
@@ -197,10 +197,10 @@ def _sync3n_S_to_rot(self, S, W=None, n_eigs=4):
                 logger.warning(f"Large Weights Matrix Normalization Error: {err}")
 
             # Make W of size 3Nx3N
-            W = np.kron(W, np.ones((3, 3)))
+            W = np.kron(W, np.ones((3, 3), dtype=self.dtype))
 
             # Make Dhalf of size 3Nx3N
-            Dhalf = np.diag(np.kron(np.diag(Dhalf), np.ones((1, 3)))[0])
+            Dhalf = np.diag(np.kron(np.diag(Dhalf), np.ones(3, dtype=self.dtype)))
 
             # Apply weights to S
             S = Dhalf @ (W * S) @ Dhalf
@@ -333,7 +333,7 @@ def _body(prev_too_low, Pmin, Pmax, hist, p_domain_limit=p_domain_limit):
             i += 1
 
         # Pack W
-        W = np.zeros((self.n_img, self.n_img))
+        W = np.zeros((self.n_img, self.n_img), dtype=self.dtype)
         idx = 0
         for i in range(self.n_img):
             for j in range(i + 1, self.n_img):
@@ -375,7 +375,7 @@ def _triangle_scores_inner_host(self, Rijs):
         h = 1 / self.hist_intervals
 
         c = np.empty((4), dtype=Rijs.dtype)
-        for i in trange(self.n_img, desc="Computing triangle scores"):
+        for i in trange(self.n_img - 2, desc="Computing triangle scores"):
             for j in range(
                 i + 1, self.n_img - 1
             ):  # check bound (taken from MATLAB mex)
@@ -525,7 +525,7 @@ def _pairs_probabilities_host(self, Rijs, P2, A, a, B, b, x0):
         ln_f_arb = np.zeros(len(Rijs), dtype=Rijs.dtype)
 
         c = np.empty((4), dtype=Rijs.dtype)
-        for i in trange(self.n_img, desc="Computing pair probabilities"):
+        for i in trange(self.n_img - 2, desc="Computing pair probabilities"):
             for j in range(i + 1, self.n_img - 1):
                 ij = self._pairs_to_linear[i, j]
                 Rij = Rijs[ij]
@@ -715,8 +715,7 @@ def fun(x, B, P, b, x0, A=A, a=a):
 
         # Derive P and sigma
         P = P ** (1 / 3)
-        peak = x0  # can rm later
-        sigma = (1 - peak) / peak2sigma
+        sigma = (1 - x0) / peak2sigma
 
         # Initialize probability computations
         # Local histograms analysis
@@ -918,7 +917,7 @@ def _signs_times_v_host(self, Rijs, vec):
         desc = "Computing signs_times_v"
         if self.J_weighting:
             desc += " with J_weighting"
-        for i in trange(self.n_img, desc=desc):
+        for i in trange(self.n_img - 2, desc=desc):
             for j in range(
                 i + 1, self.n_img - 1
             ):  # check bound (taken from MATLAB mex)

From 61c6a89ef1f246ee78242ea57bc5f3f15f7a8307 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Fri, 19 Jul 2024 08:51:42 -0400
Subject: [PATCH 49/60] move initial rotation estimate lines into
 estimate_rotations

---
 src/aspire/abinitio/commonline_sync3n.py | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 4c06d882d9..77f964c523 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -134,8 +134,14 @@ def estimate_rotations(self):
         :return: Array of rotation matrices, size n_imgx3x3.
         """
 
+        logger.info(f"Estimating relative viewing directions for {self.n_img} images.")
+
+        # Detect a single pair of common-lines between each pair of images
+        self.build_clmatrix()
+
         # Initial estimate of viewing directions
-        Rijs0 = self._estimate_relative_viewing_directions()
+        # Calculate relative rotations
+        Rijs0 = self._estimate_all_Rijs(self.clmatrix)
 
         # Compute and apply global handedness
         Rijs = self._global_J_sync(Rijs0)
@@ -743,20 +749,6 @@ def fun(x, B, P, b, x0, A=A, a=a):
     # Primary Methods                         #
     ###########################################
 
-    def _estimate_relative_viewing_directions(self):
-        """
-        Estimate the relative viewing directions vij = vi*vj^T, i<j, and vii = vi*vi^T, where
-        vi is the third row of the i'th rotation matrix Ri.
-        """
-        logger.info(f"Estimating relative viewing directions for {self.n_img} images.")
-        # Detect a single pair of common-lines between each pair of images
-        self.build_clmatrix()
-
-        # Calculate relative rotations
-        Rijs = self._estimate_all_Rijs(self.clmatrix)
-
-        return Rijs
-
     def _global_J_sync(self, Rijs):
         """
         Apply global J-synchronization.

From d8b03bf800fcfa80a2f97ab3c403be89fa8d9d87 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Fri, 9 Aug 2024 14:32:26 -0400
Subject: [PATCH 50/60] important progress bar

---
 src/aspire/abinitio/commonline_sync3n.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 77f964c523..8369d50045 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -6,7 +6,7 @@
 from scipy.optimize import curve_fit
 
 from aspire.abinitio import CLOrient3D, SyncVotingMixin
-from aspire.utils import J_conjugate, all_pairs, nearest_rotations, trange
+from aspire.utils import J_conjugate, all_pairs, nearest_rotations, tqdm, trange
 from aspire.utils.matlab_compat import stable_eigsh
 from aspire.utils.random import randn
 
@@ -783,7 +783,7 @@ def _estimate_all_Rijs(self, clmatrix):
         n_theta = self.n_theta
         Rijs = np.zeros((len(self._pairs), 3, 3))
 
-        for idx, (i, j) in enumerate(self._pairs):
+        for idx, (i, j) in enumerate(tqdm(self._pairs, desc="Estimate Rijs")):
             Rijs[idx] = self._syncmatrix_ij_vote_3n(
                 clmatrix, i, j, np.arange(n_img), n_theta
             )

From d4bf0bb2af02d66f38265da40904976b42388890 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Tue, 13 Aug 2024 15:37:49 -0400
Subject: [PATCH 51/60] Use trust region method for S weight least squares

---
 src/aspire/abinitio/commonline_sync3n.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 8369d50045..5c730e7aa2 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -716,6 +716,7 @@ def fun(x, B, P, b, x0, A=A, a=a):
             scores_hist.astype(np.float64, copy=False),
             p0=start_values,
             bounds=(lower_bounds, upper_bounds),
+            method="trf",  # MATLAB used method "LAR" with algo "Trust-Region"
         )
         B, P, b, x0 = popt
 

From e1774869ed195a92359c3493f7f28de3cbc9986e Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 19 Aug 2024 09:10:18 -0400
Subject: [PATCH 52/60] use class mangled names for gpu methods

---
 src/aspire/abinitio/commonline_sync3n.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 5c730e7aa2..e8149a4921 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -108,7 +108,7 @@ def __init__(
             )
 
         # Auto configure GPU
-        self._gpu_module = None
+        self.__gpu_module = None
         try:
             import cupy as cp
 
@@ -117,7 +117,7 @@ def __init__(
                 logger.info(
                     f"cupy and GPU {gpu_id} found by cuda runtime; enabling cupy."
                 )
-                self._gpu_module = self._init_cupy_module()
+                self.__gpu_module = self.__init_cupy_module()
             else:
                 logger.info("GPU not found, defaulting to numpy.")
 
@@ -360,7 +360,7 @@ def _triangle_scores_inner(self, Rijs):
         """
 
         # host/gpu dispatch
-        if self._gpu_module:
+        if self.__gpu_module:
             scores_hist = self._triangle_scores_inner_cupy(Rijs)
         else:
             scores_hist = self._triangle_scores_inner_host(Rijs)
@@ -460,7 +460,7 @@ def _triangle_scores_inner_cupy(self, Rijs):
 
         import cupy as cp
 
-        triangle_scores = self._gpu_module.get_function("triangle_scores_inner")
+        triangle_scores = self.__gpu_module.get_function("triangle_scores_inner")
 
         Rijs_dev = cp.array(Rijs, dtype=np.float64)
 
@@ -511,7 +511,7 @@ def _pairs_probabilities(self, Rijs, P2, A, a, B, b, x0):
         params = np.array([P2, A, a, B, b, x0], dtype=np.float64)
 
         # host/gpu dispatch
-        if self._gpu_module:
+        if self.__gpu_module:
             ln_f_ind, ln_f_arb = self._pairs_probabilities_cupy(Rijs, *params)
         else:
             ln_f_ind, ln_f_arb = self._pairs_probabilities_host(Rijs, *params)
@@ -625,7 +625,7 @@ def _pairs_probabilities_cupy(self, Rijs, P2, A, a, B, b, x0):
 
         import cupy as cp
 
-        pairs_probabilities = self._gpu_module.get_function("pairs_probabilities")
+        pairs_probabilities = self.__gpu_module.get_function("pairs_probabilities")
 
         Rijs_dev = cp.array(Rijs, dtype=np.float64)
         ln_f_ind_dev = cp.zeros((self.n_img * (self.n_img - 1) // 2), dtype=np.float64)
@@ -886,7 +886,7 @@ def _signs_times_v(self, Rijs, vec):
         :return: New candidate eigenvector.
         """
         # host/gpu dispatch
-        if self._gpu_module:
+        if self.__gpu_module:
             new_vec = self._signs_times_v_cupy(Rijs, vec)
         else:
             new_vec = self._signs_times_v_host(Rijs, vec)
@@ -979,7 +979,7 @@ def _signs_times_v_cupy(self, Rijs, vec):
         """
         import cupy as cp
 
-        signs_times_v = self._gpu_module.get_function("signs_times_v")
+        signs_times_v = self.__gpu_module.get_function("signs_times_v")
 
         Rijs_dev = cp.array(Rijs, dtype=np.float64)
         vec_dev = cp.array(vec, dtype=np.float64)
@@ -1000,7 +1000,7 @@ def _signs_times_v_cupy(self, Rijs, vec):
         return new_vec
 
     @staticmethod
-    def _init_cupy_module():
+    def __init_cupy_module():
         """
         Private utility method to read in CUDA source and return as
         compiled CUPY module.

From 8a2cd5315f33f4b0dfda41cbb2e9bb1b8c3c8477 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Mon, 19 Aug 2024 09:11:32 -0400
Subject: [PATCH 53/60] typo

---
 src/aspire/abinitio/commonline_sync3n.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu
index 99582b3f24..eeaee723b9 100644
--- a/src/aspire/abinitio/commonline_sync3n.cu
+++ b/src/aspire/abinitio/commonline_sync3n.cu
@@ -1,5 +1,5 @@
 
-/* from i,j indoces to the common index in the N-choose-2 sized array */
+/* from i,j indices to the common index in the N-choose-2 sized array */
 #define PAIR_IDX(N,I,J) ((2*N-I-1)*I/2 + J-I-1)
 
 

From a89fb8f22678b637564c6d9d959ddc4fe0bfa847 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Tue, 27 Aug 2024 09:53:55 -0400
Subject: [PATCH 54/60] Add disable_gpu sync3n flag

---
 src/aspire/abinitio/commonline_sync3n.py | 31 ++++++++++++++----------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index e8149a4921..5e34491d42 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -57,6 +57,7 @@ def __init__(
         S_weighting=False,
         J_weighting=False,
         hist_intervals=100,
+        disable_gpu=False,
     ):
         """
         Initialize object for estimating 3D orientations.
@@ -77,6 +78,9 @@ def __init__(
             signs when computing `signs_times_v`.
         :param hist_intervals: Number of histogram bins used to
             compute triangle scores when `S_weighting` enabled.
+        :param disable_gpu: Disables GPU acceleration;
+            forces CPU only code for this module.
+            Defaults to automatically using GPU when available.
         """
 
         super().__init__(
@@ -109,20 +113,21 @@ def __init__(
 
         # Auto configure GPU
         self.__gpu_module = None
-        try:
-            import cupy as cp
-
-            if cp.cuda.runtime.getDeviceCount() >= 1:
-                gpu_id = cp.cuda.runtime.getDevice()
-                logger.info(
-                    f"cupy and GPU {gpu_id} found by cuda runtime; enabling cupy."
-                )
-                self.__gpu_module = self.__init_cupy_module()
-            else:
-                logger.info("GPU not found, defaulting to numpy.")
+        if not disable_gpu:
+            try:
+                import cupy as cp
+
+                if cp.cuda.runtime.getDeviceCount() >= 1:
+                    gpu_id = cp.cuda.runtime.getDevice()
+                    logger.info(
+                        f"cupy and GPU {gpu_id} found by cuda runtime; enabling cupy."
+                    )
+                    self.__gpu_module = self.__init_cupy_module()
+                else:
+                    logger.info("GPU not found, defaulting to numpy.")
 
-        except ModuleNotFoundError:
-            logger.info("cupy not found, defaulting to numpy.")
+            except ModuleNotFoundError:
+                logger.info("cupy not found, defaulting to numpy.")
 
     ###########################################
     # High level algorithm steps              #

From 3d8da44468b2b11f3a3d78d2deb69124daaa676c Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Tue, 27 Aug 2024 10:02:34 -0400
Subject: [PATCH 55/60] P->W typo

---
 src/aspire/abinitio/commonline_sync3n.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 5e34491d42..e2186a5bc6 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -277,7 +277,7 @@ def _syncmatrix_weights(
     ):
         """
         Given relative rotations matrix `Rij`,
-        compute and return probability weights `P` for S.
+        compute and return probability weights `W` for S.
 
         Default parameters here were taken from those in the MATLAB
         code, with the original author noting they were found

From ff56876b5040f53205b8aa08867101ec2b69fac3 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Tue, 27 Aug 2024 10:06:14 -0400
Subject: [PATCH 56/60] use more specific language instead of resolution

---
 src/aspire/abinitio/commonline_sync3n.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index e2186a5bc6..3b02bb8d2d 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -65,8 +65,8 @@ def __init__(
         :param src: The source object of 2D denoised or class-averaged images with metadata
         :param n_rad: The number of points in the radial direction
         :param n_theta: The number of points in the theta direction
-        :param max_shift: Maximum range for shifts as a proportion of resolution. Default = 0.15.
-        :param shift_step: Resolution of shift estimation in pixels. Default = 1 pixel.
+        :param max_shift: Maximum range for shifts as a proportion of box size. Default = 0.15.
+        :param shift_step: Step size of shift estimation in pixels. Default = 1 pixel.
         :param epsilon: Tolerance for the power method.
         :param max_iter: Maximum iterations for the power method.
         :param seed: Optional seed for RNG.

From a7f77b9d43641a24706298a80a50882d890a446c Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Tue, 27 Aug 2024 10:53:40 -0400
Subject: [PATCH 57/60] Replace histogram logic

---
 src/aspire/abinitio/commonline_sync3n.py | 29 +++++++-----------------
 1 file changed, 8 insertions(+), 21 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index 3b02bb8d2d..e3b1c50edc 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -383,9 +383,9 @@ def _triangle_scores_inner_host(self, Rijs):
 
         # Initialize probability result arrays
         scores_hist = np.zeros(self.hist_intervals, dtype=np.uint32)
-        h = 1 / self.hist_intervals
 
         c = np.empty((4), dtype=Rijs.dtype)
+        s = np.empty((3), dtype=Rijs.dtype)
         for i in trange(self.n_img - 2, desc="Computing triangle scores"):
             for j in range(
                 i + 1, self.n_img - 1
@@ -427,28 +427,15 @@ def _triangle_scores_inner_host(self, Rijs):
                         alt_ij_ik = c[self._ALTS[1][best_i][2]]
 
                     # Compute scores
-                    s_ij_jk = 1 - np.sqrt(best_val / alt_ij_jk)
-                    s_ik_jk = 1 - np.sqrt(best_val / alt_ik_jk)
-                    s_ij_ik = 1 - np.sqrt(best_val / alt_ij_ik)
+                    s[0] = 1 - np.sqrt(best_val / alt_ij_jk)  # s_ij_jk
+                    s[1] = 1 - np.sqrt(best_val / alt_ik_jk)  # s_ik_jk
+                    s[2] = 1 - np.sqrt(best_val / alt_ij_ik)  # s_ij_ik
 
                     # Update histogram
-                    threshold = 0
-                    for _l1 in range(self.hist_intervals - 1):
-                        threshold += h
-                        if s_ij_jk < threshold:
-                            break
-
-                    threshold = 0
-                    for _l2 in range(self.hist_intervals - 1):
-                        threshold += h
-                        if s_ik_jk < threshold:
-                            break
-
-                    threshold = 0
-                    for _l3 in range(self.hist_intervals - 1):
-                        threshold += h
-                        if s_ij_ik < threshold:
-                            break
+                    # Find integer bin [0,self.hist_intervals)
+                    _l1, _l2, _l3 = np.minimum(
+                        (self.hist_intervals * s).astype(int),  # implicit floor
+                        self.hist_intervals-1)  # clamp upper bound
 
                     scores_hist[_l1] += 1
                     scores_hist[_l2] += 1

From bd34d3d7dd3bdd9d1c432046437395124432e483 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Tue, 27 Aug 2024 11:27:51 -0400
Subject: [PATCH 58/60] factor out sync3n score body

---
 src/aspire/abinitio/commonline_sync3n.py | 126 ++++++++++-------------
 1 file changed, 56 insertions(+), 70 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index e3b1c50edc..e0ed1514e0 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -372,6 +372,54 @@ def _triangle_scores_inner(self, Rijs):
 
         return scores_hist
 
+    def _scores_inner_body(self, Rijs, c, s, i, j, k):
+        """
+        Private method to compute scores `s`
+        given rotations `Rijs` and indices `i`, `j`, `k`.
+
+        Note arrays `Rijs`, `c`, and `s` are passed by reference from caller.
+        """
+
+        ij = self._pairs_to_linear[i, j]
+        ik = self._pairs_to_linear[i, k]
+        jk = self._pairs_to_linear[j, k]
+        Rij = Rijs[ij]
+        Rik = Rijs[ik]
+        Rjk = Rijs[jk]
+
+        # Compute conjugated rots
+        Rij_J = J_conjugate(Rij)
+        Rik_J = J_conjugate(Rik)
+        Rjk_J = J_conjugate(Rjk)
+
+        # Compute R muls and norms
+        c[0] = np.sum(((Rij @ Rjk) - Rik) ** 2)
+        c[1] = np.sum(((Rij_J @ Rjk) - Rik) ** 2)
+        c[2] = np.sum(((Rij @ Rjk_J) - Rik) ** 2)
+        c[3] = np.sum(((Rij @ Rjk) - Rik_J) ** 2)
+
+        # Find best match
+        best_i = np.argmin(c)
+        best_val = c[best_i]
+
+        # For each triangle side, find the best alternative
+        alt_ij_jk = c[self._ALTS[0][best_i][0]]
+        if c[self._ALTS[1][best_i][0]] < alt_ij_jk:
+            alt_ij_jk = c[self._ALTS[1][best_i][0]]
+
+        alt_ik_jk = c[self._ALTS[0][best_i][1]]
+        if c[self._ALTS[1][best_i][1]] < alt_ik_jk:
+            alt_ik_jk = c[self._ALTS[1][best_i][1]]
+
+        alt_ij_ik = c[self._ALTS[0][best_i][2]]
+        if c[self._ALTS[1][best_i][2]] < alt_ij_ik:
+            alt_ij_ik = c[self._ALTS[1][best_i][2]]
+
+        # Compute scores
+        s[0] = 1 - np.sqrt(best_val / alt_ij_jk)  # s_ij_jk
+        s[1] = 1 - np.sqrt(best_val / alt_ik_jk)  # s_ik_jk
+        s[2] = 1 - np.sqrt(best_val / alt_ij_ik)  # s_ij_ik
+
     def _triangle_scores_inner_host(self, Rijs):
         """
         See _triangle_scores_inner.
@@ -390,52 +438,17 @@ def _triangle_scores_inner_host(self, Rijs):
             for j in range(
                 i + 1, self.n_img - 1
             ):  # check bound (taken from MATLAB mex)
-                ij = self._pairs_to_linear[i, j]
-                Rij = Rijs[ij]
                 for k in range(j + 1, self.n_img):
-                    ik = self._pairs_to_linear[i, k]
-                    jk = self._pairs_to_linear[j, k]
-                    Rik = Rijs[ik]
-                    Rjk = Rijs[jk]
-
-                    # Compute conjugated rotats
-                    Rij_J = J_conjugate(Rij)
-                    Rik_J = J_conjugate(Rik)
-                    Rjk_J = J_conjugate(Rjk)
-
-                    # Compute R muls and norms
-                    c[0] = np.sum(((Rij @ Rjk) - Rik) ** 2)
-                    c[1] = np.sum(((Rij_J @ Rjk) - Rik) ** 2)
-                    c[2] = np.sum(((Rij @ Rjk_J) - Rik) ** 2)
-                    c[3] = np.sum(((Rij @ Rjk) - Rik_J) ** 2)
-
-                    # Find best match
-                    best_i = np.argmin(c)
-                    best_val = c[best_i]
-
-                    # For each triangle side, find the best alternative
-                    alt_ij_jk = c[self._ALTS[0][best_i][0]]
-                    if c[self._ALTS[1][best_i][0]] < alt_ij_jk:
-                        alt_ij_jk = c[self._ALTS[1][best_i][0]]
-
-                    alt_ik_jk = c[self._ALTS[0][best_i][1]]
-                    if c[self._ALTS[1][best_i][1]] < alt_ik_jk:
-                        alt_ik_jk = c[self._ALTS[1][best_i][1]]
-
-                    alt_ij_ik = c[self._ALTS[0][best_i][2]]
-                    if c[self._ALTS[1][best_i][2]] < alt_ij_ik:
-                        alt_ij_ik = c[self._ALTS[1][best_i][2]]
 
                     # Compute scores
-                    s[0] = 1 - np.sqrt(best_val / alt_ij_jk)  # s_ij_jk
-                    s[1] = 1 - np.sqrt(best_val / alt_ik_jk)  # s_ik_jk
-                    s[2] = 1 - np.sqrt(best_val / alt_ij_ik)  # s_ij_ik
+                    self._scores_inner_body(Rijs, c, s, i, j, k)
 
                     # Update histogram
                     # Find integer bin [0,self.hist_intervals)
                     _l1, _l2, _l3 = np.minimum(
                         (self.hist_intervals * s).astype(int),  # implicit floor
-                        self.hist_intervals-1)  # clamp upper bound
+                        self.hist_intervals - 1,
+                    )  # clamp upper bound
 
                     scores_hist[_l1] += 1
                     scores_hist[_l2] += 1
@@ -523,46 +536,19 @@ def _pairs_probabilities_host(self, Rijs, P2, A, a, B, b, x0):
         ln_f_arb = np.zeros(len(Rijs), dtype=Rijs.dtype)
 
         c = np.empty((4), dtype=Rijs.dtype)
+        s = np.empty((3), dtype=Rijs.dtype)
         for i in trange(self.n_img - 2, desc="Computing pair probabilities"):
             for j in range(i + 1, self.n_img - 1):
                 ij = self._pairs_to_linear[i, j]
-                Rij = Rijs[ij]
                 for k in range(j + 1, self.n_img):
                     ik = self._pairs_to_linear[i, k]
                     jk = self._pairs_to_linear[j, k]
-                    Rik = Rijs[ik]
-                    Rjk = Rijs[jk]
-
-                    # Compute conjugated rotats
-                    Rij_J = J_conjugate(Rij)
-                    Rik_J = J_conjugate(Rik)
-                    Rjk_J = J_conjugate(Rjk)
-
-                    # Compute R muls and norms
-                    c[0] = np.sum(((Rij @ Rjk) - Rik) ** 2)
-                    c[1] = np.sum(((Rij_J @ Rjk) - Rik) ** 2)
-                    c[2] = np.sum(((Rij @ Rjk_J) - Rik) ** 2)
-                    c[3] = np.sum(((Rij @ Rjk) - Rik_J) ** 2)
-
-                    # Find best match
-                    best_i = np.argmin(c)
-                    best_val = c[best_i]
-
-                    # For each triangle side, find the best alternative
-                    alt_ij_jk = c[self._ALTS[0][best_i][0]]
-                    if c[self._ALTS[1][best_i][0]] < alt_ij_jk:
-                        alt_ij_jk = c[self._ALTS[1][best_i][0]]
-                    alt_ik_jk = c[self._ALTS[0][best_i][1]]
-                    if c[self._ALTS[1][best_i][1]] < alt_ik_jk:
-                        alt_ik_jk = c[self._ALTS[1][best_i][1]]
-                    alt_ij_ik = c[self._ALTS[0][best_i][2]]
-                    if c[self._ALTS[1][best_i][2]] < alt_ij_ik:
-                        alt_ij_ik = c[self._ALTS[1][best_i][2]]
 
                     # Compute scores
-                    s_ij_jk = 1 - np.sqrt(best_val / alt_ij_jk)
-                    s_ik_jk = 1 - np.sqrt(best_val / alt_ik_jk)
-                    s_ij_ik = 1 - np.sqrt(best_val / alt_ij_ik)
+                    self._scores_inner_body(Rijs, c, s, i, j, k)
+
+                    # Unpack scores to local formula vars
+                    s_ij_jk, s_ik_jk, s_ij_ik = s
 
                     # Update probabilities
                     # # Probability of pair ij having score given indicicative common line

From 068ec9afb58370c1c3649b8bdcb7cf446ada021e Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Wed, 28 Aug 2024 08:01:31 -0400
Subject: [PATCH 59/60] Revert "factor out sync3n score body"

This reverts commit bd34d3d7dd3bdd9d1c432046437395124432e483.
---
 src/aspire/abinitio/commonline_sync3n.py | 126 +++++++++++++----------
 1 file changed, 70 insertions(+), 56 deletions(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index e0ed1514e0..e3b1c50edc 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -372,54 +372,6 @@ def _triangle_scores_inner(self, Rijs):
 
         return scores_hist
 
-    def _scores_inner_body(self, Rijs, c, s, i, j, k):
-        """
-        Private method to compute scores `s`
-        given rotations `Rijs` and indices `i`, `j`, `k`.
-
-        Note arrays `Rijs`, `c`, and `s` are passed by reference from caller.
-        """
-
-        ij = self._pairs_to_linear[i, j]
-        ik = self._pairs_to_linear[i, k]
-        jk = self._pairs_to_linear[j, k]
-        Rij = Rijs[ij]
-        Rik = Rijs[ik]
-        Rjk = Rijs[jk]
-
-        # Compute conjugated rots
-        Rij_J = J_conjugate(Rij)
-        Rik_J = J_conjugate(Rik)
-        Rjk_J = J_conjugate(Rjk)
-
-        # Compute R muls and norms
-        c[0] = np.sum(((Rij @ Rjk) - Rik) ** 2)
-        c[1] = np.sum(((Rij_J @ Rjk) - Rik) ** 2)
-        c[2] = np.sum(((Rij @ Rjk_J) - Rik) ** 2)
-        c[3] = np.sum(((Rij @ Rjk) - Rik_J) ** 2)
-
-        # Find best match
-        best_i = np.argmin(c)
-        best_val = c[best_i]
-
-        # For each triangle side, find the best alternative
-        alt_ij_jk = c[self._ALTS[0][best_i][0]]
-        if c[self._ALTS[1][best_i][0]] < alt_ij_jk:
-            alt_ij_jk = c[self._ALTS[1][best_i][0]]
-
-        alt_ik_jk = c[self._ALTS[0][best_i][1]]
-        if c[self._ALTS[1][best_i][1]] < alt_ik_jk:
-            alt_ik_jk = c[self._ALTS[1][best_i][1]]
-
-        alt_ij_ik = c[self._ALTS[0][best_i][2]]
-        if c[self._ALTS[1][best_i][2]] < alt_ij_ik:
-            alt_ij_ik = c[self._ALTS[1][best_i][2]]
-
-        # Compute scores
-        s[0] = 1 - np.sqrt(best_val / alt_ij_jk)  # s_ij_jk
-        s[1] = 1 - np.sqrt(best_val / alt_ik_jk)  # s_ik_jk
-        s[2] = 1 - np.sqrt(best_val / alt_ij_ik)  # s_ij_ik
-
     def _triangle_scores_inner_host(self, Rijs):
         """
         See _triangle_scores_inner.
@@ -438,17 +390,52 @@ def _triangle_scores_inner_host(self, Rijs):
             for j in range(
                 i + 1, self.n_img - 1
             ):  # check bound (taken from MATLAB mex)
+                ij = self._pairs_to_linear[i, j]
+                Rij = Rijs[ij]
                 for k in range(j + 1, self.n_img):
+                    ik = self._pairs_to_linear[i, k]
+                    jk = self._pairs_to_linear[j, k]
+                    Rik = Rijs[ik]
+                    Rjk = Rijs[jk]
+
+                    # Compute conjugated rotats
+                    Rij_J = J_conjugate(Rij)
+                    Rik_J = J_conjugate(Rik)
+                    Rjk_J = J_conjugate(Rjk)
+
+                    # Compute R muls and norms
+                    c[0] = np.sum(((Rij @ Rjk) - Rik) ** 2)
+                    c[1] = np.sum(((Rij_J @ Rjk) - Rik) ** 2)
+                    c[2] = np.sum(((Rij @ Rjk_J) - Rik) ** 2)
+                    c[3] = np.sum(((Rij @ Rjk) - Rik_J) ** 2)
+
+                    # Find best match
+                    best_i = np.argmin(c)
+                    best_val = c[best_i]
+
+                    # For each triangle side, find the best alternative
+                    alt_ij_jk = c[self._ALTS[0][best_i][0]]
+                    if c[self._ALTS[1][best_i][0]] < alt_ij_jk:
+                        alt_ij_jk = c[self._ALTS[1][best_i][0]]
+
+                    alt_ik_jk = c[self._ALTS[0][best_i][1]]
+                    if c[self._ALTS[1][best_i][1]] < alt_ik_jk:
+                        alt_ik_jk = c[self._ALTS[1][best_i][1]]
+
+                    alt_ij_ik = c[self._ALTS[0][best_i][2]]
+                    if c[self._ALTS[1][best_i][2]] < alt_ij_ik:
+                        alt_ij_ik = c[self._ALTS[1][best_i][2]]
 
                     # Compute scores
-                    self._scores_inner_body(Rijs, c, s, i, j, k)
+                    s[0] = 1 - np.sqrt(best_val / alt_ij_jk)  # s_ij_jk
+                    s[1] = 1 - np.sqrt(best_val / alt_ik_jk)  # s_ik_jk
+                    s[2] = 1 - np.sqrt(best_val / alt_ij_ik)  # s_ij_ik
 
                     # Update histogram
                     # Find integer bin [0,self.hist_intervals)
                     _l1, _l2, _l3 = np.minimum(
                         (self.hist_intervals * s).astype(int),  # implicit floor
-                        self.hist_intervals - 1,
-                    )  # clamp upper bound
+                        self.hist_intervals-1)  # clamp upper bound
 
                     scores_hist[_l1] += 1
                     scores_hist[_l2] += 1
@@ -536,19 +523,46 @@ def _pairs_probabilities_host(self, Rijs, P2, A, a, B, b, x0):
         ln_f_arb = np.zeros(len(Rijs), dtype=Rijs.dtype)
 
         c = np.empty((4), dtype=Rijs.dtype)
-        s = np.empty((3), dtype=Rijs.dtype)
         for i in trange(self.n_img - 2, desc="Computing pair probabilities"):
             for j in range(i + 1, self.n_img - 1):
                 ij = self._pairs_to_linear[i, j]
+                Rij = Rijs[ij]
                 for k in range(j + 1, self.n_img):
                     ik = self._pairs_to_linear[i, k]
                     jk = self._pairs_to_linear[j, k]
+                    Rik = Rijs[ik]
+                    Rjk = Rijs[jk]
 
-                    # Compute scores
-                    self._scores_inner_body(Rijs, c, s, i, j, k)
+                    # Compute conjugated rotats
+                    Rij_J = J_conjugate(Rij)
+                    Rik_J = J_conjugate(Rik)
+                    Rjk_J = J_conjugate(Rjk)
+
+                    # Compute R muls and norms
+                    c[0] = np.sum(((Rij @ Rjk) - Rik) ** 2)
+                    c[1] = np.sum(((Rij_J @ Rjk) - Rik) ** 2)
+                    c[2] = np.sum(((Rij @ Rjk_J) - Rik) ** 2)
+                    c[3] = np.sum(((Rij @ Rjk) - Rik_J) ** 2)
 
-                    # Unpack scores to local formula vars
-                    s_ij_jk, s_ik_jk, s_ij_ik = s
+                    # Find best match
+                    best_i = np.argmin(c)
+                    best_val = c[best_i]
+
+                    # For each triangle side, find the best alternative
+                    alt_ij_jk = c[self._ALTS[0][best_i][0]]
+                    if c[self._ALTS[1][best_i][0]] < alt_ij_jk:
+                        alt_ij_jk = c[self._ALTS[1][best_i][0]]
+                    alt_ik_jk = c[self._ALTS[0][best_i][1]]
+                    if c[self._ALTS[1][best_i][1]] < alt_ik_jk:
+                        alt_ik_jk = c[self._ALTS[1][best_i][1]]
+                    alt_ij_ik = c[self._ALTS[0][best_i][2]]
+                    if c[self._ALTS[1][best_i][2]] < alt_ij_ik:
+                        alt_ij_ik = c[self._ALTS[1][best_i][2]]
+
+                    # Compute scores
+                    s_ij_jk = 1 - np.sqrt(best_val / alt_ij_jk)
+                    s_ik_jk = 1 - np.sqrt(best_val / alt_ik_jk)
+                    s_ij_ik = 1 - np.sqrt(best_val / alt_ij_ik)
 
                     # Update probabilities
                     # # Probability of pair ij having score given indicicative common line

From f094f03fbc63c6cc7be6cb1de2313c0d65e50002 Mon Sep 17 00:00:00 2001
From: Garrett Wright <garrettwrong@gmail.com>
Date: Wed, 28 Aug 2024 08:04:05 -0400
Subject: [PATCH 60/60] black style

---
 src/aspire/abinitio/commonline_sync3n.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py
index e3b1c50edc..3c40eb3ac5 100644
--- a/src/aspire/abinitio/commonline_sync3n.py
+++ b/src/aspire/abinitio/commonline_sync3n.py
@@ -435,7 +435,8 @@ def _triangle_scores_inner_host(self, Rijs):
                     # Find integer bin [0,self.hist_intervals)
                     _l1, _l2, _l3 = np.minimum(
                         (self.hist_intervals * s).astype(int),  # implicit floor
-                        self.hist_intervals-1)  # clamp upper bound
+                        self.hist_intervals - 1,
+                    )  # clamp upper bound
 
                     scores_hist[_l1] += 1
                     scores_hist[_l2] += 1