From ca2d15d35f40d8fcb910d097efa84aee144e7e92 Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Wed, 14 Aug 2024 10:53:00 -0400 Subject: [PATCH 01/45] quick and dirty cupy build_clmatrix --- src/aspire/abinitio/commonline_base.py | 144 +++++++++++++++++++++++-- 1 file changed, 135 insertions(+), 9 deletions(-) diff --git a/src/aspire/abinitio/commonline_base.py b/src/aspire/abinitio/commonline_base.py index c0c3718803..6c222eafa8 100644 --- a/src/aspire/abinitio/commonline_base.py +++ b/src/aspire/abinitio/commonline_base.py @@ -1,10 +1,12 @@ import logging import math +from time import perf_counter import numpy as np import scipy.sparse as sparse from aspire.image import Image +from aspire.numeric import fft, xp from aspire.operators import PolarFT from aspire.utils import common_line_from_rots, fuzzy_mask, tqdm from aspire.utils.random import choice @@ -129,6 +131,22 @@ def estimate_rotations(self): raise NotImplementedError("subclasses should implement this") def build_clmatrix(self): + """dispatch, compare, dispair""" + + tic1 = perf_counter() + clmatrix_orig = self.build_clmatrix_orig() + tic2 = perf_counter() + clmatrix_cupy = self.build_clmatrix_cupy() + tic3 = perf_counter() + print(f"Orig CL build: {tic2-tic1}") + print(f"Cupy CL build: {tic3-tic2}") + + np.testing.assert_allclose(clmatrix_orig[0], clmatrix_cupy[0]) + np.testing.assert_allclose(clmatrix_orig[1], clmatrix_cupy[1]) + + self.shifts_1d, self.clmatrix = clmatrix_cupy + + def build_clmatrix_orig(self): """ Build common-lines matrix from Fourier stack of 2D images """ @@ -177,10 +195,11 @@ def build_clmatrix(self): shifts, shift_phases, h = self._generate_shift_phase_and_filter( r_max, max_shift, shift_step ) + shifts, shift_phases = xp.asnumpy(shifts), xp.asnumpy(shift_phases) # Apply bandpass filter, normalize each ray of each image # Note that only use half of each ray - pf = self._apply_filter_and_norm("ijk, k -> ijk", pf, r_max, h) + pf = xp.asnumpy(self._apply_filter_and_norm("ijk, k -> ijk", pf, r_max, h)) # Setup a progress bar _total_pairs_to_test = self.n_img * (self.n_check - 1) // 2 @@ -233,8 +252,115 @@ def build_clmatrix(self): pbar.update() pbar.close() - self.clmatrix = clmatrix - self.shifts_1d = shifts_1d + return shifts_1d, clmatrix + + def build_clmatrix_cupy(self): + """ + Build common-lines matrix from Fourier stack of 2D images + """ + + n_img = self.n_img + n_check = self.n_check + + if self.n_theta % 2 == 1: + msg = "n_theta must be even" + logger.error(msg) + raise NotImplementedError(msg) + + n_theta_half = self.n_theta // 2 + + # need to do a copy to prevent modifying self.pf for other functions + # also places on GPU when xp is in cupy mode. + pf = xp.array(self.pf) + + # Allocate local variables for return + # clmatrix represents the common lines matrix. + # Namely, clmatrix[i,j] contains the index in image i of + # the common line with image j. Note the common line index + # starts from 0 instead of 1 as Matlab version. -1 means + # there is no common line such as clmatrix[i,i]. + clmatrix = -xp.ones((n_img, n_img), dtype=self.dtype) + # When cl_dist[i, j] is not -1, it stores the maximum value + # of correlation between image i and j for all possible 1D shifts. + # We will use cl_dist[i, j] = -1 (including j<=i) to + # represent that there is no need to check common line + # between i and j. Since it is symmetric, + # only above the diagonal entries are necessary. + cl_dist = -xp.ones((n_img, n_img), dtype=self.dtype) + + # Allocate variables used for shift estimation + + # set maximum value of 1D shift (in pixels) to search + # between common-lines. + max_shift = self.max_shift + # Set resolution of shift estimation in pixels. Note that + # shift_step can be any positive real number. + shift_step = self.shift_step + # 1D shift between common-lines + shifts_1d = xp.zeros((n_img, n_img)) + + # Prepare the shift phases to try and generate filter for common-line detection + r_max = pf.shape[2] + shifts, shift_phases, h = self._generate_shift_phase_and_filter( + r_max, max_shift, shift_step + ) + + # Apply bandpass filter, normalize each ray of each image + # Note that only use half of each ray + pf = self._apply_filter_and_norm("ijk, k -> ijk", pf, r_max, h) + + # Setup a progress bar + _total_pairs_to_test = self.n_img * (self.n_check - 1) // 2 + pbar = tqdm(desc="Searching over common line pairs", total=_total_pairs_to_test) + + # Search for common lines between [i, j] pairs of images. + # Creating pf and building common lines are different to the Matlab version. + # The random selection is implemented. + for i in range(n_img - 1): + p1 = pf[i] + p1_real = xp.real(p1) + p1_imag = xp.imag(p1) + + # build the subset of j images if n_check < n_img + n_remaining = n_img - i - 1 + n_j = min(n_remaining, n_check) + subset_j = np.sort(choice(n_remaining, n_j, replace=False) + i + 1) + + for j in subset_j: + p2_flipped = xp.conj(pf[j]) + + for shift in range(len(shifts)): + shift_phase = shift_phases[shift] + p2_shifted_flipped = (shift_phase * p2_flipped).T + # Compute correlations in the positive r direction + part1 = p1_real.dot(p2_shifted_flipped.real) + # Compute correlations in the negative r direction + part2 = p1_imag.dot(p2_shifted_flipped.imag) + + c1 = part1 - part2 + sidx = c1.argmax() + cl1, cl2 = np.unravel_index(sidx, c1.shape) + sval = c1[cl1, cl2] + + c2 = part1 + part2 + sidx = c2.argmax() + cl1_2, cl2_2 = np.unravel_index(sidx, c2.shape) + sval2 = c2[cl1_2, cl2_2] + + if sval2 > sval: + cl1 = cl1_2 + cl2 = cl2_2 + n_theta_half + sval = sval2 + sval = 2 * sval + if sval > cl_dist[i, j]: + clmatrix[i, j] = cl1 + clmatrix[j, i] = cl2 + cl_dist[i, j] = sval + shifts_1d[i, j] = shifts[shift] + pbar.update() + pbar.close() + + return xp.asnumpy(shifts_1d), xp.asnumpy(clmatrix) def estimate_shifts(self, equations_factor=1, max_memory=4000): """ @@ -488,13 +614,13 @@ def _generate_shift_phase_and_filter(self, r_max, max_shift, shift_step): n_shifts = int(np.ceil(2 * max_shift / shift_step + 1)) # only half of ray, excluding the DC component. - rk = np.arange(1, r_max + 1) + rk = xp.arange(1, r_max + 1) # Generate all shift phases - shifts = -max_shift + shift_step * np.arange(n_shifts) - shift_phases = np.exp(np.outer(shifts, -2 * np.pi * 1j * rk / (2 * r_max + 1))) + shifts = -max_shift + shift_step * xp.arange(n_shifts) + shift_phases = xp.exp(xp.outer(shifts, -2 * xp.pi * 1j * rk / (2 * r_max + 1))) # Set filter for common-line detection - h = np.sqrt(np.abs(rk)) * np.exp(-np.square(rk) / (2 * (r_max / 4) ** 2)) + h = xp.sqrt(xp.abs(rk)) * xp.exp(-xp.square(rk) / (2 * (r_max / 4) ** 2)) return shifts, shift_phases, h @@ -556,11 +682,11 @@ def _apply_filter_and_norm(self, subscripts, pf, r_max, h): # Note if we'd rather not have the dtype and casting args, # we can control h.dtype instead. - np.einsum(subscripts, pf, h, out=pf, dtype=pf.dtype, casting="same_kind") + xp.einsum(subscripts, pf, h, out=pf, dtype=pf.dtype, casting="same_kind") # This is a high pass filter, cutting out the lowest frequency # (DC has already been removed). pf[..., 0] = 0 - pf /= np.linalg.norm(pf, axis=-1)[..., np.newaxis] + pf /= xp.linalg.norm(pf, axis=-1)[..., xp.newaxis] return pf From 99e0bf281fadb442a1338caebfd6b2b26552b9a7 Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Thu, 15 Aug 2024 14:04:28 -0400 Subject: [PATCH 02/45] stashing --- src/aspire/abinitio/commonline_base.cu | 87 ++++++++++++++ src/aspire/abinitio/commonline_base.py | 159 +++++++++++++++---------- src/aspire/config_default.yaml | 4 +- 3 files changed, 182 insertions(+), 68 deletions(-) create mode 100644 src/aspire/abinitio/commonline_base.cu diff --git a/src/aspire/abinitio/commonline_base.cu b/src/aspire/abinitio/commonline_base.cu new file mode 100644 index 0000000000..78d39586fb --- /dev/null +++ b/src/aspire/abinitio/commonline_base.cu @@ -0,0 +1,87 @@ + +extern "C" __global__ +void build_clmatrix_kernel(int n, int m, int r, double* pf, double* clmatrix, double* cl_dist, double* shifts_1d, int n_shifts, int* shifts, double* shift_phases) +{ + /* n n_img */ + /* m,r st (n, m, r) = pf.shape, ie len(pf[i]) */ + + /* thread index (1d), represents "i" index */ + unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + unsigned int j = blockDim.y * blockIdx.y + threadIdx.y; + + /* no-op when out of bounds */ + if(i >= n) return; + if(j >= n) return; + /* no-op lower triangle */ + if(j <= i) return; + + int ind; + int k; + int s; + int cl1, cl2; + int best_cl1, best_cl2, best_s; + double dist, best_cl_dist; + double p1,p2; + + double p1_realk, p1_imagk; + double p2conj_realk, p2conj_imagk; + double p2_realk, p2_imagk; + double shift_phases_real, shift_phases_imag; + + + best_s = -1; + best_cl1 = -1; + best_cl2 = -1; + best_cl_dist = -1/0; + + for(s=0; s best_cl_dist){ + best_cl_dist = dist; + best_cl1 = cl1; + best_cl2 = cl2; + best_s = s; + } + + dist = p1 + p2; + if(dist > best_cl_dist){ + best_cl_dist = dist; + best_cl1 = cl1; + best_cl2 = cl2 + m; // m is pf.shape[1], which should be n_theta//2... + best_s = s; + } + + } /* cl2 */ + }/* cl1 */ + } /* s */ + + /* update global best for i, j*/ + ind = i*n + j; + clmatrix[ind] = best_cl1; + clmatrix[j*n+i] = best_cl2; /* [j,i] */ + cl_dist[ind] = 2*best_cl_dist; // 2 of mystery + shifts_1d[ind] = shifts[best_s]; + +} diff --git a/src/aspire/abinitio/commonline_base.py b/src/aspire/abinitio/commonline_base.py index 6c222eafa8..5da7bbf04a 100644 --- a/src/aspire/abinitio/commonline_base.py +++ b/src/aspire/abinitio/commonline_base.py @@ -1,5 +1,6 @@ import logging import math +import os from time import perf_counter import numpy as np @@ -70,6 +71,23 @@ def __init__( self.rotations = None self._pf = None + # Auto configure GPU + self.__gpu_module = None + try: + import cupy as cp + + if cp.cuda.runtime.getDeviceCount() >= 1: + gpu_id = cp.cuda.runtime.getDevice() + logger.info( + f"cupy and GPU {gpu_id} found by cuda runtime; enabling cupy." + ) + self.__gpu_module = self.__init_cupy_module() + else: + logger.info("GPU not found, defaulting to numpy.") + + except ModuleNotFoundError: + logger.info("cupy not found, defaulting to numpy.") + self._build() def _build(self): @@ -133,18 +151,22 @@ def estimate_rotations(self): def build_clmatrix(self): """dispatch, compare, dispair""" + logger.info("Begin building Common Lines Matrix") + tic1 = perf_counter() - clmatrix_orig = self.build_clmatrix_orig() + clmatrix_cu = self.build_clmatrix_cu() tic2 = perf_counter() - clmatrix_cupy = self.build_clmatrix_cupy() + print(f"Cuda CL build: {tic2-tic1}") + tic3 = perf_counter() - print(f"Orig CL build: {tic2-tic1}") - print(f"Cupy CL build: {tic3-tic2}") + clmatrix_orig = self.build_clmatrix_orig() + tic4 = perf_counter() + print(f"Orig CL build: {tic4-tic3}") - np.testing.assert_allclose(clmatrix_orig[0], clmatrix_cupy[0]) - np.testing.assert_allclose(clmatrix_orig[1], clmatrix_cupy[1]) + # np.testing.assert_allclose(clmatrix_cu[1], clmatrix_orig[1]) + # np.testing.assert_allclose( clmatrix_cu[0], clmatrix_orig[0]) - self.shifts_1d, self.clmatrix = clmatrix_cupy + self.shifts_1d, self.clmatrix = clmatrix_cu def build_clmatrix_orig(self): """ @@ -203,7 +225,9 @@ def build_clmatrix_orig(self): # Setup a progress bar _total_pairs_to_test = self.n_img * (self.n_check - 1) // 2 - pbar = tqdm(desc="Searching over common line pairs", total=_total_pairs_to_test) + pbar = tqdm( + desc="Searching over common line pairs (orig)", total=_total_pairs_to_test + ) # Search for common lines between [i, j] pairs of images. # Creating pf and building common lines are different to the Matlab version. @@ -219,6 +243,7 @@ def build_clmatrix_orig(self): subset_j = np.sort(choice(n_remaining, n_j, replace=False) + i + 1) for j in subset_j: + # for j in range(i+1,n_img): p2_flipped = np.conj(pf[j]) for shift in range(len(shifts)): @@ -254,11 +279,13 @@ def build_clmatrix_orig(self): return shifts_1d, clmatrix - def build_clmatrix_cupy(self): + def build_clmatrix_cu(self): """ Build common-lines matrix from Fourier stack of 2D images """ + import cupy as cp + n_img = self.n_img n_check = self.n_check @@ -267,11 +294,12 @@ def build_clmatrix_cupy(self): logger.error(msg) raise NotImplementedError(msg) - n_theta_half = self.n_theta // 2 + n_theta_half = self.n_theta // 2 # rm # need to do a copy to prevent modifying self.pf for other functions - # also places on GPU when xp is in cupy mode. - pf = xp.array(self.pf) + # also places on GPU + pf = cp.array(self.pf) + assert pf.shape[1] == n_theta_half # Allocate local variables for return # clmatrix represents the common lines matrix. @@ -279,14 +307,14 @@ def build_clmatrix_cupy(self): # the common line with image j. Note the common line index # starts from 0 instead of 1 as Matlab version. -1 means # there is no common line such as clmatrix[i,i]. - clmatrix = -xp.ones((n_img, n_img), dtype=self.dtype) + clmatrix = -cp.ones((n_img, n_img), dtype=np.float64) # When cl_dist[i, j] is not -1, it stores the maximum value # of correlation between image i and j for all possible 1D shifts. # We will use cl_dist[i, j] = -1 (including j<=i) to # represent that there is no need to check common line # between i and j. Since it is symmetric, # only above the diagonal entries are necessary. - cl_dist = -xp.ones((n_img, n_img), dtype=self.dtype) + cl_dist = -cp.ones((n_img, n_img), dtype=np.float64) # Allocate variables used for shift estimation @@ -297,7 +325,7 @@ def build_clmatrix_cupy(self): # shift_step can be any positive real number. shift_step = self.shift_step # 1D shift between common-lines - shifts_1d = xp.zeros((n_img, n_img)) + shifts_1d = cp.zeros((n_img, n_img), dtype=np.float64) # check dtype # Prepare the shift phases to try and generate filter for common-line detection r_max = pf.shape[2] @@ -309,58 +337,40 @@ def build_clmatrix_cupy(self): # Note that only use half of each ray pf = self._apply_filter_and_norm("ijk, k -> ijk", pf, r_max, h) - # Setup a progress bar - _total_pairs_to_test = self.n_img * (self.n_check - 1) // 2 - pbar = tqdm(desc="Searching over common line pairs", total=_total_pairs_to_test) - - # Search for common lines between [i, j] pairs of images. - # Creating pf and building common lines are different to the Matlab version. - # The random selection is implemented. - for i in range(n_img - 1): - p1 = pf[i] - p1_real = xp.real(p1) - p1_imag = xp.imag(p1) - - # build the subset of j images if n_check < n_img - n_remaining = n_img - i - 1 - n_j = min(n_remaining, n_check) - subset_j = np.sort(choice(n_remaining, n_j, replace=False) + i + 1) - - for j in subset_j: - p2_flipped = xp.conj(pf[j]) - - for shift in range(len(shifts)): - shift_phase = shift_phases[shift] - p2_shifted_flipped = (shift_phase * p2_flipped).T - # Compute correlations in the positive r direction - part1 = p1_real.dot(p2_shifted_flipped.real) - # Compute correlations in the negative r direction - part2 = p1_imag.dot(p2_shifted_flipped.imag) - - c1 = part1 - part2 - sidx = c1.argmax() - cl1, cl2 = np.unravel_index(sidx, c1.shape) - sval = c1[cl1, cl2] - - c2 = part1 + part2 - sidx = c2.argmax() - cl1_2, cl2_2 = np.unravel_index(sidx, c2.shape) - sval2 = c2[cl1_2, cl2_2] + # Get kernel + build_clmatrix_kernel = self.__gpu_module.get_function("build_clmatrix_kernel") + + # Configure grid of blocks + blkszx = 32 + nblkx = (self.n_img + blkszx - 2) // blkszx + blkszy = 32 + nblky = (self.n_img + blkszy - 1) // blkszy + + # Launch + logger.info("Launching `build_clmatrix_kernel`.") + build_clmatrix_kernel( + (nblkx, nblky), + (blkszx, blkszy), + ( + pf.shape[0], + pf.shape[1], + pf.shape[2], + pf.astype(np.complex128, copy=False).view(np.float64), + clmatrix, + cl_dist, + shifts_1d, + len(shifts), + shifts, + shift_phases.astype(np.complex128, copy=False).view(np.float64), + ), + ) - if sval2 > sval: - cl1 = cl1_2 - cl2 = cl2_2 + n_theta_half - sval = sval2 - sval = 2 * sval - if sval > cl_dist[i, j]: - clmatrix[i, j] = cl1 - clmatrix[j, i] = cl2 - cl_dist[i, j] = sval - shifts_1d[i, j] = shifts[shift] - pbar.update() - pbar.close() + # Copy result device arrays to host + shifts_1d = shifts_1d.get().astype(self.dtype, copy=False) + clmatrix = clmatrix.get().astype(self.dtype, copy=False) + # cl_dist = cl_dist.get().astype(self.dtype, copy=False) - return xp.asnumpy(shifts_1d), xp.asnumpy(clmatrix) + return shifts_1d, clmatrix def estimate_shifts(self, equations_factor=1, max_memory=4000): """ @@ -682,7 +692,7 @@ def _apply_filter_and_norm(self, subscripts, pf, r_max, h): # Note if we'd rather not have the dtype and casting args, # we can control h.dtype instead. - xp.einsum(subscripts, pf, h, out=pf, dtype=pf.dtype, casting="same_kind") + pf = xp.einsum(subscripts, pf, h, dtype=pf.dtype) # casting="same_kind" # This is a high pass filter, cutting out the lowest frequency # (DC has already been removed). @@ -690,3 +700,20 @@ def _apply_filter_and_norm(self, subscripts, pf, r_max, h): pf /= xp.linalg.norm(pf, axis=-1)[..., xp.newaxis] return pf + + @staticmethod + def __init_cupy_module(): + """ + Private utility method to read in CUDA source and return as + compiled CUPY module. + """ + + import cupy as cp + + # Read in contents of file + fp = os.path.join(os.path.dirname(__file__), "commonline_base.cu") + with open(fp, "r") as fh: + module_code = fh.read() + + # CUPY compile the CUDA code + return cp.RawModule(code=module_code) diff --git a/src/aspire/config_default.yaml b/src/aspire/config_default.yaml index 9af8732dd1..6e32ead3d2 100644 --- a/src/aspire/config_default.yaml +++ b/src/aspire/config_default.yaml @@ -1,9 +1,9 @@ version: 0.13.0 common: # numeric module to use - one of numpy/cupy - numeric: numpy + numeric: cupy # fft backend to use - one of pyfftw/scipy/cupy/mkl - fft: scipy + fft: cupy # Set cache directory for ASPIRE example data. # By default the cache location will be set by pooch.os_cache(), From 80d9acdd86c5fd47ccd236f19020f0dae4e5ebe6 Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Thu, 15 Aug 2024 16:13:36 -0400 Subject: [PATCH 03/45] stashing 2, clmatrix match, good speedup [skip ci] --- src/aspire/abinitio/commonline_base.cu | 12 +++++------- src/aspire/abinitio/commonline_base.py | 9 ++++----- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/src/aspire/abinitio/commonline_base.cu b/src/aspire/abinitio/commonline_base.cu index 78d39586fb..8dabbb4533 100644 --- a/src/aspire/abinitio/commonline_base.cu +++ b/src/aspire/abinitio/commonline_base.cu @@ -21,7 +21,7 @@ void build_clmatrix_kernel(int n, int m, int r, double* pf, double* clmatrix, do int cl1, cl2; int best_cl1, best_cl2, best_s; double dist, best_cl_dist; - double p1,p2; + double p1, p2; double p1_realk, p1_imagk; double p2conj_realk, p2conj_imagk; @@ -29,14 +29,14 @@ void build_clmatrix_kernel(int n, int m, int r, double* pf, double* clmatrix, do double shift_phases_real, shift_phases_imag; - best_s = -1; + best_s = -99999; best_cl1 = -1; best_cl2 = -1; best_cl_dist = -1/0; - for(s=0; s Date: Mon, 19 Aug 2024 10:04:42 -0400 Subject: [PATCH 04/45] cleanup --- src/aspire/abinitio/commonline_base.cu | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/aspire/abinitio/commonline_base.cu b/src/aspire/abinitio/commonline_base.cu index 8dabbb4533..7673b68af0 100644 --- a/src/aspire/abinitio/commonline_base.cu +++ b/src/aspire/abinitio/commonline_base.cu @@ -4,7 +4,7 @@ void build_clmatrix_kernel(int n, int m, int r, double* pf, double* clmatrix, do { /* n n_img */ /* m,r st (n, m, r) = pf.shape, ie len(pf[i]) */ - + /* thread index (1d), represents "i" index */ unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; unsigned int j = blockDim.y * blockIdx.y + threadIdx.y; @@ -25,10 +25,10 @@ void build_clmatrix_kernel(int n, int m, int r, double* pf, double* clmatrix, do double p1_realk, p1_imagk; double p2conj_realk, p2conj_imagk; - double p2_realk, p2_imagk; + double p2_realk, p2_imagk; double shift_phases_real, shift_phases_imag; - + best_s = -99999; best_cl1 = -1; best_cl2 = -1; @@ -71,9 +71,10 @@ void build_clmatrix_kernel(int n, int m, int r, double* pf, double* clmatrix, do best_s = s; } - } /* cl2 */ - }/* cl1 */ - } /* s */ + } /* s */ + } /* cl2 */ + }/* cl1 */ + /* update global best for i, j*/ ind = i*n + j; @@ -81,5 +82,5 @@ void build_clmatrix_kernel(int n, int m, int r, double* pf, double* clmatrix, do clmatrix[j*n+i] = best_cl2; /* [j,i] */ cl_dist[ind] = 2*best_cl_dist; // 2 of mystery shifts_1d[ind] = shifts[best_s]; - -} + +} From 6cb90d1205e5617708ef1634fe54e049e5f202bb Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Mon, 19 Aug 2024 10:52:58 -0400 Subject: [PATCH 05/45] implement transpose PF for better mem patterns --- src/aspire/abinitio/commonline_base.cu | 8 ++++---- src/aspire/abinitio/commonline_base.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/aspire/abinitio/commonline_base.cu b/src/aspire/abinitio/commonline_base.cu index 7673b68af0..670eb5fa39 100644 --- a/src/aspire/abinitio/commonline_base.cu +++ b/src/aspire/abinitio/commonline_base.cu @@ -42,10 +42,10 @@ void build_clmatrix_kernel(int n, int m, int r, double* pf, double* clmatrix, do /* inner most dim of dot (matmul) */ for(k=0; k Date: Mon, 19 Aug 2024 15:06:36 -0400 Subject: [PATCH 06/45] use cu complex for CL kernel --- src/aspire/abinitio/commonline_base.cu | 26 +++++++------------------- src/aspire/abinitio/commonline_base.py | 12 ++++++------ 2 files changed, 13 insertions(+), 25 deletions(-) diff --git a/src/aspire/abinitio/commonline_base.cu b/src/aspire/abinitio/commonline_base.cu index 670eb5fa39..2d159b87a6 100644 --- a/src/aspire/abinitio/commonline_base.cu +++ b/src/aspire/abinitio/commonline_base.cu @@ -1,6 +1,7 @@ +#include extern "C" __global__ -void build_clmatrix_kernel(int n, int m, int r, double* pf, double* clmatrix, double* cl_dist, double* shifts_1d, int n_shifts, int* shifts, double* shift_phases) +void build_clmatrix_kernel(int n, int m, int r, const complex* __restrict__ pf, double* __restrict__ clmatrix, double* __restrict__ cl_dist, double* __restrict__ shifts_1d, int n_shifts, int* __restrict__ shifts, const complex* __restrict__ shift_phases) { /* n n_img */ /* m,r st (n, m, r) = pf.shape, ie len(pf[i]) */ @@ -22,12 +23,7 @@ void build_clmatrix_kernel(int n, int m, int r, double* pf, double* clmatrix, do int best_cl1, best_cl2, best_s; double dist, best_cl_dist; double p1, p2; - - double p1_realk, p1_imagk; - double p2conj_realk, p2conj_imagk; - double p2_realk, p2_imagk; - double shift_phases_real, shift_phases_imag; - + complex pfik, pfjk; best_s = -99999; best_cl1 = -1; @@ -41,18 +37,10 @@ void build_clmatrix_kernel(int n, int m, int r, double* pf, double* clmatrix, do p2 = 0; /* inner most dim of dot (matmul) */ for(k=0; k ijk", pf, r_max, h) + # Tranpose `pf` for better (CUDA) memory access pattern, and cast as needed. + pf = cp.ascontiguousarray(pf.T, dtype=np.complex128) # Get kernel build_clmatrix_kernel = self.__gpu_module.get_function("build_clmatrix_kernel") @@ -351,16 +351,16 @@ def build_clmatrix_cu(self): (nblkx, nblky), (blkszx, blkszy), ( - pf.shape[0], + n_img, pf.shape[1], - pf.shape[2], - cp.ascontiguousarray(pf.T, dtype=np.complex128).view(np.float64), + r_max, + pf, clmatrix, cl_dist, shifts_1d, len(shifts), shifts, - shift_phases.astype(np.complex128, copy=False).view(np.float64), + shift_phases.astype(np.complex128, copy=False), ), ) From 6a4689a174201a1360ebead1085e6baca0cfbac1 Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Tue, 20 Aug 2024 12:49:44 -0400 Subject: [PATCH 07/45] fixed shifts, dtype --- src/aspire/abinitio/commonline_base.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aspire/abinitio/commonline_base.cu b/src/aspire/abinitio/commonline_base.cu index 2d159b87a6..a6bad78aea 100644 --- a/src/aspire/abinitio/commonline_base.cu +++ b/src/aspire/abinitio/commonline_base.cu @@ -1,7 +1,7 @@ #include extern "C" __global__ -void build_clmatrix_kernel(int n, int m, int r, const complex* __restrict__ pf, double* __restrict__ clmatrix, double* __restrict__ cl_dist, double* __restrict__ shifts_1d, int n_shifts, int* __restrict__ shifts, const complex* __restrict__ shift_phases) +void build_clmatrix_kernel(int n, int m, int r, const complex* __restrict__ pf, double* __restrict__ clmatrix, double* __restrict__ cl_dist, double* __restrict__ shifts_1d, int n_shifts, double* __restrict__ shifts, const complex* __restrict__ shift_phases) { /* n n_img */ /* m,r st (n, m, r) = pf.shape, ie len(pf[i]) */ From 6a0e2769a867d8ac0bfc295fb04541dc9a18ebf6 Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Tue, 20 Aug 2024 13:57:26 -0400 Subject: [PATCH 08/45] revert some unused optimizations, fix minor casting/dtype issue, begin cleanup --- src/aspire/abinitio/commonline_base.py | 42 ++++++++++++-------------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/src/aspire/abinitio/commonline_base.py b/src/aspire/abinitio/commonline_base.py index cf749e0f1f..2fbf12d8ea 100644 --- a/src/aspire/abinitio/commonline_base.py +++ b/src/aspire/abinitio/commonline_base.py @@ -153,22 +153,16 @@ def build_clmatrix(self): logger.info("Begin building Common Lines Matrix") - tic1 = perf_counter() - clmatrix_cu = self.build_clmatrix_cu() - tic2 = perf_counter() - print(f"Cuda CL build: {tic2-tic1}") - - tic3 = perf_counter() - clmatrix_orig = self.build_clmatrix_orig() - tic4 = perf_counter() - print(f"Orig CL build: {tic4-tic3}") - - np.testing.assert_allclose(clmatrix_cu[1], clmatrix_orig[1]) - # np.testing.assert_allclose( clmatrix_cu[0], clmatrix_orig[0]) + # host/gpu dispatch + if self.__gpu_module: + res = self.build_clmatrix_cu() + else: + res = self.build_clmatrix_host() - self.shifts_1d, self.clmatrix = clmatrix_cu + # Unpack result + self.shifts_1d, self.clmatrix = res - def build_clmatrix_orig(self): + def build_clmatrix_host(self): """ Build common-lines matrix from Fourier stack of 2D images """ @@ -217,11 +211,10 @@ def build_clmatrix_orig(self): shifts, shift_phases, h = self._generate_shift_phase_and_filter( r_max, max_shift, shift_step ) - shifts, shift_phases = xp.asnumpy(shifts), xp.asnumpy(shift_phases) # Apply bandpass filter, normalize each ray of each image # Note that only use half of each ray - pf = xp.asnumpy(self._apply_filter_and_norm("ijk, k -> ijk", pf, r_max, h)) + pf = self._apply_filter_and_norm("ijk, k -> ijk", pf, r_max, h) # Setup a progress bar _total_pairs_to_test = self.n_img * (self.n_check - 1) // 2 @@ -329,6 +322,9 @@ def build_clmatrix_cu(self): shifts, shift_phases, h = self._generate_shift_phase_and_filter( r_max, max_shift, shift_step ) + # Transfer to device, dtypes must match kernel header. + shifts = cp.asarray(shifts, dtype=np.float64) + shift_phases = cp.asarray(shift_phases, dtype=np.complex128) # Apply bandpass filter, normalize each ray of each image # Note that only use half of each ray @@ -360,7 +356,7 @@ def build_clmatrix_cu(self): shifts_1d, len(shifts), shifts, - shift_phases.astype(np.complex128, copy=False), + shift_phases, ), ) @@ -623,13 +619,13 @@ def _generate_shift_phase_and_filter(self, r_max, max_shift, shift_step): n_shifts = int(np.ceil(2 * max_shift / shift_step + 1)) # only half of ray, excluding the DC component. - rk = xp.arange(1, r_max + 1) + rk = np.arange(1, r_max + 1, dtype=self.dtype) # Generate all shift phases - shifts = -max_shift + shift_step * xp.arange(n_shifts) - shift_phases = xp.exp(xp.outer(shifts, -2 * xp.pi * 1j * rk / (2 * r_max + 1))) + shifts = -max_shift + shift_step * np.arange(n_shifts, dtype=self.dtype) + shift_phases = np.exp(np.outer(shifts, -2 * np.pi * 1j * rk / (2 * r_max + 1))) # Set filter for common-line detection - h = xp.sqrt(xp.abs(rk)) * xp.exp(-xp.square(rk) / (2 * (r_max / 4) ** 2)) + h = np.sqrt(np.abs(rk)) * np.exp(-np.square(rk) / (2 * (r_max / 4) ** 2)) return shifts, shift_phases, h @@ -691,12 +687,12 @@ def _apply_filter_and_norm(self, subscripts, pf, r_max, h): # Note if we'd rather not have the dtype and casting args, # we can control h.dtype instead. - pf = xp.einsum(subscripts, pf, h, dtype=pf.dtype) # casting="same_kind" + pf = np.einsum(subscripts, pf, h, dtype=pf.dtype) # This is a high pass filter, cutting out the lowest frequency # (DC has already been removed). pf[..., 0] = 0 - pf /= xp.linalg.norm(pf, axis=-1)[..., xp.newaxis] + pf /= np.linalg.norm(pf, axis=-1)[..., np.newaxis] return pf From 0930e69790ff377dc8c46d96c895f665b6c4fa9e Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Tue, 20 Aug 2024 14:26:18 -0400 Subject: [PATCH 09/45] tox check cleanup --- src/aspire/abinitio/commonline_base.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/aspire/abinitio/commonline_base.py b/src/aspire/abinitio/commonline_base.py index 2fbf12d8ea..54ee26b290 100644 --- a/src/aspire/abinitio/commonline_base.py +++ b/src/aspire/abinitio/commonline_base.py @@ -1,7 +1,6 @@ import logging import math import os -from time import perf_counter import numpy as np import scipy.sparse as sparse @@ -280,7 +279,6 @@ def build_clmatrix_cu(self): import cupy as cp n_img = self.n_img - n_check = self.n_check if self.n_theta % 2 == 1: msg = "n_theta must be even" From 997da4517eb09f585d61304d765b1924a19e5650 Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Mon, 26 Aug 2024 13:59:45 -0400 Subject: [PATCH 10/45] stashing stub --- src/aspire/abinitio/commonline_sync3n.cu | 17 +++++++++++++++++ src/aspire/abinitio/commonline_sync3n.py | 22 ++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu index eeaee723b9..7ec1cbfd05 100644 --- a/src/aspire/abinitio/commonline_sync3n.cu +++ b/src/aspire/abinitio/commonline_sync3n.cu @@ -413,3 +413,20 @@ void triangle_scores_inner(int n, double* Rijs, int n_intervals, unsigned int* s return; }; + +extern "C" __global__ +void estimate_all_Rijs(int n,double* __restrict__ clmatrix) +{ + /* n n_img */ + + /* thread index (2d), represents "i" index, "j" index */ + unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + unsigned int j = blockDim.y * blockIdx.y + threadIdx.y; + + /* no-op when out of bounds */ + if(i >= n) return; + if(j >= n) return; + /* no-op lower triangle */ + if(j <= i) return; + +} diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py index 6a7e5390d3..d6b12ef1d1 100644 --- a/src/aspire/abinitio/commonline_sync3n.py +++ b/src/aspire/abinitio/commonline_sync3n.py @@ -823,6 +823,28 @@ def _estimate_all_Rijs(self, clmatrix): :param clmatrix: Common lines matrix :return: Estimated rotations """ + # host/gpu dispatch + if self.__gpu_module: + res = self._estimate_all_Rijs_cupy(Rijs) + else: + res = self._estimate_all_Rijs_host(Rijs) + + return res + + def _estimate_all_Rijs_cupy(self, clmatrix): + import cupy as cp + + estimate_all_Rijs = self.__gpu_module.get_function("estimate_all_Rijs") + + return + + def _estimate_all_Rijs_host(self, clmatrix): + """ + Estimate Rijs using the voting method. + + :param clmatrix: Common lines matrix + :return: Estimated rotations + """ n_img = self.n_img n_theta = self.n_theta Rijs = np.zeros((len(self._pairs), 3, 3)) From 425d3ee187f86eb8a8b3beb58967c5493caad70d Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Wed, 4 Sep 2024 09:56:15 -0400 Subject: [PATCH 11/45] stashing stubs --- src/aspire/abinitio/commonline_sync3n.cu | 74 ++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu index 7ec1cbfd05..2adfe80084 100644 --- a/src/aspire/abinitio/commonline_sync3n.cu +++ b/src/aspire/abinitio/commonline_sync3n.cu @@ -422,6 +422,14 @@ void estimate_all_Rijs(int n,double* __restrict__ clmatrix) /* thread index (2d), represents "i" index, "j" index */ unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; unsigned int j = blockDim.y * blockIdx.y + threadIdx.y; + int cl_diff1, cl_diff2, cl_diff3; + double theta1, theta2, theta3; + double c1, c2, c3; + double cond; + double cos_phi2; + double alpha; + double angles[3]; + double r[3][3]; /* no-op when out of bounds */ if(i >= n) return; @@ -429,4 +437,70 @@ void estimate_all_Rijs(int n,double* __restrict__ clmatrix) /* no-op lower triangle */ if(j <= i) return; + + // vote_ij creates good_k list per (i,j) + + // We shouldn't need this... confirm and rm later. + if(clmatrix[i*n + j] == -1) return; + + int cl_idx12 = clmatrix[i*n + j]; + int cl_idx21 = clmatrix[j*n + i]; + + /* Assume that k_list starts as [0,n] */ + for(k=0; k1){ + cos_phi2 = 1; + } + if(cos_phi2 <-1){ + cos_phi2 = -1; + } + + // // _rotratio_eulerangle_vec loops over good_k list per (i,j) + // alpha = arccos(cos_phi2); + // // convert Euler angle to rotation matrix + // angles[0] = cl_idx12 * 2 * pi / n + pi / 2; + // angles[1] = alpha; + // angles[2] = -pi / 2 - cl_idx21 * 2 * pi / n; + // //r = from_euler(angles); + + + + + } From 5b0d7ccb9fec1351638db28672c0680b9971f2e1 Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Wed, 4 Sep 2024 16:02:11 -0400 Subject: [PATCH 12/45] stashing init kernel port --- src/aspire/abinitio/commonline_sync3n.cu | 109 +++++++++++++++++++---- 1 file changed, 91 insertions(+), 18 deletions(-) diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu index 2adfe80084..8e83aee9aa 100644 --- a/src/aspire/abinitio/commonline_sync3n.cu +++ b/src/aspire/abinitio/commonline_sync3n.cu @@ -1,7 +1,28 @@ +# define M_PI 3.14159265358979323846 /* pi */ /* from i,j indices to the common index in the N-choose-2 sized array */ #define PAIR_IDX(N,I,J) ((2*N-I-1)*I/2 + J-I-1) +/* convert euler angles (a,b,c) in ZYZ to rotation matrix r */ +inline void ang2orth(double* r, double a, double b, double c){ + double sa = sin(a); + double sb = sin(b); + double sc = sin(c); + double ca = cos(a); + double cb = cos(b); + double cc = cos(c); + + r[0] = ca*cb*cc - sa*sc; + r[1] = -ca*cb*cc - sa*sc; + r[2] = ca*sb; + r[3] = sa*cb*cc + ca*sc; + r[4] = -sa*cb*cc + ca*sc; + r[5] = sa*sb; + r[6] = -sb*cc; + r[7] = sb*sc; + r[8] = cb; +} + inline void mult_3x3(double *out, double *R1, double *R2) { /* 3X3 matrices multiplication: out = R1*R2 @@ -415,21 +436,24 @@ void triangle_scores_inner(int n, double* Rijs, int n_intervals, unsigned int* s }; extern "C" __global__ -void estimate_all_Rijs(int n,double* __restrict__ clmatrix) +void estimate_all_Rijs(int n, int n_theta, double* __restrict__ clmatrix, double* __restrict__ hist, int* __restrict__ kmap, double* rotations) { + // try toget kmap as uint16_t /* n n_img */ /* thread index (2d), represents "i" index, "j" index */ unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; unsigned int j = blockDim.y * blockIdx.y + threadIdx.y; + int k; + int kk; int cl_diff1, cl_diff2, cl_diff3; double theta1, theta2, theta3; double c1, c2, c3; double cond; double cos_phi2; - double alpha; double angles[3]; - double r[3][3]; + double r[9]; + int cnt; /* no-op when out of bounds */ if(i >= n) return; @@ -445,7 +469,16 @@ void estimate_all_Rijs(int n,double* __restrict__ clmatrix) int cl_idx12 = clmatrix[i*n + j]; int cl_idx21 = clmatrix[j*n + i]; - + int cl_idx13, cl_idx31; + int cl_idx23, cl_idx32; + const int ntics = 60; + const double sigma = 3.0; + double ga; + double angle; + int b; + int peak_idx; + double peak; + /* Assume that k_list starts as [0,n] */ for(k=0; k peak){ + peak = hist[PAIR_IDX(n,i,j)*ntics + b]; + peak_idx = b; + } + } + /* find mean of rotations */ + // initialize + cnt = 0; + // _rotratio_eulerangle_vec loops over good_k list per (i,j) + // find satisfying indices + for(k=0; k Date: Tue, 17 Sep 2024 11:58:39 -0400 Subject: [PATCH 13/45] update kernel with some of the angs work --- src/aspire/abinitio/commonline_sync3n.cu | 123 +++++++++++++++-------- 1 file changed, 82 insertions(+), 41 deletions(-) diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu index 8e83aee9aa..71b2cdc365 100644 --- a/src/aspire/abinitio/commonline_sync3n.cu +++ b/src/aspire/abinitio/commonline_sync3n.cu @@ -20,9 +20,9 @@ inline void ang2orth(double* r, double a, double b, double c){ r[5] = sa*sb; r[6] = -sb*cc; r[7] = sb*sc; - r[8] = cb; + r[8] = cb; } - + inline void mult_3x3(double *out, double *R1, double *R2) { /* 3X3 matrices multiplication: out = R1*R2 @@ -436,7 +436,16 @@ void triangle_scores_inner(int n, double* Rijs, int n_intervals, unsigned int* s }; extern "C" __global__ -void estimate_all_Rijs(int n, int n_theta, double* __restrict__ clmatrix, double* __restrict__ hist, int* __restrict__ kmap, double* rotations) +void estimate_all_Rijs(int n, + int n_theta, + double hist_bin_width, + int full_width, + double sigma, + int sync, + double* __restrict__ clmatrix, + double* __restrict__ hist, + int* __restrict__ kmap, + double* rotations) { // try toget kmap as uint16_t /* n n_img */ @@ -445,14 +454,12 @@ void estimate_all_Rijs(int n, int n_theta, double* __restrict__ clmatrix, double unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; unsigned int j = blockDim.y * blockIdx.y + threadIdx.y; int k; - int kk; int cl_diff1, cl_diff2, cl_diff3; double theta1, theta2, theta3; double c1, c2, c3; double cond; double cos_phi2; double angles[3]; - double r[9]; int cnt; /* no-op when out of bounds */ @@ -471,14 +478,15 @@ void estimate_all_Rijs(int n, int n_theta, double* __restrict__ clmatrix, double int cl_idx21 = clmatrix[j*n + i]; int cl_idx13, cl_idx31; int cl_idx23, cl_idx32; - const int ntics = 60; - const double sigma = 3.0; + int ntics = 180 / hist_bin_width; + const double TOL_idx = 1e-12; + bool ind1, ind2; double ga; double angle; int b; int peak_idx; double peak; - + /* Assume that k_list starts as [0,n] */ for(k=0; k (M_PI + TOL_idx)) | ( + (theta1 < -TOL_idx) & (theta1 > -M_PI) + ); + ind2 = (theta2 > (M_PI + TOL_idx)) | ( + (theta2 < -TOL_idx) & (theta2 > -M_PI) + ); + if( (ind1 & ~ind2) | (~ind1 & ind2)){ + /* Apply sync */ + cos_phi2 = -cos_phi2; + } + + } + else{ + + cos_phi2 = (c3 - c1*c2 ) / (sin(theta1) * sin(theta2)); + } + //end _get_cos_phis // clip [-1,1] @@ -523,19 +556,29 @@ void estimate_all_Rijs(int n, int n_theta, double* __restrict__ clmatrix, double if(cos_phi2 <-1){ cos_phi2 = -1; } - + /* compute histogram contribution and index map */ angle = acos(cos_phi2) * 180. / M_PI; // angle's bin kmap[PAIR_IDX(n,i,j)*n + k] = angle / ntics; for(b=0; b Date: Tue, 17 Sep 2024 12:22:28 -0400 Subject: [PATCH 14/45] start populating eastimate_Rijs kernel call (stash) --- src/aspire/abinitio/commonline_sync3n.py | 69 ++++++++++++++++++++---- 1 file changed, 60 insertions(+), 9 deletions(-) diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py index d6b12ef1d1..f9755257da 100644 --- a/src/aspire/abinitio/commonline_sync3n.py +++ b/src/aspire/abinitio/commonline_sync3n.py @@ -824,19 +824,70 @@ def _estimate_all_Rijs(self, clmatrix): :return: Estimated rotations """ # host/gpu dispatch - if self.__gpu_module: - res = self._estimate_all_Rijs_cupy(Rijs) - else: - res = self._estimate_all_Rijs_host(Rijs) - + # if self.__gpu_module: + # res = self._estimate_all_Rijs_cupy(Rijs) + # else: + # res = self._estimate_all_Rijs_host(Rijs) + res = self._estimate_all_Rijs_cupy(clmatrix) + res_host = self._estimate_all_Rijs_host(clmatrix) + if not np.allclose(res, res_host): + breakpoint() + return res - + def _estimate_all_Rijs_cupy(self, clmatrix): import cupy as cp estimate_all_Rijs = self.__gpu_module.get_function("estimate_all_Rijs") - - return + + full_width = self.full_width + if str(full_width).lower() == "adaptive": + full_width = -1 + + sigma = 3.0 + sync = 1 + + # transfer input to device + clmatrix = cp.asarray(self.clmatrix, dtype=int) + + # workspace arrays + ntics = int(180 / self.hist_bin_width) + n_pairs = (self.n_img * (self.n_img - 1)) // 2 + hist = cp.zeros((n_pairs, ntics), dtype=np.float64) + # kmap stores the mapping of k indices to histogram bins + kmap = cp.zeros( + (n_pairs, self.n_img), dtype=int + ) # note, high memory , probably need to change... + rotations = cp.empty((n_pairs, 3, 3), dtype=np.float64) + + # Configure grid of blocks + blkszx = 32 + nblkx = (self.n_img + blkszx - 1) // blkszx + blkszy = 32 + nblky = (self.n_img + blkszy - 1) // blkszy + + logger.info("Launching `estimate_all_Rijs` kernel.") + estimate_all_Rijs( + (nblkx, nblky), + (blkszx, blkszy), + ( + self.n_img, + self.n_theta, + self.hist_bin_width, # converted to double + full_width, + sigma, + sync, + clmatrix, # input + hist, # tmp + kmap, # tmp + rotations, # output + ), + ) + + # transfer results device to host + rotations = rotations.get() + + return rotations def _estimate_all_Rijs_host(self, clmatrix): """ @@ -844,7 +895,7 @@ def _estimate_all_Rijs_host(self, clmatrix): :param clmatrix: Common lines matrix :return: Estimated rotations - """ + """ n_img = self.n_img n_theta = self.n_theta Rijs = np.zeros((len(self._pairs), 3, 3)) From 41665b7670de18ce58e8fec4920fad94fe2a4ad8 Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Tue, 17 Sep 2024 15:06:56 -0400 Subject: [PATCH 15/45] breakout angles and add angles map (stash) --- src/aspire/abinitio/commonline_sync3n.cu | 71 +++++++++++++++--------- src/aspire/abinitio/commonline_sync3n.py | 55 +++++++++++++----- 2 files changed, 87 insertions(+), 39 deletions(-) diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu index 71b2cdc365..fa8b9261e3 100644 --- a/src/aspire/abinitio/commonline_sync3n.cu +++ b/src/aspire/abinitio/commonline_sync3n.cu @@ -436,18 +436,19 @@ void triangle_scores_inner(int n, double* Rijs, int n_intervals, unsigned int* s }; extern "C" __global__ -void estimate_all_Rijs(int n, - int n_theta, - double hist_bin_width, - int full_width, - double sigma, - int sync, - double* __restrict__ clmatrix, - double* __restrict__ hist, - int* __restrict__ kmap, - double* rotations) +void estimate_all_angles(int n, + int n_theta, + double hist_bin_width, + int full_width, + double sigma, + int sync, + int* __restrict__ clmatrix, + double* __restrict__ hist, + int* __restrict__ k_map, + double* __restrict__ angles_map, + double* __restrict__ angles) { - // try toget kmap as uint16_t + // try toget k_map as uint16_t /* n n_img */ /* thread index (2d), represents "i" index, "j" index */ @@ -459,7 +460,6 @@ void estimate_all_Rijs(int n, double c1, c2, c3; double cond; double cos_phi2; - double angles[3]; int cnt; /* no-op when out of bounds */ @@ -470,6 +470,8 @@ void estimate_all_Rijs(int n, // vote_ij creates good_k list per (i,j) + const int pair_idx = PAIR_IDX(n,i,j); + int map_idx; /* tmp index var */ // We shouldn't need this... confirm and rm later. if(clmatrix[i*n + j] == -1) return; @@ -560,12 +562,14 @@ void estimate_all_Rijs(int n, /* compute histogram contribution and index map */ angle = acos(cos_phi2) * 180. / M_PI; // angle's bin - kmap[PAIR_IDX(n,i,j)*n + k] = angle / ntics; + map_idx = pair_idx*n + k; + k_map[map_idx] = angle / ntics; + angles_map[map_idx] = angle; for(b=0; b peak){ - peak = hist[PAIR_IDX(n,i,j)*ntics + b]; + map_idx = pair_idx*ntics + b; + if(hist[map_idx] > peak){ + peak = hist[map_idx]; peak_idx = b; } } /* find mean of rotations */ // initialize - angles[0] = cl_idx12 * 2 * M_PI / n + M_PI / 2; - angles[1] = 0; - angles[2] = -M_PI / 2 - cl_idx21 * 2 * M_PI / n; + map_idx = pair_idx*3; + angles[map_idx ] = cl_idx12 * 2 * M_PI / n + M_PI / 2; + angles[map_idx + 1] = 0; + angles[map_idx + 2] = -M_PI / 2 - cl_idx21 * 2 * M_PI / n; cnt = 0; if(full_width!=-1){ @@ -601,9 +607,10 @@ void estimate_all_Rijs(int n, // find satisfying indices for(k=0; k= n) return; + + ang2orth(&(rotations[i*9]), angles[i*3],angles[i*3+1],angles[i*3+2]); + +} diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py index f9755257da..835788bc5a 100644 --- a/src/aspire/abinitio/commonline_sync3n.py +++ b/src/aspire/abinitio/commonline_sync3n.py @@ -830,15 +830,16 @@ def _estimate_all_Rijs(self, clmatrix): # res = self._estimate_all_Rijs_host(Rijs) res = self._estimate_all_Rijs_cupy(clmatrix) res_host = self._estimate_all_Rijs_host(clmatrix) - if not np.allclose(res, res_host): - breakpoint() + # if not np.allclose(res, res_host): + # breakpoint() return res def _estimate_all_Rijs_cupy(self, clmatrix): import cupy as cp - estimate_all_Rijs = self.__gpu_module.get_function("estimate_all_Rijs") + estimate_all_angles = self.__gpu_module.get_function("estimate_all_angles") + angles_to_rots = self.__gpu_module.get_function("angles_to_rots") full_width = self.full_width if str(full_width).lower() == "adaptive": @@ -854,11 +855,14 @@ def _estimate_all_Rijs_cupy(self, clmatrix): ntics = int(180 / self.hist_bin_width) n_pairs = (self.n_img * (self.n_img - 1)) // 2 hist = cp.zeros((n_pairs, ntics), dtype=np.float64) - # kmap stores the mapping of k indices to histogram bins - kmap = cp.zeros( + # k_map stores the mapping of k indices to histogram bins + k_map = cp.empty( (n_pairs, self.n_img), dtype=int ) # note, high memory , probably need to change... - rotations = cp.empty((n_pairs, 3, 3), dtype=np.float64) + angles_map = cp.empty( + (n_pairs, self.n_img), dtype=int + ) # note, high memory , probably need to change... + angles = cp.empty((n_pairs, 3), dtype=np.float64) # Configure grid of blocks blkszx = 32 @@ -866,21 +870,44 @@ def _estimate_all_Rijs_cupy(self, clmatrix): blkszy = 32 nblky = (self.n_img + blkszy - 1) // blkszy - logger.info("Launching `estimate_all_Rijs` kernel.") - estimate_all_Rijs( + logger.info("Launching `estimate_all_angles` kernel.") + estimate_all_angles( (nblkx, nblky), (blkszx, blkszy), ( self.n_img, self.n_theta, - self.hist_bin_width, # converted to double - full_width, - sigma, - sync, + np.float64(self.hist_bin_width), + int(full_width), + np.float64(sigma), + int(sync), clmatrix, # input hist, # tmp - kmap, # tmp - rotations, # output + k_map, # tmp + angles_map, # tmp + angles, # output + ), + ) + + # no longer need workspace vars + del hist + del k_map + del angles_map + + # convert angles to rots + rotations = cp.empty((n_pairs, 3, 3), dtype=np.float64) + + blkszx = 1024 + nblkx = (n_pairs + blkszx - 1) // blkszx + + logger.info("Launching `angles_to_rots` kernel.") + angles_to_rots( + (nblkx,), + (blkszx,), + ( + n_pairs, + angles, + rotations, ), ) From be329599961503fa9f54cf5e0d6b620e237f4a58 Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Tue, 17 Sep 2024 17:05:08 -0400 Subject: [PATCH 16/45] pair_idx doesn't include diag --- src/aspire/abinitio/commonline_sync3n.cu | 59 +++++++++++++----------- src/aspire/abinitio/commonline_sync3n.py | 21 ++++++--- tests/test_commonline_sync3n.py | 4 +- 3 files changed, 51 insertions(+), 33 deletions(-) diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu index fa8b9261e3..bc84ce71eb 100644 --- a/src/aspire/abinitio/commonline_sync3n.cu +++ b/src/aspire/abinitio/commonline_sync3n.cu @@ -1,8 +1,10 @@ # define M_PI 3.14159265358979323846 /* pi */ /* from i,j indices to the common index in the N-choose-2 sized array */ +// careful, this is stricly the upper triangle! #define PAIR_IDX(N,I,J) ((2*N-I-1)*I/2 + J-I-1) + /* convert euler angles (a,b,c) in ZYZ to rotation matrix r */ inline void ang2orth(double* r, double a, double b, double c){ double sa = sin(a); @@ -452,8 +454,8 @@ void estimate_all_angles(int n, /* n n_img */ /* thread index (2d), represents "i" index, "j" index */ - unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - unsigned int j = blockDim.y * blockIdx.y + threadIdx.y; + const unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + const unsigned int j = blockDim.y * blockIdx.y + threadIdx.y; int k; int cl_diff1, cl_diff2, cl_diff3; double theta1, theta2, theta3; @@ -470,14 +472,15 @@ void estimate_all_angles(int n, // vote_ij creates good_k list per (i,j) - const int pair_idx = PAIR_IDX(n,i,j); + const int pair_idx = PAIR_IDXD(n,i,j); int map_idx; /* tmp index var */ // We shouldn't need this... confirm and rm later. + //xxx if(clmatrix[i*n + j] == -1) return; - int cl_idx12 = clmatrix[i*n + j]; - int cl_idx21 = clmatrix[j*n + i]; + const int cl_idx12 = clmatrix[i*n + j]; + const int cl_idx21 = clmatrix[j*n + i]; int cl_idx13, cl_idx31; int cl_idx23, cl_idx32; int ntics = 180 / hist_bin_width; @@ -498,10 +501,10 @@ void estimate_all_angles(int n, cl_idx32 = clmatrix[k*n + j]; // test `k` values - if(k==i) return; - if(cl_idx13 == -1) return; // i, k - if(cl_idx31 == -1) return; // k, i - if(cl_idx23 == -1) return; // j, k + if(k==i) continue; + if(cl_idx13 == -1) continue; // i, k + if(cl_idx31 == -1) continue; // k, i + if(cl_idx23 == -1) continue; // j, k // if(cl_idx32 == -1) return; // k, j // self._get_cos_phis(cl_diff1, cl_diff2, cl_diff3, n_theta) @@ -519,12 +522,12 @@ void estimate_all_angles(int n, // test if we have a good_idx cond = 1 + 2 * c1 * c2 * c3 - (c1*c1 + c2*c2 + c3*c3); - if(cond < 1e-5) return; + if(cond <= 1e-5) continue; // this value of k is not good /* Calculated cos values of angle between i and j images */ if( sync == 1){ - cos_phi2 = (c3 - c1 * c2) / (sqrt(1 - c1*c1) * sqrt(1 - c2 *c2)); + cos_phi2 = (c3 - c1*c2) / (sqrt(1 - c1*c1) * sqrt(1 - c2*c2)); /* Some synchronization must be applied when common line is out by 180 degrees. Here fix the angles between c_ij(c_ji) and c_ik(c_jk) to be smaller than pi/2, @@ -535,21 +538,19 @@ void estimate_all_angles(int n, ind1 = (theta1 > (M_PI + TOL_idx)) | ( (theta1 < -TOL_idx) & (theta1 > -M_PI) ); - ind2 = (theta2 > (M_PI + TOL_idx)) | ( - (theta2 < -TOL_idx) & (theta2 > -M_PI) + ind2 = (theta2 > (M_PI + TOL_idx)) || ( + (theta2 < -TOL_idx) && (theta2 > -M_PI) ); - if( (ind1 & ~ind2) | (~ind1 & ind2)){ + if( (ind1 && !ind2) || (!ind1 && ind2)){ /* Apply sync */ cos_phi2 = -cos_phi2; } - } + } // end sync else{ cos_phi2 = (c3 - c1*c2 ) / (sin(theta1) * sin(theta2)); - } - - //end _get_cos_phis + } // end not sync // clip [-1,1] if(cos_phi2 >1){ @@ -564,7 +565,7 @@ void estimate_all_angles(int n, // angle's bin map_idx = pair_idx*n + k; k_map[map_idx] = angle / ntics; - angles_map[map_idx] = angle; + angles_map[map_idx] = angle; // degree for(b=0; b Date: Tue, 17 Sep 2024 23:39:45 -0400 Subject: [PATCH 17/45] angles matching (Stash), bug in angles to rot func --- src/aspire/abinitio/commonline_sync3n.cu | 50 +++++++++++------------- src/aspire/abinitio/commonline_sync3n.py | 25 ++++++------ 2 files changed, 35 insertions(+), 40 deletions(-) diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu index bc84ce71eb..82cf859222 100644 --- a/src/aspire/abinitio/commonline_sync3n.cu +++ b/src/aspire/abinitio/commonline_sync3n.cu @@ -7,21 +7,22 @@ /* convert euler angles (a,b,c) in ZYZ to rotation matrix r */ inline void ang2orth(double* r, double a, double b, double c){ - double sa = sin(a); + double sa = sin(c); double sb = sin(b); double sc = sin(c); double ca = cos(a); double cb = cos(b); double cc = cos(c); + // bugs here r[0] = ca*cb*cc - sa*sc; - r[1] = -ca*cb*cc - sa*sc; - r[2] = ca*sb; - r[3] = sa*cb*cc + ca*sc; - r[4] = -sa*cb*cc + ca*sc; - r[5] = sa*sb; - r[6] = -sb*cc; - r[7] = sb*sc; + r[1] = sa*cb*cc + ca*sc; + r[2] = -sb*cc; + r[3] = -ca*cb*sc - sa*cc; + r[4] = -sa*cb*sc + ca*cc; + r[5] = sb*sc; + r[6] = ca*sb; + r[7] = sa*sb; r[8] = cb; } @@ -472,18 +473,18 @@ void estimate_all_angles(int n, // vote_ij creates good_k list per (i,j) - const int pair_idx = PAIR_IDXD(n,i,j); + const int pair_idx = PAIR_IDX(n,i,j); int map_idx; /* tmp index var */ // We shouldn't need this... confirm and rm later. //xxx - if(clmatrix[i*n + j] == -1) return; + // if(clmatrix[i*n + j] == -1) return; - const int cl_idx12 = clmatrix[i*n + j]; - const int cl_idx21 = clmatrix[j*n + i]; + int cl_idx12 = clmatrix[i*n + j]; + int cl_idx21 = clmatrix[j*n + i]; int cl_idx13, cl_idx31; int cl_idx23, cl_idx32; - int ntics = 180 / hist_bin_width; + int ntics = 180. / hist_bin_width; const double TOL_idx = 1e-12; bool ind1, ind2; double ga; @@ -503,7 +504,7 @@ void estimate_all_angles(int n, // test `k` values if(k==i) continue; if(cl_idx13 == -1) continue; // i, k - if(cl_idx31 == -1) continue; // k, i + //if(cl_idx31 == -1) continue; // k, i if(cl_idx23 == -1) continue; // j, k // if(cl_idx32 == -1) return; // k, j @@ -522,7 +523,7 @@ void estimate_all_angles(int n, // test if we have a good_idx cond = 1 + 2 * c1 * c2 * c3 - (c1*c1 + c2*c2 + c3*c3); - if(cond <= 1e-5) continue; // this value of k is not good + if(cond <= 1e-5) continue; // that value of k is not good, skip /* Calculated cos values of angle between i and j images */ if( sync == 1){ @@ -540,7 +541,7 @@ void estimate_all_angles(int n, ); ind2 = (theta2 > (M_PI + TOL_idx)) || ( (theta2 < -TOL_idx) && (theta2 > -M_PI) - ); + ); if( (ind1 && !ind2) || (!ind1 && ind2)){ /* Apply sync */ cos_phi2 = -cos_phi2; @@ -548,7 +549,6 @@ void estimate_all_angles(int n, } // end sync else{ - cos_phi2 = (c3 - c1*c2 ) / (sin(theta1) * sin(theta2)); } // end not sync @@ -564,14 +564,14 @@ void estimate_all_angles(int n, angle = acos(cos_phi2) * 180. / M_PI; // angle's bin map_idx = pair_idx*n + k; - k_map[map_idx] = angle / ntics; + k_map[map_idx] = angle / hist_bin_width; //check angles_map[map_idx] = angle; // degree for(b=0; b= n) return; - ang2orth(&(rotations[i*9]), angles[i*3],angles[i*3+1],angles[i*3+2]); + ang2orth(&(rotations[i*9]), angles[i*3], angles[i*3+1], angles[i*3+2]); } diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py index c753426bdc..d28315ea71 100644 --- a/src/aspire/abinitio/commonline_sync3n.py +++ b/src/aspire/abinitio/commonline_sync3n.py @@ -831,8 +831,7 @@ def _estimate_all_Rijs(self, clmatrix): print(f"clmatrix {clmatrix}") res = self._estimate_all_Rijs_cupy(clmatrix) res_host = self._estimate_all_Rijs_host(clmatrix) - # if not np.allclose(res, res_host): - # breakpoint() + breakpoint() return res @@ -846,22 +845,21 @@ def _estimate_all_Rijs_cupy(self, clmatrix): if str(full_width).lower() == "adaptive": full_width = -1 raise - #XXX debug (check rest of kernel works before porting adaptive) - + # XXX debug (check rest of kernel works before porting adaptive) sigma = 3.0 sync = 1 # transfer input to device - clmatrix = cp.asarray(self.clmatrix, dtype=int) + clmatrix = cp.asarray(clmatrix, dtype=np.int32) # workspace arrays ntics = int(180 / self.hist_bin_width) - n_pairs = (self.n_img * (self.n_img - 1)) // 2 + n_pairs = self.n_img * (self.n_img - 1) // 2 hist = cp.zeros((n_pairs, ntics), dtype=np.float64) # k_map stores the mapping of k indices to histogram bins k_map = cp.zeros( - (n_pairs, self.n_img), dtype=int + (n_pairs, self.n_img), dtype=np.int32 ) # note, high memory , probably need to change... angles_map = cp.zeros( (n_pairs, self.n_img), dtype=np.float64 @@ -879,12 +877,12 @@ def _estimate_all_Rijs_cupy(self, clmatrix): (nblkx, nblky), (blkszx, blkszy), ( - int(self.n_img), - int(self.n_theta), + self.n_img, + self.n_theta, np.float64(self.hist_bin_width), - int(full_width), + full_width, np.float64(sigma), - int(sync), + sync, clmatrix, # input hist, # tmp k_map, # tmp @@ -899,6 +897,7 @@ def _estimate_all_Rijs_cupy(self, clmatrix): del angles_map print(f"First three angles\n{angles.get()[:3]}") + np.save("cupy_angles.npy", angles.get()) # convert angles to rots rotations = cp.empty((n_pairs, 3, 3), dtype=np.float64) @@ -964,8 +963,8 @@ def _syncmatrix_ij_vote_3n(self, clmatrix, i, j, k_list, n_theta): angles[2] = -np.pi / 2 - clmatrix[j, i] * 2 * np.pi / n_theta rot = Rotation.from_euler(angles).matrices - if i == 0 and j<=3: - print(f'host angles {i} {j} {angles}') + if i == 0 and j <= 3: + print(f"host angles {i} {j} {angles}") else: # This is for the case that images i and j correspond to the same From 558fddc3aa7418380aa421740b900b65dc4c4fed Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Wed, 18 Sep 2024 10:00:04 -0400 Subject: [PATCH 18/45] fixed zyz angles conversion --- src/aspire/abinitio/commonline_sync3n.cu | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu index 82cf859222..297b5dbb6d 100644 --- a/src/aspire/abinitio/commonline_sync3n.cu +++ b/src/aspire/abinitio/commonline_sync3n.cu @@ -7,22 +7,23 @@ /* convert euler angles (a,b,c) in ZYZ to rotation matrix r */ inline void ang2orth(double* r, double a, double b, double c){ - double sa = sin(c); + double sa = sin(a); double sb = sin(b); double sc = sin(c); double ca = cos(a); double cb = cos(b); double cc = cos(c); - // bugs here + // ZYZ Proper Euler angles + // https://en.wikipedia.org/wiki/Euler_angles#Rotation_matrix r[0] = ca*cb*cc - sa*sc; - r[1] = sa*cb*cc + ca*sc; - r[2] = -sb*cc; - r[3] = -ca*cb*sc - sa*cc; - r[4] = -sa*cb*sc + ca*cc; - r[5] = sb*sc; - r[6] = ca*sb; - r[7] = sa*sb; + r[1] = -cc*sa -ca*cb*sc; + r[2] = ca*sb; + r[3] = ca*sc + cb*cc*sa; + r[4] = ca*cc - cb*sa*sc; + r[5] = sa*sb; + r[6] = -cc*sb; + r[7] = sb*sc; r[8] = cb; } From cd26b1a944cc0ead45d4fe7af428f11724bbf655 Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Wed, 18 Sep 2024 10:01:46 -0400 Subject: [PATCH 19/45] remove dbg prints --- src/aspire/abinitio/commonline_sync3n.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py index d28315ea71..e8736533fc 100644 --- a/src/aspire/abinitio/commonline_sync3n.py +++ b/src/aspire/abinitio/commonline_sync3n.py @@ -828,10 +828,8 @@ def _estimate_all_Rijs(self, clmatrix): # res = self._estimate_all_Rijs_cupy(Rijs) # else: # res = self._estimate_all_Rijs_host(Rijs) - print(f"clmatrix {clmatrix}") res = self._estimate_all_Rijs_cupy(clmatrix) res_host = self._estimate_all_Rijs_host(clmatrix) - breakpoint() return res @@ -896,9 +894,6 @@ def _estimate_all_Rijs_cupy(self, clmatrix): del k_map del angles_map - print(f"First three angles\n{angles.get()[:3]}") - np.save("cupy_angles.npy", angles.get()) - # convert angles to rots rotations = cp.empty((n_pairs, 3, 3), dtype=np.float64) @@ -963,9 +958,6 @@ def _syncmatrix_ij_vote_3n(self, clmatrix, i, j, k_list, n_theta): angles[2] = -np.pi / 2 - clmatrix[j, i] * 2 * np.pi / n_theta rot = Rotation.from_euler(angles).matrices - if i == 0 and j <= 3: - print(f"host angles {i} {j} {angles}") - else: # This is for the case that images i and j correspond to the same # viewing direction and differ only by in-plane rotation. From 9e7a3b460b4d220a3c25123383b969b95c8e96cf Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Wed, 18 Sep 2024 10:10:36 -0400 Subject: [PATCH 20/45] add adaptive width to kernel --- src/aspire/abinitio/commonline_base.py | 4 +++- src/aspire/abinitio/commonline_sync3n.cu | 28 +++++++++++++++++++----- src/aspire/abinitio/commonline_sync3n.py | 8 +------ 3 files changed, 26 insertions(+), 14 deletions(-) diff --git a/src/aspire/abinitio/commonline_base.py b/src/aspire/abinitio/commonline_base.py index 54ee26b290..c5d99e96b7 100644 --- a/src/aspire/abinitio/commonline_base.py +++ b/src/aspire/abinitio/commonline_base.py @@ -62,7 +62,9 @@ def __init__( self.n_theta = n_theta self.n_check = n_check self.hist_bin_width = hist_bin_width - self.full_width = full_width + if str(full_width).lower() == "adaptive": + full_width = -1 + self.full_width = int(full_width) self.clmatrix = None self.max_shift = math.ceil(max_shift * self.n_res) self.shift_step = shift_step diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu index 297b5dbb6d..c40dd32c84 100644 --- a/src/aspire/abinitio/commonline_sync3n.cu +++ b/src/aspire/abinitio/commonline_sync3n.cu @@ -465,6 +465,8 @@ void estimate_all_angles(int n, double cond; double cos_phi2; int cnt; + double w_theta_need; + /* no-op when out of bounds */ if(i >= n) return; @@ -605,7 +607,23 @@ void estimate_all_angles(int n, cnt = 0; - if(full_width!=-1){ + if(full_width==-1){ + /* adaptive width*/ + w_theta_need = 0; + cnt = 0; + while(cnt == 0){ + w_theta_need += hist_bin_width; + for(k=0; k Date: Thu, 19 Sep 2024 07:57:18 -0400 Subject: [PATCH 21/45] 1d rij kernel --- src/aspire/abinitio/commonline_sync3n.cu | 291 ++++++++++++----------- src/aspire/abinitio/commonline_sync3n.py | 26 +- 2 files changed, 162 insertions(+), 155 deletions(-) diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu index c40dd32c84..ec24101546 100644 --- a/src/aspire/abinitio/commonline_sync3n.cu +++ b/src/aspire/abinitio/commonline_sync3n.cu @@ -455,10 +455,10 @@ void estimate_all_angles(int n, // try toget k_map as uint16_t /* n n_img */ - /* thread index (2d), represents "i" index, "j" index */ + /* thread index (1d), represents "i" index */ const unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - const unsigned int j = blockDim.y * blockIdx.y + threadIdx.y; - int k; + + int j, k; int cl_diff1, cl_diff2, cl_diff3; double theta1, theta2, theta3; double c1, c2, c3; @@ -466,25 +466,15 @@ void estimate_all_angles(int n, double cos_phi2; int cnt; double w_theta_need; - /* no-op when out of bounds */ if(i >= n) return; - if(j >= n) return; - /* no-op lower triangle */ - if(j <= i) return; - // vote_ij creates good_k list per (i,j) - const int pair_idx = PAIR_IDX(n,i,j); + int pair_idx; int map_idx; /* tmp index var */ - // We shouldn't need this... confirm and rm later. - //xxx - // if(clmatrix[i*n + j] == -1) return; - - int cl_idx12 = clmatrix[i*n + j]; - int cl_idx21 = clmatrix[j*n + i]; + int cl_idx12, cl_idx21; int cl_idx13, cl_idx31; int cl_idx23, cl_idx32; int ntics = 180. / hist_bin_width; @@ -496,151 +486,166 @@ void estimate_all_angles(int n, int peak_idx; double peak; - /* Assume that k_list starts as [0,n] */ - for(k=0; k (M_PI + TOL_idx)) | ( - (theta1 < -TOL_idx) & (theta1 > -M_PI) - ); - ind2 = (theta2 > (M_PI + TOL_idx)) || ( - (theta2 < -TOL_idx) && (theta2 > -M_PI) - ); - if( (ind1 && !ind2) || (!ind1 && ind2)){ - /* Apply sync */ - cos_phi2 = -cos_phi2; - } - - } // end sync - else{ - cos_phi2 = (c3 - c1*c2 ) / (sin(theta1) * sin(theta2)); - } // end not sync + for(j=i+1; j1){ - cos_phi2 = 1; + /* initialize */ + for(k=0; k (M_PI + TOL_idx)) | ( + (theta1 < -TOL_idx) & (theta1 > -M_PI) + ); + ind2 = (theta2 > (M_PI + TOL_idx)) || ( + (theta2 < -TOL_idx) && (theta2 > -M_PI) + ); + if( (ind1 && !ind2) || (!ind1 && ind2)){ + /* Apply sync */ + cos_phi2 = -cos_phi2; + } + + } // end sync + else{ + cos_phi2 = (c3 - c1*c2 ) / (sin(theta1) * sin(theta2)); + } // end not sync + + // clip [-1,1] + if(cos_phi2 >1){ + cos_phi2 = 1; + } + if(cos_phi2 <-1){ + cos_phi2 = -1; + } + + /* compute histogram contribution and index map */ + angle = acos(cos_phi2) * 180. / M_PI; + // angle's bin + map_idx = i*n + k; + k_map[map_idx] = angle / hist_bin_width; //check + angles_map[map_idx] = angle; // degree + for(b=0; b peak){ - peak = hist[map_idx]; - peak_idx = b; + map_idx = i*ntics + b; + if(hist[map_idx] > peak){ + peak = hist[map_idx]; + peak_idx = b; + } } - } - /* find mean of rotations */ - // initialize - map_idx = pair_idx*3; - angles[map_idx ] = cl_idx12 * 2 * M_PI / n_theta + M_PI / 2; - angles[map_idx + 1] = 0; - angles[map_idx + 2] = -M_PI / 2 - cl_idx21 * 2 * M_PI / n_theta; + /* find mean of rotations */ + // initialize + map_idx = pair_idx*3; + angles[map_idx ] = cl_idx12 * 2 * M_PI / n_theta + M_PI / 2; + angles[map_idx + 1] = 0; + angles[map_idx + 2] = -M_PI / 2 - cl_idx21 * 2 * M_PI / n_theta; - cnt = 0; - - if(full_width==-1){ - /* adaptive width*/ - w_theta_need = 0; cnt = 0; - while(cnt == 0){ - w_theta_need += hist_bin_width; + + if(full_width==-1){ + /* adaptive width*/ + w_theta_need = 0; + cnt = 0; + while(cnt == 0){ + w_theta_need += hist_bin_width; + for(k=0; k Date: Thu, 19 Sep 2024 08:19:12 -0400 Subject: [PATCH 22/45] implement nvcc backend and int16_t --- src/aspire/abinitio/commonline_base.cu | 3 ++- src/aspire/abinitio/commonline_base.py | 10 ++++++++-- src/aspire/abinitio/commonline_sync3n.cu | 10 ++++++++-- src/aspire/abinitio/commonline_sync3n.py | 6 +++--- 4 files changed, 21 insertions(+), 8 deletions(-) diff --git a/src/aspire/abinitio/commonline_base.cu b/src/aspire/abinitio/commonline_base.cu index a6bad78aea..eb493acf44 100644 --- a/src/aspire/abinitio/commonline_base.cu +++ b/src/aspire/abinitio/commonline_base.cu @@ -1,7 +1,8 @@ +#include #include extern "C" __global__ -void build_clmatrix_kernel(int n, int m, int r, const complex* __restrict__ pf, double* __restrict__ clmatrix, double* __restrict__ cl_dist, double* __restrict__ shifts_1d, int n_shifts, double* __restrict__ shifts, const complex* __restrict__ shift_phases) +void build_clmatrix_kernel(int n, int m, int r, const complex* __restrict__ pf, int16_t* __restrict__ clmatrix, double* __restrict__ cl_dist, double* __restrict__ shifts_1d, int n_shifts, double* __restrict__ shifts, const complex* __restrict__ shift_phases) { /* n n_img */ /* m,r st (n, m, r) = pf.shape, ie len(pf[i]) */ diff --git a/src/aspire/abinitio/commonline_base.py b/src/aspire/abinitio/commonline_base.py index c5d99e96b7..efb33280e2 100644 --- a/src/aspire/abinitio/commonline_base.py +++ b/src/aspire/abinitio/commonline_base.py @@ -72,6 +72,12 @@ def __init__( self.rotations = None self._pf = None + # Sanity limit to match potential clmatrix dtype of int16. + if self.n_img > (2**15 - 1): + raise NotImplementedError( + "Commonlines implementation limited to <2**15 images." + ) + # Auto configure GPU self.__gpu_module = None try: @@ -297,7 +303,7 @@ def build_clmatrix_cu(self): # the common line with image j. Note the common line index # starts from 0 instead of 1 as Matlab version. -1 means # there is no common line such as clmatrix[i,i]. - clmatrix = -cp.ones((n_img, n_img), dtype=np.float64) + clmatrix = -cp.ones((n_img, n_img), dtype=np.int16) # When cl_dist[i, j] is not -1, it stores the maximum value # of correlation between image i and j for all possible 1D shifts. # We will use cl_dist[i, j] = -1 (including j<=i) to @@ -711,4 +717,4 @@ def __init_cupy_module(): module_code = fh.read() # CUPY compile the CUDA code - return cp.RawModule(code=module_code) + return cp.RawModule(code=module_code, backend="nvcc") diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu index ec24101546..a36744bef5 100644 --- a/src/aspire/abinitio/commonline_sync3n.cu +++ b/src/aspire/abinitio/commonline_sync3n.cu @@ -1,3 +1,5 @@ +#include "stdint.h" + # define M_PI 3.14159265358979323846 /* pi */ /* from i,j indices to the common index in the N-choose-2 sized array */ @@ -6,6 +8,7 @@ /* convert euler angles (a,b,c) in ZYZ to rotation matrix r */ +__host__ __device__ inline void ang2orth(double* r, double a, double b, double c){ double sa = sin(a); double sb = sin(b); @@ -28,6 +31,7 @@ inline void ang2orth(double* r, double a, double b, double c){ } +__host__ __device__ inline void mult_3x3(double *out, double *R1, double *R2) { /* 3X3 matrices multiplication: out = R1*R2 * Note, this differs from the MATLAB mult_3x3. @@ -45,6 +49,7 @@ inline void mult_3x3(double *out, double *R1, double *R2) { } } +__host__ __device__ inline void JRJ(double *R, double *A) { /* multiple 3X3 matrix by J from both sizes: A = JRJ */ A[0]=R[0]; @@ -58,6 +63,7 @@ inline void JRJ(double *R, double *A) { A[8]=R[8]; } +__host__ __device__ inline double diff_norm_3x3(const double *R1, const double *R2) { /* difference 2 matrices and return squared norm: ||R1-R2||^2 */ int i; @@ -446,9 +452,9 @@ void estimate_all_angles(int n, int full_width, double sigma, int sync, - int* __restrict__ clmatrix, + int16_t* __restrict__ clmatrix, double* __restrict__ hist, - int* __restrict__ k_map, + uint16_t* __restrict__ k_map, double* __restrict__ angles_map, double* __restrict__ angles) { diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py index e753960095..7c98ac69f7 100644 --- a/src/aspire/abinitio/commonline_sync3n.py +++ b/src/aspire/abinitio/commonline_sync3n.py @@ -851,14 +851,14 @@ def _estimate_all_Rijs_cupy(self, clmatrix): sync = 1 # transfer input to device - clmatrix = cp.asarray(clmatrix, dtype=np.int32) + clmatrix = cp.asarray(clmatrix, dtype=np.int16) # workspace arrays ntics = int(180 / self.hist_bin_width) n_pairs = self.n_img * (self.n_img - 1) // 2 hist = cp.zeros((self.n_img, ntics), dtype=np.float64) # k_map stores the mapping of k indices to histogram bins - k_map = cp.zeros((self.n_img, self.n_img), dtype=np.int32) + k_map = cp.zeros((self.n_img, self.n_img), dtype=np.uint16) angles_map = cp.zeros((self.n_img, self.n_img), dtype=np.float64) angles = cp.zeros((n_pairs, 3), dtype=np.float64) @@ -1157,4 +1157,4 @@ def __init_cupy_module(): module_code = fh.read() # CUPY compile the CUDA code - return cp.RawModule(code=module_code) + return cp.RawModule(code=module_code, backend="nvcc") From 306e35efbe1ade24d879f28300aa9bb58bce98b6 Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Thu, 19 Sep 2024 14:10:09 -0400 Subject: [PATCH 23/45] split kernels --- src/aspire/abinitio/commonline_sync3n.cu | 349 +++++++++++++---------- src/aspire/abinitio/commonline_sync3n.py | 55 ++-- 2 files changed, 234 insertions(+), 170 deletions(-) diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu index a36744bef5..09d4aa80be 100644 --- a/src/aspire/abinitio/commonline_sync3n.cu +++ b/src/aspire/abinitio/commonline_sync3n.cu @@ -446,17 +446,18 @@ void triangle_scores_inner(int n, double* Rijs, int n_intervals, unsigned int* s }; extern "C" __global__ -void estimate_all_angles(int n, - int n_theta, - double hist_bin_width, - int full_width, - double sigma, - int sync, - int16_t* __restrict__ clmatrix, - double* __restrict__ hist, - uint16_t* __restrict__ k_map, - double* __restrict__ angles_map, - double* __restrict__ angles) +void estimate_all_angles1(int j, + int n, + int n_theta, + double hist_bin_width, + int full_width, + double sigma, + int sync, + int16_t* __restrict__ clmatrix, + double* __restrict__ hist, + uint16_t* __restrict__ k_map, + double* __restrict__ angles_map, + double* __restrict__ angles) { // try toget k_map as uint16_t /* n n_img */ @@ -464,7 +465,7 @@ void estimate_all_angles(int n, /* thread index (1d), represents "i" index */ const unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - int j, k; + int k; int cl_diff1, cl_diff2, cl_diff3; double theta1, theta2, theta3; double c1, c2, c3; @@ -489,169 +490,213 @@ void estimate_all_angles(int n, double ga; double angle; int b; - int peak_idx; - double peak; - for(j=i+1; j (M_PI + TOL_idx)) | ( + (theta1 < -TOL_idx) & (theta1 > -M_PI) + ); + ind2 = (theta2 > (M_PI + TOL_idx)) || ( + (theta2 < -TOL_idx) && (theta2 > -M_PI) + ); + if( (ind1 && !ind2) || (!ind1 && ind2)){ + /* Apply sync */ + cos_phi2 = -cos_phi2; + } + + } // end sync + else{ + cos_phi2 = (c3 - c1*c2 ) / (sin(theta1) * sin(theta2)); + } // end not sync + + // clip [-1,1] + if(cos_phi2 >1){ + cos_phi2 = 1; } - for(k=0; k<3; k++){ - angles[pair_idx*3+k] = 0; + if(cos_phi2 <-1){ + cos_phi2 = -1; } - cl_idx12 = clmatrix[i*n + j]; - cl_idx21 = clmatrix[j*n + i]; + /* compute histogram contribution and index map */ + angle = acos(cos_phi2) * 180. / M_PI; + // angle's bin + map_idx = i*n + k; + k_map[map_idx] = angle / hist_bin_width; //check + angles_map[map_idx] = angle; // degree + for(b=0; b (M_PI + TOL_idx)) | ( - (theta1 < -TOL_idx) & (theta1 > -M_PI) - ); - ind2 = (theta2 > (M_PI + TOL_idx)) || ( - (theta2 < -TOL_idx) && (theta2 > -M_PI) - ); - if( (ind1 && !ind2) || (!ind1 && ind2)){ - /* Apply sync */ - cos_phi2 = -cos_phi2; - } - } // end sync - else{ - cos_phi2 = (c3 - c1*c2 ) / (sin(theta1) * sin(theta2)); - } // end not sync - // clip [-1,1] - if(cos_phi2 >1){ - cos_phi2 = 1; - } - if(cos_phi2 <-1){ - cos_phi2 = -1; - } +extern "C" __global__ +void estimate_all_angles2(int j, + int n, + int n_theta, + double hist_bin_width, + int full_width, + double* __restrict__ hist, + uint16_t* __restrict__ k_map, + double* __restrict__ angles_map, + double* __restrict__ angles) +{ + // try toget k_map as uint16_t + /* n n_img */ - /* compute histogram contribution and index map */ - angle = acos(cos_phi2) * 180. / M_PI; - // angle's bin - map_idx = i*n + k; - k_map[map_idx] = angle / hist_bin_width; //check - angles_map[map_idx] = angle; // degree - for(b=0; b peak){ - peak = hist[map_idx]; - peak_idx = b; - } + /* thread index (1d), represents "i" index */ + const unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + + int k; + int cnt; + double w_theta_need; + + /* no-op when out of bounds */ + if(i >= n) return; + + // vote_ij creates good_k list per (i,j) + int pair_idx; + int map_idx; /* tmp index var */ + + int ntics = 180. / hist_bin_width; + int b; + int peak_idx; + double peak; + + //for(j=i+1; j peak){ + peak = hist[map_idx]; + peak_idx = b; } + } - /* find mean of rotations */ - // initialize - map_idx = pair_idx*3; - angles[map_idx ] = cl_idx12 * 2 * M_PI / n_theta + M_PI / 2; - angles[map_idx + 1] = 0; - angles[map_idx + 2] = -M_PI / 2 - cl_idx21 * 2 * M_PI / n_theta; + /* find mean of rotations */ - cnt = 0; + cnt = 0; - if(full_width==-1){ - /* adaptive width*/ - w_theta_need = 0; - cnt = 0; - while(cnt == 0){ - w_theta_need += hist_bin_width; - for(k=0; k Date: Thu, 19 Sep 2024 15:32:09 -0400 Subject: [PATCH 24/45] general cleanup --- src/aspire/abinitio/commonline_sync3n.cu | 126 ++++++++++------------- src/aspire/abinitio/commonline_sync3n.py | 1 - 2 files changed, 57 insertions(+), 70 deletions(-) diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu index 09d4aa80be..5d3d0340d1 100644 --- a/src/aspire/abinitio/commonline_sync3n.cu +++ b/src/aspire/abinitio/commonline_sync3n.cu @@ -17,8 +17,8 @@ inline void ang2orth(double* r, double a, double b, double c){ double cb = cos(b); double cc = cos(c); - // ZYZ Proper Euler angles - // https://en.wikipedia.org/wiki/Euler_angles#Rotation_matrix + /* ZYZ Proper Euler angles */ + /* https://en.wikipedia.org/wiki/Euler_angles#Rotation_matrix */ r[0] = ca*cb*cc - sa*sc; r[1] = -cc*sa -ca*cb*sc; r[2] = ca*sb; @@ -459,11 +459,11 @@ void estimate_all_angles1(int j, double* __restrict__ angles_map, double* __restrict__ angles) { - // try toget k_map as uint16_t /* n n_img */ /* thread index (1d), represents "i" index */ const unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + /* j is image j index */ int k; int cl_diff1, cl_diff2, cl_diff3; @@ -477,8 +477,6 @@ void estimate_all_angles1(int j, /* no-op when out of bounds */ if(i >= n) return; - // vote_ij creates good_k list per (i,j) - int pair_idx; int map_idx; /* tmp index var */ int cl_idx12, cl_idx21; @@ -487,28 +485,22 @@ void estimate_all_angles1(int j, int ntics = 180. / hist_bin_width; const double TOL_idx = 1e-12; bool ind1, ind2; - double ga; - double angle; + double grid_angle, angle_diff, angle; int b; + const double two_sigma_sq = 2*sigma*sigma; - // for(j=i+1; j1){ cos_phi2 = 1; } @@ -575,18 +568,19 @@ void estimate_all_angles1(int j, cos_phi2 = -1; } - /* compute histogram contribution and index map */ + /* compute histogram contribution, angle mapping, and index mappings. */ angle = acos(cos_phi2) * 180. / M_PI; - // angle's bin + /* angle's bin */ map_idx = i*n + k; - k_map[map_idx] = angle / hist_bin_width; //check - angles_map[map_idx] = angle; // degree + k_map[map_idx] = angle / hist_bin_width; // check + angles_map[map_idx] = angle; /* degrees */ for(b=0; b= n) return; - // vote_ij creates good_k list per (i,j) - int pair_idx; int map_idx; /* tmp index var */ int ntics = 180. / hist_bin_width; @@ -644,10 +634,9 @@ void estimate_all_angles2(int j, int peak_idx; double peak; - //for(j=i+1; j Date: Thu, 19 Sep 2024 16:14:20 -0400 Subject: [PATCH 25/45] threads over k --- src/aspire/abinitio/commonline_sync3n.cu | 181 ++++++++++++----------- src/aspire/abinitio/commonline_sync3n.py | 14 +- 2 files changed, 101 insertions(+), 94 deletions(-) diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu index 5d3d0340d1..6841ad2d5d 100644 --- a/src/aspire/abinitio/commonline_sync3n.cu +++ b/src/aspire/abinitio/commonline_sync3n.cu @@ -464,18 +464,19 @@ void estimate_all_angles1(int j, /* thread index (1d), represents "i" index */ const unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; /* j is image j index */ + const unsigned int k = blockDim.y * blockIdx.y + threadIdx.y; - int k; int cl_diff1, cl_diff2, cl_diff3; double theta1, theta2, theta3; double c1, c2, c3; double cond; double cos_phi2; - int cnt; double w_theta_need; /* no-op when out of bounds */ if(i >= n) return; + if(k >= n) return; + int map_idx; /* tmp index var */ @@ -493,96 +494,96 @@ void estimate_all_angles1(int j, const int pair_idx = PAIR_IDX(n,i,j); /* initialize */ - for(b=0; b (M_PI + TOL_idx)) | ( - (theta1 < -TOL_idx) & (theta1 > -M_PI) - ); - ind2 = (theta2 > (M_PI + TOL_idx)) || ( - (theta2 < -TOL_idx) && (theta2 > -M_PI) - ); - if( (ind1 && !ind2) || (!ind1 && ind2)){ - /* Apply sync */ - cos_phi2 = -cos_phi2; - } - } /* end sync */ - else{ - cos_phi2 = (c3 - c1*c2 ) / (sin(theta1) * sin(theta2)); - } /* end not sync */ - - /* clip cosine phi between [-1,1] */ - if(cos_phi2 >1){ - cos_phi2 = 1; - } - if(cos_phi2 <-1){ - cos_phi2 = -1; + cl_idx13 = clmatrix[i*n + k]; + cl_idx31 = clmatrix[k*n + i]; + cl_idx23 = clmatrix[j*n + k]; + cl_idx32 = clmatrix[k*n + j]; + + /* test `k` values */ + if(k==i) return; + if(cl_idx13 == -1) return; /* i, k */ + //if(cl_idx31 == -1) return; // k, i ? + if(cl_idx23 == -1) return; /* j, k */ + // if(cl_idx32 == -1) return; // k, j ? + + /* get cosine angles */ + cl_diff1 = cl_idx13 - cl_idx12; + cl_diff2 = cl_idx23 - cl_idx21; + cl_diff3 = cl_idx32 - cl_idx31; + + theta1 = cl_diff1 * 2 * M_PI / n_theta; + theta2 = cl_diff2 * 2 * M_PI / n_theta; + theta3 = cl_diff3 * 2 * M_PI / n_theta; + + c1 = cos(theta1); + c2 = cos(theta2); + c3 = cos(theta3); + + /* test if we have a good index */ + cond = 1 + 2 * c1 * c2 * c3 - (c1*c1 + c2*c2 + c3*c3); + if(cond <= 1e-5) return; /* current value of k is not good, skip */ + + /* Calculated cos values of angle between i and j images */ + if( sync == 1){ + + cos_phi2 = (c3 - c1*c2) / (sqrt(1 - c1*c1) * sqrt(1 - c2*c2)); + + /* + Some synchronization must be applied when common line is out by 180 degrees. + Here fix the angles between c_ij(c_ji) and c_ik(c_jk) to be smaller than pi/2, + otherwise there will be an ambiguity between alpha and pi-alpha. + */ + + /* Check sync conditions */ + ind1 = (theta1 > (M_PI + TOL_idx)) | ( + (theta1 < -TOL_idx) & (theta1 > -M_PI) + ); + ind2 = (theta2 > (M_PI + TOL_idx)) || ( + (theta2 < -TOL_idx) && (theta2 > -M_PI) + ); + if( (ind1 && !ind2) || (!ind1 && ind2)){ + /* Apply sync */ + cos_phi2 = -cos_phi2; } - /* compute histogram contribution, angle mapping, and index mappings. */ - angle = acos(cos_phi2) * 180. / M_PI; - /* angle's bin */ - map_idx = i*n + k; - k_map[map_idx] = angle / hist_bin_width; // check - angles_map[map_idx] = angle; /* degrees */ - for(b=0; b1){ + cos_phi2 = 1; + } + if(cos_phi2 <-1){ + cos_phi2 = -1; + } + + /* compute histogram contribution, angle mapping, and index mappings. */ + angle = acos(cos_phi2) * 180. / M_PI; + /* angle's bin */ + map_idx = i*n + k; + k_map[map_idx] = angle / hist_bin_width; // check + angles_map[map_idx] = angle; /* degrees */ + for(b=0; b Date: Thu, 19 Sep 2024 17:06:07 -0400 Subject: [PATCH 26/45] continue cleanup threads over k --- src/aspire/abinitio/commonline_base.py | 1 - src/aspire/abinitio/commonline_sync3n.py | 55 ++++++++++++++---------- 2 files changed, 33 insertions(+), 23 deletions(-) diff --git a/src/aspire/abinitio/commonline_base.py b/src/aspire/abinitio/commonline_base.py index efb33280e2..66c960dd2d 100644 --- a/src/aspire/abinitio/commonline_base.py +++ b/src/aspire/abinitio/commonline_base.py @@ -6,7 +6,6 @@ import scipy.sparse as sparse from aspire.image import Image -from aspire.numeric import fft, xp from aspire.operators import PolarFT from aspire.utils import common_line_from_rots, fuzzy_mask, tqdm from aspire.utils.random import choice diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py index 9b87a8f220..36a2e1356a 100644 --- a/src/aspire/abinitio/commonline_sync3n.py +++ b/src/aspire/abinitio/commonline_sync3n.py @@ -824,24 +824,14 @@ def _estimate_all_Rijs(self, clmatrix): :return: Estimated rotations """ # host/gpu dispatch - # if self.__gpu_module: - # res = self._estimate_all_Rijs_cupy(Rijs) - # else: - # res = self._estimate_all_Rijs_host(Rijs) - from time import perf_counter - - tic = perf_counter() - res = self._estimate_all_Rijs_cupy(clmatrix) - toc1 = perf_counter() - print("_estimate_all_Rijs_cupy", toc1 - tic) - - res_host = self._estimate_all_Rijs_host(clmatrix) - toc2 = perf_counter() - print("_estimate_all_Rijs_host", toc2 - toc1) + if self.__gpu_module: + res = self._estimate_all_Rijs_cu(clmatrix) + else: + res = self._estimate_all_Rijs_host(clmatrix) return res - def _estimate_all_Rijs_cupy(self, clmatrix): + def _estimate_all_Rijs_cu(self, clmatrix): import cupy as cp estimate_all_angles1 = self.__gpu_module.get_function("estimate_all_angles1") @@ -863,16 +853,24 @@ def _estimate_all_Rijs_cupy(self, clmatrix): angles_map = cp.zeros((self.n_img, self.n_img), dtype=np.float64) angles = cp.zeros((n_pairs, 3), dtype=np.float64) - # Configure grid of blocks + # Configure 2d grid of blocks (kernel part 1) blkszx = 32 nblkx = (self.n_img + blkszx - 1) // blkszx blkszy = 32 nblky = (self.n_img + blkszy - 1) // blkszy + + # Configure 1d grid of blocks (kernel part 2) blksz = 1024 nblk = (self.n_img + blksz - 1) // blksz + from time import perf_counter + logger.info("Launching `estimate_all_angles` kernel.") + toc0 = perf_counter() for j in range(0, self.n_img): + + # ------------------- + # Vote into histogram estimate_all_angles1( (nblkx, nblky), (blkszx, blkszy), @@ -892,6 +890,8 @@ def _estimate_all_Rijs_cupy(self, clmatrix): ), ) + # ------------------------- + # Solve hist for mean angle estimate_all_angles2( (nblk,), (blksz,), @@ -907,21 +907,28 @@ def _estimate_all_Rijs_cupy(self, clmatrix): ), ) - # no longer need workspace vars + # Force all kernels to complete + cp.cuda.runtime.deviceSynchronize() + toc1 = perf_counter() + logger.info(f"estimate_all_angles: {toc1-toc0}s") + + # Inform CuPy we longer need these workspace vars del hist del k_map del angles_map - # convert angles to rots + # --------------------------- + # Convert angles to rotations rotations = cp.empty((n_pairs, 3, 3), dtype=np.float64) - blkszx = 1024 - nblkx = (n_pairs + blkszx - 1) // blkszx + # Configure 1d grid of blocks + blksz = 1024 + nblk = (n_pairs + blksz - 1) // blksz logger.info("Launching `angles_to_rots` kernel.") angles_to_rots( - (nblkx,), - (blkszx,), + (nblk,), + (blksz,), ( n_pairs, angles, @@ -929,6 +936,10 @@ def _estimate_all_Rijs_cupy(self, clmatrix): ), ) + cp.cuda.runtime.deviceSynchronize() + toc2 = perf_counter() + logger.info(f"angles_to_rots: {toc2-toc1}s") + # transfer results device to host rotations = rotations.get() From 0e219cc153664083d1925991aacb25b415b7c43b Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Mon, 23 Sep 2024 06:41:49 -0400 Subject: [PATCH 27/45] fix j= n) return; if(k >= n) return; - + if(i >= j) return; int map_idx; /* tmp index var */ @@ -502,6 +502,11 @@ void estimate_all_angles1(int j, cl_idx12 = clmatrix[i*n + j]; cl_idx21 = clmatrix[j*n + i]; + /* + MATLAB code indicated this condition might occur outside i==j; + Ask Yoel what other reasons this would occur. + */ + if(cl_idx12 == -1) return; /* Assume that k_list starts as all n images */ @@ -628,6 +633,7 @@ void estimate_all_angles2(int j, /* no-op when out of bounds */ if(i >= n) return; + if(i >= j) return; int map_idx; /* tmp index var */ From 3eaef56d02c15a3f9e65111c9aa816e942e2aca4 Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Mon, 23 Sep 2024 11:09:51 -0400 Subject: [PATCH 28/45] fix adative param oversight bug --- src/aspire/abinitio/sync_voting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aspire/abinitio/sync_voting.py b/src/aspire/abinitio/sync_voting.py index fb626a8a91..82bfcf8870 100644 --- a/src/aspire/abinitio/sync_voting.py +++ b/src/aspire/abinitio/sync_voting.py @@ -156,7 +156,7 @@ def _vote_ij(self, clmatrix, n_theta, i, j, k_list, sync=False): # that accidentally fall near the peak. peak_idx = angles_hist.argmax() - if str(self.full_width).lower() == "adaptive": + if self.full_width==-1: # Adaptive width (MATLAB) # Look for the estimations in the peak of the histogram w_theta_needed = 0 From 1748a691f8cd167c7a7585b33bdbc0fe0a319286 Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Mon, 23 Sep 2024 11:34:41 -0400 Subject: [PATCH 29/45] parallel case bug --- src/aspire/abinitio/commonline_sync3n.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu index 725f1b0758..4b61767bbd 100644 --- a/src/aspire/abinitio/commonline_sync3n.cu +++ b/src/aspire/abinitio/commonline_sync3n.cu @@ -517,6 +517,7 @@ void estimate_all_angles1(int j, /* test `k` values */ if(k==i) return; + if(k==j) return; if(cl_idx13 == -1) return; /* i, k */ //if(cl_idx31 == -1) return; // k, i ? if(cl_idx23 == -1) return; /* j, k */ From 90ff5b3d49899bf0f4dc83581c0c7ad24441d2ac Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Tue, 24 Sep 2024 14:31:01 -0400 Subject: [PATCH 30/45] C order, sigh --- src/aspire/abinitio/commonline_sync3n.cu | 35 +++++++++++------------- src/aspire/abinitio/commonline_sync3n.py | 7 ++++- src/aspire/abinitio/sync_voting.py | 2 +- 3 files changed, 23 insertions(+), 21 deletions(-) diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu index 4b61767bbd..c89064a4e2 100644 --- a/src/aspire/abinitio/commonline_sync3n.cu +++ b/src/aspire/abinitio/commonline_sync3n.cu @@ -1,6 +1,6 @@ #include "stdint.h" -# define M_PI 3.14159265358979323846 /* pi */ +#define M_PI 3.14159265358979323846 /* pi */ /* from i,j indices to the common index in the N-choose-2 sized array */ // careful, this is stricly the upper triangle! @@ -477,13 +477,19 @@ void estimate_all_angles1(int j, if(i >= n) return; if(k >= n) return; if(i >= j) return; + /* + These are also tested later via the clmatrix values, + testing now avoids extra reads. + */ + if(k==i) return; + if(k==j) return; int map_idx; /* tmp index var */ int cl_idx12, cl_idx21; int cl_idx13, cl_idx31; int cl_idx23, cl_idx32; - int ntics = 180. / hist_bin_width; + const int ntics = 180. / hist_bin_width; const double TOL_idx = 1e-12; bool ind1, ind2; double grid_angle, angle_diff, angle; @@ -493,13 +499,6 @@ void estimate_all_angles1(int j, const int pair_idx = PAIR_IDX(n,i,j); - /* initialize */ - if((k==0) ^ (k==n-2)){ /* unique per ijk grid */ - for(b=0; b (M_PI + TOL_idx)) | ( - (theta1 < -TOL_idx) & (theta1 > -M_PI) - ); + ind1 = (theta1 > (M_PI + TOL_idx)) || ( + (theta1 < -TOL_idx) && (theta1 > -M_PI) + ); ind2 = (theta2 > (M_PI + TOL_idx)) || ( (theta2 < -TOL_idx) && (theta2 > -M_PI) ); @@ -586,9 +583,9 @@ void estimate_all_angles1(int j, // todo, just compute in radians to avoid extra arithmetic grid_angle = b * (180./ntics); /* grid angle */ // check, I think off by one-ish (grid vs bins) angle_diff = (grid_angle - angle); - /* accumulate histogram contribution */ - hist[i*ntics + b ] += exp( - -(angle_diff*angle_diff) / ( two_sigma_sq)); + /* accumulate histogram contribution, atomic due to `k` */ + atomicAdd(&(hist[i*ntics + b ]), + exp(-(angle_diff*angle_diff) / ( two_sigma_sq))); } /* bins */ /* @@ -600,7 +597,7 @@ void estimate_all_angles1(int j, */ /* Initialize euler angles */ - if((k==0) ^ (k==n-2)) /* unique per ijk grid */ + /* can be done once? */ { map_idx = pair_idx*3; angles[map_idx ] = cl_idx12 * 2 * M_PI / n_theta + M_PI / 2; @@ -638,7 +635,7 @@ void estimate_all_angles2(int j, int map_idx; /* tmp index var */ - int ntics = 180. / hist_bin_width; + const int ntics = 180. / hist_bin_width; int b; int peak_idx; double peak; diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py index 36a2e1356a..3110e179c3 100644 --- a/src/aspire/abinitio/commonline_sync3n.py +++ b/src/aspire/abinitio/commonline_sync3n.py @@ -842,7 +842,7 @@ def _estimate_all_Rijs_cu(self, clmatrix): sync = 1 # transfer input to device - clmatrix = cp.asarray(clmatrix, dtype=np.int16) + clmatrix = cp.asarray(clmatrix, order="C", dtype=np.int16) # workspace arrays ntics = int(180 / self.hist_bin_width) @@ -869,6 +869,11 @@ def _estimate_all_Rijs_cu(self, clmatrix): toc0 = perf_counter() for j in range(0, self.n_img): + # ------------------------------------------ + # Zero histogram and k mapping for each `j`. + hist[:] = 0 + k_map[:] = 0 + # ------------------- # Vote into histogram estimate_all_angles1( diff --git a/src/aspire/abinitio/sync_voting.py b/src/aspire/abinitio/sync_voting.py index 82bfcf8870..63cb6a923f 100644 --- a/src/aspire/abinitio/sync_voting.py +++ b/src/aspire/abinitio/sync_voting.py @@ -156,7 +156,7 @@ def _vote_ij(self, clmatrix, n_theta, i, j, k_list, sync=False): # that accidentally fall near the peak. peak_idx = angles_hist.argmax() - if self.full_width==-1: + if self.full_width == -1: # Adaptive width (MATLAB) # Look for the estimations in the peak of the histogram w_theta_needed = 0 From 627d4c5c2d3dac6602ff4270cd447542f2d91da3 Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Fri, 4 Oct 2024 11:43:02 -0400 Subject: [PATCH 31/45] remove unused vars from build cl kernel --- src/aspire/abinitio/commonline_base.cu | 12 +++++++++--- src/aspire/abinitio/commonline_base.py | 15 +-------------- 2 files changed, 10 insertions(+), 17 deletions(-) diff --git a/src/aspire/abinitio/commonline_base.cu b/src/aspire/abinitio/commonline_base.cu index eb493acf44..acce2fbba2 100644 --- a/src/aspire/abinitio/commonline_base.cu +++ b/src/aspire/abinitio/commonline_base.cu @@ -2,7 +2,15 @@ #include extern "C" __global__ -void build_clmatrix_kernel(int n, int m, int r, const complex* __restrict__ pf, int16_t* __restrict__ clmatrix, double* __restrict__ cl_dist, double* __restrict__ shifts_1d, int n_shifts, double* __restrict__ shifts, const complex* __restrict__ shift_phases) +void build_clmatrix_kernel( + int n, + int m, + int r, + const complex* __restrict__ pf, + int16_t* const __restrict__ clmatrix, + const int n_shifts, + double* __restrict__ shifts, + const complex* const __restrict__ shift_phases) { /* n n_img */ /* m,r st (n, m, r) = pf.shape, ie len(pf[i]) */ @@ -69,7 +77,5 @@ void build_clmatrix_kernel(int n, int m, int r, const complex* __restric ind = i*n + j; clmatrix[ind] = best_cl1; clmatrix[j*n+i] = best_cl2; /* [j,i] */ - cl_dist[ind] = 2*best_cl_dist; // 2 of mystery - shifts_1d[ind] = shifts[best_s]; } diff --git a/src/aspire/abinitio/commonline_base.py b/src/aspire/abinitio/commonline_base.py index 66c960dd2d..d70618cee1 100644 --- a/src/aspire/abinitio/commonline_base.py +++ b/src/aspire/abinitio/commonline_base.py @@ -303,13 +303,6 @@ def build_clmatrix_cu(self): # starts from 0 instead of 1 as Matlab version. -1 means # there is no common line such as clmatrix[i,i]. clmatrix = -cp.ones((n_img, n_img), dtype=np.int16) - # When cl_dist[i, j] is not -1, it stores the maximum value - # of correlation between image i and j for all possible 1D shifts. - # We will use cl_dist[i, j] = -1 (including j<=i) to - # represent that there is no need to check common line - # between i and j. Since it is symmetric, - # only above the diagonal entries are necessary. - cl_dist = -cp.ones((n_img, n_img), dtype=np.float64) # Allocate variables used for shift estimation @@ -319,8 +312,6 @@ def build_clmatrix_cu(self): # Set resolution of shift estimation in pixels. Note that # shift_step can be any positive real number. shift_step = self.shift_step - # 1D shift between common-lines - shifts_1d = cp.zeros((n_img, n_img), dtype=np.float64) # check dtype # Prepare the shift phases to try and generate filter for common-line detection r_max = pf.shape[2] @@ -357,8 +348,6 @@ def build_clmatrix_cu(self): r_max, pf, clmatrix, - cl_dist, - shifts_1d, len(shifts), shifts, shift_phases, @@ -366,11 +355,9 @@ def build_clmatrix_cu(self): ) # Copy result device arrays to host - shifts_1d = shifts_1d.get().astype(self.dtype, copy=False) clmatrix = clmatrix.get().astype(self.dtype, copy=False) - cl_dist = cl_dist.get().astype(self.dtype, copy=False) # diagnostic - return shifts_1d, clmatrix + return None, clmatrix def estimate_shifts(self, equations_factor=1, max_memory=4000): """ From 230fd0f31b333e9ed4b1df1807bd8974fce6d6f4 Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Fri, 4 Oct 2024 12:19:57 -0400 Subject: [PATCH 32/45] remove unused vars from build cl kernel --- src/aspire/abinitio/commonline_base.cu | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/aspire/abinitio/commonline_base.cu b/src/aspire/abinitio/commonline_base.cu index acce2fbba2..7ec2fad014 100644 --- a/src/aspire/abinitio/commonline_base.cu +++ b/src/aspire/abinitio/commonline_base.cu @@ -29,12 +29,11 @@ void build_clmatrix_kernel( int k; int s; int cl1, cl2; - int best_cl1, best_cl2, best_s; + int best_cl1, best_cl2; double dist, best_cl_dist; double p1, p2; complex pfik, pfjk; - best_s = -99999; best_cl1 = -1; best_cl2 = -1; best_cl_dist = -1/0; @@ -57,7 +56,6 @@ void build_clmatrix_kernel( best_cl_dist = dist; best_cl1 = cl1; best_cl2 = cl2; - best_s = s; } dist = p1 + p2; @@ -65,7 +63,6 @@ void build_clmatrix_kernel( best_cl_dist = dist; best_cl1 = cl1; best_cl2 = cl2 + m; // m is pf.shape[1], which should be n_theta//2... - best_s = s; } } /* s */ From b64164e0c3cceb2de993734531e402b4ec27a2ad Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Thu, 10 Oct 2024 09:45:23 -0400 Subject: [PATCH 33/45] continue removing unused vars --- src/aspire/abinitio/commonline_base.cu | 4 ++-- src/aspire/abinitio/commonline_base.py | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/aspire/abinitio/commonline_base.cu b/src/aspire/abinitio/commonline_base.cu index 7ec2fad014..3fbc5306e4 100644 --- a/src/aspire/abinitio/commonline_base.cu +++ b/src/aspire/abinitio/commonline_base.cu @@ -1,4 +1,5 @@ #include +#include #include extern "C" __global__ @@ -9,7 +10,6 @@ void build_clmatrix_kernel( const complex* __restrict__ pf, int16_t* const __restrict__ clmatrix, const int n_shifts, - double* __restrict__ shifts, const complex* const __restrict__ shift_phases) { /* n n_img */ @@ -36,7 +36,7 @@ void build_clmatrix_kernel( best_cl1 = -1; best_cl2 = -1; - best_cl_dist = -1/0; + best_cl_dist = -INFINITY; for(cl1=0; cl1 Date: Thu, 10 Oct 2024 09:49:50 -0400 Subject: [PATCH 34/45] update constants --- src/aspire/abinitio/commonline_base.cu | 10 +++++----- src/aspire/abinitio/commonline_sync3n.cu | 3 +-- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/aspire/abinitio/commonline_base.cu b/src/aspire/abinitio/commonline_base.cu index 3fbc5306e4..37e112e4b8 100644 --- a/src/aspire/abinitio/commonline_base.cu +++ b/src/aspire/abinitio/commonline_base.cu @@ -4,9 +4,9 @@ extern "C" __global__ void build_clmatrix_kernel( - int n, - int m, - int r, + const int n, + const int m, + const int r, const complex* __restrict__ pf, int16_t* const __restrict__ clmatrix, const int n_shifts, @@ -16,8 +16,8 @@ void build_clmatrix_kernel( /* m,r st (n, m, r) = pf.shape, ie len(pf[i]) */ /* thread index (1d), represents "i" index */ - unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - unsigned int j = blockDim.y * blockIdx.y + threadIdx.y; + const unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + const unsigned int j = blockDim.y * blockIdx.y + threadIdx.y; /* no-op when out of bounds */ if(i >= n) return; diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu index c89064a4e2..14e27669b9 100644 --- a/src/aspire/abinitio/commonline_sync3n.cu +++ b/src/aspire/abinitio/commonline_sync3n.cu @@ -1,6 +1,5 @@ #include "stdint.h" - -#define M_PI 3.14159265358979323846 /* pi */ +#include "math.h" /* from i,j indices to the common index in the N-choose-2 sized array */ // careful, this is stricly the upper triangle! From 7e6415dc61fbd1e5ef39686eaa1047809756cacd Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Thu, 10 Oct 2024 10:26:59 -0400 Subject: [PATCH 35/45] add single precision build CL kernel and launching code --- src/aspire/abinitio/commonline_base.cu | 75 ++++++++++++++++++++++++++ src/aspire/abinitio/commonline_base.py | 21 ++++++-- 2 files changed, 91 insertions(+), 5 deletions(-) diff --git a/src/aspire/abinitio/commonline_base.cu b/src/aspire/abinitio/commonline_base.cu index 37e112e4b8..b889cad62d 100644 --- a/src/aspire/abinitio/commonline_base.cu +++ b/src/aspire/abinitio/commonline_base.cu @@ -70,6 +70,81 @@ void build_clmatrix_kernel( }/* cl1 */ + /* update global best for i, j*/ + ind = i*n + j; + clmatrix[ind] = best_cl1; + clmatrix[j*n+i] = best_cl2; /* [j,i] */ + +} + +extern "C" __global__ +void fbuild_clmatrix_kernel( + const int n, + const int m, + const int r, + const complex* __restrict__ pf, + int16_t* const __restrict__ clmatrix, + const int n_shifts, + const complex* const __restrict__ shift_phases) +{ + /* n n_img */ + /* m,r st (n, m, r) = pf.shape, ie len(pf[i]) */ + + /* thread index (1d), represents "i" index */ + const unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + const unsigned int j = blockDim.y * blockIdx.y + threadIdx.y; + + /* no-op when out of bounds */ + if(i >= n) return; + if(j >= n) return; + /* no-op lower triangle */ + if(j <= i) return; + + int ind; + int k; + int s; + int cl1, cl2; + int best_cl1, best_cl2; + float dist, best_cl_dist; + float p1, p2; + complex pfik, pfjk; + + best_cl1 = -1; + best_cl2 = -1; + best_cl_dist = -INFINITY; + + for(cl1=0; cl1 best_cl_dist){ + best_cl_dist = dist; + best_cl1 = cl1; + best_cl2 = cl2; + } + + dist = p1 + p2; + if(dist > best_cl_dist){ + best_cl_dist = dist; + best_cl1 = cl1; + best_cl2 = cl2 + m; // m is pf.shape[1], which should be n_theta//2... + } + + } /* s */ + } /* cl2 */ + }/* cl1 */ + + /* update global best for i, j*/ ind = i*n + j; clmatrix[ind] = best_cl1; diff --git a/src/aspire/abinitio/commonline_base.py b/src/aspire/abinitio/commonline_base.py index 6ac23f7c4e..df01578abe 100644 --- a/src/aspire/abinitio/commonline_base.py +++ b/src/aspire/abinitio/commonline_base.py @@ -7,7 +7,7 @@ from aspire.image import Image from aspire.operators import PolarFT -from aspire.utils import common_line_from_rots, fuzzy_mask, tqdm +from aspire.utils import common_line_from_rots, complex_type, fuzzy_mask, tqdm from aspire.utils.random import choice logger = logging.getLogger(__name__) @@ -319,16 +319,23 @@ def build_clmatrix_cu(self): r_max, max_shift, shift_step ) # Transfer to device, dtypes must match kernel header. - shift_phases = cp.asarray(shift_phases, dtype=np.complex128) + shift_phases = cp.asarray(shift_phases, dtype=complex_type(self.dtype)) # Apply bandpass filter, normalize each ray of each image # Note that only use half of each ray pf = self._apply_filter_and_norm("ijk, k -> ijk", pf, r_max, h) # Tranpose `pf` for better (CUDA) memory access pattern, and cast as needed. - pf = cp.ascontiguousarray(pf.T, dtype=np.complex128) + pf = cp.ascontiguousarray(pf.T, dtype=complex_type(self.dtype)) # Get kernel - build_clmatrix_kernel = self.__gpu_module.get_function("build_clmatrix_kernel") + if self.dtype == np.float64: + build_clmatrix_kernel = self.__gpu_module.get_function( + "build_clmatrix_kernel" + ) + elif self.dtype == np.float32: + build_clmatrix_kernel = self.__gpu_module.get_function( + "fbuild_clmatrix_kernel" + ) # Configure grid of blocks blkszx = 32 @@ -701,4 +708,8 @@ def __init_cupy_module(): module_code = fh.read() # CUPY compile the CUDA code - return cp.RawModule(code=module_code, backend="nvcc") + return cp.RawModule( + code=module_code, + backend="nvcc", + options=("-O3", "--use_fast_math", "--extra-device-vectorization"), + ) From a5701557ad43bcd3ab0cb41b7ca8596961f2d9c7 Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Thu, 10 Oct 2024 13:39:53 -0400 Subject: [PATCH 36/45] revert accidental config commit --- src/aspire/config_default.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/aspire/config_default.yaml b/src/aspire/config_default.yaml index 6e32ead3d2..9af8732dd1 100644 --- a/src/aspire/config_default.yaml +++ b/src/aspire/config_default.yaml @@ -1,9 +1,9 @@ version: 0.13.0 common: # numeric module to use - one of numpy/cupy - numeric: cupy + numeric: numpy # fft backend to use - one of pyfftw/scipy/cupy/mkl - fft: cupy + fft: scipy # Set cache directory for ASPIRE example data. # By default the cache location will be set by pooch.os_cache(), From bc4c3b8187ab98055b6eb9a7952de52c884452cc Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Thu, 10 Oct 2024 13:52:24 -0400 Subject: [PATCH 37/45] cleanup base cuda code a little --- src/aspire/abinitio/commonline_base.cu | 38 ++++++++++++-------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/src/aspire/abinitio/commonline_base.cu b/src/aspire/abinitio/commonline_base.cu index b889cad62d..26529a03cc 100644 --- a/src/aspire/abinitio/commonline_base.cu +++ b/src/aspire/abinitio/commonline_base.cu @@ -13,9 +13,11 @@ void build_clmatrix_kernel( const complex* const __restrict__ shift_phases) { /* n n_img */ - /* m,r st (n, m, r) = pf.shape, ie len(pf[i]) */ + /* m angular componentns, n_theta//2 */ + /* r radial componentns */ + /* (n, m, r) = pf.shape in python (before transpose for CUDA kernel) */ - /* thread index (1d), represents "i" index */ + /* thread index (2d), represents "i" and "j" indices */ const unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; const unsigned int j = blockDim.y * blockIdx.y + threadIdx.y; @@ -25,7 +27,6 @@ void build_clmatrix_kernel( /* no-op lower triangle */ if(j <= i) return; - int ind; int k; int s; int cl1, cl2; @@ -49,7 +50,7 @@ void build_clmatrix_kernel( pfjk = conj(pf[k*m*n + cl2*n + j]) * shift_phases[s*r + k]; p1 += real(pfik) * real(pfjk); p2 += imag(pfik) * imag(pfjk); - } + } /* k */ dist = p1 - p2; if(dist > best_cl_dist){ @@ -62,20 +63,18 @@ void build_clmatrix_kernel( if(dist > best_cl_dist){ best_cl_dist = dist; best_cl1 = cl1; - best_cl2 = cl2 + m; // m is pf.shape[1], which should be n_theta//2... + best_cl2 = cl2 + m; /* m is pf.shape[1], which should be n_theta//2 */ } } /* s */ } /* cl2 */ }/* cl1 */ - - /* update global best for i, j*/ - ind = i*n + j; - clmatrix[ind] = best_cl1; + /* update global best for i, j */ + clmatrix[i*n + j] = best_cl1; clmatrix[j*n+i] = best_cl2; /* [j,i] */ -} +} /* build_clmatrix_kernel */ extern "C" __global__ void fbuild_clmatrix_kernel( @@ -88,9 +87,11 @@ void fbuild_clmatrix_kernel( const complex* const __restrict__ shift_phases) { /* n n_img */ - /* m,r st (n, m, r) = pf.shape, ie len(pf[i]) */ + /* m angular componentns, n_theta//2 */ + /* r radial componentns */ + /* (n, m, r) = pf.shape in python (before transpose for CUDA kernel) */ - /* thread index (1d), represents "i" index */ + /* thread index (2d), represents "i" and "j" indices */ const unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; const unsigned int j = blockDim.y * blockIdx.y + threadIdx.y; @@ -100,7 +101,6 @@ void fbuild_clmatrix_kernel( /* no-op lower triangle */ if(j <= i) return; - int ind; int k; int s; int cl1, cl2; @@ -124,7 +124,7 @@ void fbuild_clmatrix_kernel( pfjk = conj(pf[k*m*n + cl2*n + j]) * shift_phases[s*r + k]; p1 += real(pfik) * real(pfjk); p2 += imag(pfik) * imag(pfjk); - } + } /* k */ dist = p1 - p2; if(dist > best_cl_dist){ @@ -137,17 +137,15 @@ void fbuild_clmatrix_kernel( if(dist > best_cl_dist){ best_cl_dist = dist; best_cl1 = cl1; - best_cl2 = cl2 + m; // m is pf.shape[1], which should be n_theta//2... + best_cl2 = cl2 + m; /* m is pf.shape[1], which should be n_theta//2 */ } } /* s */ } /* cl2 */ }/* cl1 */ - - /* update global best for i, j*/ - ind = i*n + j; - clmatrix[ind] = best_cl1; + /* update global best for i, j */ + clmatrix[i*n + j] = best_cl1; clmatrix[j*n+i] = best_cl2; /* [j,i] */ -} +} /* fbuild_clmatrix_kernel */ From c4ca481f32bcc81e521353b9fe244068dc7bd977 Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Thu, 10 Oct 2024 15:19:25 -0400 Subject: [PATCH 38/45] self review cleanup --- src/aspire/abinitio/commonline_base.py | 49 +++++++++++++------- src/aspire/abinitio/commonline_sync3n.cu | 58 +++++++++++++----------- src/aspire/abinitio/commonline_sync3n.py | 26 +++++++---- src/aspire/abinitio/sync_voting.py | 2 +- 4 files changed, 80 insertions(+), 55 deletions(-) diff --git a/src/aspire/abinitio/commonline_base.py b/src/aspire/abinitio/commonline_base.py index df01578abe..91049890ad 100644 --- a/src/aspire/abinitio/commonline_base.py +++ b/src/aspire/abinitio/commonline_base.py @@ -155,7 +155,11 @@ def estimate_rotations(self): raise NotImplementedError("subclasses should implement this") def build_clmatrix(self): - """dispatch, compare, dispair""" + """ + Build common-lines matrix from Fourier stack of 2D images + + Wrapper for cpu/gpu dispatch. + """ logger.info("Begin building Common Lines Matrix") @@ -242,7 +246,6 @@ def build_clmatrix_host(self): subset_j = np.sort(choice(n_remaining, n_j, replace=False) + i + 1) for j in subset_j: - # for j in range(i+1,n_img): # for testing, rm later p2_flipped = np.conj(pf[j]) for shift in range(len(shifts)): @@ -286,14 +289,15 @@ def build_clmatrix_cu(self): import cupy as cp n_img = self.n_img + r = pf.shape[2] if self.n_theta % 2 == 1: msg = "n_theta must be even" logger.error(msg) raise NotImplementedError(msg) - # need to do a copy to prevent modifying self.pf for other functions - # also places on GPU + # Copy to prevent modifying self.pf for other functions + # Simultaneously place on GPU pf = cp.array(self.pf) # Allocate local variables for return @@ -305,25 +309,26 @@ def build_clmatrix_cu(self): clmatrix = -cp.ones((n_img, n_img), dtype=np.int16) # Allocate variables used for shift estimation - - # set maximum value of 1D shift (in pixels) to search + # + # Set maximum value of 1D shift (in pixels) to search # between common-lines. - max_shift = self.max_shift # Set resolution of shift estimation in pixels. Note that # shift_step can be any positive real number. - shift_step = self.shift_step - + # # Prepare the shift phases to try and generate filter for common-line detection - r_max = pf.shape[2] + # + # Note the CUDA implementation has been optimized to not + # compute or return diagnostic 1d shifts. _, shift_phases, h = self._generate_shift_phase_and_filter( - r_max, max_shift, shift_step + r, self.max_shift, self.shift_step ) # Transfer to device, dtypes must match kernel header. shift_phases = cp.asarray(shift_phases, dtype=complex_type(self.dtype)) # Apply bandpass filter, normalize each ray of each image - # Note that only use half of each ray - pf = self._apply_filter_and_norm("ijk, k -> ijk", pf, r_max, h) + # Note that this only uses half of each ray + pf = self._apply_filter_and_norm("ijk, k -> ijk", pf, r, h) + # Tranpose `pf` for better (CUDA) memory access pattern, and cast as needed. pf = cp.ascontiguousarray(pf.T, dtype=complex_type(self.dtype)) @@ -336,11 +341,17 @@ def build_clmatrix_cu(self): build_clmatrix_kernel = self.__gpu_module.get_function( "fbuild_clmatrix_kernel" ) + else: + raise NotImplementedError( + "build_clmatrix_kernel only implemented for float32 and float64." + ) # Configure grid of blocks blkszx = 32 - nblkx = (self.n_img + blkszx - 2) // blkszx # n_img-1 + # Enough blocks to cover n_img-1 + nblkx = (self.n_img + blkszx - 2) // blkszx blkszy = 32 + # Enough blocks to cover n_img nblky = (self.n_img + blkszy - 1) // blkszy # Launch @@ -351,7 +362,7 @@ def build_clmatrix_cu(self): ( n_img, pf.shape[1], - r_max, + r, pf, clmatrix, len(shift_phases), @@ -362,6 +373,7 @@ def build_clmatrix_cu(self): # Copy result device arrays to host clmatrix = clmatrix.get().astype(self.dtype, copy=False) + # Note diagnostic 1d shifts are not computed in the CUDA implementation. return None, clmatrix def estimate_shifts(self, equations_factor=1, max_memory=4000): @@ -697,7 +709,7 @@ def _apply_filter_and_norm(self, subscripts, pf, r_max, h): def __init_cupy_module(): """ Private utility method to read in CUDA source and return as - compiled CUPY module. + compiled CuPy module. """ import cupy as cp @@ -707,7 +719,10 @@ def __init_cupy_module(): with open(fp, "r") as fh: module_code = fh.read() - # CUPY compile the CUDA code + # CuPy compile the CUDA code + # Note these optimizations are to steer aggresive optimization + # for single precision code. Fast math will potentionally + # reduce accuracy in single precision. return cp.RawModule( code=module_code, backend="nvcc", diff --git a/src/aspire/abinitio/commonline_sync3n.cu b/src/aspire/abinitio/commonline_sync3n.cu index 14e27669b9..65ab91a77d 100644 --- a/src/aspire/abinitio/commonline_sync3n.cu +++ b/src/aspire/abinitio/commonline_sync3n.cu @@ -1,8 +1,8 @@ #include "stdint.h" #include "math.h" -/* from i,j indices to the common index in the N-choose-2 sized array */ -// careful, this is stricly the upper triangle! +/* From i,j indices to the common index in the N-choose-2 sized array */ +/* Careful, this is strictly the upper triangle! */ #define PAIR_IDX(N,I,J) ((2*N-I-1)*I/2 + J-I-1) @@ -459,10 +459,11 @@ void estimate_all_angles1(int j, double* __restrict__ angles) { /* n n_img */ + /* j is image j index */ - /* thread index (1d), represents "i" index */ + /* thread index represents "i" index */ const unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - /* j is image j index */ + /* thread index represents "k" index */ const unsigned int k = blockDim.y * blockIdx.y + threadIdx.y; int cl_diff1, cl_diff2, cl_diff3; @@ -515,9 +516,7 @@ void estimate_all_angles1(int j, /* test `k` values */ if(cl_idx13 == -1) return; /* i, k */ - //if(cl_idx31 == -1) return; // k, i ? if(cl_idx23 == -1) return; /* j, k */ - // if(cl_idx32 == -1) return; // k, j ? /* get cosine angles */ cl_diff1 = cl_idx13 - cl_idx12; @@ -565,38 +564,43 @@ void estimate_all_angles1(int j, } /* end not sync */ /* clip cosine phi between [-1,1] */ - if(cos_phi2 >1){ + if(cos_phi2 > 1){ cos_phi2 = 1; } - if(cos_phi2 <-1){ + if(cos_phi2 < -1){ cos_phi2 = -1; } /* compute histogram contribution, angle mapping, and index mappings. */ angle = acos(cos_phi2) * 180. / M_PI; - /* angle's bin */ + /* index of angle's bin */ map_idx = i*n + k; - k_map[map_idx] = angle / hist_bin_width; // check + /* + For each k, keep track of bin and angles. + Note, this is slightly different than the host + which uses slightly different angle/hist grids (likely an oversight). + */ + k_map[map_idx] = angle / hist_bin_width; angles_map[map_idx] = angle; /* degrees */ for(b=0; b Date: Thu, 10 Oct 2024 15:22:03 -0400 Subject: [PATCH 39/45] use adaptive width mode for sync3n tests --- tests/test_commonline_sync3n.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_commonline_sync3n.py b/tests/test_commonline_sync3n.py index 53fa544ea6..68f1eb84e9 100644 --- a/tests/test_commonline_sync3n.py +++ b/tests/test_commonline_sync3n.py @@ -63,7 +63,7 @@ def source_orientation_objs(resolution, offsets, dtype): shift_step = 0.25 # Reduce shift steps for non-integer offsets of Simulation. orient_est = CLSync3N( - src, max_shift=max_shift, shift_step=shift_step, seed=789, full_width=2 + src, max_shift=max_shift, shift_step=shift_step, seed=789, ) # Estimate rotations once for all tests. From 706c2bdfb6893184ce7946da1d351c7ee31dbe4a Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Thu, 10 Oct 2024 15:41:17 -0400 Subject: [PATCH 40/45] add additional sync3n code paths --- src/aspire/abinitio/commonline_base.py | 2 +- tests/test_commonline_sync3n.py | 33 +++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/src/aspire/abinitio/commonline_base.py b/src/aspire/abinitio/commonline_base.py index 91049890ad..cda8d51d78 100644 --- a/src/aspire/abinitio/commonline_base.py +++ b/src/aspire/abinitio/commonline_base.py @@ -289,7 +289,7 @@ def build_clmatrix_cu(self): import cupy as cp n_img = self.n_img - r = pf.shape[2] + r = self.pf.shape[2] if self.n_theta % 2 == 1: msg = "n_theta must be even" diff --git a/tests/test_commonline_sync3n.py b/tests/test_commonline_sync3n.py index 68f1eb84e9..815381f303 100644 --- a/tests/test_commonline_sync3n.py +++ b/tests/test_commonline_sync3n.py @@ -63,7 +63,10 @@ def source_orientation_objs(resolution, offsets, dtype): shift_step = 0.25 # Reduce shift steps for non-integer offsets of Simulation. orient_est = CLSync3N( - src, max_shift=max_shift, shift_step=shift_step, seed=789, + src, + max_shift=max_shift, + shift_step=shift_step, + seed=789, ) # Estimate rotations once for all tests. @@ -138,3 +141,31 @@ def test_estimate_rotations(source_orientation_objs): if src.offsets.all() != 0: tol = 4 mean_aligned_angular_distance(orient_est.rotations, src.rotations, degree_tol=tol) + + +@pytest.mark.expensive +def test_weighted_sync3n(source_orientation_objs): + """ + Test alternative Sync3N configuration code paths. + """ + src, _ = source_orientation_objs + + orient_est = CLSync3N( + src, seed=789, S_weighting=True, J_weighting=True, full_width=2, sigma=3.1415 + ) + # Estimate rotations + orient_est.estimate_rotations() + + gt_clmatrix = rots_to_clmatrix(src.rotations, orient_est.n_theta) + + angle_diffs = abs(orient_est.clmatrix - gt_clmatrix) * 360 / orient_est.n_theta + + # Count number of estimates within 5 degrees of ground truth. + within_5 = np.sum((angle_diffs - 360) % 360 < 5) + + # Check that at least 98% of estimates are within 5 degrees. + tol = 0.98 + if src.offsets.all() != 0: + # Set tolerance to 75% when using nonzero offsets. + tol = 0.75 + assert within_5 / angle_diffs.size > tol From f2025c8865f38e0952503f7ef3f50543f7313ccc Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Fri, 11 Oct 2024 15:40:17 -0400 Subject: [PATCH 41/45] must use smaller shift step for unit test size problem --- tests/test_commonline_sync3n.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/tests/test_commonline_sync3n.py b/tests/test_commonline_sync3n.py index 815381f303..8305aa85e9 100644 --- a/tests/test_commonline_sync3n.py +++ b/tests/test_commonline_sync3n.py @@ -150,8 +150,22 @@ def test_weighted_sync3n(source_orientation_objs): """ src, _ = source_orientation_objs + # Search for common lines over less shifts for 0 offsets. + max_shift = 1 / src.L + shift_step = 1 + if src.offsets.all() != 0: + max_shift = 0.20 + shift_step = 0.25 # Reduce shift steps for non-integer offsets of Simulation. + orient_est = CLSync3N( - src, seed=789, S_weighting=True, J_weighting=True, full_width=2, sigma=3.1415 + src, + max_shift=max_shift, + shift_step=shift_step, + seed=789, + S_weighting=True, + J_weighting=True, + full_width=2, + sigma=2.9, ) # Estimate rotations orient_est.estimate_rotations() From 13cc09fe4b1afb29c67bd7a40b12f88c48c37c23 Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Wed, 16 Oct 2024 10:33:38 -0400 Subject: [PATCH 42/45] Remove missed debug string change --- src/aspire/abinitio/commonline_base.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/aspire/abinitio/commonline_base.py b/src/aspire/abinitio/commonline_base.py index cda8d51d78..3329e117a3 100644 --- a/src/aspire/abinitio/commonline_base.py +++ b/src/aspire/abinitio/commonline_base.py @@ -228,9 +228,7 @@ def build_clmatrix_host(self): # Setup a progress bar _total_pairs_to_test = self.n_img * (self.n_check - 1) // 2 - pbar = tqdm( - desc="Searching over common line pairs (orig)", total=_total_pairs_to_test - ) + pbar = tqdm(desc="Searching over common line pairs", total=_total_pairs_to_test) # Search for common lines between [i, j] pairs of images. # Creating pf and building common lines are different to the Matlab version. From 78030fe1dea3b8cc69e90baafd5e7631c9e4d5b8 Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Wed, 16 Oct 2024 10:35:01 -0400 Subject: [PATCH 43/45] Remove range(0,...) --- src/aspire/abinitio/commonline_sync3n.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py index eb9364aa77..9dd394e00b 100644 --- a/src/aspire/abinitio/commonline_sync3n.py +++ b/src/aspire/abinitio/commonline_sync3n.py @@ -873,7 +873,7 @@ def _estimate_all_Rijs_cu(self, clmatrix): logger.info("Launching `estimate_all_angles` kernel.") toc0 = perf_counter() - for j in range(0, self.n_img): + for j in range(self.n_img): # ------------------------------------------ # Zero histogram and k mapping for each `j`. From 6a99b0589e2deea3c13486a44acbd9557936c6cc Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Fri, 18 Oct 2024 09:22:43 -0400 Subject: [PATCH 44/45] change var name from dist to xcorr --- src/aspire/abinitio/commonline_base.cu | 32 +++++++++++++------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/aspire/abinitio/commonline_base.cu b/src/aspire/abinitio/commonline_base.cu index 26529a03cc..a3ee96ceed 100644 --- a/src/aspire/abinitio/commonline_base.cu +++ b/src/aspire/abinitio/commonline_base.cu @@ -31,13 +31,13 @@ void build_clmatrix_kernel( int s; int cl1, cl2; int best_cl1, best_cl2; - double dist, best_cl_dist; + double xcorr, best_cl_xcorr; double p1, p2; complex pfik, pfjk; best_cl1 = -1; best_cl2 = -1; - best_cl_dist = -INFINITY; + best_cl_xcorr = -INFINITY; for(cl1=0; cl1 best_cl_dist){ - best_cl_dist = dist; + xcorr = p1 - p2; + if(xcorr > best_cl_xcorr){ + best_cl_xcorr = xcorr; best_cl1 = cl1; best_cl2 = cl2; } - dist = p1 + p2; - if(dist > best_cl_dist){ - best_cl_dist = dist; + xcorr = p1 + p2; + if(xcorr > best_cl_xcorr){ + best_cl_xcorr = xcorr; best_cl1 = cl1; best_cl2 = cl2 + m; /* m is pf.shape[1], which should be n_theta//2 */ } @@ -105,13 +105,13 @@ void fbuild_clmatrix_kernel( int s; int cl1, cl2; int best_cl1, best_cl2; - float dist, best_cl_dist; + float xcorr, best_cl_xcorr; float p1, p2; complex pfik, pfjk; best_cl1 = -1; best_cl2 = -1; - best_cl_dist = -INFINITY; + best_cl_xcorr = -INFINITY; for(cl1=0; cl1 best_cl_dist){ - best_cl_dist = dist; + xcorr = p1 - p2; + if(xcorr > best_cl_xcorr){ + best_cl_xcorr = xcorr; best_cl1 = cl1; best_cl2 = cl2; } - dist = p1 + p2; - if(dist > best_cl_dist){ - best_cl_dist = dist; + xcorr = p1 + p2; + if(xcorr > best_cl_xcorr){ + best_cl_xcorr = xcorr; best_cl1 = cl1; best_cl2 = cl2 + m; /* m is pf.shape[1], which should be n_theta//2 */ } From c47b626c508a5777c4dfb7cc1f57f8c617ed993c Mon Sep 17 00:00:00 2001 From: Garrett Wright Date: Fri, 18 Oct 2024 09:26:24 -0400 Subject: [PATCH 45/45] Remove kernel timing --- src/aspire/abinitio/commonline_sync3n.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/src/aspire/abinitio/commonline_sync3n.py b/src/aspire/abinitio/commonline_sync3n.py index 9dd394e00b..879728436d 100644 --- a/src/aspire/abinitio/commonline_sync3n.py +++ b/src/aspire/abinitio/commonline_sync3n.py @@ -869,10 +869,6 @@ def _estimate_all_Rijs_cu(self, clmatrix): blksz = 1024 nblk = (self.n_img + blksz - 1) // blksz - from time import perf_counter - - logger.info("Launching `estimate_all_angles` kernel.") - toc0 = perf_counter() for j in range(self.n_img): # ------------------------------------------ @@ -918,10 +914,8 @@ def _estimate_all_Rijs_cu(self, clmatrix): ), ) - # Force all kernels to complete before timing + # Force all kernels to complete cp.cuda.runtime.deviceSynchronize() - toc1 = perf_counter() - logger.info(f"estimate_all_angles: {toc1-toc0}s") # Explicitly inform CuPy we longer need these workspace vars del hist @@ -947,9 +941,8 @@ def _estimate_all_Rijs_cu(self, clmatrix): ), ) + # Force all kernels to complete cp.cuda.runtime.deviceSynchronize() - toc2 = perf_counter() - logger.info(f"angles_to_rots: {toc2-toc1}s") # transfer results device to host rotations = rotations.get()