Skip to content

Commit c20c236

Browse files
committed
use cu complex for CL kernel
1 parent 42c0e2e commit c20c236

File tree

2 files changed

+13
-25
lines changed

2 files changed

+13
-25
lines changed

src/aspire/abinitio/commonline_base.cu

Lines changed: 7 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1+
#include <cupy/complex.cuh>
12

23
extern "C" __global__
3-
void build_clmatrix_kernel(int n, int m, int r, double* pf, double* clmatrix, double* cl_dist, double* shifts_1d, int n_shifts, int* shifts, double* shift_phases)
4+
void build_clmatrix_kernel(int n, int m, int r, const complex<double>* __restrict__ pf, double* __restrict__ clmatrix, double* __restrict__ cl_dist, double* __restrict__ shifts_1d, int n_shifts, int* __restrict__ shifts, const complex<double>* __restrict__ shift_phases)
45
{
56
/* n n_img */
67
/* m,r st (n, m, r) = pf.shape, ie len(pf[i]) */
@@ -22,12 +23,7 @@ void build_clmatrix_kernel(int n, int m, int r, double* pf, double* clmatrix, do
2223
int best_cl1, best_cl2, best_s;
2324
double dist, best_cl_dist;
2425
double p1, p2;
25-
26-
double p1_realk, p1_imagk;
27-
double p2conj_realk, p2conj_imagk;
28-
double p2_realk, p2_imagk;
29-
double shift_phases_real, shift_phases_imag;
30-
26+
complex<double> pfik, pfjk;
3127

3228
best_s = -99999;
3329
best_cl1 = -1;
@@ -41,18 +37,10 @@ void build_clmatrix_kernel(int n, int m, int r, double* pf, double* clmatrix, do
4137
p2 = 0;
4238
/* inner most dim of dot (matmul) */
4339
for(k=0; k<r; k++){
44-
/* factor */
45-
p1_realk = pf[2*(k*m*n + cl1*n + i)];
46-
p1_imagk = pf[2*(k*m*n + cl1*n + i) + 1];
47-
p2conj_realk = pf[2*(k*m*n + cl2*n + j)];
48-
p2conj_imagk = -pf[2*(k*m*n + cl2*n + j) + 1];
49-
shift_phases_real = shift_phases[2*(s*r + k)];
50-
shift_phases_imag = shift_phases[2*(s*r + k) +1];
51-
p2_realk = (p2conj_realk * shift_phases_real) - (p2conj_imagk*shift_phases_imag);
52-
p2_imagk = (p2conj_imagk * shift_phases_real) + (p2conj_realk*shift_phases_imag);
53-
p1 += p1_realk * p2_realk;
54-
p2 += p1_imagk * p2_imagk;
55-
40+
pfik = pf[k*m*n + cl1*n + i];
41+
pfjk = conj(pf[k*m*n + cl2*n + j]) * shift_phases[s*r + k];
42+
p1 += real(pfik) * real(pfjk);
43+
p2 += imag(pfik) * imag(pfjk);
5644
}
5745

5846
dist = p1 - p2;

src/aspire/abinitio/commonline_base.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -279,8 +279,6 @@ def build_clmatrix_cu(self):
279279
logger.error(msg)
280280
raise NotImplementedError(msg)
281281

282-
n_theta_half = self.n_theta // 2 # rm
283-
284282
# need to do a copy to prevent modifying self.pf for other functions
285283
# also places on GPU
286284
pf = cp.array(self.pf)
@@ -320,6 +318,8 @@ def build_clmatrix_cu(self):
320318
# Apply bandpass filter, normalize each ray of each image
321319
# Note that only use half of each ray
322320
pf = self._apply_filter_and_norm("ijk, k -> ijk", pf, r_max, h)
321+
# Tranpose `pf` for better (CUDA) memory access pattern, and cast as needed.
322+
pf = cp.ascontiguousarray(pf.T, dtype=np.complex128)
323323

324324
# Get kernel
325325
build_clmatrix_kernel = self.__gpu_module.get_function("build_clmatrix_kernel")
@@ -336,16 +336,16 @@ def build_clmatrix_cu(self):
336336
(nblkx, nblky),
337337
(blkszx, blkszy),
338338
(
339-
pf.shape[0],
339+
n_img,
340340
pf.shape[1],
341-
pf.shape[2],
342-
cp.ascontiguousarray(pf.T, dtype=np.complex128).view(np.float64),
341+
r_max,
342+
pf,
343343
clmatrix,
344344
cl_dist,
345345
shifts_1d,
346346
len(shifts),
347347
shifts,
348-
shift_phases.astype(np.complex128, copy=False).view(np.float64),
348+
shift_phases.astype(np.complex128, copy=False),
349349
),
350350
)
351351

0 commit comments

Comments
 (0)