In [5]:
import numpy as np
import cupy as cp
import sys
import time
from numba import cuda
from numba import vectorize
import math

In [6]:
@cuda.jit(device=True)
def calc_epsilon(dedx):
    density = 1.38    # g/cm^3
    alpha   = 0.847
    beta    = 0.2061
    efield  = 0.500   # V/sm
   
    epsilon = beta/(density*efield) * dedx
    return alpha, epsilon

@vectorize(['float64(float64)'], target='cuda')
def modified_box(dedx):
    alpha, ep = calc_epsilon(dedx)
    return math.log(alpha + ep) / ep;

In [20]:
%%time
npts = int(1e8)
dedx = np.random.random(npts) * 0.5 + 2.
result = np.empty_like(dedx)
np_dedx_device = cuda.to_device(dedx)
np_result_device = cuda.device_array_like(np_dedx)

CPU times: user 881 ms, sys: 144 ms, total: 1.03 s
Wall time: 1.02 s


In [22]:
%%time
npts = int(1e8)
cp_dedx_device = cp.random.random(npts) * 0.5 + 2.
cp_result_device = cp.zeros_like(cp_dedx_device)

CPU times: user 0 ns, sys: 6.79 ms, total: 6.79 ms
Wall time: 6.09 ms


In [23]:
%timeit result = modified_box(dedx)

463 ms ± 35.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [24]:
%timeit modified_box(np_dedx_device, out=np_result_device)

1.94 ms ± 213 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [25]:
%timeit modified_box(cp_dedx_device, out=cp_result_device)

1.99 ms ± 89.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
