In [1]:
'''This code is a prototype to demonstrate the use
of numba for GPU computing. It takes a mock dataset of
tracks position and energy deposit and computes
recombination factor, longitudna and transverse diffusion,
lifetime and number of electrons.'''

import numpy as np
import cupy as cp
import pandas as pd
import sys
import time
import numba
from numba import cuda
from numba import vectorize
import math

In [84]:
'''Functions to declare constants to calcuate recombination.
A function decorated with device=True only resides
on GPU memory and is accessible only via GPU machines.'''

@cuda.jit(device=True)
def calc_epsilon(dedx):
    _density = 1.38    # g/cm^3
    _alpha   = 0.930
    _beta    = 0.212
    _efield  = 0.500   # V/sm
   
    _epsilon = _beta/(_density * _efield) * dedx
    return _alpha, _epsilon

'''Function to calculate the recombination factor.
A function decorated with vecotrize is a way to 
define a numpy ufunc --> universal func to do
elementwise calculation on an array on gpu'''

@vectorize(['float64(float64, float64, float64, float64, float64, float64, float64)'], target='cuda')
def get_recomb_factor(x_start,x_end,y_start,y_end,z_start,z_end,de):
    dx        = math.sqrt((x_end - x_start) ** 2 + 
                   (y_end - y_start) ** 2 +
                   (z_end - z_start) ** 2)
    dedx      = abs(de)*1e3/dx
    alpha, ep = calc_epsilon(dedx)
    return math.log(alpha + ep) / ep;

In [79]:
@cuda.jit(device=True)
def get_diffusion_consts():
    _vdrift   = 0.153812 # cm/us
    _msTous   = 10e3 # us
    _tpcZ     = 0
    _lDiff    = 6.2e-6 # cm^2/us
    _tDiff    = 16.3e-6 # cm^2/us
    return _tpcZ, _vdrift, _msTous, _lDiff, _tDiff

'''Function to calculate lifetime and diffusion coefficients.
Traditional GPU kernel function which loops over blocks and grids
on GPU machines. Not using a vectorize decorator as we can calculate
3 different quantities in the same call and return them together.'''

@cuda.jit
def get_lifetime_diffusion_coeff(z_start, z_end, dE, lifetime, long_diffusion, trans_diffusion):
    start  = cuda.grid(1)      # 1 = one dimensional thread grid, returns a single value
    stride = cuda.gridsize(1) # ditto

    _tpcPlaneZ, _vdrift, _msTous, _lDiff, _tDiff = get_diffusion_consts();
    for i in range(start, z_start.shape[0], stride):
        z = (z_end[i] - z_start[i])/2.    
        drift_time  = abs(z - _tpcPlaneZ)/ _vdrift;
        lifetime[i] = math.exp(-drift_time/_msTous)
        long_diffusion[i]  = math.sqrt(drift_time) * _lDiff
        trans_diffusion[i] = math.sqrt(drift_time) * _tDiff/_vdrift + drift_time

In [95]:
'''Vectorize to declare numpy ufunc equivalent for GPU.
Numba's power in declaring custom kernel function:
While numpy.multiply (or GPU equivalent cupy.multiply)
exists to mulitply two arrays. This will directly multiply
3 array with a const in equivalent time. '''

@vectorize(['float64(float64, float64, float64)'], target='cuda')
def get_nElectrons(de,recomb, lifetime):
    _MeVToElectrons = 4.237e+04
    return de * recomb * lifetime * _MeVToElectrons

In [85]:
'''Function to copy track information to GPU arrays
and declare new GPU arrays for computation.'''

def initialize_gpu_arrays(tracks):
    '''Input Data as GPU arrays'''   
    global x_start_device, x_end_device, y_start_device, y_end_device, z_start_device, z_end_device, dE_device
    x_start_device = cuda.to_device(tracks['x_start'].to_numpy())
    x_end_device   = cuda.to_device(tracks['x_end'].to_numpy())
    y_start_device = cuda.to_device(tracks['y_start'].to_numpy())
    y_end_device   = cuda.to_device(tracks['y_end'].to_numpy())
    z_start_device = cuda.to_device(tracks['z_start'].to_numpy())
    z_end_device   = cuda.to_device(tracks['z_end'].to_numpy())
    dE_device      = cuda.to_device(tracks['dE'].to_numpy())

    '''Output as GPU arrays'''
    global recomb_device, lifetime_device, long_diffusion_device, trans_diffusion_device, nelec_device
    recomb_device          = cuda.device_array_like(x_start_device)
    lifetime_device        = cuda.device_array_like(x_start_device)
    long_diffusion_device  = cuda.device_array_like(x_start_device)
    trans_diffusion_device = cuda.device_array_like(x_start_device)
    nelec_device           = cuda.device_array_like(x_start_device)

In [122]:
def main():
    
        tracks = pd.read_csv('tracks.txt', delim_whitespace=True)
        #print("Input data: ")
        #print(tracks[0:10])
        
        initialize_gpu_arrays(tracks)
        
        threads_per_block = 128
        blocks_per_grid   = 30
        
        get_lifetime_diffusion_coeff[blocks_per_grid, threads_per_block](z_start_device, 
                                     z_end_device, 
                                     dE_device, 
                                     lifetime_device, 
                                     long_diffusion_device,
                                     trans_diffusion_device)
        
      
        recomb_device = get_recomb_factor(x_start_device,
                                         x_end_device,
                                         y_start_device,
                                         y_end_device,
                                         z_start_device,
                                         z_end_device,
                                         dE_device)  

        nelec_device = get_nElectrons(dE_device, recomb_device, lifetime_device)
        
        tracks['recomb_factor']         = recomb_device.copy_to_host()
        tracks['lifetime']              = lifetime_device.copy_to_host()
        tracks['longitudnal_diffusion'] = long_diffusion_device.copy_to_host()
        tracks['transverse_diffusion']  = trans_diffusion_device.copy_to_host()
        tracks['nElectrons']            = nelec_device.copy_to_host()
        
        #print("The new data frame after calculation is: ")
        #print(tracks[0:10])

In [123]:
main()

In [124]:
%timeit main()

14.6 ms ± 224 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [119]:
%%time
threads_per_block = 128
blocks_per_grid   = 30
        
get_lifetime_diffusion_coeff[blocks_per_grid, threads_per_block](z_start_device, 
                                     z_end_device, 
                                     dE_device, 
                                     lifetime_device, 
                                     long_diffusion_device,
                                     trans_diffusion_device)

CPU times: user 667 µs, sys: 0 ns, total: 667 µs
Wall time: 601 µs


In [120]:
%time nelec_device = get_nElectrons(dE_device, recomb_device, lifetime_device)

CPU times: user 3 µs, sys: 3.77 ms, total: 3.77 ms
Wall time: 3.14 ms


In [121]:
%%time 
recomb_device = get_recomb_factor(x_start_device,
                                         x_end_device,
                                         y_start_device,
                                         y_end_device,
                                         z_start_device,
                                         z_end_device,
                                         dE_device)

CPU times: user 5.25 ms, sys: 74 µs, total: 5.32 ms
Wall time: 4.57 ms
