In [5]:
# Serial reduction (Dot product)
import numpy as np
from numba import cuda

N = 2**16

def dot(u,v):
    n = u.shape[0]
    accum = 0
    for i in range(n):
        accum += u[i] * v[i]
    return accum

def main():
	u = np.ones(N, dtype = np.float32) 
	v = np.ones(N, dtype = np.float32) 

	accum = 0 
	for i in range(N):
		accum += u[i]*v[i] 
	print("Serial result: ", accum)

if __name__ == '__main__':
    main()

Serial result:  65536.0


In [21]:
# Reductions
# Issues with accessing global memory during parallel computations
# Must control order or execution
# Use "atomic operations" to elimiate race conditions: "read-increment-write" and prevent other threads from reading values
# Other atomic operations
# cuda.atomic.max() and cuda.atomic.min()

import os
os.environ['NUMBA_ENABLE_CUDASIM'] = '1'

import numpy as np
from numba import cuda , float32

N = 2**16
TPB = 32

@cuda.jit
def dot_kernel( d_accum , d_u , d_v):
    i = cuda.grid(1)
    n = d_u.shape[0]

    if i >= n:
       return

    w = d_u[i]*d_v[i]
    cuda.atomic.add( d_accum , 0, w) # New addition: But this basically creates a version slower than serial

def dot(u, v):
    n = u.shape[0]
    accum = np.zeros (1, dtype = np.float32)

    d_u = cuda.to_device(u)
    d_v = cuda.to_device(v)
    d_accum = cuda.to_device( accum )

    gridDim = (n + TPB - 1)//TPB
    blockDim = TPB

    dot_kernel[gridDim,blockDim]( d_accum , d_u , d_v)

    return d_accum.copy_to_host()[0]

def main():
	u = np.ones(N, dtype = np.float32) 
	v = np.ones(N, dtype = np.float32) 

	for j in range(8):
		accum = dot(u, v) 
		print("Naive parallel result: ", accum)

if __name__ == '__main__':
    main()

TypeError: __init__() got an unexpected keyword argument 'debug'

In [None]:
# Better to create shared memory arrays for parallel reductions
# Split reduction into blocks that can be summed together at the end

import os
os.environ['NUMBA_ENABLE_CUDASIM'] = '1'

import numpy as np
from numba import cuda , float32

N = 2**16
TPB = 512

@cuda.jit('void(f4[:],f4[:],f4[:])')
def dot_kernel(d_accum, d_u, d_v):
	i = cuda.grid(1)
	tIdx = cuda.threadIdx.x
	n = d_u.shape[0]
	sh_w = cuda.shared.array(shape = TPB, dtype = float32) #establish shared array
	sh_w[tIdx] = 0 #initialize shared array to zero

	if i < n: #bounds check
		sh_w[tIdx] = d_u[i]*d_v[i] #store element product in shared array
	cuda.syncthreads() #make sure all element products are stored before summing

	if tIdx == 0: #assign thread 0 to compute the block sum
		block_sum = 0
		for j in range(cuda.blockDim.x):
			block_sum += sh_w[tIdx]
		cuda.atomic.add(d_accum , 0, block_sum)

def dot(u, v):
	d_u = cuda.to_device(u)
	d_v = cuda.to_device(v)
	gridDim = int(np.ceil(u.size/TPB))
	blockDim = TPB
	accum = np.zeros(1, dtype = np.float32)
	d_accum = cuda.to_device(accum)

	dot_kernel[gridDim, blockDim](d_accum, d_u, d_v)

	return d_accum.copy_to_host()[0]

def main():
	u = np.ones(N, dtype = np.float32) 
	v = np.ones(N, dtype = np.float32) 

	for j in range(8):
		accum = dot(u, v) 
		print("Naive parallel result: ", accum)

if __name__ == '__main__':
    main()

# A more complex and efficient version is available in the ch7_atomic reductions

In [None]:
# numba built in reduction
# @cuda.reduce
# This is a sum reduction

import os
os.environ['NUMBA_ENABLE_CUDASIM'] = '1'

import numpy as np
from numba import cuda , float32

N = 2**16
TPB = 1024

@cuda.reduce # new decorator to implement reduction
def reduction (u, v): # function definition: compare values within w (u and v)
    return u + v # If we want to do a max reduction max(u,v) instead

@cuda.jit('void (f4 [:] , f4 [:] , f4 [:])')
def dot_kernel(d_w , d_u , d_v):
    i = cuda.grid(1)
    n = d_u.shape [0]

    if i >= n:
        return

    d_w[i] = d_u[i]*d_v[i]

def dot(u, v):
    n = u.shape [0]

    d_u = cuda.to_device(u)
    d_v = cuda.to_device(v)
    d_w = cuda.device_array(n, dtype = np.float32)

    gridDim = int( np.ceil (n/TPB))
    blockDim = TPB
    dot_kernel[gridDim ,blockDim ](d_w , d_u , d_v)

    w = reduction(d_w) # function call: pass in array that you want to be reducted (this is odd)
    return w

def main():
	u = np.ones(N, dtype = np.float32) 
	v = np.ones(N, dtype = np.float32) 

	for j in range(8):
		accum = dot(u, v) 
		print("Naive parallel result: ", accum)

if __name__ == '__main__':
    main()
