In [1]:
from numba import jit, njit, vectorize, cuda
import numpy as np
import cupy as cp
from timeit import default_timer as timer

In [2]:
dev = cuda.gpus
len(dev)

1

In [3]:
N = 500000000

In [3]:
# Function to multiply two array elements and also update the results on the second array

@jit(nopython=True)
def multiply(p_cpu, q_cpu):
    for i in range(N):
        q_cpu[i] = p_cpu[i]*q_cpu[i]

def main():
    p = np.zeros(N, dtype=np.double)
    q = np.zeros(N, dtype=np.double)
    # Fill the arrays
    p.fill(23.0)
    q.fill(12.0)
    # Time the function
    begin = timer()
    multiply(p,q)
    numpy_cpu_time = timer() - begin
    # Report computation time
    print("Multiply function took %s time" % numpy_cpu_time)

    return 0

if __name__=="__main__":
    main()

Multiply function took 0.42176981503143907 time


In [4]:
# The @vectorize decorator turns the function into a GPU-based vectorized function

@vectorize(["double(double, double)"], target='cuda')
def vector_multiply_gpu(a, b):
    return a*b

def main():
    p = np.zeros(N, dtype=np.double)
    q = np.zeros(N, dtype=np.double)
    # Fill the arrays
    p.fill(23.0)
    q.fill(12.0)
    # Time the function
    begin = timer()
    vector_multiply_gpu(p, q)
    vector_multiply_gpu_time = timer() - begin
    # Report computation time
    print("Multiply function took %s time" % vector_multiply_gpu_time)

    return 0

if __name__=="__main__":
    main()



Multiply function took 1.1493102160748094 time


In [4]:
# Interoperability between Cupy and numba

@cuda.jit
def multiply(p,q):
    # Thread id in a 1D block
    tx = cuda.threadIdx.x
    # Block id in a 1D grid
    ty = cuda.blockIdx.x
    # Number  of threads per block
    bw = cuda.blockDim.x
    index = tx + ty*bw
    if index < N: # Check array size limit
        q[index] = p[index]*q[index]

def main():
    a_source = cp.zeros(N, dtype=cp.double)
    b_source = cp.zeros(N, dtype=cp.double)
    a_source.fill(23)
    b_source.fill(23)

    threadsperblock = 1024
    blockspergrid = (N + (threadsperblock - 1)) // threadsperblock

    # Time the GPU function
    begin = timer()
    multiply[blockspergrid, threadsperblock](a_source, b_source)
    vector_multiply_gpu_time = timer() - begin
    print("GPU function took %f seconds." % vector_multiply_gpu_time)

if __name__ == "__main__":
    main()

GPU function took 0.186319 seconds.
