In [1]:
import math
import time
import numpy as np
from numba import cuda, vectorize, jit
from timeit import default_timer as timer

### Check if CUDA is available
If it is not available, it could be one of two reasons:
- you do not have a GPU enabled VM
- you do not have the correct docker image (you should use the aztk/python:spark2.2.0-python3.6.2-gpu image)

In [2]:
if not cuda.is_available():
    print("CUDA is not available. Please select a GPU enabled VM.")
else:
    print("CUDA is available!")

CUDA is available!


### Create my CPU function

In [3]:
def cpu_pow(a, b):
    return a ** b

def cpu_work(vec_size):
    #vec_size = 100000000 is a good number to show the diff

    a = b = np.array(np.random.sample(vec_size), dtype=np.float32)
    c = np.zeros(vec_size, dtype=np.float32)

    c = cpu_pow(a, b)

### Create the same function, using numba to utilize GPUs

In [4]:
from numba import vectorize

@vectorize(['float32(float32, float32)'], target='cuda')
def gpu_pow(a, b):
    return a ** b

def gpu_work(vec_size):
    #vec_size = 100000000 is a good number to show the diff

    a = b = np.array(np.random.sample(vec_size), dtype=np.float32)
    c = np.zeros(vec_size, dtype=np.float32)

    c = gpu_pow(a, b)

### Compare on single node
On the master node, lets run the `gpu_work` against the `cpu_work` function. We will see considerable speed up using GPUs.

In [6]:
%%timeit
gpu_work(10000000)

170 ms ± 23.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
%%timeit
cpu_work(10000000)

853 ms ± 1.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Compare across the Spark Cluster
Lets run the same computation, but in parallel across the Spark cluster. Once again, we will see considerable speedup when using GPUs.

In [8]:
rdd = sc.parallelize(list([10000000]*100))
print("Partitions", rdd.getNumPartitions())

Partitions 12


In [9]:
%%timeit
res = rdd.map(lambda x: gpu_work(x))
res.collect()

2.56 s ± 112 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%%timeit
res = rdd.map(lambda x: cpu_work(x))
res.collect()

10.5 s ± 47.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
