# 5.8. Writing massively parallel code for NVIDIA graphics cards (GPUs) with CUDA

In [None]:
import math
import numpy as np
from numba import cuda
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
len(cuda.gpus)

In [None]:
cuda.gpus[0].name

In [None]:
@cuda.jit
def mandelbrot_numba(m, iterations):
    # Matrix index.
    i, j = cuda.grid(2)
    size = m.shape[0]
    # Skip threads outside the matrix.
    if i >= size or j >= size:
        return
    # Run the simulation.
    c = (-2 + 3. / size * j +
         1j * (1.5 - 3. / size * i))
    z = 0
    for n in range(iterations):
        if abs(z) <= 10:
            z = z * z + c
            m[i, j] = n
        else:
            break

In [None]:
size = 400
iterations = 100

In [None]:
m = np.zeros((size, size))

In [None]:
# 16x16 threads per block.
bs = 16
# Number of blocks in the grid.
bpg = math.ceil(size / bs)
# We prepare the GPU function.
f = mandelbrot_numba[(bpg, bpg), (bs, bs)]

In [None]:
f(m, iterations)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
ax.imshow(np.log(m), cmap=plt.cm.hot)
ax.set_axis_off()

In [None]:
%timeit -n10 -r100 f(m, iterations)

In [None]:
%timeit -n10 -r100 cuda.to_device(m)

In [None]:
%%timeit -n10 -r100 m_gpu = cuda.to_device(m)
f(m_gpu, iterations)

In [None]:
m_gpu = cuda.to_device(m)

In [None]:
%timeit -n10 -r100 m_gpu.copy_to_host()

```
# Thread id in a 1D block
tx = cuda.threadIdx.x
# Block id in a 1D grid
ty = cuda.blockIdx.x
# Block width, i.e. number of threads per block
bw = cuda.blockDim.x
# Compute flattened index inside the array
pos = tx + ty * bw
if pos < an_array.size:  # Check array boundaries
    # One can access `an_array[pos]`
```