In [1]:
# use an nvidia shell command to check the gpu
! nvidia-smi

Tue Apr 18 09:27:50 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# N8 Python GPU Workshop April 2023



## Our first GPU Kernel

In [2]:
# vector_add that adds two input vectors together and stores the sum in another vector

def vector_add(A, B, C, size):
  for item in range(0, size):
    C[item] = A[item] + B[item]

In [5]:
import numpy as np

size = 1024

a_cpu = np.random.rand(size)
b_cpu = np.random.rand(size)
c_cpu = np.zeros(size)

vector_add(a_cpu, b_cpu, c_cpu, size)

In [6]:
print(c_cpu)

[1.35561226 0.5990934  1.62263847 ... 1.60657024 0.56377069 1.67242285]


In [21]:
# converting the above code into a CUDA kernel

import cupy 

# CUDA kernel

vector_add_cuda_code = r'''
extern "C"
__global__ void vector_add(const float * A, const float * B, float * C, const int size)
{
  int item = threadIdx.x;
  C[item] = A[item] + B[item];
}
'''

In [22]:
# compile the kernel into a function
# we can run, through CuPy

vector_add_gpu = cupy.RawKernel(vector_add_cuda_code, "vector_add")

In [23]:
# run the GPU version of vector_add

size = 1024

# create cupy arrays from numpy arrays
a_gpu = cupy.asarray(a_cpu, dtype=cupy.float32)
b_gpu = cupy.asarray(b_cpu, dtype=cupy.float32)
c_gpu = cupy.zeros(size, dtype=cupy.float32)

In [24]:
# run our custom kernel on the GPU

# specified 3 tuples, grid configuration, block configuration, kernel arguments
vector_add_gpu((1, 1, 1), (size, 1, 1), (a_gpu, b_gpu, c_gpu, size))

In [25]:
if np.allclose(c_cpu, c_gpu):
  print("Correct results!")

Correct results!


In [26]:
# using data larger than 1024

size = 2048

a_gpu = cupy.random.rand(size, dtype=cupy.float32)
b_gpu = cupy.random.rand(size, dtype=cupy.float32)
c_gpu = cupy.zeros(size, dtype=cupy.float32)

In [27]:
# this errors because we've requested more than 1024 threads per block
# in the second tuple argument
vector_add_gpu((1,1,1), (size, 1, 1), (a_gpu, b_gpu, c_gpu, size))

CUDADriverError: ignored

In [28]:
# can we get around the block size limit
# by request 2 blocks with half the size as number of threads

vector_add_gpu((2,1,1), (size // 2, 1, 1), (a_gpu, b_gpu, c_gpu, size))

In [29]:
# test if this has worked as expected

a_cpu = cupy.asnumpy(a_gpu)
b_cpu = cupy.asnumpy(b_gpu)
c_cpu = np.zeros(size, dtype=np.float32)

# call the python version of vector add
vector_add(a_cpu, b_cpu, c_cpu, size)

In [30]:
if np.allclose(c_cpu, c_gpu):
  print("Right results!")
else:
  print("Wrong results!")

Wrong results!


In [31]:
print(c_gpu)

[0.6206742  0.32335612 1.0443004  ... 0.         0.         0.        ]


In [32]:
print(c_cpu)

[0.6206742  0.32335612 1.0443004  ... 0.16475175 0.7256685  0.9482078 ]


In [33]:
# CUDA kernel that accounts for multiple blocks

# we need to use additional CUDA variables
# to help calculate the right index for each thread in each block


vector_add_cuda_code = r'''
extern "C"
__global__ void vector_add(const float * A, const float * B, float * C, const int size)
{
  int item = (blockIdx.x * blockDim.x) + threadIdx.x;
  C[item] = A[item] + B[item];
}
'''

In [34]:
vector_add_gpu = cupy.RawKernel(vector_add_cuda_code, "vector_add")

vector_add_gpu((2, 1, 1), (size // 2, 1, 1), (a_gpu, b_gpu, c_gpu, size))

In [35]:
if np.allclose(c_cpu, c_gpu):
  print("All correct!")
else:
  print("Wrong results!")

All correct!


In [36]:
# updating the kernel to handle arbitrary values

vector_add_cuda_code = r'''
extern "C"
__global__ void vector_add(const float * A, const float * B, float * C, const int size)
{
  int item = (blockIdx.x * blockDim.x) + threadIdx.x;
  if ( item < size )
  {
    C[item] = A[item] + B[item];
  }
}
'''

In [37]:
# using data of an arbitrary size

size = 10_000

a_gpu = cupy.random.rand(size, dtype=cupy.float32)
b_gpu = cupy.random.rand(size, dtype=cupy.float32)
c_gpu = cupy.zeros(size, dtype=cupy.float32)

In [38]:
vector_add_gpu = cupy.RawKernel(vector_add_cuda_code, "vector_add")

vector_add_gpu((2, 1, 1), (size // 2, 1, 1), (a_gpu, b_gpu, c_gpu, size))

CUDADriverError: ignored

In [39]:
# calculating the required number of blocks
# in our CUDA grid to run data of arbitrary size

import math 

# specify that our blocks should always have
# 1024 threads in the x dimension
block_size = (1024, 1, 1)

grid_size = (int(math.ceil(size / 1024)), 1, 1)

In [40]:
print(grid_size)

(10, 1, 1)


In [41]:
# now we can run our vector_add code on data of an arbitrary size
# using Python logic to help calculate the CUDA grid
vector_add_gpu(grid_size, block_size, (a_gpu, b_gpu, c_gpu, size))

In [42]:
a_cpu = cupy.asnumpy(a_gpu)
b_cpu = cupy.asnumpy(b_gpu)
c_cpu = np.zeros(size, dtype=np.float32)

vector_add(a_cpu, b_cpu, c_cpu, size)

In [43]:
# check if our results match

if np.allclose(c_cpu, c_gpu):
  print("Correct results!")
else:
  print("Oh no!")

Correct results!
