# CUDA Kernels

We will use CuPy to execute CudaC Kernel to `add vectors` from the Python runtime

In [1]:
#enable T4 gpu in Runtime > Change Runtime Type
!nvidia-smi

Thu Apr 17 02:06:32 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   44C    P8             12W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# CUDA Kernel

In [2]:
kernel_code = r'''
extern "C" __global__
void vector_add(const float* x, const float* y, float* out, int N) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N) {
        out[i] = x[i] + y[i];
    }
}
'''

# Main

In [3]:
import cupy as cp

module = cp.RawModule(code=kernel_code)
vector_add_kernel = module.get_function('vector_add')

In [4]:
N = 1024 * 10
x = cp.random.rand(N, dtype=cp.float32)
y = cp.random.rand(N, dtype=cp.float32)
out = cp.empty_like(x)

In [5]:
threads_per_block = 256
blocks_per_grid = (N + threads_per_block - 1) // threads_per_block

In [6]:
vector_add_kernel((blocks_per_grid,), (threads_per_block,), (x, y, out, N))

In [8]:
print(x)
print(y)
print(out)

[0.8099163  0.05843949 0.76927894 ... 0.4224384  0.34039268 0.16255459]
[0.8101021  0.15584847 0.60862285 ... 0.4157467  0.96168965 0.00377935]
[1.6200185  0.21428797 1.3779018  ... 0.8381851  1.3020823  0.16633394]
