# CUDA Kernels

We will use CuPy to execute CudaC Kernel from the Python runtime

In [1]:
#enable T4 gpu in Runtime > Change Runtime Type
!nvidia-smi

Wed Apr 16 13:29:25 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   61C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

## CUDA Kernel

In [13]:
min_kernel_code = r'''
extern "C" __global__
void reduce_min(const float* arr, float* result, int N) {
    __shared__ float sdata[256];
    int tid = threadIdx.x;
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    sdata[tid] = (i < N) ? arr[i] : 1e20f;  // use large number as "max"
    __syncthreads();

    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s)
            sdata[tid] = fminf(sdata[tid], sdata[tid + s]);
        __syncthreads();
    }

    if (tid == 0 && blockIdx.x == 0)
        result[0] = sdata[0];
}
'''

In [18]:
max_kernel_code = r'''
extern "C" __global__
void reduce_max(const float* arr, float* result, int N) {
    __shared__ float sdata[256];
    int tid = threadIdx.x;
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    sdata[tid] = (i < N) ? arr[i] : -1e20f;  // use small number as "min"
    __syncthreads();

    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s)
            sdata[tid] = fmaxf(sdata[tid], sdata[tid + s]);
        __syncthreads();
    }

    if (tid == 0 && blockIdx.x == 0)
        result[0] = sdata[0];
}
'''

## Main

In [29]:
import cupy as cp

N = 1024
threads = 256
blocks = (N + threads - 1) // threads

arr = cp.arange(1, N + 1, dtype=cp.float32)
cp.random.shuffle(arr)

# Result buffers
d_min = cp.zeros(1, dtype=cp.float32)
d_max = cp.zeros(1, dtype=cp.float32)
d_mean = cp.zeros(1, dtype=cp.float32)

In [30]:
min_kernel = cp.RawKernel(min_kernel_code, 'reduce_min')
max_kernel = cp.RawKernel(max_kernel_code, 'reduce_max')

In [33]:
%%time
min_kernel((blocks,), (threads,), (arr, d_min, N))
max_kernel((blocks,), (threads,), (arr, d_max, N))

CPU times: user 714 µs, sys: 0 ns, total: 714 µs
Wall time: 634 µs


In [34]:
print(arr)
print("Min:", float(d_min[0]))
print("Max:", float(d_max[0]))

[384. 666. 784. ... 276. 586. 974.]
Min: 1.0
Max: 1023.0
