In [1]:
!uv pip install -q --system numba-cuda==0.4.0 --force-reinstall

In [2]:
from numba import config
config.CUDA_ENABLE_PYNVJITLINK = 1

In [6]:
from numba import cuda
import numpy as np

# Kernels decorados con `@cuda.jit` no devuelven valores
# No es necesaria signatura de tipos
@cuda.jit
def add_kernel(x, y, out):
    idx = cuda.grid(1)
        # 1 = grid unidimensional
        # cuda.grid(1) = cuda.threadIdx.x + cuda.blockIdx.x*cuda.blockDim.x
    out[idx] = x[idx] + y[idx]


n = 4096
h_x = np.arange(n).astype(np.float32)  # [0.0 ... 4095.0]
h_y = np.ones_like(h_x)              # [1.0 ... 1.0]

d_x = cuda.to_device(h_x)
d_y = cuda.to_device(h_y)
d_out = cuda.device_array_like(d_x)

# Necesitamos un hilo para cada elemento (4096)
threads_per_block = 128
blocks_per_grid = 32

add_kernel[blocks_per_grid, threads_per_block](d_x, d_y, d_out)
cuda.synchronize() # Esto sería innecesario
print(d_out.copy_to_host().astype(np.int16)) # Resultado: [1...4096]

[   1    2    3 ... 4094 4095 4096]


