In [1]:
!uv pip install -q --system numba-cuda==0.4.0
import numpy as np

import time
import os

# Enable the CUDA simulator. This MUST be set BEFORE numba imports or kernel definitions.
os.environ["NUMBA_ENABLE_CUDASIM"] = "1"
from numba import cuda
from numba import config

# --- Configuration & Data Preparation ---

config.CUDA_ENABLE_PYNVJITLINK = 1

In [2]:
# Prepare character data (ACII values for A-H, 8 charcaters total)
characters = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
data = np.array([ord(c) for c in characters], dtype=np.uint8)
data_size = len(data) # 8 elementos

# --- 1D Kernel Definition (using gidx, bidx, tidx) ---

@cuda.jit
def kernel_1d_dims(arr):
  # gidx: Global 1D index (Thread ID in the entire grid)
  gidx = cuda.grid(1)

  #bidx: Block ID (Block index in the grid)
  bidx = cuda.blockIdx.x
  # tidx: Thread ID (Thread index within within the block)
  tidx = cuda.threadIdx.x

  if gidx < arr.size:
    # Standard Python print works via the simulator
    print(f"BID: {bidx}, TID: {tidx}, GID: {gidx}, Char: {chr(arr[gidx])}")
#
# Example 1: Block, 8 Threads per Block
#

blocks_per_grid_ex1 = 1
threads_per_block_ex1 = 8
# Total threads = 1 * 8 = 8

kernel_1d_dims[blocks_per_grid_ex1, threads_per_block_ex1](data)
cuda.synchronize

BID: 0, TID: 0, GID: 0, Char: A
BID: 0, TID: 1, GID: 1, Char: B
BID: 0, TID: 2, GID: 2, Char: C
BID: 0, TID: 3, GID: 3, Char: D
BID: 0, TID: 4, GID: 4, Char: E
BID: 0, TID: 5, GID: 5, Char: F
BID: 0, TID: 6, GID: 6, Char: G
BID: 0, TID: 7, GID: 7, Char: H


In [3]:
@cuda.jit
def whoami():
    # Compute block id in a 3D grid
    block_id = (
        cuda.blockIdx.x +
        cuda.blockIdx.y * cuda.gridDim.x +
        cuda.gridDim.x * cuda.gridDim.y
    )

    # Threads per block
    threads_per_block = (
        cuda.blockDim.x * cuda.blockDim.y
    )

    # Offset of this block
    block_offset = block_id * threads_per_block

    # Compute thread id inside block
    thread_offset = (
        cuda.threadIdx.x +
        cuda.threadIdx.y * cuda.blockDim.x +
        cuda.blockDim.x * cuda.blockDim.y
    )

    # Global thread id across all blocks
    global_id = block_offset + thread_offset


    print(f"{global_id:03d} | Block[x, y]({cuda.blockIdx.x} {cuda.blockIdx.y}) = {block_id:3d} | "
          f"Thread[x, y] ({cuda.threadIdx.x} {cuda.threadIdx.y} ) = {thread_offset:3d} BlockDim.x {cuda.blockDim.x} BlockDim.y {cuda.blockDim.y} GridDim.x {cuda.gridDim.x} GridDim.y {cuda.gridDim.y}")


b_x, b_y = 2, 2
t_x, t_y = 4, 1

blocks_per_grid = (b_x, b_y)
threads_per_block = (t_x, t_y)

total_blocks = b_x * b_y
total_threads = t_x * t_y
print(f"{total_blocks} blocks/grid")
print(f"{total_threads} threads/block")
print(f"{total_blocks * total_threads} total threads\n")

# Launch kernel
whoami[blocks_per_grid, threads_per_block]()

# Wait for GPU to finish (like cudaDeviceSynchronize)
cuda.synchronize()

4 blocks/grid
4 threads/block
16 total threads

020 | Block[x, y](0 0) =   4 | Thread[x, y] (0 0 ) =   4 BlockDim.x 4 BlockDim.y 1 GridDim.x 2 GridDim.y 2
021 | Block[x, y](0 0) =   4 | Thread[x, y] (1 0 ) =   5 BlockDim.x 4 BlockDim.y 1 GridDim.x 2 GridDim.y 2
022 | Block[x, y](0 0) =   4 | Thread[x, y] (2 0 ) =   6 BlockDim.x 4 BlockDim.y 1 GridDim.x 2 GridDim.y 2
023 | Block[x, y](0 0) =   4 | Thread[x, y] (3 0 ) =   7 BlockDim.x 4 BlockDim.y 1 GridDim.x 2 GridDim.y 2
028 | Block[x, y](0 1) =   6 | Thread[x, y] (0 0 ) =   4 BlockDim.x 4 BlockDim.y 1 GridDim.x 2 GridDim.y 2
029 | Block[x, y](0 1) =   6 | Thread[x, y] (1 0 ) =   5 BlockDim.x 4 BlockDim.y 1 GridDim.x 2 GridDim.y 2
030 | Block[x, y](0 1) =   6 | Thread[x, y] (2 0 ) =   6 BlockDim.x 4 BlockDim.y 1 GridDim.x 2 GridDim.y 2
031 | Block[x, y](0 1) =   6 | Thread[x, y] (3 0 ) =   7 BlockDim.x 4 BlockDim.y 1 GridDim.x 2 GridDim.y 2
024 | Block[x, y](1 0) =   5 | Thread[x, y] (0 0 ) =   4 BlockDim.x 4 BlockDim.y 1 GridDim.x 2 G