In [1]:
!uv pip install -q --system numba-cuda==0.4.0
import numpy as np
from numba import cuda
import time
import os
from numba import config
config.CUDA_ENABLE_PYNVJITLINK = 1

In [2]:
from numba.cuda.simulator import kernel
# CUDA STEPS:
# Initializing data on CPU
# Transfer from CPU to GPU
# Run Kernel with defined Grid/Block size (threads)
# Transfer results from GPU to CPU
# Clear memory

# CUDA Kernel Device
@cuda.jit
def first_kernel(a,result):
  idx = cuda.grid(1)  # index thread
  if idx < a.size:
    result[idx] = a[idx]

# Host CPU
def main():
  # 1. Initialize data on CPU
  N = 10_000_000
  a_cpu = np.arange(N, dtype=np.float32)


  # ---------------------------------
  # CPU computation
  # ---------------------------------
  start = time.time()
  result_cpu = a_cpu
  cpu_time = time.time() - start
  print(f"CPU time: {cpu_time * 1e6:.2f} us")

  # ---------------------------------
  # GPU computation
  # ---------------------------------
  # 2. Transfer from CPU to GPU
  start = time.time()
  a_gpu = cuda.to_device(a_cpu)
  result_gpu = cuda.device_array_like(a_cpu) # reserve memory
  transfer_in_time = time.time() - start

  # Kernel launch
  threads_per_block = 128
  blocks_per_grid = (N + threads_per_block - 1) // threads_per_block # (10_000_000 + 127) // 128 = 78,125 blocks
  start = time.time()
  first_kernel[blocks_per_grid, threads_per_block](a_gpu, result_gpu) # lunch kernel
  cuda.synchronize()
  kernel_time = time.time() - start

  # Copy back
  start = time.time()
  result_from_gpu = result_gpu.copy_to_host()
  cuda.synchronize()
  transfer_out_time = time.time() - start

  # Report
  print(f"GPU trnasfer to device: {transfer_in_time * 1e3:.2f} ms")
  print(f"GPU kernel execution:   {kernel_time * 1e3:.2f} ms")
  print(f"GPU transfer to host:   {transfer_out_time * 1e3:.2f} ms")
  print(f"TOTAL GPU time:         {(transfer_in_time + kernel_time + transfer_out_time ) * 1e3:.2f} ms")

  # Cleanup
  del a_gpu, result_gpu
  cuda.close()

if __name__ == "__main__":
    main()

CPU time: 1.43 us


ImportError: Using pynvjitlink requires the pynvjitlink package to be available