In [1]:
from numba import cuda
import numpy as np
from timeit import default_timer as timer

# Normal function to run on CPU
def func(a):								 
    for i in range(len(a)): 
        a[i] += 1	

# Function optimized to run on GPU
@cuda.jit
def func2(a): 
    idx = cuda.grid(1)  # Get the unique thread index in the grid
    if idx < a.size:     # Ensure we do not go out of bounds
        a[idx] += 1

if __name__ == "__main__": 
    n = 100000000							
    a = np.ones(n, dtype=np.float64) 
    
    # Run on CPU
    start = timer() 
    func(a) 
    print("Without GPU:", timer() - start)	 
    
    # Allocate memory for GPU and copy array
    a_gpu = cuda.to_device(a)
    
    # Run on GPU
    threads_per_block = 256
    blocks_per_grid = (a_gpu.size + (threads_per_block - 1)) // threads_per_block
    
    start = timer() 
    func2[blocks_per_grid, threads_per_block](a_gpu)  # Launch the kernel
    a_gpu.copy_to_host(a)  # Copy result back to host
    print("With GPU:", timer() - start)


Without GPU: 31.51335750000726


CudaSupportError: Error at driver init: Call to cuInit results in CUDA_ERROR_NO_DEVICE (100)