# CUDA - prvi deo

In [2]:
from numba import cuda

@cuda.jit
def add_kernel(x, y, out):

    idx = cuda.grid(1) 
                    
    out[idx] = x[idx] + y[idx]

In [3]:
import numpy as np

n = 128*32
x = np.arange(n).astype(np.int32) 
y = np.ones_like(x)             

d_x = cuda.to_device(x)
d_y = cuda.to_device(y)
d_out = cuda.device_array_like(d_x)

threads_per_block = 128
blocks_per_grid = 32

In [4]:
add_kernel[blocks_per_grid, threads_per_block](d_x, d_y, d_out)
cuda.synchronize()
print(d_out.copy_to_host())

NvvmSupportError: libNVVM cannot be found. Do `conda install cudatoolkit`:
[WinError 3] The system cannot find the path specified: 'c:\\program files\\python36\\Library\\bin'

In [None]:
from numba import cuda

@cuda.jit
def add_kernel(x, y, out):
    

    start = cuda.grid(1)
    
    stride = cuda.gridsize(1)   


    for i in range(start, x.shape[0], stride):
        out[i] = x[i] + y[i]

In [None]:
import numpy as np

n = 100000
x = np.arange(n).astype(np.int32)
y = np.ones_like(x)

d_x = cuda.to_device(x)
d_y = cuda.to_device(y)
d_out = cuda.device_array_like(d_x)

threads_per_block = 128
blocks_per_grid = 30

In [None]:
from math import hypot

@cuda.jit
def hypot_stride(a, b, c):
    idx = cuda.grid(1)
    stride = cuda.gridsize(1)
    
    for i in range(idx, a.shape[0], stride):
        c[i] = hypot(a[i], b[i])

In [None]:
n = 1000000
a = np.random.uniform(-12, 12, n).astype(np.float32)
b = np.random.uniform(-12, 12, n).astype(np.float32)
d_a = cuda.to_device(a)
d_b = cuda.to_device(b)
d_c = cuda.device_array_like(d_b)

blocks = 128
threads_per_block = 64

hypot_stride[blocks, threads_per_block](d_a, d_b, d_c)

In [None]:
%timeit np.hypot(a, b)

In [None]:
from numba import jit

@jit
def numba_hypot(a, b):
    return np.hypot(a, b)

In [None]:
%timeit numba_hypot(a, b)

In [None]:
%time hypot_stride[1, 1](d_a, d_b, d_c); cuda.synchronize()

In [None]:
%time hypot_stride[128, 64](d_a, d_b, d_c); cuda.synchronize()

### Race condition

In [None]:
@cuda.jit
def thread_counter_race_condition(global_counter):
    global_counter[0] += 1  # This is bad
    
@cuda.jit
def thread_counter_safe(global_counter):
    cuda.atomic.add(global_counter, 0, 1)  # Safely add 1 to offset 0 in global_counter array

In [None]:
global_counter = cuda.to_device(np.array([0], dtype=np.int32))
thread_counter_race_condition[64, 64](global_counter)

print('Ocekivano %d:' % (64*64), global_counter.copy_to_host())

In [None]:
# This works correctly
global_counter = cuda.to_device(np.array([0], dtype=np.int32))
thread_counter_safe[64, 64](global_counter)

print('Ocekivano be %d:' % (64*64), global_counter.copy_to_host())