https://numba.readthedocs.io/en/stable/cuda/memory.html

In [1]:
from numba import cuda
import numpy as np

In [2]:
ary = np.arange(10)
d_ary = cuda.to_device(ary)
d_ary

<numba.cuda.cudadrv.devicearray.DeviceNDArray at 0x7f6be04b29e0>

In [3]:
stream = cuda.stream()
d_ary = cuda.to_device(ary, stream=stream)
d_ary

<numba.cuda.cudadrv.devicearray.DeviceNDArray at 0x7f6bcc12c970>

In [4]:
hary = d_ary.copy_to_host()
hary

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [5]:
ary = np.empty(shape=d_ary.shape, dtype=d_ary.dtype)
d_ary.copy_to_host(ary)
ary

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [6]:
hary = d_ary.copy_to_host(stream=stream)
hary

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [7]:
@cuda.jit
def my_kernel(arr):
    for i in range(len(arr)):
        arr[i] += 1

arr = np.arange(1000)

d_arr = cuda.to_device(arr)

my_kernel[100, 100](d_arr)

result_array = d_arr.copy_to_host()

result_array[0:10]



array([ 1,  3,  6,  8,  8,  9, 10, 11, 12, 14])

In [8]:
d_arr = d_arr.reshape(20, 50, order='F')
d_arr.shape

(20, 50)

In [9]:
@cuda.jit
def kernel_func(x):
   dyn_arr = cuda.shared.array(0, dtype=np.float32)

In [10]:
kernel_func[32, 32, 0, 128](arr)



In [11]:
@cuda.jit
def f():
   f32_arr = cuda.shared.array(0, dtype=np.float32)
   i32_arr = cuda.shared.array(0, dtype=np.int32)
   f32_arr[0] = 3.14
   print(f32_arr[0])
   print(i32_arr[0])

f[1, 1, 0, 4]()
cuda.synchronize()

3.140000
1078523331




In [12]:
@cuda.jit
def f_with_view():
   f32_arr = cuda.shared.array(0, dtype=np.float32)
   i32_arr = cuda.shared.array(0, dtype=np.int32)[1:] # 1 int32 = 4 bytes
   f32_arr[0] = 3.14
   i32_arr[0] = 1
   print(f32_arr[0])
   print(i32_arr[0])

f_with_view[1, 1, 0, 8]()
cuda.synchronize()

3.140000
1


