In [2]:
import pycuda.driver as cuda

cuda.init()
print("%d device(s) found." % cuda.Device.count())

1 device(s) found.


In [3]:
for i in range(cuda.Device.count()):
    dev = cuda.Device(i)
    print("Device %d: %s" % (i, dev.name()))
    print("  Compute Capability: %d.%d" % dev.compute_capability())
    print("  Total Memory: %s KB" % (dev.total_memory() // (1024)))
    atts = [(str(att), value) for att, value in dev.get_attributes().items()]
    atts.sort()
    for att, value in atts:
        print("  %s: %s" % (att, value))

Device 0: NVIDIA GeForce RTX 2060 with Max-Q Design
  Compute Capability: 7.5
  Total Memory: 6291136 KB
  ASYNC_ENGINE_COUNT: 2
  CAN_MAP_HOST_MEMORY: 1
  CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM: 0
  CLOCK_RATE: 1185000
  COMPUTE_CAPABILITY_MAJOR: 7
  COMPUTE_CAPABILITY_MINOR: 5
  COMPUTE_MODE: DEFAULT
  COMPUTE_PREEMPTION_SUPPORTED: 1
  CONCURRENT_KERNELS: 1
  CONCURRENT_MANAGED_ACCESS: 0
  DIRECT_MANAGED_MEM_ACCESS_FROM_HOST: 0
  ECC_ENABLED: 0
  GENERIC_COMPRESSION_SUPPORTED: 0
  GLOBAL_L1_CACHE_SUPPORTED: 1
  GLOBAL_MEMORY_BUS_WIDTH: 192
  GPU_OVERLAP: 1
  HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED: 1
  HANDLE_TYPE_WIN32_HANDLE_SUPPORTED: 0
  HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED: 0
  HOST_NATIVE_ATOMIC_SUPPORTED: 0
  INTEGRATED: 0
  KERNEL_EXEC_TIMEOUT: 1
  L2_CACHE_SIZE: 3145728
  LOCAL_L1_CACHE_SUPPORTED: 1
  MANAGED_MEMORY: 1
  MAXIMUM_SURFACE1D_LAYERED_LAYERS: 2048
  MAXIMUM_SURFACE1D_LAYERED_WIDTH: 32768
  MAXIMUM_SURFACE1D_WIDTH: 32768
  MAXIMUM_SURFACE2D_HEIGHT: 65536
  

thread -->> block -->> grid
sp     -->> sm    -->> device
wrap is a group of 32 threads in a block

In [16]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

import numpy as np


def test_gpu():
    arr = np.random.randn(4, 4)
    arr = arr.astype(np.float32)

    arr_gpu = cuda.mem_alloc(arr.nbytes)
    cuda.memcpy_htod(arr_gpu, arr)  # host to device
    mod = SourceModule("""
        __global__ void doublify(float *a)
        {
            int idx = threadIdx.x + threadIdx.y*4;
            a[idx] *= 2;
        }
        """)
    # __global__ means that the function is called from the host and runs on the device
    # threadIdx is a built-in CUDA variable that holds the thread index within the block
    # blockIdx is a built-in CUDA variable that holds the block index within the grid
    # why 4? because we have 4x4 array
    func = mod.get_function("doublify")
    func(arr_gpu, block=(4, 4, 1))
    # block is a tuple of 3 integers that specifies the dimensions of the block
    # grid is a tuple of 3 integers that specifies the dimensions of the grid
    # dim x has 4 threads, dim y has 4 threads, dim z has 1 thread
    arr_doubled = np.empty_like(arr)
    cuda.memcpy_dtoh(arr_doubled, arr_gpu)
    print(arr)
    print(arr_doubled)

%time test_gpu()

[[-0.5788051  -1.2548146  -1.2591091   0.6281901 ]
 [ 1.2621292  -0.4714631   0.05602671 -1.0856785 ]
 [ 1.060138   -0.4933093  -0.3339168   0.33785075]
 [ 1.07119     0.6540993   1.1726604  -0.5625655 ]]
[[-1.1576102  -2.5096292  -2.5182183   1.2563802 ]
 [ 2.5242584  -0.9429262   0.11205342 -2.171357  ]
 [ 2.120276   -0.9866186  -0.6678336   0.6757015 ]
 [ 2.14238     1.3081986   2.3453207  -1.125131  ]]
CPU times: user 2.93 ms, sys: 0 ns, total: 2.93 ms
Wall time: 2.27 ms


In [17]:
def test_cpu():
    arr = np.random.randn(4, 4)
    arr = arr.astype(np.float32)
    arr_doubled = arr * 2
    print(arr)
    print(arr_doubled)

%time test_cpu()

[[-0.19378187  1.0278679  -1.1469375   0.1396176 ]
 [ 1.5364314   0.13483281 -0.7757695   0.6898246 ]
 [-0.4454204  -0.6694119  -0.49060008 -1.7487348 ]
 [ 0.28382403 -0.46344396 -0.9994953  -0.17561226]]
[[-0.38756374  2.0557358  -2.293875    0.2792352 ]
 [ 3.0728629   0.26966563 -1.551539    1.3796492 ]
 [-0.8908408  -1.3388238  -0.98120016 -3.4974697 ]
 [ 0.56764805 -0.9268879  -1.9989907  -0.3512245 ]]
CPU times: user 1.15 ms, sys: 255 µs, total: 1.4 ms
Wall time: 1.35 ms


In [18]:
import pycuda.autoinit
import pycuda.driver as drv
import numpy

from pycuda.compiler import SourceModule

mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
  const int i = threadIdx.x;
  dest[i] = a[i] * b[i];
}
""")

multiply_them = mod.get_function("multiply_them")

a = numpy.random.randn(400).astype(numpy.float32)
b = numpy.random.randn(400).astype(numpy.float32)

dest = numpy.zeros_like(a)
multiply_them(
    drv.Out(dest), drv.In(a), drv.In(b),
    block=(400, 1, 1), grid=(1, 1))

print(dest - a * b)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

In [19]:
import pycuda.gpuarray as gpuarray
import pycuda.driver as cuda
import pycuda.autoinit
import numpy


def test_gpu():
    a_gpu = gpuarray.to_gpu(numpy.random.randn(4, 4).astype(numpy.float32))
    a_doubled = (2 * a_gpu).get()
    print(a_doubled)
    print(a_gpu)

%time test_gpu()

[[ 2.4968941   0.02882253  2.7008696   0.9127767 ]
 [-0.4148375   3.3071177   1.6443638  -0.8705558 ]
 [-1.5854768  -2.152403    2.9916284  -2.2097363 ]
 [ 1.045465    2.6737626   0.12718704 -1.452502  ]]
[[ 1.2484471   0.01441126  1.3504348   0.45638835]
 [-0.20741875  1.6535589   0.8221819  -0.4352779 ]
 [-0.7927384  -1.0762016   1.4958142  -1.1048682 ]
 [ 0.5227325   1.3368813   0.06359352 -0.726251  ]]
CPU times: user 14.9 ms, sys: 8.86 ms, total: 23.8 ms
Wall time: 453 ms


In [25]:
"""
void maxtrix_mul(float *a, float *b, float *c, int n)
{
    for (int i = 0; i < n; i++)
    {
        for (int j = 0; j < n; j++)
        {
            float sum = 0;
            for (int k = 0; k < n; k++)
            {
                sum += a[i*n+k] * b[k*n+j];
            }
            c[i*n+j] = sum;
        }
    }
}
"""

# __global__ void matrix_mul(float *a, float *b, float *c)
# {
#     int row = threadIdx.y;
#     int col = threadIdx.x;
#     float sum = 0;
#     for (int i = 0; i < 4; i++)
#     {
#         sum += a[row*4+i] * b[i*4+col];
#     }
#     c[row*4+col] = sum;
# }


from pycuda import driver, compiler, gpuarray, tools
import numpy as np

MATRIX_SIZE = 4

kernel_code = f"""
__global__ void matrix_mul(float *a, float *b, float *c)
{{
    int row = threadIdx.y;
    int col = threadIdx.x;
    float sum = 0;
    for (int i = 0; i < {MATRIX_SIZE}; i++)
    {{
        sum += a[row*{MATRIX_SIZE}+i] * b[i*{MATRIX_SIZE}+col];
    }}
    c[row*{MATRIX_SIZE}+col] = sum;
}}
"""
print(kernel_code)
# why {{ and }}? because we want to use { and } as a part of the string
# %(MATRIX_SIZE)s is a placeholder for the value of MATRIX_SIZE
# why s? because it is a string

mod = compiler.SourceModule(kernel_code)

a_cpu = np.random.randn(MATRIX_SIZE, MATRIX_SIZE).astype(np.float32)
b_cpu = np.random.randn(MATRIX_SIZE, MATRIX_SIZE).astype(np.float32)
c_cpu = np.dot(a_cpu, b_cpu)
a_gpu = gpuarray.to_gpu(a_cpu)
b_gpu = gpuarray.to_gpu(b_cpu)
c_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)
matrix_mul = mod.get_function("matrix_mul")

matrix_mul(a_gpu, b_gpu, c_gpu, block=(MATRIX_SIZE, MATRIX_SIZE, 1))
print(c_gpu.get() - c_cpu)


__global__ void matrix_mul(float *a, float *b, float *c)
{
    int row = threadIdx.y;
    int col = threadIdx.x;
    float sum = 0;
    for (int i = 0; i < 4; i++)
    {
        sum += a[row*4+i] * b[i*4+col];
    }
    c[row*4+col] = sum;
}

[[-2.3841858e-07  0.0000000e+00  0.0000000e+00  0.0000000e+00]
 [ 0.0000000e+00  1.1920929e-07  0.0000000e+00  0.0000000e+00]
 [ 0.0000000e+00  0.0000000e+00  0.0000000e+00 -2.9802322e-08]
 [-2.9802322e-08  0.0000000e+00  1.4901161e-08  0.0000000e+00]]
