In [2]:
# Downgrade both to keep them in lockstep
!python -m pip uninstall -y cuda-bindings cuda-python
!python -m pip install --no-cache-dir "cuda-bindings==12.8.0" "cuda-python==12.8.0"
!python -m pip install --no-cache-dir nvidia-cutlass-dsl pycuda

# Then restart the runtime (Runtime → Restart runtime) and run:
# import cutlass.cute
# import cuda.bindings.driver as cu; cu.cuInit(0)


[0mCollecting cuda-bindings==12.8.0
  Downloading cuda_bindings-12.8.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting cuda-python==12.8.0
  Downloading cuda_python-12.8.0-py3-none-any.whl.metadata (15 kB)
Downloading cuda_bindings-12.8.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cuda_python-12.8.0-py3-none-any.whl (11 kB)
Installing collected packages: cuda-bindings, cuda-python
Successfully installed cuda-bindings-12.8.0 cuda-python-12.8.0


In [2]:
import cutlass.cute
import cuda.bindings.driver as cu; cu.cuInit(0)

(<CUresult.CUDA_SUCCESS: 0>,)

In [6]:
import cutlass
import cutlass.cute as cute
from cutlass.cute.runtime import from_dlpack
import cupy as cp

import time


@cute.kernel
def elementwise_add_kernel(A, B, C):
    tidx, _, _ = cute.arch.thread_idx()
    bidx, _, _ = cute.arch.block_idx()
    bdim, _, _ = cute.arch.block_dim()

    global_x = bidx * bdim + tidx

    nx, ny = A.shape
    ix = global_x % nx;
    iy = global_x // nx;

    C[ix, iy] = A[ix, iy] + B[ix, iy]
    #C[0,0] = 1.23
    #C[ix, iy] = 123.0

@cute.jit
def elementwise_add(A, B, C):
    num_threads_per_block = 256
    nx, ny = A.shape
    kernel = elementwise_add_kernel(A, B, C)
    kernel.launch(
            grid=((nx * ny) // num_threads_per_block, 1, 1),
            block=(num_threads_per_block, 1, 1)
            )

    #cute.print_tensor(A)
    #cute.print_tensor(B)
    #cute.print_tensor(C)

Nx = 2048
Ny = 2048
A = cp.random.uniform(0.0, 1.0, (Nx, Ny)).astype(cp.float32)
B = cp.random.uniform(0.0, 1.0, (Nx, Ny)).astype(cp.float32)
C = cp.empty( (Nx, Ny), dtype=cp.float32)

#A_ = from_dlpack(A, assumed_align=32)
#B_ = from_dlpack(B, assumed_align=32)
#C_ = from_dlpack(C, assumed_align=32)
A_ = from_dlpack(A)
B_ = from_dlpack(B)
C_ = from_dlpack(C)

cutlass.cuda.initialize_cuda_context()
start_time = time.time()
for i in range(100):
    elementwise_add(A_, B_, C_)
print(f"Total time: {time.time()-start_time}")


print("C[0:3, 0:3] =\n", cp.asnumpy(C[:3, :3]))


Total time: 1.4251413345336914
C[0:3, 0:3] =
 [[1.3828324  0.5965689  1.2344048 ]
 [0.7937813  0.8457347  0.80628765]
 [1.0233462  1.1294141  0.8820095 ]]


In [5]:
import cutlass
import cutlass.cute as cute
from cutlass.cute.runtime import from_dlpack
import cupy as cp

import time


@cute.kernel
def elementwise_add_kernel(A, B, C):
    tidx, _, _ = cute.arch.thread_idx()
    bidx, _, _ = cute.arch.block_idx()
    bdim, _, _ = cute.arch.block_dim()

    global_x = bidx * bdim + tidx

    #nx, ny = A.shape
    #ix = global_x % nx;
    #iy = global_x // nx;

    #C[ix, iy] = A[ix, iy] + B[ix, iy]

    # Map thread index to logical index of input tensor
    m, n = A.shape[1]       # thread-domain
    ni = global_x % n
    mi = global_x // n

    # Map logical index to physical address via tensor layout
    a_val = A[(None, (mi, ni))].load()
    b_val = B[(None, (mi, ni))].load()
    #print(f"[DSL INFO] sliced gA = {A[(None, (mi, ni))]}")
    #print(f"[DSL INFO] sliced gB = {B[(None, (mi, ni))]}")

    # Perform element-wise addition
    C[(None, (mi, ni))] = a_val + b_val


@cute.jit
def elementwise_add(A, B, C):
    num_threads_per_block = 256

    gA = cute.zipped_divide(A, (1, 4))
    gB = cute.zipped_divide(B, (1, 4))
    gC = cute.zipped_divide(C, (1, 4))

    #print(f"[DSL INFO] Tiled Tensors:")
    #print(f"[DSL INFO]   gA = {gA}")
    #print(f"[DSL INFO]   gB = {gB}")
    #print(f"[DSL INFO]   gC = {gC}")

    elementwise_add_kernel(gA, gB, gC).launch(
        grid=(cute.size(gC, mode=[1]) // num_threads_per_block, 1, 1),
        block=(num_threads_per_block, 1, 1),
    )



Nx = 2048
Ny = 2048
A = cp.random.uniform(0.0, 1.0, (Nx, Ny)).astype(cp.float32)
B = cp.random.uniform(0.0, 1.0, (Nx, Ny)).astype(cp.float32)
C = cp.empty( (Nx, Ny), dtype=cp.float32)

A_ = from_dlpack(A)
B_ = from_dlpack(B)
C_ = from_dlpack(C)

cutlass.cuda.initialize_cuda_context()
start_time = time.time()
for i in range(100):
    elementwise_add(A_, B_, C_)
print(f"Total time: {time.time()-start_time}")


print("C[0:3, 0:3] =\n", cp.asnumpy(C[:3, :3]))


Total time: 1.9427759647369385
C[0:3, 0:3] =
 [[-8.9898463e+28  1.8275932e+00 -1.1929849e-14]
 [ 1.9503593e+20  1.6870652e+00  3.5662110e-38]
 [-1.0447428e+01  1.7868803e+00 -7.1491721e+37]]
