In [8]:
import numpy as np
import numpy.testing
import hidet


def matmul_func(m_size, n_size, k_size):
    from hidet.lang import attr, f32, tensor
    from hidet.lang import spatial, repeat
    from hidet.lang.cuda import threadIdx, blockIdx, blockDim, syncthreads
    from hidet.transforms.tools import add_packed_func

    def ceil_div(a, b):
        return (a + b - 1) // b

    tm, tn, tk = 32, 32, 128

    assert tk % tm == 0
    assert tk % tn == 0
    # make sure the matrix size is divisible by the tile size
    assert m_size % tm == 0 and n_size % tn == 0 and k_size % tk == 0

    with hidet.script_module() as script_module:

        @hidet.script
        def kernel(
            a: f32[m_size, k_size],
            b: f32[k_size, n_size],
            c: f32[m_size, n_size]
        ):
            attr.func_kind = 'cuda_kernel'
            attr.cuda_block_dim = tn, tm
            attr.cuda_grid_dim = n_size / tn, m_size / tm

            smem_a = tensor(scope='shared', dtype='float32', shape=[tm, tk])
            smem_b = tensor(scope='shared', dtype='float32', shape=[tk, tn])

            acc = f32(0.0)
            for k_tile in range(k_size / tk):
                gmem_a = a[blockIdx.y * tm: , k_tile * tk: ]
                gmem_b = b[k_tile * tk: , blockIdx.x * tn: ]

                # load data from global memory to shared memory
                tid = threadIdx.x + threadIdx.y * blockDim.y

                for i, k in repeat(1, tk / tn).spatial(tm, tn).on(tid):
                    smem_a[i, k] = gmem_a[i, k]

                for k, j in repeat(tk / tm, 1).spatial(tm, tn).on(tid):
                    smem_b[k, j] = gmem_b[k, j]

                syncthreads()

                # compute
                for k in range(tk):
                    acc += smem_a[threadIdx.y, k] * smem_b[k, threadIdx.x]
                syncthreads()

            # write result
            gi, gj = blockIdx.y * tm + threadIdx.y, blockIdx.x * tn + threadIdx.x
            c[gi, gj] = acc

    ir_module = script_module.ir_module()
    add_packed_func(ir_module, func=kernel, pack_func_name='matmul')
    return hidet.driver.build_ir_module(ir_module, func_name='matmul')


m_size, n_size, k_size = 1024, 1024, 1024
matmul = matmul_func(m_size, n_size, k_size)
print(matmul.source(color=True))

[38;5;64m#[39m[38;5;64minclude[39m[38;5;250m [39m[38;5;248;03m<stdint.h>[39;00m
[38;5;64m#[39m[38;5;64minclude[39m[38;5;250m [39m[38;5;248;03m<cuda_fp16.h>[39;00m
[38;5;64m#[39m[38;5;64minclude[39m[38;5;250m [39m[38;5;248;03m<cuda_bf16.h>[39;00m
[38;5;64m#[39m[38;5;64minclude[39m[38;5;250m [39m[38;5;248;03m<hidet/runtime/cuda_context.h>[39;00m
[38;5;64m#[39m[38;5;64minclude[39m[38;5;250m [39m[38;5;248;03m<hidet/runtime/cpu_context.h>[39;00m
[38;5;19mtypedef[39m[38;5;250m [39m[38;5;37mfloat[39m[38;5;250m [39mtfloat32_t;
[38;5;64m#[39m[38;5;64mdefine __float_to_tf32(x) (x)[39m
[38;5;19mextern[39m[38;5;250m [39m[38;5;130m"[39m[38;5;130mC[39m[38;5;130m"[39m[38;5;250m [39m{

[38;5;19m__global__[39m[38;5;250m [39m[38;5;37mvoid[39m[38;5;250m [39m__launch_bounds__([38;5;30m1024[39m)[38;5;250m [39mhidet_kernel([38;5;37mfloat[39m[38;5;250m [39m*[38;5;250m [39m[38;5;37m__restrict__[39m[38;5;250m [39ma,[38;5;250

In [19]:

a = hidet.randn([m_size, k_size]).cuda()
b = hidet.randn([k_size, n_size]).cuda()
c = hidet.empty([m_size, n_size]).cuda()
matmul(a, b, c)

np_a = a.cpu().numpy()
np_b = b.cpu().numpy()
np_c = np.matmul(np_a, np_b)

numpy.testing.assert_allclose(c.cpu().numpy(), np_c, rtol=1e-4, atol=1e-4)
print('Correctness: Pass')

latency = hidet.utils.benchmark_func(lambda: matmul(a, b, c), number=20, repeat=20)
print('    Latency: {:.2f} ms'.format(latency))

Correctness: Pass
    Latency: 0.69 ms
