# Vector Addition

In this example, we will learn how to implement a vector addition kernel in hidet script. 


In [3]:
import hidet

def vector_addition(n):
    # the hidet.lang module is the submodule that implements hidet script
    from hidet.lang import attr, f32
    
    # import cuda specific extern variables
    from hidet.lang.cuda import threadIdx, blockIdx, blockDim
    
    from hidet.transforms.tools import add_packed_func

    with hidet.script_module() as script_module:

        @hidet.script
        def kernel(a: f32[n], b: f32[n], c: f32[n]):
            
            # mark this function as a cuda kernel
            attr.func_kind = 'cuda_kernel'
            
            # set the block dimension and grid dimensions
            attr.cuda_block_dim = 256
            attr.cuda_grid_dim = (n + 255) / 256
            
            # get the index of the thread among all threads
            idx = threadIdx.x + blockIdx.x * blockDim.x
            
            if idx < n:
                c[idx] = a[idx] + b[idx]

    ir_module = script_module.ir_module()
    
    # because we can not run the cuda kernel, we creates a packed function to launch it
    add_packed_func(ir_module, func=kernel, pack_func_name='add')
    
    return hidet.driver.build_ir_module(ir_module, func_name='add')

n = 5
add_func = vector_addition(n)
print(add_func.source(color=True))

[38;5;64m#[39m[38;5;64minclude[39m[38;5;250m [39m[38;5;248;03m<stdint.h>[39;00m
[38;5;64m#[39m[38;5;64minclude[39m[38;5;250m [39m[38;5;248;03m<cuda_fp16.h>[39;00m
[38;5;64m#[39m[38;5;64minclude[39m[38;5;250m [39m[38;5;248;03m<cuda_bf16.h>[39;00m
[38;5;64m#[39m[38;5;64minclude[39m[38;5;250m [39m[38;5;248;03m<hidet/runtime/cuda_context.h>[39;00m
[38;5;64m#[39m[38;5;64minclude[39m[38;5;250m [39m[38;5;248;03m<hidet/runtime/cpu_context.h>[39;00m
[38;5;19mtypedef[39m[38;5;250m [39m[38;5;37mfloat[39m[38;5;250m [39mtfloat32_t;
[38;5;64m#[39m[38;5;64mdefine __float_to_tf32(x) (x)[39m
[38;5;19mextern[39m[38;5;250m [39m[38;5;130m"[39m[38;5;130mC[39m[38;5;130m"[39m[38;5;250m [39m{

[38;5;19m__global__[39m[38;5;250m [39m[38;5;37mvoid[39m[38;5;250m [39m__launch_bounds__([38;5;30m256[39m)[38;5;250m [39mhidet_kernel([38;5;37mfloat[39m[38;5;250m [39m*[38;5;250m [39m[38;5;37m__restrict__[39m[38;5;250m [39ma,[38;5;250m

In [2]:
a = hidet.randint(low=0, high=3, shape=[n]).to('float32').cuda()
b = hidet.randint(low=0, high=3, shape=[n]).to('float32').cuda()
c = hidet.randn([n]).cuda()
print(a)
print(b)
add_func(a, b, c)
print(c)

Tensor(shape=(5,), dtype='float32', device='cuda:0')
[2. 0. 2. 0. 1.]
Tensor(shape=(5,), dtype='float32', device='cuda:0')
[0. 1. 1. 2. 2.]
Tensor(shape=(5,), dtype='float32', device='cuda:0')
[2. 1. 3. 2. 3.]
