In [2]:
import triton 
import torch
import triton.language as tl

In [9]:
@triton.jit
def add2vector(inputptr1,inputptr2,outputptr,n,block_size:tl.constexpr):
    block_id=tl.program_id(0)
    block_start=block_id*block_size

    offset=block_start+tl.arange(0,block_size)
    mask=offset<n
    x1=tl.load(inputptr1+offset,mask=mask)
    x2=tl.load(inputptr2+offset,mask=mask)
    result=x1+x2
    tl.store(outputptr+offset,result,mask=mask)
    

In [11]:
def test():
    n = 1000000 
    data1 = torch.randn(n, device='cuda', dtype=torch.float32)
    data2 = torch.randn(n, device='cuda', dtype=torch.float32)

    output2 = torch.zeros_like(data1)
    BLOCK_SIZE = 256
    num_blocks = triton.cdiv(n, BLOCK_SIZE)  
    add2vector[(num_blocks,)](data1,data2, output2, n, BLOCK_SIZE)
    print(f"New way result: {output2}")

if __name__ == "__main__":
    test()

New way result: tensor([ 1.2217, -0.4200, -2.3043,  ..., -0.3393,  1.6395,  1.3889],
       device='cuda:0')
