In [1]:
import triton 
import torch
import triton.language as tl

In [13]:
@triton.jit 
def addrelu(inptr1,inptr2,outptr,m,blocksize:tl.constexpr):
    p_id=tl.program_id(0)
    block_start=blocksize*p_id
    offsets=block_start+tl.arange(0,blocksize)
    mask=offsets<m
    a=tl.load(inptr1+offsets,mask=mask)
    b=tl.load(inptr2+offsets,mask=mask)
    add=a+b
    val=tl.where(add>0,add,0)
    tl.store(outptr+offsets,val,mask=mask)

In [54]:
def test():
    m=500
    vector_a = torch.randn(m, device='cuda', dtype=torch.float32)
    vector_b = torch.randn(m, device='cuda', dtype=torch.float32)
    vector_c=torch.zeros_like(vector_a)
    blocksize=128
    noofblock=triton.cdiv(m,blocksize)
    addrelu[(noofblock,)](vector_a,vector_b,vector_c,m,blocksize)
    print(vector_a)
    print(vector_b)
    print(vector_c)

    
    

In [55]:
if __name__=="__main__":
    test()

tensor([ 2.3215, -0.0299, -0.9560,  0.1787,  0.2025, -0.1617,  0.7445, -0.1346,
         0.2441, -0.4707,  0.0543,  1.3179, -0.5354,  0.5873, -1.1280, -0.5593,
         1.0811, -0.2767, -0.5366, -0.1407,  0.0572,  1.1267,  1.0083,  0.8906,
        -0.1748, -1.7069, -0.7855,  0.7537, -0.4147,  1.5721, -0.6286,  0.2159,
         1.8299, -0.5284,  3.6455, -0.5095,  0.8809,  1.1681,  2.7848, -1.0551,
        -1.6946,  0.4969,  0.3774, -0.6015, -2.1354,  0.3633,  1.0295, -0.7661,
        -0.5028,  0.3671, -0.2926, -0.6580,  0.0230, -1.6875,  1.3753, -1.4851,
         0.2391, -0.1628, -1.1032, -0.6824,  0.9899, -0.7036, -0.6938, -1.3151,
         0.0119,  0.2264,  0.7243, -0.3058, -0.0676, -0.4264, -1.2726, -1.1453,
        -0.4714, -0.5245, -1.3367,  1.0400, -0.7517,  1.5651, -0.3441,  0.0561,
        -1.1072,  0.3202,  1.3057,  0.3557,  1.0742, -0.9793, -1.9338,  1.2487,
         0.5232, -1.0249, -1.3499, -0.8370,  0.9949, -0.2556, -0.2703, -1.3451,
        -0.6342, -0.6660, -1.0141, -0.93