In [1]:
import triton 
import torch
import triton.language as tl

In [2]:
@triton.jit 
def addrelu(inptr1,outptr,m,blocksize:tl.constexpr):
    p_id=tl.program_id(0)
    block_start=blocksize*p_id
    offsets=block_start+tl.arange(0,blocksize)
    mask=offsets<m
    a=tl.load(inptr1+offsets,mask=mask)
    val=tl.where(a>0,a,0)
    tl.store(outptr+offsets,val,mask=mask)

In [5]:
def test():
    m=500
    vector_a = torch.randn(m, device='cuda', dtype=torch.float32)
    vector_b=torch.zeros_like(vector_a)
    blocksize=128
    noofblock=triton.cdiv(m,blocksize)
    addrelu[(noofblock,)](vector_a,vector_b,m,blocksize)
    print(vector_a)
    print(vector_b)

    
    

In [6]:
if __name__=="__main__":
    test()

tensor([-1.2105, -0.3093,  0.3727,  1.6774, -2.1794,  0.3832,  0.5094,  0.1192,
        -1.0801,  0.4817,  1.6915, -3.2598,  0.7989, -0.5359,  0.6640,  0.3425,
        -0.4539, -1.1492, -0.0449,  0.4946, -0.0781,  0.1905, -0.1515, -1.2684,
        -1.5063,  0.6914,  0.0051,  0.4601,  1.0120, -0.1773, -0.9306, -0.8558,
         0.1983, -0.8884,  0.2512, -0.0525,  0.1433,  0.1707, -1.5819, -1.1430,
        -1.4076, -0.4468,  0.6361,  0.7358,  0.5840, -0.9123, -1.2743, -0.8465,
        -1.1013, -0.4737,  0.6564,  1.6716, -0.7081, -1.2951,  0.7579,  0.7095,
         0.4827, -0.6133, -0.9323, -2.0065,  0.2352, -0.5849, -0.0048,  1.6966,
        -0.3938,  1.0447,  0.8331, -1.7182,  0.1041,  0.5254,  1.6273, -1.5504,
        -2.1668, -1.5960, -0.7003,  1.4643,  0.9674, -0.2163, -0.9116, -1.6568,
         1.3556, -1.4203,  1.1190, -1.8048, -0.9675, -0.9038, -0.8760, -0.0487,
         1.7397,  0.2841,  0.7566, -0.9278, -0.7687, -0.6032, -0.6101,  0.5778,
        -0.6079, -0.5309,  1.4584, -0.06