In [1]:
import triton 
import torch
import triton.language as tl

In [3]:
@triton.jit 
def addrelu(inptr1,outptr,m,alpha,blocksize:tl.constexpr):
    p_id=tl.program_id(0)
    block_start=blocksize*p_id
    offsets=block_start+tl.arange(0,blocksize)
    mask=offsets<m
    a=tl.load(inptr1+offsets,mask=mask)
    val=tl.where(a>0,a,a*alpha)
    tl.store(outptr+offsets,val,mask=mask)

In [4]:
def test():
    m=500
    alpha=0.01
    vector_a = torch.randn(m, device='cuda', dtype=torch.float32)
    vector_b=torch.zeros_like(vector_a)
    blocksize=128
    noofblock=triton.cdiv(m,blocksize)
    addrelu[(noofblock,)](vector_a,vector_b,m,alpha,blocksize)
    print(vector_a)
    print(vector_b)

    
    

In [5]:
if __name__=="__main__":
    test()

tensor([-2.2943e+00, -1.7625e-01,  3.5393e-01,  7.3282e-01, -9.9733e-01,
        -4.7105e-01, -8.4926e-02,  3.0288e-02,  1.0693e+00,  1.8056e+00,
         1.7899e+00,  7.4371e-01,  5.1959e-02,  4.4756e-01,  1.1062e+00,
         8.4512e-02, -2.8226e-01, -3.6292e-01, -1.0222e-01,  5.3512e-01,
         1.5349e+00,  1.4131e+00, -1.0352e+00, -1.1144e-01, -6.4032e-01,
         5.1725e-01, -3.6418e-01,  1.6522e-03,  1.0524e+00, -3.6783e-01,
        -2.3976e-02, -9.8250e-01, -1.0825e+00, -2.9201e-01,  1.8058e+00,
        -1.3402e+00, -4.9282e-01,  1.4332e-01,  1.3459e-01, -2.9321e-01,
         6.5894e-01, -1.6069e+00,  1.1110e+00,  1.1631e+00, -1.4599e-01,
        -8.1063e-01, -6.0599e-02,  1.6416e-01, -2.0047e+00,  1.5046e-01,
        -1.4963e+00,  8.1861e-01,  7.8490e-01,  1.9784e-01,  2.5576e-01,
        -5.7283e-01, -1.9473e+00,  5.9864e-01, -9.7368e-01,  1.1494e+00,
        -3.6365e-01, -1.1934e+00, -1.4670e+00, -8.0259e-01, -1.8496e-01,
         1.1241e+00, -2.6499e-01,  4.2589e-01,  6.4