In [None]:
import triton
import triton.language as tl

@triton.jit
def conv1d_kernel(
    input_ptr, kernel_ptr, output_ptr,
    input_size, kernel_size,
    BLOCK_SIZE: tl.constexpr
):
    input_ptr = input_ptr.to(tl.pointer_type(tl.float32))
    kernel_ptr = kernel_ptr.to(tl.pointer_type(tl.float32))
    output_ptr = output_ptr.to(tl.pointer_type(tl.float32))

    pid=tl.program_id(0)
    block_start=pid*BLOCK_SIZE
    offsets=block_start+ tl.arange(0,BLOCK_SIZE)
    mask=offsets < (input_size - kernel_size + 1)

    accumulate=tl.zeros([BLOCK_SIZE], dtype=tl.float32)
    for j in range(0,kernel_size):
        inval=tl.load(input_ptr+offsets+j,mask=mask)
        invalkernel=tl.load(kernel_ptr+j)
        accumulate+=inval*invalkernel

    tl.store(output_ptr + offsets, accumulate, mask=mask)


def solve(input_ptr: int, kernel_ptr: int, output_ptr: int, input_size: int, kernel_size: int):
    BLOCK_SIZE = 1024
    n_blocks = triton.cdiv(input_size - kernel_size + 1, BLOCK_SIZE)
    grid = (n_blocks,)
    
    conv1d_kernel[grid](
        input_ptr, kernel_ptr, output_ptr,
        input_size, kernel_size,
        BLOCK_SIZE
    )