In [1]:
import logging

import helion
import helion.language as hl
import torch
from torch import Tensor

# If you set this to info you will see the output Triton Code
logging.getLogger().setLevel(logging.WARNING)

In [2]:
from triton.testing import do_bench
def test_kernel(kernel_fn, spec_fn, *args):
    """Test a Helion kernel against a reference implementation."""
    # Run our implementation
    result = kernel_fn(*args)
    # Run reference implementation
    expected = spec_fn(*args)

    # Check if results match
    torch.testing.assert_close(result, expected)
    print("✅ Results Match ✅")

def benchmark_kernel(kernel_fn, *args, **kwargs):
    """Benchmark a Helion kernel."""
    no_args = lambda: kernel_fn(*args, **kwargs)
    time_in_ms = do_bench(no_args)
    print(f"⏱ Time: {time_in_ms} ms")

def compare_implementations(kernel_fn, spec_fn, *args, **kwargs):
    """Benchmark a Helion kernel and its reference implementation."""
    kernel_no_args = lambda: kernel_fn(*args, **kwargs)
    spec_no_args = lambda: spec_fn(*args, **kwargs)
    kernel_time = do_bench(kernel_no_args)
    spec_time = do_bench(spec_no_args)
    print(f"⏱ Helion Kernel Time: {kernel_time:.3f} ms, PyTorch Reference Time: {spec_time:.3f} ms, Speedup: {spec_time/kernel_time:.3f}x")

In [None]:
@helion.kernel(config=helion.Config(block_sizes=[128,128]))
def example_add(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
    m, n = x.size()
    out = torch.empty_like(x)    
    for tile_m, tile_n in hl.tile([m,n]):
        out[tile_m, tile_n] = x[tile_m, tile_n] + y[tile_m,tile_n]
    return out

# Create some sample data
x = torch.randn(10, 10, device="cuda")
y = torch.randn(10, 10, device="cuda")

# Run the kernel
result = example_add(x, y)

# Verify result
expected = x + y
torch.testing.assert_close(result, expected)
print("✅ Results Match ✅")
benchmark_kernel(example_add, x, y)
compare_implementations(example_add, torch.add, x, y)

✅ Results Match ✅
⏱ Time: 0.006967028159056312 ms
⏱ Helion Kernel Time: 0.007 ms, PyTorch Reference Time: 0.006 ms, Speedup: 0.907x


In [4]:
@helion.kernel()
def example_add(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
    m, n = x.size()
    out = torch.empty_like(x)    
    for tile_m, tile_n in hl.tile([m,n]):
        out[tile_m, tile_n] = x[tile_m, tile_n] + y[tile_m,tile_n]
    return out

In [5]:
x = torch.randn(10, 10, device="cuda")
y = torch.randn(10, 10, device="cuda")

# Run the kernel
result = example_add(x, y)

# Verify result
expected = x + y
torch.testing.assert_close(result, expected)
print("✅ Results Match ✅")
benchmark_kernel(example_add, x, y)
compare_implementations(example_add, torch.add, x, y)

[0s] Autotune random seed: 499402173
[0s] Starting autotuning process, this may take a while...
[0s] Starting PatternSearch with initial_population=100, copies=5, max_generations=20


[25s] Initial random population of 100, 5 starting points: ok=100 min=0.0051 mid=0.0061 max=0.0072 best=Config(block_sizes=[1, 16], flatten_loops=[True], indexing='block_ptr', l2_groupings=[32], load_eviction_policies=['', ''], loop_orders=[[0, 1]], num_stages=2, num_warps=8, pid_type='flat', range_flattens=[None], range_multi_buffers=[None], range_num_stages=[0], range_unroll_factors=[0], range_warp_specializes=[])
[25s] Generation 1 starting: 115 neighbors, 5 active search path(s)


[55s] Generation 1 complete: ok=120 min=0.0051 mid=0.0072 max=0.0072 best=Config(block_sizes=[2, 16], flatten_loops=[True], indexing='block_ptr', l2_groupings=[32], load_eviction_policies=['', ''], loop_orders=[[0, 1]], num_stages=2, num_warps=8, pid_type='flat', range_flattens=[None], range_multi_buffers=[None], range_num_stages=[0], range_unroll_factors=[0], range_warp_specializes=[])
[55s] Generation 2 starting: 106 neighbors, 5 active search path(s)


[82s] Generation 2 complete: ok=111 min=0.0061 mid=0.0061 max=0.0072 best=Config(block_sizes=[2, 16], flatten_loops=[True], indexing='block_ptr', l2_groupings=[32], load_eviction_policies=['', ''], loop_orders=[[0, 1]], num_stages=2, num_warps=8, pid_type='flat', range_flattens=[None], range_multi_buffers=[None], range_num_stages=[0], range_unroll_factors=[0], range_warp_specializes=[])
[82s] Autotuning complete in 83.0s after searching 321 configs.
One can hardcode the best config and skip autotuning with:
    @helion.kernel(config=helion.Config(block_sizes=[2, 16], flatten_loops=[True], indexing='block_ptr', l2_groupings=[32], load_eviction_policies=['', ''], loop_orders=[[0, 1]], num_stages=2, num_warps=8, pid_type='flat', range_flattens=[None], range_multi_buffers=[None], range_num_stages=[0], range_unroll_factors=[0], range_warp_specializes=[]), static_shapes=True)



✅ Results Match ✅
⏱ Time: 0.006280533405434754 ms
⏱ Helion Kernel Time: 0.006 ms, PyTorch Reference Time: 0.006 ms, Speedup: 1.001x


## PUZZLE 1: CONSTANT ADD

In [10]:

def add_spec(x: Tensor) -> Tensor:
    """This is the spec that you should implement in the helion kernel below."""
    return x + 10.

# ---- ✨ Is this the best block size? ----
@helion.kernel(config = helion.Config(block_sizes = [1,]))
def add_kernel(x: torch.Tensor) -> torch.Tensor:
    # ---- ✨ Your Code Here ✨----
    # Set up the output buffer which you will return
    out = torch.empty_like(x)
    n = x.size()[0]
    # Use Helion to tile the computation
    for tile_n in hl.tile(n):
         out[tile_n] = x[tile_n] + 10

    return out

# Test the kernel
x = torch.randn(8192, device="cuda")
test_kernel(add_kernel, add_spec, x)
benchmark_kernel(add_kernel, x)
compare_implementations(add_kernel, add_spec, x)

✅ Results Match ✅
⏱ Time: 0.01255107654364613 ms
⏱ Helion Kernel Time: 0.012 ms, PyTorch Reference Time: 0.006 ms, Speedup: 0.507x


In [13]:

def add_spec(x: Tensor) -> Tensor:
    """This is the spec that you should implement in the helion kernel below."""
    return x + 10.

# ---- ✨ Is this the best block size? ----
@helion.kernel(config = helion.Config(block_sizes = [128,]))
def add_kernel(x: torch.Tensor) -> torch.Tensor:
    # ---- ✨ Your Code Here ✨----
    # Set up the output buffer which you will return
    out = torch.empty_like(x)
    n = x.size()[0]
    # Use Helion to tile the computation
    for tile_n in hl.tile(n):
         out[tile_n] = x[tile_n] + 10

    return out

# Test the kernel
x = torch.randn(8192, device="cuda")
test_kernel(add_kernel, add_spec, x)
benchmark_kernel(add_kernel, x)
compare_implementations(add_kernel, add_spec, x)

✅ Results Match ✅
⏱ Time: 0.01174441568081739 ms
⏱ Helion Kernel Time: 0.006 ms, PyTorch Reference Time: 0.006 ms, Speedup: 1.004x


## PUZZLE 2: OUTER VECTOR ADD

In [15]:
def broadcast_add_spec(x: Tensor, y: Tensor) -> Tensor:
    return x[None, :] + y[:, None]

# ---- ✨ Is this the best block size? ----
@helion.kernel(config = helion.Config(block_sizes = [32, 32]))
def broadcast_add_kernel(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
    # Get tensor sizes
     # ---- ✨ Your Code Here ✨----
    n0 = x.size(0)
    n1 = y.size(0)
    out = x.new_empty(n1, n0)

    # Use Helion to tile the computation
    for tile_i, tile_j in hl.tile([n1, n0]):
        # Get tiles from x and y
        y_tile = y[tile_i]
        x_tile = x[tile_j]
        # Compute outer sum
        out[tile_i, tile_j] = y_tile[:, None] + x_tile[None, :]

    return out

# Test the kernel
x = torch.randn(1142, device="cuda")
y = torch.randn(512, device="cuda")
test_kernel(broadcast_add_kernel, broadcast_add_spec, x, y)
benchmark_kernel(broadcast_add_kernel, x, y)
compare_implementations(broadcast_add_kernel, broadcast_add_spec, x, y)

✅ Results Match ✅
⏱ Time: 0.007596276778106888 ms
⏱ Helion Kernel Time: 0.008 ms, PyTorch Reference Time: 0.009 ms, Speedup: 1.047x


## PUZZLE 3

In [17]:
def mul_relu_block_spec(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
    return torch.relu(x[None,:]*y[:,None])

@helion.kernel(config = helion.Config(block_sizes = [32, 32]))
def mul_relu_block_kernel(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
    n0 = x.size(0)
    n1 = y.size(0)
    out = x.new_empty(n1,n0)
    
    for tile_i, tile_j in hl.tile([n1, n0]):
        y_tile = y[tile_i]
        x_tile = x[tile_j]
        
        out[tile_i, tile_j] = torch.relu(x_tile[None,:] * y_tile[:,None])
    
    return out

# Test the kernel
x = torch.randn(512, device="cuda")
y = torch.randn(512, device="cuda")
test_kernel(mul_relu_block_kernel, mul_relu_block_spec, x, y)
compare_implementations(mul_relu_block_kernel, mul_relu_block_spec, x, y)

✅ Results Match ✅
⏱ Helion Kernel Time: 0.008 ms, PyTorch Reference Time: 0.011 ms, Speedup: 1.520x
