<a href="https://colab.research.google.com/github/AndreSlavescu/triton-exercises/blob/main/triton_tutorials.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install triton torch

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [None]:
# Vector Addition

import torch
import triton
import triton.language as tl
import triton.testing as tt

@triton.jit
def add_kernel(
    x,
    y,
    out,
    n_elems,
    BLOCK_SIZE: tl.constexpr
):
  pid = tl.program_id(axis=0)
  block_start = pid * BLOCK_SIZE
  offsets = block_start + tl.arange(0, BLOCK_SIZE)

  mask = offsets < n_elems
  x_tensor = tl.load(x + offsets, mask)
  y_tensor = tl.load(y + offsets, mask)

  out_tensor = tl.store(out + offsets, x_tensor + y_tensor, mask=mask)

def add(
    x: torch.Tensor,
    y: torch.Tensor
):
  out = torch.empty_like(x)
  assert x.is_cuda and y.is_cuda and out.is_cuda and x.numel() == y.numel()

  n_elems = x.numel()

  grid = lambda meta: (triton.cdiv(n_elems, meta['BLOCK_SIZE']), )
  add_kernel[grid](x, y, out, n_elems, BLOCK_SIZE=1024)

  return out


if __name__ == "__main__":
  torch.manual_seed(0)
  size = 98432
  x = torch.rand(size, device='cuda')
  y = torch.rand(size, device='cuda')
  out_torch = x + y
  out_triton = add(x, y)

  print(f'The maximum difference between torch and triton is: '
      f'{torch.max(torch.abs(out_torch - out_triton))}')

  mean_ms = triton.testing.do_bench(lambda: add(x, y))
  print(f"mean ms: {mean_ms}")

The maximum difference between torch and triton is: 0.0
mean ms: 0.010818585753440857


In [None]:
# Prefix Sum

import torch
import triton
import triton.language as tl
import triton.testing as tt

import os
os.environ['TRITON_PRINT_AUTOTUNING'] = "1"

@triton.autotune(configs = [
    triton.Config({'BLOCK_SIZE': 128}, num_warps = 4),
    triton.Config({'BLOCK_SIZE': 1024}, num_warps = 8),
], key = ['n_elems'])
@triton.jit
def prefix_sum_kernel(
    x,
    out,
    n_elems,
    BLOCK_SIZE: tl.constexpr
):
  pid = tl.program_id(axis=0)
  block_start = pid * BLOCK_SIZE
  offsets = block_start + tl.arange(0, BLOCK_SIZE)
  tl.static_print("offsets: ", offsets)

  mask = offsets < n_elems
  x_tensor = tl.load(x + offsets, mask)

  out_tensor = tl.store(out + offsets, x_tensor, mask=mask)

def prefix_sum(
    x: torch.Tensor
):
  out = torch.empty_like(x)
  assert x.is_cuda and out.is_cuda

  n_elems = x.numel()

  grid = lambda meta: (triton.cdiv(n_elems, meta['BLOCK_SIZE']), )
  prefix_sum_kernel[grid](x, out, n_elems, BLOCK_SIZE=1024)

  return out


if __name__ == "__main__":
  torch.manual_seed(0)
  size = 98432
  x = torch.rand(size, device='cuda')
  out_torch = torch.cumsum(x, dim=0)
  out_triton = prefix_sum(x)

  print(f'The maximum difference between torch and triton is: '
      f'{torch.max(torch.abs(out_torch - out_triton))}')

  mean_ms = triton.testing.do_bench(lambda: prefix_sum(x))
  print(f"mean ms: {mean_ms}")

offsets:  int32[constexpr[1024]]
The maximum difference between torch and triton is: 49189.1015625
mean ms: 0.008661333471536636
