<a href="https://colab.research.google.com/github/DataLama/triton-tutorials/blob/main/tutorials/basic/3_matrix_multiplication.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip show torch
!pip show triton

Name: torch
Version: 2.2.1+cu121
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3
Location: /usr/local/lib/python3.10/dist-packages
Requires: filelock, fsspec, jinja2, networkx, nvidia-cublas-cu12, nvidia-cuda-cupti-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-runtime-cu12, nvidia-cudnn-cu12, nvidia-cufft-cu12, nvidia-curand-cu12, nvidia-cusolver-cu12, nvidia-cusparse-cu12, nvidia-nccl-cu12, nvidia-nvtx-cu12, sympy, triton, typing-extensions
Required-by: fastai, torchaudio, torchdata, torchtext, torchvision
Name: triton
Version: 2.2.0
Summary: A language and compiler for custom Deep Learning operations
Home-page: https://github.com/openai/triton/
Author: Philippe Tillet
Author-email: phil@openai.com
License: 
Location: /usr/local/lib/python3.10/dist-packages
Requires: filelock
Required-by: torch


---

In [11]:
%%writefile main.py
from typing import Dict
import torch
import triton
import triton.language as tl

@triton.jit
def matmul_kernel(
    x_ptr: torch.Tensor,
    y_ptr: torch.Tensor,
    z_ptr: torch.Tensor,
    m: int,
    n: int,
    k: int,
    m_block_size: tl.constexpr,
    n_block_size: tl.constexpr,
    k_block_size: tl.constexpr,
):
  # let n_size = 16, n_block_size = 2
  # pid in {0, 1, 2, 3, ..., 64}
  # (m_block, n_block) in {(0, 0), (0, 1), (0, 2), (0, 3), ... , (7, 7)}
  pid = tl.program_id(0)
  num_n_blocks = tl.cdiv(n, n_block_size) # n의 축으로 존재하는 모든 block의 갯수 8개
  m_block = pid // num_n_blocks
  n_block = pid % num_n_blocks

  # (m_block, n_block)을 시작점으로 m_block_size, n_block_size 만큼의 텐서 공간을 확보.. 값을 저장하는...
  # z가 tensor z의 부분 텐서일 때,  z_(m_block, n_block)으로 표현 가능.
  # z_(m_block, n_block)를 계산하기 위한 부분은 (m_block, k) by (k, n_block)으로 표현 가능함.
  m_offsets = tl.arange(0, m_block_size) + m_block * m_block_size
  n_offsets = tl.arange(0, n_block_size) + n_block * n_block_size
  k_offsets = tl.arange(0, k_block_size) # k_offsets는 (m_block, k) by (k, n_block) 이 두 행렬을 바로 곱하는게 아니라 이걸 또 부분행렬로 쪼개서 iteration돌면서 계산하는데, 그때의 크기.

  # x, y, z 행렬의 포인터를 정의
  x_ptrs = x_ptr + m_offsets[:, None] * k + k_offsets[None, :]
  y_ptrs = y_ptr + k_offsets[:, None] * n + n_offsets[None, :]
  z_ptrs = z_ptr + m_offsets[:, None] * n + n_offsets[None, :]

  # z값을 0으로 초기화
  z = tl.zeros((m_block_size, n_block_size), dtype=tl.float32)

  for _ in range(0, k, k_block_size):
    # x, y를 로드
    x_sub = tl.load(x_ptrs)
    y_sub = tl.load(y_ptrs)

    # x_sub와 y_sub를 곱하여 z에 누적.
    z += tl.dot(x_sub, y_sub, allow_tf32=False)

    # next pointer로 이동
    x_ptrs += k_block_size
    y_ptrs += k_block_size * n

  # z포인터에 z값 저장
  tl.store(z_ptrs, z)

def matmul(x, y):
  m, k = x.shape
  _, n = y.shape
  z = torch.empty(m, n, device='cuda')

  def grid(meta):
    # m = 16, m_block_size = 2
    # n = 16, n_block_size = 2
    # grid의 x축 방향으로 64개 블록을 align.
    return (triton.cdiv(m, meta['m_block_size']) * triton.cdiv(n, meta['n_block_size']), )


  matmul_kernel[grid](
      x, y, z,
      m, k, n,
      m, k, n,
  )

  return z

def main():
  x = torch.randn(16, 16, device="cuda")
  y = torch.randn(16, 16, device="cuda")

  a = matmul(x, y)
  b = torch.matmul(x, y)

  assert torch.allclose(a, b)

if __name__ == "__main__":
  main()

Overwriting main.py


In [13]:
# temperal하게 allclose에러를 뱉는다.
!python main.py

경계 검사를 도입하여, 다양한 크기의 행렬에 대한 연산을 구현하면 좋을듯?

https://github.com/openai/triton/blob/main/python/tutorials/03-matrix-multiplication.py