In [3]:
import torch
import triton
import triton.language as tl

@triton.jit
def add_kernel(x_ptr, y_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
    pid = tl.program_id(axis=0)
    block_start = pid * BLOCK_SIZE
    offsets = block_start + tl.arange(0, BLOCK_SIZE)
    mask = offsets < n_elements
    x = tl.load(x_ptr + offsets, mask=mask)
    y = tl.load(y_ptr + offsets, mask=mask)
    output = x + y
    tl.store(output_ptr + offsets, output, mask=mask)

def add(x: torch.Tensor, y: torch.Tensor):
    output = torch.empty_like(x)
    n_elements = output.numel()
    grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
    add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
    return output

a = torch.rand(3, device="cuda")
b = a + a
b_compiled = add(a, a)
print(b_compiled - b)
print("If you see tensor([0., 0., 0.], device='cuda:0'), then it works")

tensor([0., 0., 0.], device='cuda:0')
If you see tensor([0., 0., 0.], device='cuda:0'), then it works


In [6]:
import torch

print(torch.version.cuda)
print(torch.cuda.is_available())

if torch.cuda.is_available():
    print("CUDA is available")
    print(f"Number of devices: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"{i}: {torch.cuda.get_device_name(i)}")
else:
    print("CUDA is not available")

12.8
True
CUDA is available
Number of devices: 2
0: NVIDIA GeForce RTX 5090
1: NVIDIA GeForce RTX 5070 Ti


In [7]:
print("PyTorch CUDA available:", torch.cuda.is_available())
print("CUDA Version:", torch.version.cuda)
print("Torch Version:", torch.__version__)
print("GPU Name:", torch.cuda.get_device_name(0))
print("Compute Capability:", torch.cuda.get_device_capability(0))


PyTorch CUDA available: True
CUDA Version: 12.8
Torch Version: 2.8.0.dev20250405+cu128
GPU Name: NVIDIA GeForce RTX 5090
Compute Capability: (12, 0)
