In [6]:
import torch
print("HIP/ROCm version:", torch.version.hip)
print("Torch CUDA available (ROCm uses same API):", torch.cuda.is_available())
print("Device count:", torch.cuda.device_count())
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU only")


HIP/ROCm version: 6.4.43482-0f2d60242
Torch CUDA available (ROCm uses same API): True
Device count: 1
Device name: AMD Radeon RX 7900 XTX


In [7]:
x = torch.rand(5000, 5000, device="cuda")
y = torch.mm(x, x)
print("Matrix multiply OK:", y.shape)

Matrix multiply OK: torch.Size([5000, 5000])


In [1]:
import torch, time

def benchmark_matmul(size=8000, dtype=torch.float32, device="cpu"):
    print(f"\nBenchmarking {size}x{size} matmul on {device} with {dtype}...")
    x = torch.rand(size, size, device=device, dtype=dtype)
    y = torch.rand(size, size, device=device, dtype=dtype)

    # Warm-up (important for GPU JIT / caches)
    _ = torch.mm(x, y)

    if device.startswith("cuda"):
        torch.cuda.synchronize()

    start = time.time()
    _ = torch.mm(x, y)
    if device.startswith("cuda"):
        torch.cuda.synchronize()
    end = time.time()

    print(f"Time: {end - start:.3f} seconds")

# Run on CPU
benchmark_matmul(size=4000, device="cpu")

# Run on GPU (ROCm exposes GPU as 'cuda')
if torch.cuda.is_available():
    benchmark_matmul(size=4000, device="cuda")


Benchmarking 4000x4000 matmul on cpu with torch.float32...
Time: 0.145 seconds

Benchmarking 4000x4000 matmul on cuda with torch.float32...
Time: 0.005 seconds


In [None]:
import torch
import time
import matplotlib.pyplot as plt

# Matrix sizes and data types
sizes = [2000, 4000, 8000]
dtypes = [torch.float32, torch.float16, torch.bfloat16]

def benchmark_matmul(size, dtype, device):
    x = torch.rand(size, size, device=device, dtype=dtype)
    y = torch.rand(size, size, device=device, dtype=dtype)
    _ = torch.mm(x, y)  # warm-up
    if device.startswith("cuda"):
        torch.cuda.synchronize()
    start = time.time()
    _ = torch.mm(x, y)
    if device.startswith("cuda"):
        torch.cuda.synchronize()
    end = time.time()
    return end - start

# Store results
results = {dtype: {"cpu": [], "gpu": []} for dtype in dtypes}

for dtype in dtypes:
    for size in sizes:
        cpu_time = benchmark_matmul(size, dtype, "cpu")
        gpu_time = benchmark_matmul(size, dtype, "cuda") if torch.cuda.is_available() else None
        results[dtype]["cpu"].append(cpu_time)
        results[dtype]["gpu"].append(gpu_time)

# Plot results
plt.figure(figsize=(10, 6))
markers = {"cpu": "o", "gpu": "s"}
colors = {"cpu": "blue", "gpu": "red"}

for dtype in dtypes:
    plt.plot(sizes, results[dtype]["cpu"], marker=markers["cpu"], color=colors["cpu"], linestyle="--", label=f"{dtype} CPU")
    plt.plot(sizes, results[dtype]["gpu"], marker=markers["gpu"], color=colors["gpu"], linestyle="-", label=f"{dtype} GPU")

plt.xlabel("Matrix size (N x N)")
plt.ylabel("Time (seconds)")
plt.title("PyTorch MatMul Benchmark: CPU vs GPU (RX 7900 XTX)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()