# Micro-benchmark: matrix multiply (GFLOPS)

In [1]:
# file: torch_gflops.py  | jalankan: python torch_gflops.py
import torch, time, math, os, platform
assert torch.cuda.is_available(), "GPU CUDA tidak terdeteksi oleh PyTorch."

torch.backends.cudnn.benchmark = True

def bench_mm(n=4096, iters=30, warmup=10):
    a = torch.randn((n,n), device="cuda", dtype=torch.float16)  # FP16 lebih cepat (jika mendukung)
    b = torch.randn((n,n), device="cuda", dtype=torch.float16)
    # Casting ke FP32 output untuk stabilitas (opsional)
    # a = a.float(); b = b.float()

    # Warmup
    for _ in range(warmup):
        c = a @ b
    torch.cuda.synchronize()

    t0 = time.time()
    for _ in range(iters):
        c = a @ b
    torch.cuda.synchronize()
    t1 = time.time()

    avg_s = (t1 - t0) / iters
    # FLOPs untuk gemm: 2*N^3
    tflops = (2 * (n**3)) / avg_s / 1e12
    return avg_s, tflops

if __name__ == "__main__":
    print("Device:", torch.cuda.get_device_name(0))
    for size in [2048, 3072, 4096, 6144]:
        avg_s, tflops = bench_mm(n=size)
        print(f"N={size:4d} | avg_time={avg_s*1000:7.2f} ms | throughput≈{tflops:6.2f} TFLOPS")


Device: NVIDIA GeForce RTX 4050 Laptop GPU
N=2048 | avg_time=   0.88 ms | throughput≈ 19.49 TFLOPS
N=3072 | avg_time=   2.71 ms | throughput≈ 21.37 TFLOPS
N=4096 | avg_time=   5.81 ms | throughput≈ 23.66 TFLOPS
N=6144 | avg_time=  19.45 ms | throughput≈ 23.85 TFLOPS


# Benchmark inferensi model (images/sec)

In [2]:
# file: torch_resnet50_bench.py | jalankan: python torch_resnet50_bench.py
import torch, time
import torchvision.models as models

assert torch.cuda.is_available(), "GPU CUDA tidak terdeteksi."
torch.backends.cudnn.benchmark = True

model = models.resnet50(weights=None).cuda().eval()   # tanpa weights untuk kecepatan init
batch_size = 128
warmup, iters = 10, 30

x = torch.randn(batch_size, 3, 224, 224, device="cuda")

# Warmup
with torch.inference_mode():
    for _ in range(warmup):
        y = model(x)
torch.cuda.synchronize()

# Timing
t0 = time.time()
with torch.inference_mode():
    for _ in range(iters):
        y = model(x)
torch.cuda.synchronize()
t1 = time.time()

avg_s = (t1 - t0) / iters
imgs_per_sec = batch_size / avg_s
print("GPU:", torch.cuda.get_device_name(0))
print(f"Batch={batch_size} | avg_time_per_batch={avg_s*1000:.2f} ms | throughput≈{imgs_per_sec:.1f} images/sec")


GPU: NVIDIA GeForce RTX 4050 Laptop GPU
Batch=128 | avg_time_per_batch=320.28 ms | throughput≈399.7 images/sec


# Tes memory bandwidth (opsional, cepat)

In [3]:
# file: torch_bandwidth.py | jalankan: python torch_bandwidth.py
import torch, time
x = torch.empty(2_000_000_000//4, dtype=torch.float32, device="cuda")  # ~2GB/4=0.5G elemen? (sesuaikan VRAM!)
torch.cuda.synchronize()
t0 = time.time(); y = x.clone(); torch.cuda.synchronize(); t1 = time.time()
gb = x.numel()*x.element_size()/1e9
bw = gb/(t1-t0)
print(f"Copy ~{gb:.2f} GB | time={t1-t0:.4f}s | bandwidth≈{bw:.2f} GB/s")


Copy ~2.00 GB | time=0.7256s | bandwidth≈2.76 GB/s


In [4]:
import torch 
import time

In [5]:
size = 20000

In [6]:
# Buat tensor acak besar
a_cpu = torch.randn(size, size)
b_cpu = torch.randn(size, size)

In [7]:

# --- CPU Timing ---
start_cpu = time.time()
result_cpu = torch.matmul(a_cpu, b_cpu)
end_cpu = time.time()

In [8]:
print(f"CPU time: {end_cpu - start_cpu:.4f} seconds")


CPU time: 20.5007 seconds


In [None]:
if torch.cuda.is_available():
    # Pindahkan tensor ke GPU
    a_gpu = a_cpu.to('cuda')
    b_gpu = b_cpu.to('cuda')

    # Warm-up (karena kadang operasi pertama lambat karena init CUDA context)
    _ = torch.matmul(a_gpu, b_gpu)

    # --- GPU Timing ---
    torch.cuda.synchronize()  # pastikan semua selesai sebelum mulai timing
    start_gpu = time.time()
    result_gpu = torch.matmul(a_gpu, b_gpu)
    torch.cuda.synchronize()
    end_gpu = time.time()

    print(f"GPU time: {end_gpu - start_gpu:.4f} seconds")

    # Optional: bandingkan hasilnya
    diff = torch.norm(result_gpu.cpu() - result_cpu)
    print(f"Difference (should be close to 0): {diff:.4f}")
else:
    print("CUDA is not available on this machine.")

In [None]:
print("CUDA:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)
print("device:", torch.cuda.get_device_name())

CUDA: True
CUDA version: 11.8
device: NVIDIA GeForce RTX 4050 Laptop GPU
