In [None]:
import torch

In [3]:
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0))

PyTorch version: 2.8.0+cu128
CUDA available: True
GPU name: NVIDIA GeForce RTX 3090


In [4]:
import torch
import time

device = "cuda"

print("Running GPU test on:", torch.cuda.get_device_name(0))

# Allocate 6000x6000 matrices (roughly 1.1GB total)
a = torch.randn(6000, 6000, device=device)
b = torch.randn(6000, 6000, device=device)

torch.cuda.synchronize()
start = time.time()

c = torch.mm(a, b)   # Heavy GPU work here

torch.cuda.synchronize()
end = time.time()

print("Matrix multiply finished on:", c.device)
print(f"Time taken: {end - start:.4f} seconds")


Running GPU test on: NVIDIA GeForce RTX 3090
Matrix multiply finished on: cuda:0
Time taken: 0.1031 seconds


In [5]:
import torch
import torch.nn as nn
import torch.optim as optim

device = "cuda"

model = nn.Sequential(
    nn.Linear(1024, 2048),
    nn.ReLU(),
    nn.Linear(2048, 512)
).to(device)

x = torch.randn(64, 1024, device=device)
y = torch.randn(64, 512, device=device)

opt = optim.Adam(model.parameters(), lr=1e-3)

for i in range(5):
    opt.zero_grad()
    pred = model(x)
    loss = nn.functional.mse_loss(pred, y)
    loss.backward()
    opt.step()

    print(f"Step {i}, Loss: {loss.item():.4f}")


Step 0, Loss: 1.0549
Step 1, Loss: 0.8061
Step 2, Loss: 0.6270
Step 3, Loss: 0.4621
Step 4, Loss: 0.3178


In [6]:
print("Allocated:", torch.cuda.memory_allocated() / 1024**2, "MB")
print("Reserved:", torch.cuda.memory_reserved() / 1024**2, "MB")
print("Max allocated:", torch.cuda.max_memory_allocated() / 1024**2, "MB")


Allocated: 478.9140625 MB
Reserved: 518.0 MB
Max allocated: 490.923828125 MB


In [7]:
print(1+1)

2


In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import time

print("PyTorch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
else:
    print("No GPU detected!")

# ---------- helper for timing ----------

def time_op(fn, *, repeats=5, sync_cuda=False):
    """Run fn() several times and return average seconds."""
    # warmup
    fn()
    if sync_cuda:
        torch.cuda.synchronize()
    times = []
    for _ in range(repeats):
        if sync_cuda:
            torch.cuda.synchronize()
        t0 = time.perf_counter()
        fn()
        if sync_cuda:
            torch.cuda.synchronize()
        t1 = time.perf_counter()
        times.append(t1 - t0)
    return sum(times) / len(times)

# ---------- 1. Matrix multiplication benchmark ----------

def matmul_benchmark(device, size=4096):
    print(f"\n[MatMul] device={device}, size={size}x{size}")
    dev = torch.device(device)

    a = torch.randn(size, size, device=dev)
    b = torch.randn(size, size, device=dev)

    def op():
        torch.mm(a, b)

    avg_time = time_op(op, repeats=5, sync_cuda=(device=="cuda"))
    print(f"Average time: {avg_time:.4f} s")

# ---------- 2. Tiny training loop benchmark ----------

def training_benchmark(device, steps=200):
    print(f"\n[Training] device={device}, steps={steps}")
    dev = torch.device(device)

    model = nn.Sequential(
        nn.Linear(1024, 2048),
        nn.ReLU(),
        nn.Linear(2048, 1024),
    ).to(dev)

    x = torch.randn(1024, 1024, device=dev)
    y = torch.randn(1024, 1024, device=dev)

    opt = optim.Adam(model.parameters(), lr=1e-3)

    def train_step():
        opt.zero_grad(set_to_none=True)
        pred = model(x)
        loss = nn.functional.mse_loss(pred, y)
        loss.backward()
        opt.step()

    # timing whole loop
    def loop():
        for _ in range(steps):
            train_step()

    avg_time = time_op(loop, repeats=1, sync_cuda=(device=="cuda"))
    print(f"Total time for {steps} steps: {avg_time:.4f} s")
    print(f"Time per step: {avg_time/steps:.6f} s")

# ---------- run benchmarks ----------

# CPU
matmul_benchmark("cpu")
training_benchmark("cpu", steps=200)

# GPU (only if available)
if torch.cuda.is_available():
    matmul_benchmark("cuda")
    training_benchmark("cuda", steps=200)


PyTorch: 2.8.0+cu128
CUDA available: True
GPU: NVIDIA GeForce RTX 3090

[MatMul] device=cpu, size=4096x4096
Average time: 0.5187 s

[Training] device=cpu, steps=200
Total time for 200 steps: 33.5966 s
Time per step: 0.167983 s

[MatMul] device=cuda, size=4096x4096
Average time: 0.0056 s

[Training] device=cuda, steps=200
Total time for 200 steps: 0.3184 s
Time per step: 0.001592 s
