In [15]:
import torch
import random

if not torch.cuda.is_available():
    raise ValueError("CUDA is not available")

seed = 241
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)  # If using GPU

N = 2**20
D = 256

start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)

device = torch.device("cuda")
x = torch.randn(N, D, device=device)

act = torch.nn.GELU()

# warmup
for i in range(10):
    y = act(x)

# benchmark
timings = []
for i in range(500):
    start_event.record()
    y = act(x)
    end_event.record()

    torch.cuda.synchronize()

    timings.append(start_event.elapsed_time(end_event))

average_time = sum(timings) / len(timings)

print(f"Average time: {average_time} ms")

Average time: 0.6034145275354386 ms


In [17]:
# Calculate throughput
bytes_per_element = x.element_size()  # Usually 4 bytes for float32
num_elements = x.numel() * 2  # Read + write operations
time_in_seconds = average_time / 1000  # Convert ms to seconds
throughput_gb_s = (bytes_per_element * num_elements) / time_in_seconds / 1e9

print(f"Elements: {x.numel():,}")
print(f"Bytes per element: {bytes_per_element}")
print(f"Total data processed: {(bytes_per_element * num_elements)/1e9:.2f} GB")
print(f"Memory throughput: {throughput_gb_s:.2f} GB/s")

Elements: 268,435,456
Bytes per element: 4
Total data processed: 2.15 GB
Memory throughput: 3558.89 GB/s
