In [1]:
import torch
import time

# Check available devices
devices = ['cpu']
if torch.backends.mps.is_available():
    devices.append('mps')

# Tensor dimensions and test parameters
size = (10000, 10000)  # Large tensor for measurable computation
repeats = 10  # Number of repeats for stable timing

for device in devices:
    # Create tensor on target device
    x = torch.randn(size, device=device)
    
    # Warm-up to avoid initialization overhead
    _ = x * x
    if device == 'mps':
        torch.mps.synchronize()  # MPS operations are asynchronous
    
    # Timing loop
    start_time = time.time()
    for _ in range(repeats):
        y = x * x  # Element-wise multiplication
    if device == 'mps':
        torch.mps.synchronize()  # Wait for MPS operations to complete
    total_time_ms = (time.time() - start_time) * 1000  # Convert to milliseconds
    
    # Calculate average time per operation
    avg_time_ms = total_time_ms / repeats
    
    # Results
    print(f"Device: {device.upper()}")
    print(f"Total time for {repeats} ops: {total_time_ms:.2f} ms")
    print(f"Average time per op: {avg_time_ms:.2f} ms\n")

Device: CPU
Total time for 10 ops: 311.56 ms
Average time per op: 31.16 ms

Device: MPS
Total time for 10 ops: 205.69 ms
Average time per op: 20.57 ms

