# QuantumConv2D Performance Benchmarks

This notebook benchmarks the performance of three implementations of a Quantum Convolutional Layer:
1. **Sequential**: Standard loop-based implementation on CPU
2. **Batched (CPU)**: Vectorized implementation running on CPU
3. **Batched (GPU)**: Vectorized implementation running on GPU

In [1]:
import torch
import time

from qml.layers import QuantumConv2D, BatchedQuantumConv2D, BatchedGPUQuantumConv2D
from qml.ansatz.standard import StandardQCNNAnsatz

In [2]:
# Setup Device and Hyperparameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Target device for GPU model: {device}")

batch_size = 16  # Batch size for testing
channels = 1
height = 28
width = 28
n_qubits = 4

print(f"Batch Size: {batch_size}, Image Size: {height}x{width}")

Target device for GPU model: cuda
Batch Size: 16, Image Size: 28x28


In [3]:
# Create Random Input Data
# Fix seed for reproducibility
torch.manual_seed(42)
x = torch.randn(batch_size, channels, height, width)

# GPU Version needs input on the device
x_gpu = x.clone().to(device)

print(f"Input shape: {x.shape}")

Input shape: torch.Size([16, 1, 28, 28])


In [4]:
# Initialize Layers
ansatz = StandardQCNNAnsatz()

print("Initializing layers...")

# 1. Sequential (Slow reference)
layer_sequential = QuantumConv2D(
    n_qubits=n_qubits, ansatz=ansatz, stride=1, encoding="ry"
)

# 2. Batched CPU (Fast)
layer_batched_cpu = BatchedQuantumConv2D(
    n_qubits=n_qubits, ansatz=ansatz, stride=1, encoding="ry"
)

# 3. Batched GPU (Fastest on NVIDIA)
layer_batched_gpu = BatchedGPUQuantumConv2D(
    n_qubits=n_qubits, ansatz=ansatz, stride=1, encoding="ry"
).to(device)

# FORCE WEIGHTS TO BE IDENTICAL FOR FAIR COMPARISON
with torch.no_grad():
    layer_batched_cpu.q_params.data = layer_sequential.q_params.data.clone()
    layer_batched_gpu.q_params.data = layer_sequential.q_params.data.to(device)

print("Layer initialization complete and weights synchronized.")

Initializing layers...
Using lightning.qubit device with 'ry' encoding, StandardQCNNAnsatz, measurement=PauliZ
Using lightning.qubit device with 'ry' encoding, StandardQCNNAnsatz, measurement=PauliZ
Using default.qubit device with 'ry' encoding, StandardQCNNAnsatz, measurement=PauliZ
Layer initialization complete and weights synchronized.


In [5]:
# Test Sequential Implementation
print("Running Sequential (Slow) Version (CPU)...")
start_time = time.time()

with torch.no_grad():
    out_sequential = layer_sequential(x)

end_time = time.time()
time_sequential = end_time - start_time
print(f"Sequential Time: {time_sequential:.4f} seconds")

Running Sequential (Slow) Version (CPU)...
Sequential Time: 55.1463 seconds


In [6]:
# Test Batched CPU Implementation
print("Running Batched Version (CPU)...")
start_time = time.time()

with torch.no_grad():
    out_batched_cpu = layer_batched_cpu(x)

end_time = time.time()
time_batched_cpu = end_time - start_time
print(f"Batched CPU Time: {time_batched_cpu:.4f} seconds")

Running Batched Version (CPU)...
Batched CPU Time: 31.6674 seconds


In [7]:
# Test Batched GPU Implementation
# Note: This is the most critical benchmark for large scale training
print(f"Running Batched Version ({device})...")

# Sync CUDA before starting timer
if device.type == "cuda":
    torch.cuda.synchronize()

start_time = time.time()

with torch.no_grad():
    out_batched_gpu = layer_batched_gpu(x_gpu)

# Sync CUDA after execution to measure actual compute time
if device.type == "cuda":
    torch.cuda.synchronize()

end_time = time.time()
time_batched_gpu = end_time - start_time
print(f"Batched GPU Time: {time_batched_gpu:.4f} seconds")

Running Batched Version (cuda)...
Batched GPU Time: 0.5908 seconds


In [8]:
# Compare Results and Verify Correctness

speedup_batched = time_sequential / time_batched_cpu
speedup_gpu = time_sequential / time_batched_gpu
speedup_batched_gpu = time_batched_cpu / time_batched_gpu

print(f"Speedup Batched vs Sequential: {speedup_batched:.2f}x")
print(f"Speedup GPU vs Sequential:     {speedup_gpu:.2f}x")
print(f"Speedup GPU vs Batched CPU:    {speedup_batched_gpu:.2f}x")

# Check correctness (Numerical drift is expected, but should be small)
diff_batched = torch.abs(out_sequential - out_batched_cpu).max().item()
diff_gpu = torch.abs(out_sequential - out_batched_gpu.cpu()).max().item()

print(f"\nMax Diff Batched (CPU): {diff_batched}")
print(f"Max Diff GPU:           {diff_gpu}")

# Validation Threshold
if diff_batched < 1e-4 and diff_gpu < 1e-4:
    print(">> VALIDATION PASSED: All results are identical (within tolerance).")
else:
    print(">> VALIDATION FAILED: Results differ significantly!")

Speedup Batched vs Sequential: 1.74x
Speedup GPU vs Sequential:     93.34x
Speedup GPU vs Batched CPU:    53.60x

Max Diff Batched (CPU): 0.0
Max Diff GPU:           3.5762786865234375e-07
>> VALIDATION PASSED: All results are identical (within tolerance).
