Here we benchmark the UNet part of the stable diffusion model with inference done using Onnx runtime on a GPU.

In [None]:
import os
import time
import psutil
import statistics
import torch
import onnxruntime as ort
from tinydiffusion.utils.logger import LoggerConfig
from tinydiffusion.utils.constants import PROMPT
from transformers import CLIPTokenizer, CLIPTextModel

In [None]:
LOGGER = LoggerConfig().logger

In [None]:
LOGGER.info(f"{ort.get_available_providers()}")

Above shows that we have tensorrt, CUDA and CPU runtimes to perform inference on

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
# device = "cpu"
LOGGER.info(f"Using device: {device}") 

In [None]:
UNET_ONNX_PATH = os.path.join(os.getcwd(), "..", "checkpoints", "onnx", "unet.onnx")

if not os.path.exists(UNET_ONNX_PATH):
    LOGGER.warning(f"ONNX model not found at {UNET_ONNX_PATH}")

In [None]:
ROOT_DIR = os.path.dirname(os.getcwd())
LOGGER.info(f"Root directory: {ROOT_DIR}")

In [None]:
from tinydiffusion.utils.constants import ModelType

tokenizer_model_cache_dir = os.path.join(ROOT_DIR, "checkpoints", "cliptokenizer")
text_encoder_model_cache_dir = os.path.join(ROOT_DIR, "checkpoints", "cliptextencoder")

tokenizer_model_id = ModelType.LAION_CLIP_VIT.value
text_encoder_model_id = ModelType.LAION_CLIP_VIT.value

In [None]:
available_providers = ort.get_available_providers()

# CUDA
provider = "CUDAExecutionProvider" if "CUDAExecutionProvider" in available_providers else "CPUExecutionProvider"

# CPU
#provider = "CPUExecutionProvider"

# TensorRT
# provider = "TensorrtExecutionProvider"

LOGGER.info(f"Using ONNX Runtime provider: {provider}")

In [None]:
session = ort.InferenceSession(UNET_ONNX_PATH, providers=[provider])

Load text encoder & tokenizer from HuggingFace. This matches what HF's stable diffusion model uses.

In [None]:
tokenizer = CLIPTokenizer.from_pretrained(tokenizer_model_id, cache_dir=tokenizer_model_cache_dir)
text_encoder = CLIPTextModel.from_pretrained(text_encoder_model_id, cache_dir=text_encoder_model_cache_dir).to(device)

Benchmarking

In [None]:
# Metrics
inference_time = []
cpu_mem_usage = []
gpu_mem_usage = []

In [None]:
batch_size = 1
height = width = 64 # for 512x512 images

# dummy image latents
dummy_latents = torch.randn(
    batch_size, 
    4,                        # UNet in_channels is 4 - see baseline_generation.ipynb
    height, 
    width, 
    device=device, 
    dtype=torch.float16 if device=="cuda" else torch.float32
)

# arbitrary timestep
dummy_timestep = torch.tensor([10], device=device, dtype=torch.int64)

# prompt token embeddings
with torch.no_grad():
    input_ids = tokenizer(PROMPT, return_tensors="pt").input_ids.to(device)
    text_embeddings = text_encoder(input_ids)[0]

LOGGER.info(f"Text embeddings shape: {text_embeddings.shape}")

# ensure text embeddings match with stable diffusion - see baseline_generation.ipynb
assert text_embeddings.shape == torch.Size([1, 34, 1024]), f"Unexpected text embeddings shape: {text_embeddings.shape}"

In [None]:
prompt = PROMPT
num_samples = 10

process = psutil.Process(os.getpid())

results = []

dummy_latents = dummy_latents.cpu().numpy().astype("float16") # to match Stable Diffusion UNet's dtype
dummy_timestep = dummy_timestep.cpu().numpy().astype("int64")
text_embeddings = text_embeddings.cpu().numpy().astype("float16")

for i in range(num_samples):
    start_time = time.time()

    # run ONNX inference - dummy inputs and for 1 denoising step of the UNet
    outputs = session.run(
        None,
        {
            "latents": dummy_latents,
            "timestep": dummy_timestep,
            "text_embeddings": text_embeddings,
        }
    )

    end_time = time.time()
    inference_time.append(end_time - start_time)

    # Memory usage - START
    cpu_mem = process.memory_info().rss / (1024**2)  # MB
    cpu_mem_usage.append(cpu_mem)

    if device == "cuda":
        gpu_mem = torch.cuda.memory_allocated(0) / (1024**2)  # MB
        gpu_mem_usage.append(gpu_mem)
    else:
        gpu_mem = 0 
    # Memory usage - END

    LOGGER.info(f"ONNXRuntime Inference time: {(end_time - start_time):.2f}s")

LOGGER.info(f"\nAverage inference time: {statistics.mean(inference_time):.2f}s ± {statistics.stdev(inference_time):.2f}s")
LOGGER.info(f"\nAverage CPU memory usage: {statistics.mean(cpu_mem_usage):.2f}MB ± {statistics.stdev(cpu_mem_usage):.2f}MB")
if device == "cuda":
    LOGGER.info(f"\nAverage GPU memory usage: {statistics.mean(gpu_mem_usage):.2f}MB ± {statistics.stdev(gpu_mem_usage):.2f}MB")

# store results
results.append({
    "desc": "stable_diffusion_ONNX_UNet_GPU",
    "avg_inference_time": statistics.mean(inference_time),
    "std_inference_time": statistics.stdev(inference_time),
    "avg_cpu_mem_usage": statistics.mean(cpu_mem_usage),
    "std_cpu_mem_usage": statistics.stdev(cpu_mem_usage),
    "avg_gpu_mem_usage": statistics.mean(gpu_mem_usage) if device == "cuda" else 0,
    "std_gpu_mem_usage": statistics.stdev(gpu_mem_usage) if device == "cuda" else 0,
})

Ooof, that pretty bad compared to running the original Stable Diffusion UNet directly. And that is probably because pytorch UNet has a bunch of optimizations which are not being used by the exported ONNX model (if it was even exported at all)

**Change the execution provider and device and rerun this notebook on CPU. Also try tensorrt provider + CUDA.**

See `Analysis.md`. ONNX Runtime with CPU provider has the worst latency.

Save benchmark details as CSV

In [None]:
from tinydiffusion.utils.csv_utils import save_results_to_csv

save_results_to_csv(results)