Choosing to use a LoRA / Distilled model because its lighter, faster, lower VRAM, easier for experimentation, perfect for a baseline and for quantization/optimization later.

In [None]:
import os
import torch
from diffusers import StableDiffusionPipeline
import time
import statistics
import psutil

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Load a lightweight/distilled Stable Diffusion model (LoRA or small variant)

In [None]:
ROOT_DIR = os.path.dirname(os.getcwd())
print(f"Root directory: {ROOT_DIR}")

In [None]:
# Example: "stabilityai/stable-diffusion-2-base" is smaller than SD 1.5 full
model_cache_dir = os.path.join(ROOT_DIR, "checkpoints", "stablediffusion")
model_id = "stabilityai/stable-diffusion-2-base"  # This is not LoRA checkpoint

Below we load the fp16 variant (as opposed to downloading the fp32 variant and then converting to fp16). [Ref](https://huggingface.co/docs/diffusers/en/using-diffusers/loading#:~:text=There%20are%20two%20important%20arguments%20for%20loading%20variants%3A)

In [None]:
# Load pipeline
pipe = StableDiffusionPipeline.from_pretrained(
    model_id,
    cache_dir=model_cache_dir,
    variant="fp16",
    torch_dtype=torch.float16 if device=="cuda" else torch.float32
)
pipe = pipe.to(device)

In [None]:
# Enable memory-efficient attention for faster & less VRAM usage
pipe.enable_attention_slicing()

In [None]:
# Metrics
inference_time = []
cpu_mem_usage = []
gpu_mem_usage = []

In [None]:
prompt = "A whale falling through a starry sky beside a floating bowl of petunias, painted in a surreal " \
         "cosmic landscape, whimsical and dreamlike, detailed digital art."
num_samples = 10

process = psutil.Process(os.getpid())

GEN_IMG_SAVE_PATH = os.path.join(os.path.dirname(os.getcwd()), "results", "generated_images")
os.makedirs(GEN_IMG_SAVE_PATH, exist_ok=True)

print(f"Generating images. Will be saved to: {GEN_IMG_SAVE_PATH}")

for i in range(num_samples):
    start_time = time.time()
    image = pipe(prompt, guidance_scale=7.5, num_inference_steps=50).images[0]
    end_time = time.time()
    inference_time.append(end_time - start_time)

    # Memory usage - START
    cpu_mem = process.memory_info().rss / (1024**2)  # MB
    cpu_mem_usage.append(cpu_mem)

    if device == "cuda":
        gpu_mem = torch.cuda.memory_allocated(0) / (1024**2)  # MB
        gpu_mem_usage.append(gpu_mem)
    # Memory usage - END

    print(f"Saved sample_{i}.png | Inference time: {(end_time - start_time):.2f}s")
    image.save(os.path.join(GEN_IMG_SAVE_PATH, f"sample_{i}.png"))
    print(f"Saved sample_{i}.png")

print(f"\nAverage inference time: {statistics.mean(inference_time):.2f}s ± {statistics.stdev(inference_time):.2f}s")
print(f"\nAverage CPU memory usage: {statistics.mean(cpu_mem_usage):.2f}MB ± {statistics.stdev(cpu_mem_usage):.2f}MB")
if device == "cuda":
    print(f"\nAverage GPU memory usage: {statistics.mean(gpu_mem_usage):.2f}MB ± {statistics.stdev(gpu_mem_usage):.2f}MB")