In [None]:
from diffusers import DiffusionPipeline
import torch

model_name = "Qwen/Qwen-Image"

# Load the pipeline
if torch.cuda.is_available():
    torch_dtype = torch.bfloat16
    device = "cuda"
else:
    torch_dtype = torch.float32
    device = "cpu"
    
# If your GPU has 24 GiB, 20/24 ~= 0.83
if torch.cuda.is_available():
    torch.cuda.set_per_process_memory_fraction(0.83, device=0)  # soft cap; prevents allocator from growing past ~cap
    
# Ask diffusers/accelerate to stay within 20 GiB on GPU and spill the rest to CPU
max_mem = {0: "20GiB", "cpu": "64GiB"}  # adjust CPU RAM as you like


pipe = DiffusionPipeline.from_pretrained(model_name, torch_dtype=torch_dtype, device_map="balanced", max_memory=max_mem)


# ----- 2) Turn on memory-saving knobs
# Prefer SDPA (PyTorch 2.x) or xFormers if you have it
try:
    pipe.enable_sdpa()  # uses scaled-dot-product attention
except Exception:
    try:
        pipe.enable_xformers_memory_efficient_attention()
    except Exception:
        pass

pipe.enable_attention_slicing()   # chunk attention
pipe.enable_vae_slicing()         # VAE slices
pipe.enable_vae_tiling()          # tiling helps larger resolutions
# pipe.enable_sequential_cpu_offload()  # move modules to CPU between calls

positive_magic = {
    "en": ", just regular image", # for english prompt
    "zh": ", 超清，4K，电影级构图." # for chinese prompt
}

# Generate image
prompt = '''A coffee shop entrance features a chalkboard sign reading "Qwen Coffee 😊 $2 per cup," with a neon light beside it displaying "通义千问". Next to it hangs a poster showing a beautiful Chinese woman, and beneath the poster is written "π≈3.1415926-53589793-23846264-33832795-02384197".'''

negative_prompt = " " # Recommended if you don't use a negative prompt.

# ----- 4) Resolution (keep moderate to reduce peak VRAM)
width, height = (1664, 928)  # 16:9; lowering this saves a lot of memory

# ----- 5) CFG can increase memory (second UNet pass). If you hit OOM, set to 1.0 (no CFG).
true_cfg_scale = 4.0  # try 1.0 if you still exceed ~20 GiB

gen = torch.Generator(device="cuda") if torch.cuda.is_available() else torch.Generator()
gen = gen.manual_seed(42)


# Generate with different aspect ratios
aspect_ratios = {
    "1:1": (1328, 1328),
    "16:9": (1664, 928),
    "9:16": (928, 1664),
    "4:3": (1472, 1104),
    "3:4": (1104, 1472),
    "3:2": (1584, 1056),
    "2:3": (1056, 1584),
}

width, height = aspect_ratios["16:9"]

image = pipe(
    prompt=prompt + positive_magic["en"],
    negative_prompt=negative_prompt,
    width=width,
    height=height,
    num_inference_steps=50,
    true_cfg_scale=true_cfg_scale,
    generator=gen
).images[0]

image.save("example.png")

    PyTorch 2.6.0+cu124 with CUDA 1204 (you have 2.5.1+cu121)
    Python  3.11.11 (you have 3.11.9)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details
2025-08-22 11:35:26.218068: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-22 11:35:26.229879: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755830126.243857 4146818 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

The config attributes {'pooled_projection_dim': 768} were passed to QwenImageTransformer2DModel, but are not expected and will be ignored. Please verify your config.json configuration file.


Loading checkpoint shards:   0%|          | 0/9 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

ValueError: It seems like you have activated a device mapping strategy on the pipeline so calling `enable_sequential_cpu_offload() isn't allowed. You can call `reset_device_map()` first and then call `enable_sequential_cpu_offload()`.