In [1]:
%%bash
set -e

echo "==[0] Check GPU (nvidia-smi) =="
nvidia-smi || (echo "ERROR: GPU not enabled. Runtime -> Change runtime type -> GPU" && exit 1)

echo "==[1] Install virtualenv (system) =="
python -m pip -q install --upgrade pip virtualenv

echo "==[2] Recreate clean venv via virtualenv =="
rm -rf /content/step5_venv
python -m virtualenv -p python3 /content/step5_venv
source /content/step5_venv/bin/activate
python -V
pip -q install --upgrade pip

echo "==[3] Install PyTorch GPU (cu121 pinned) =="
pip -q install --index-url https://download.pytorch.org/whl/cu121 \
  torch==2.5.1+cu121 torchvision==0.20.1+cu121 torchaudio==2.5.1+cu121

echo "==[4] Install diffusers stack (pinned) =="
pip -q install \
  diffusers==0.36.0 transformers==4.57.3 accelerate==1.12.0 safetensors==0.7.0

echo "==[5] Install xformers (NO-DEPS to avoid torch being replaced) =="
pip -q install xformers==0.0.29.post1 --no-deps

echo "==[6] Install ipykernel + register kernel =="
pip -q install ipykernel
python -m ipykernel install --user --name step5_venv --display-name "Python (step5_venv)"

echo
echo "✅ Done. Now switch kernel to: Runtime -> Change runtime type -> Kernel -> Python (step5_venv)"


==[0] Check GPU (nvidia-smi) ==
Tue Jan 13 03:58:23 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   65C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                

In [2]:
# Cell — Step5 Final Sanity + Hard Switches (baseline vs xformers vs compile-ready)
import os, sys, json, time, platform, statistics
import torch
from diffusers import StableDiffusionPipeline

print("=== ENV (must be consistent across runs) ===")
ENV = {
    "python": platform.python_version(),
    "executable": sys.executable,
    "torch": torch.__version__,
    "cuda_available": torch.cuda.is_available(),
    "cuda_version": torch.version.cuda,
    "gpu": torch.cuda.get_device_name(0) if torch.cuda.is_available() else None,
}
try:
    import diffusers
    ENV["diffusers"] = diffusers.__version__
except Exception as e:
    ENV["diffusers"] = f"NA ({e})"
try:
    import xformers
    ENV["xformers"] = xformers.__version__
except Exception as e:
    ENV["xformers"] = f"NA ({e})"

print(json.dumps(ENV, indent=2))
assert torch.cuda.is_available(), "CUDA must be available (T4). Stop."

# Fixed config (DoD)
CFG = {
    "model_id": "runwayml/stable-diffusion-v1-5",
    "prompt": "a cinematic photo of a corgi astronaut, ultra detailed, sharp focus",
    "negative_prompt": "blurry, low quality, artifacts",
    "seed": 42,
    "steps": 20,
    "guidance_scale": 7.5,
    "height": 512,
    "width": 512,
    "batch_size": 1,
    "warmup": 2,
    "runs": 8,
}
print("=== CFG ===")
print(json.dumps(CFG, indent=2))

DEVICE = "cuda"
DTYPE = torch.float16

def seed_everything(seed: int):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def make_pipe(enable_xf: bool):
    pipe = StableDiffusionPipeline.from_pretrained(
        CFG["model_id"],
        torch_dtype=DTYPE,
        safety_checker=None,
    ).to(DEVICE)
    pipe.set_progress_bar_config(disable=True)

    # HARD control: explicitly enable/disable xformers
    if enable_xf:
        pipe.enable_xformers_memory_efficient_attention()
        xf_state = "ENABLED"
    else:
        # this call exists in diffusers for turning it off
        try:
            pipe.disable_xformers_memory_efficient_attention()
            xf_state = "DISABLED"
        except Exception:
            xf_state = "DISABLED (no-op)"
    return pipe, xf_state

@torch.inference_mode()
def run_once(pipe):
    seed_everything(CFG["seed"])
    g = torch.Generator(device=DEVICE).manual_seed(CFG["seed"])
    img = pipe(
        prompt=[CFG["prompt"]] * CFG["batch_size"],
        negative_prompt=[CFG["negative_prompt"]] * CFG["batch_size"],
        num_inference_steps=CFG["steps"],
        guidance_scale=CFG["guidance_scale"],
        height=CFG["height"],
        width=CFG["width"],
        generator=g,
    ).images[0]
    return img

def bench(enable_xf: bool):
    pipe, xf_state = make_pipe(enable_xf)
    # warmup
    for _ in range(CFG["warmup"]):
        _ = run_once(pipe)
    torch.cuda.synchronize()

    times = []
    for _ in range(CFG["runs"]):
        t0 = time.perf_counter()
        _ = run_once(pipe)
        torch.cuda.synchronize()
        times.append(time.perf_counter() - t0)

    return {
        "xformers": xf_state,
        "mean_s": statistics.mean(times),
        "std_s": statistics.pstdev(times),
        "n": len(times),
    }

print("\n✅ Sanity OK. Next step: run two benches (baseline disable vs xformers enable).")


Flax classes are deprecated and will be removed in Diffusers v1.0.0. We recommend migrating to PyTorch classes or pinning your version of Diffusers.
Flax classes are deprecated and will be removed in Diffusers v1.0.0. We recommend migrating to PyTorch classes or pinning your version of Diffusers.


=== ENV (must be consistent across runs) ===
{
  "python": "3.12.12",
  "executable": "/usr/bin/python3",
  "torch": "2.9.0+cu126",
  "cuda_available": true,
  "cuda_version": "12.6",
  "gpu": "Tesla T4",
  "diffusers": "0.36.0",
  "xformers": "NA (No module named 'xformers')"
}
=== CFG ===
{
  "model_id": "runwayml/stable-diffusion-v1-5",
  "prompt": "a cinematic photo of a corgi astronaut, ultra detailed, sharp focus",
  "negative_prompt": "blurry, low quality, artifacts",
  "seed": 42,
  "steps": 20,
  "guidance_scale": 7.5,
  "height": 512,
  "width": 512,
  "batch_size": 1,
  "warmup": 2,
  "runs": 8
}

✅ Sanity OK. Next step: run two benches (baseline disable vs xformers enable).


In [4]:
# Cell — Run baseline vs xformers (safe, never crash)
import importlib, time, statistics
import torch

def _xformers_status():
    """
    Returns (import_ok, diffusers_flag, reason)
    - import_ok: can import xformers
    - diffusers_flag: diffusers thinks xformers is available
    """
    import_ok = False
    reason = None
    try:
        import xformers  # noqa: F401
        import_ok = True
    except Exception as e:
        return False, False, f"import failed: {e}"

    try:
        from diffusers.utils.import_utils import is_xformers_available
        flag = bool(is_xformers_available())
        if not flag:
            reason = "diffusers_flag=False (wheel/torch/cuda mismatch or wrong env)"
        return import_ok, flag, reason
    except Exception as e:
        return import_ok, False, f"diffusers check failed: {e}"

def bench(enable_xf: bool, runs=6, warmup=2):
    from diffusers import StableDiffusionPipeline
    import gc

    device = "cuda" if torch.cuda.is_available() else "cpu"
    dtype = torch.float16 if device == "cuda" else torch.float32

    pipe = StableDiffusionPipeline.from_pretrained(
        "runwayml/stable-diffusion-v1-5",
        torch_dtype=dtype,   # diffusers will warn it's deprecated; ok for now
        safety_checker=None,
    ).to(device)

    import_ok, diff_flag, reason = _xformers_status()
    xf_used = False
    xf_note = "disabled"

    if enable_xf:
        if import_ok and diff_flag:
            try:
                pipe.enable_xformers_memory_efficient_attention()
                xf_used = True
                xf_note = "enabled"
            except Exception as e:
                xf_used = False
                xf_note = f"NOT usable (enable failed: {e})"
        else:
            xf_used = False
            xf_note = f"NOT usable (import_ok={import_ok}, diffusers_flag={diff_flag}, reason={reason})"

    # Warmup + timing
    g = torch.Generator(device=device).manual_seed(42)
    prompt = "a cinematic photo of a corgi astronaut, ultra detailed, sharp focus"
    neg = "blurry, low quality, artifacts"

    def _run_one():
        if device == "cuda":
            torch.cuda.synchronize()
        t0 = time.time()
        _ = pipe(prompt=prompt, negative_prompt=neg, num_inference_steps=20,
                 guidance_scale=7.5, height=512, width=512, generator=g)
        if device == "cuda":
            torch.cuda.synchronize()
        return time.time() - t0

    times = []
    for _ in range(warmup):
        _run_one()
    for _ in range(runs):
        times.append(_run_one())

    # cleanup a bit
    del pipe
    gc.collect()
    if device == "cuda":
        torch.cuda.empty_cache()

    return {
        "mean_s": float(statistics.mean(times)),
        "std_s": float(statistics.pstdev(times)) if len(times) > 1 else 0.0,
        "n": len(times),
        "xformers": xf_note,
        "xformers_used": xf_used
    }

# --- run two modes safely ---
baseline = bench(enable_xf=False, runs=6, warmup=2)
xfast = bench(enable_xf=True, runs=6, warmup=2)

base = baseline["mean_s"]
xf = xfast["mean_s"]
speedup = base / xf if xf else None

print("\n=== RESULTS ===")
print("baseline:", baseline)
print("xformers:", xfast)
print(f"speedup: {speedup:.2f}x" if speedup else "speedup: NA")

print("\n=== README TABLE ===")
print("| Mode | Mean (s/img) | Std | N | Speedup vs Baseline | Notes |")
print("|---|---:|---:|---:|---:|---|")
print(f"| baseline | {base:.3f} | {baseline['std_s']:.3f} | {baseline['n']} | 1.00x | {baseline['xformers']} |")
print(f"| xformers | {xf:.3f} | {xfast['std_s']:.3f} | {xfast['n']} | {speedup:.2f}x | {xfast['xformers']} |")


Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]


=== RESULTS ===
baseline: {'mean_s': 3.80859367052714, 'std_s': 0.6303337786531851, 'n': 6, 'xformers': 'disabled', 'xformers_used': False}
xformers: {'mean_s': 3.154710133870443, 'std_s': 0.02672731235213691, 'n': 6, 'xformers': "NOT usable (import_ok=False, diffusers_flag=False, reason=import failed: No module named 'xformers')", 'xformers_used': False}
speedup: 1.21x

=== README TABLE ===
| Mode | Mean (s/img) | Std | N | Speedup vs Baseline | Notes |
|---|---:|---:|---:|---:|---|
| baseline | 3.809 | 0.630 | 6 | 1.00x | disabled |
| xformers | 3.155 | 0.027 | 6 | 1.21x | NOT usable (import_ok=False, diffusers_flag=False, reason=import failed: No module named 'xformers') |


✅ Step 5 — Inference Speedup Result Summary (torch.compile + xFormers)

Environment sanity

GPU: Tesla T4

Torch: 2.9.0+cu126

CUDA: 12.6 (runtime reports 12.6; driver shows 12.4, acceptable on Colab)

Diffusers: 0.36.0

xFormers: Not installed in current kernel (import fails)

Benchmark config (fixed)

model: runwayml/stable-diffusion-v1-5

steps=20, size=512×512, batch=1, seed=42

warmup=2, timed runs=6 (bench cell used runs=6)

Measured speed

Mode	Mean (s/img)	Std	N	Speedup vs Baseline	Notes
baseline	(baseline mean)	(std)	6	1.00x	xformers disabled
xformers	(xformers mean)	(std)	6	(speedup)	xformers NOT usable (import failed / not in current kernel)

Interpretation

当前对比中，“xformers 模式”实际上并未启用 xFormers kernel，因此该行结果属于 fallback run（等价于未启用 xFormers）。

因此，speedup 数值不具有代表性，不能写成“xFormers 带来 X 倍加速”。

下一步需要在 Colab 中切换到 Python (step5_venv) 内核（或确保 xformers 安装在当前 kernel），再重复相同配置 benchmark，才能得到有效的 xFormers 加速对比结论。

torch.compile status

本轮未进行 torch.compile 稳定性测试（或因环境未对齐暂缓）。

建议在 kernel 对齐后，对 pipe.unet 尝试 torch.compile()，若失败则记录 error 并回退到 eager 模式。