In [None]:
# 首先启用代理
import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

# torch.compile加速

In [None]:
import time
import torch
import cache_dit
from diffusers import ZImagePipeline, AutoModel, PyramidAttentionBroadcastConfig
from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig

def load_model(model_name= 'Tongyi-MAI/Z-Image-Turbo', cache_dir='/root/autodl-tmp/Model'):
    device = "cuda"
    quantization_config = DiffusersBitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        llm_int8_skip_modules=["transformer_blocks.0.img_mod"],
    )
    transformer = AutoModel.from_pretrained(
        model_name,
        cache_dir=cache_dir,
        subfolder="transformer",
        quantization_config=quantization_config,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        mirror='https://hf-mirror.com'
    )
    # transformer = transformer.to("cpu")

    quantization_config = TransformersBitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
    )
    text_encoder = AutoModel.from_pretrained(
        model_name,
        cache_dir=cache_dir,
        subfolder="text_encoder",
        quantization_config=quantization_config,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        mirror='https://hf-mirror.com'
    )
    # text_encoder = text_encoder.to("cpu")

    pipe = ZImagePipeline.from_pretrained(
        model_name,
        transformer=transformer,
        text_encoder=text_encoder,
        torch_dtype=torch.bfloat16,
        mirror='https://hf-mirror.com',
        device_map=device
    )
    # pipe.enable_model_cpu_offload(gpu_id=gpu_id)
    return pipe

def image_generate(pipeline, prompt, special_name, seed=10086):
    s_time = time.time()
    image = pipeline(
        prompt=prompt,
        height=1024,
        width=1024,
        num_inference_steps=10,
        guidance_scale=0.0,
        generator=torch.Generator("cuda").manual_seed(seed),
    ).images[0]
    e_time = time.time()
    # image.save(f"./{special_name}-{e_time- s_time:.2f}.png")

In [None]:
z_image_pipeline = load_model()
z_image_pipeline.transformer.compile()
s_time = time.time()
for i in range(5):
    image_generate(z_image_pipeline,
                   "Realistic mid-aged male image",
                   None)
print(f"Used Time: {time.time()- s_time:.2f}")

In [None]:
z_image_pipeline = load_model()
s_time = time.time()
for i in range(10):
    image_generate(z_image_pipeline,
                   "Realistic mid-aged male image",
                   None)
print(f"Used Time: {time.time()- s_time:.2f}")

# CacheDit使用  
## 基础使用方式
```python
cache_dit.enable_cache(dit_pipeline)
```
直接运行上面代码就可以启动cache_dit。
## BlockAdapter使用方式  
这个参数主要是为了结局在自定义的模型中使用了自定义的模型层，而这个模型层不在cache-dit中存在，比如说假设z-image里面模型有新定义的层/修改了他某些层。
```python
...
(noise_refiner): ModuleList(
    (0-1): 2 x ZImageTransformerBlock(
      (attention): Attention(
        (norm_q): RMSNorm()
        (norm_k): RMSNorm()
        (to_q): Linear4bit(in_features=3840, out_features=3840, bias=False)
        (to_k): Linear4bit(in_features=3840, out_features=3840, bias=False)
        (to_v): Linear4bit(in_features=3840, out_features=3840, bias=False)
        (to_out): ModuleList(
          (0): Linear4bit(in_features=3840, out_features=3840, bias=False)
          (1): Dropout(p=0.0, inplace=False)
        )
      )
...
```
比如说`all_final_layer`就是修改的，那么可以直接这么使用BlockAdapter 可以帮助您快速将各种缓存加速功能应用于自定义的Diffusion Pipelines和Transformers。那么可以直接使用
```python
from cache_dit import ForwardPattern, BlockAdapter
cache_dit.enable_cache(
    BlockAdapter(
        pipe= z_image_pipeline,
        transformer=z_image_pipeline.transformer,
        blocks=[
            z_image_pipeline.transformer.noise_refiner,
        ],
    ),
)
```

In [None]:
import time
import torch
import cache_dit
from diffusers import ZImagePipeline, AutoModel, PyramidAttentionBroadcastConfig
from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig

def load_model(model_name= 'Tongyi-MAI/Z-Image-Turbo', cache_dir='/root/autodl-tmp/Model'):
    device = "cuda"
    quantization_config = DiffusersBitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        llm_int8_skip_modules=["transformer_blocks.0.img_mod"],
    )
    transformer = AutoModel.from_pretrained(
        model_name,
        cache_dir=cache_dir,
        subfolder="transformer",
        quantization_config=quantization_config,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        mirror='https://hf-mirror.com'
    )
    # transformer = transformer.to("cpu")

    quantization_config = TransformersBitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
    )
    text_encoder = AutoModel.from_pretrained(
        model_name,
        cache_dir=cache_dir,
        subfolder="text_encoder",
        quantization_config=quantization_config,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        mirror='https://hf-mirror.com'
    )
    # text_encoder = text_encoder.to("cpu")

    pipe = ZImagePipeline.from_pretrained(
        model_name,
        transformer=transformer,
        text_encoder=text_encoder,
        torch_dtype=torch.bfloat16,
        mirror='https://hf-mirror.com',
        device_map=device
    )
    # pipe.enable_model_cpu_offload(gpu_id=gpu_id)
    return pipe

def image_generate(pipeline, prompt, special_name, seed=10086):
    s_time = time.time()
    image = pipeline(
        prompt=prompt,
        height=1024,
        width=1024,
        num_inference_steps=10,
        guidance_scale=0.0,
        generator=torch.Generator("cuda").manual_seed(seed),
    ).images[0]
    e_time = time.time()
    image.save(f"./{special_name}-{e_time- s_time:.2f}.png")

z_image_pipeline = load_model()

In [None]:
print(z_image_pipeline.transformer)

In [None]:
# BlockAdapter使用方式
from cache_dit import ForwardPattern, BlockAdapter
cache_dit.enable_cache(
    BlockAdapter(
        pipe= z_image_pipeline,
        transformer=z_image_pipeline.transformer,
        blocks=[
            z_image_pipeline.transformer.noise_refiner,
        ],
    ),
)
image_generate(z_image_pipeline,
               "Realistic mid-aged male image",
               "BlockAdapter")