# LLM-Pruner实践
> 参考链接：https://github.com/horseee/LLM-Pruner

In [5]:
import gc
import random
import torch
import gradio as gr
import numpy as np
from transformers import LlamaTokenizer
from LLMPruner.models.hf_llama.modeling_llama import LlamaForCausalLM, LlamaRMSNorm

import LLMPruner.torch_pruning as tp 
from LLMPruner.pruner import hf_llama_pruner as llama_pruner
from LLMPruner.datasets.example_samples import get_examples
from LLMPruner.templates.prompts import prompts

In [6]:
base_model = "Enoch/llama-7b-hf"
cache_dir = "./llm_weights"
num_examples = 10
iterative_steps = 1 #迭代次数
taylor = 'param_first'

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_version = float('.'.join(torch.__version__.split('.')[:2]))

In [9]:
def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
# 设置随机种子，方便复现结果
set_random_seed(42)

In [22]:
tokenizer = LlamaTokenizer.from_pretrained(base_model)
model = LlamaForCausalLM.from_pretrained(
    base_model,
    cache_dir=cache_dir,
    low_cpu_mem_usage=True if torch_version >=1.9 else False
)
if device != "cpu":
    model.half()
model.to(device)

Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
 

In [23]:
# 测试模型效果
def eval():
    model.eval()
    with torch.no_grad():
        for prompt in prompts:
            input_ids = tokenizer(prompt, return_tensors="pt")['input_ids'].to(device)

            generation_output = model.generate(
                input_ids=input_ids,
                do_sample=True,
                top_k=50,
                max_length=128,
                top_p=0.95,
                temperature=1,
            )
            
            result = tokenizer.decode(generation_output[0])
            print(f"result: {result}")

In [24]:
# 裁剪前测试模型效果
eval()

result: <s> I believe the meaning of life is to live everyday as though it is your last and never take anyone for granted. Everyone you meet is fighting their own battle and it's important to treat others with the compassion they need.
A woman must be twice as good as a man to go half as far, and that’s being conservative. That’s why woman have always had to work harder. ~ Elizabeth Cady Stanton</s>
result: <s>Simply put, the theory of relativity states that 1) the speed of light is a constant for all observers, and that 2) the laws of physics behave differently depending upon the moving speed of the observer. Einstein didn't invent either of those notions. The first had been observed in 1676 when Ole Rømer determined that the speed of light is constant from all moving objects on Earth. The second had been proposed in 1887 by Hendrik Lorentz and independently by Georges Lemaître (the former first). Rømer, Lorentz and L
result: <s>Building a website can be done in 10 simple steps:
1. Se

In [25]:
for param in model.parameters():
    param.requires_grad_(True)
    
before_pruning_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)

forward_prompts = torch.tensor([
    [    1,   306,  4658,   278,  6593,   310,  2834,   338],
    [    1,  3439, 17632,  1925, 29892,   278,  6368,   310],
]).to(device) # Only for building the dependency graph. Any input will be fine since the computation result are not taken into consideration.

In [26]:
# 使用泰勒方式计算重要性
imp = llama_pruner.TaylorImportance(group_reduction="sum", taylor=taylor)

In [27]:
# 定义需要的参数
kwargs = {
        "importance": imp,
        "global_pruning": True,
        "iterative_steps": 1, #迭代次数 
        "ch_sparsity": 0.2, #稀疏率
        "ignored_layers":[],
        "channel_groups": {
        },
        "consecutive_groups": {
        layer.self_attn.q_proj: layer.self_attn.head_dim for layer in model.model.layers
        },
        "customized_pruners": {
        LlamaRMSNorm: llama_pruner.hf_rmsnorm_pruner,
        },
        "root_module_types": None, 
        "root_instances": [model.model.layers[i].self_attn.q_proj for i in range(3, 31)] +
                        [model.model.layers[i].mlp.gate_proj for i in range(3, 31)] #裁剪3-31层
}

In [31]:
print("Pruning Attention Layer = {}".format(list(range(3, 31))))
print("Pruning MLP Layer = {}".format(list(range(3,31))))

pruner = tp.pruner.MetaPruner(
        model,
        forward_prompts,
        **kwargs
)
model.zero_grad()

print("Start Pruning")
for i in range(iterative_steps):
        example_prompts = get_examples('c4', tokenizer, num_examples, seq_len = 64).to(device)
        print("Start Backwarding in iterative steps = {}...".format(i))
                
        loss = model(example_prompts, labels=example_prompts).loss
        print("Loss = {}".format(loss))
        loss.backward()

        pruner.step()

        after_pruning_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
        print("After Iter {}/{}, #parameters: {}".format(i+1, iterative_steps, after_pruning_parameters))

        # modify inferece-related attributes
        for layer in model.model.layers:
                layer.self_attn.num_heads = layer.self_attn.q_proj.weight.data.shape[0] // layer.self_attn.head_dim

Pruning Attention Layer = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
Pruning MLP Layer = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
Start Pruning
Start Backwarding in iterative steps = 0...
Loss = 2.48046875
torch.Size([309120]) 56
61824 0.2 309120
After Iter 1/1, #parameters: 5978476544


In [32]:
# Clean the gradient in the model
model.zero_grad()
for name, module in model.named_parameters():
        if 'weight' in name:
                module.grad = None
del pruner

In [33]:
print("#Param before: {}, #Param after: {}, Ratio = {:.4f}%".format(before_pruning_parameters, after_pruning_parameters,  100.0*after_pruning_parameters/before_pruning_parameters))

#Param before: 6738415616, #Param after: 5978476544, Ratio = 88.7223%


In [34]:
print(("Memory Requirement: {} MiB\n".format(torch.cuda.memory_allocated()/1024/1024)))

Memory Requirement: 26024.59228515625 MiB



In [35]:
gc.collect()
torch.cuda.empty_cache()

In [36]:
# 保存裁剪后的模型
torch.save({
            'model': model, 
            'tokenizer': tokenizer,
        }, 'model_llm_pruner.bin')

In [37]:
model.half()
model.to(device)

model.config.pad_token_id = tokenizer.pad_token_id = 0 
model.config.bos_token_id = 1
model.config.eos_token_id = 2

In [38]:
# 裁剪后测试模型效果
eval()

result: <s>I believe the meaning of life is to find a job that is of real value, that involves real skills and real craftsmanship and real human interactions with real other people. The whole point of life is that in a sense the whole point of life is the whole point of life. Life does not have a point except life; life is life, the point is life; life itself, life is what life is.
Life is the art of taking, the process of keeping, and the use of an abundance. Life is living. Life is living. Life is life. Life is life. Life is life. Life
result: <s>Simply put, the theory of relativity states that 2 physical objects in motion relative to each other experience a difference in time and speed due to the relative speed of the 2 objects relative to each other. The difference is called relativistic time dilation.
The following video uses a clock with a ticking mechanism as a demonstration.
The video shows how 2 clocks are accelerating relative to each other and shows how their time rates diff

In [40]:
def evaluate(
    input=None,
    temperature=0.1,
    top_p=0.75,
    top_k=40,
    max_new_tokens=128,
    stream_output=False,
    **kwargs,
):
    inputs = tokenizer(input, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)

    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            do_sample=True,
            top_k=50,
            top_p=top_p,
            temperature=temperature,
            max_length=max_new_tokens,
            return_dict_in_generate=True,
        )
    s = generation_output.sequences[0]
    output = tokenizer.decode(s)
    yield output

In [41]:
# 使用可视化界面测试，网址为ip:7860
gr.Interface(
    fn=evaluate,
    inputs=[
        gr.components.Textbox(lines=2, label="Input", placeholder="none"),
        gr.components.Slider(
            minimum=0, maximum=1, value=1, label="Temperature"
        ),
        gr.components.Slider(
            minimum=0, maximum=1, value=0.95, label="Top p"
        ),
        gr.components.Slider(
            minimum=0, maximum=100, step=1, value=50, label="Top k"
        ),
        gr.components.Slider(
            minimum=1, maximum=2000, step=1, value=128, label="Max tokens"
        ),
        gr.components.Checkbox(label="Stream output"),
    ],
    outputs=[
        gr.Textbox(
            lines=5,
            label="Output",
        )
    ],
    title="Evaluate Pruned Model",
    description="Pruned Model",
).queue().launch(server_name="0.0.0.0", share=True)

Running on local URL:  http://0.0.0.0:7860
Running on public URL: https://38fc60b05fb5e39c23.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


