In [1]:
from common import init_timer_registry
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
from ds2_v1 import DeepseekV2DecoderLayerTimed as v1_layer
import transformers.models.deepseek_v2.modeling_deepseek_v2 as ds2
ds2.DeepseekV2DecoderLayer = v1_layer


model_name = "deepseek-ai/DeepSeek-V2-Lite"

# load the tokenizer and the model
cfg = AutoConfig.from_pretrained(model_name)
init_timer_registry(
    num_layers=cfg.num_hidden_layers, keep_history=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype="auto",
    device_map="cuda:0"
)
model.eval()

`rope_scaling`'s factor field must be a float >= 1, got 40
`rope_scaling`'s beta_fast field must be a float, got 32
`rope_scaling`'s beta_slow field must be a float, got 1
`rope_scaling`'s factor field must be a float >= 1, got 40
`rope_scaling`'s beta_fast field must be a float, got 32
`rope_scaling`'s beta_slow field must be a float, got 1


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

DeepseekV2ForCausalLM(
  (model): DeepseekV2Model(
    (embed_tokens): Embedding(102400, 2048)
    (layers): ModuleList(
      (0): DeepseekV2DecoderLayerTimed(
        (self_attn): DeepseekV2Attention(
          (q_proj): Linear(in_features=2048, out_features=3072, bias=False)
          (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False)
          (kv_a_layernorm): DeepseekV2RMSNorm((512,), eps=1e-06)
          (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): DeepseekV2MLP(
          (gate_proj): Linear(in_features=2048, out_features=10944, bias=False)
          (up_proj): Linear(in_features=2048, out_features=10944, bias=False)
          (down_proj): Linear(in_features=10944, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): DeepseekV2RMSNorm((2048,), eps=1e-06)
        (post_attention_l

In [2]:
text_list = ["explain the qwen"]
tokenizer.padding_side = "left"
input_001 = tokenizer(text_list, return_tensors="pt", padding=True, truncation=True).to(model.device)

input_001

{'input_ids': tensor([[100000,  55377,    254,   4662,  20881]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1]], device='cuda:0')}

In [3]:
import common
import torch
# warm up

common.warmup_model(model, tokenizer, text_list, 10)

init_timer_registry(model.config.num_hidden_layers, keep_history=True)
with torch.no_grad():
    _ = model.generate(**input_001, max_new_tokens=128)  # decode

torch.cuda.synchronize()

# 打印结果
common.print_timers_summary()

=== Per-layer (ms) ===
layer	attn(PF)		mlp(PF)		gating(PF)		softmax(PF)		expert(PF)		norm(PF)		router(PF)		dispatch(PF)		compute(PF)		aggregate(PF)	||	attn(DEC)		mlp(DEC)		gating(DEC)		softmax(DEC)		expert(DEC)		norm(DEC)		router(DEC)		dispatch(DEC)		compute(DEC)		aggregate(DEC)
L00	0.437		0.127		0.000		0.000		0.000		0.071		0.000		0.000		0.000		0.000	||	50.131		15.317		0.000		0.000		0.000		8.372		0.000		0.000		0.000		0.000
L01	0.360		3.123		0.000		0.000		2.945		0.063		0.079		13.769		175.776		8.186	||	41.158		225.238		0.000		0.000		211.435		7.790		9.062		0.000		0.000		0.000
L02	0.381		3.084		0.000		0.000		2.960		0.067		0.085		13.115		174.854		7.972	||	44.554		222.859		0.000		0.000		209.244		7.956		8.990		0.000		0.000		0.000
L03	0.379		3.179		0.000		0.000		3.061		0.069		0.079		13.163		175.215		7.989	||	44.418		223.309		0.000		0.000		209.665		7.998		9.024		0.000		0.000		0.000
L04	0.389		2.783		0.000		0.000		2.668		0.066		0.077		12.988		175.777		7.972	||	44.452		223.904		0.000		0.000		210.