## qwen MOE trace and analysis module

In [None]:
# # import datasets
# from datasets import load_dataset
# data_id = "wikimedia/wikipedia"
# sub_set_id = "20231101.zh-classical"
# split = "train"
# dataset = load_dataset(data_id, sub_set_id, split=split)

In [1]:
from qwen_v1 import Qwen3MoeDecoderLayerTimed as v1Timed
from qwen_v2 import Qwen3MoeDecoderLayerTimed as v2Timed

version = 'v2'

def modify_qwen3_moe_block(type__: str):
    from transformers.models.qwen3_moe import modeling_qwen3_moe as qmoe
    if type__ == 'v1':
        qmoe.Qwen3MoeDecoderLayer = v1Timed
    elif type__ == 'v2':
        qmoe.Qwen3MoeDecoderLayer = v2Timed
    else:
        pass


modify_qwen3_moe_block("1")

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig


# model_name = "Qwen/Qwen3-30B-A3B-Instruct-2507"
model_name = "Qwen/Qwen3-30B-A3B"

# load the tokenizer and the model
cfg = AutoConfig.from_pretrained(model_name)
if version == 'v2':
    from common import init_timer_registry
    init_timer_registry(
        num_layers=cfg.num_hidden_layers, keep_history=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype="auto",
    device_map="cuda:0"
)
model.eval()

Loading checkpoint shards:   0%|          | 0/16 [00:00<?, ?it/s]

Qwen3MoeForCausalLM(
  (model): Qwen3MoeModel(
    (embed_tokens): Embedding(151936, 2048)
    (layers): ModuleList(
      (0-47): 48 x Qwen3MoeDecoderLayer(
        (self_attn): Qwen3MoeAttention(
          (q_proj): Linear(in_features=2048, out_features=4096, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=4096, out_features=2048, bias=False)
          (q_norm): Qwen3MoeRMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3MoeRMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MoeSparseMoeBlock(
          (gate): Linear(in_features=2048, out_features=128, bias=False)
          (experts): ModuleList(
            (0-127): 128 x Qwen3MoeMLP(
              (gate_proj): Linear(in_features=2048, out_features=768, bias=False)
              (up_proj): Linear(in_features=2048, out_features=768, bias=False)
              (down_proj): Linear

In [5]:
text_list = ["explain the qwen"]
tokenizer.padding_side = "left"
input_001 = tokenizer(text_list, return_tensors="pt", padding=True, truncation=True).to(model.device)

input_001


{'input_ids': tensor([[94344,   279,  2804, 16948]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1]], device='cuda:0')}

In [3]:
from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments
args = PyTorchBenchmarkArguments(models=[model_name], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512])
benchmark = PyTorchBenchmark(args)
benchmark.run()

ImportError: cannot import name 'PyTorchBenchmark' from 'transformers' (/home/lkx/data/qwen_moe/.venv/lib/python3.13/site-packages/transformers/__init__.py)

## for v1 to test

In [None]:
from torch.profiler import profile, ProfilerActivity
import torch
import qwen_v1
# warm up
with torch.no_grad():
    model_output = model(**input_001, use_cache=True)
qwen_v1.reset_timers()
next_token = torch.argmax(model_output.logits[:, -1, :], dim=-1, keepdim=True)
past_kv = model_output.past_key_values

with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    record_shapes=False, with_stack=True, profile_memory=False
) as prof:
    with torch.no_grad():
        _ = model(next_token, past_key_values=past_kv, use_cache=True)
qwen_v1.show_res()
prof.export_chrome_trace("trace/trace_qwen3_moe_v1.json")

## for v2 to test

In [4]:
import common
common._TREG

TimerRegistry(num_layers=48, keep_history=True, timers={})

In [6]:
import common
import torch
# warm up

common.warmup_model(model, tokenizer, text_list, 10)

init_timer_registry(model.config.num_hidden_layers, keep_history=True)
with torch.no_grad():
    _ = model(**input_001)  # prefill
    _ = model.generate(**input_001, max_new_tokens=128)  # decode

torch.cuda.synchronize()

# 打印结果
common.print_timers_summary()

=== Per-layer (ms) ===
layer	attn(PF)	mlp(PF)	gating(PF)	softmax(PF)	expert(PF)	||	attn(DEC)	mlp(DEC)	gating(DEC)	softmax(DEC)	expert(DEC)
L00	0.886		8.584		0.087		0.138		8.015		||	56.022		168.423		5.368		8.552		142.410
L01	0.814		6.696		0.082		0.135		6.293		||	47.158		163.952		4.880		8.239		139.683
L02	0.786		7.320		0.081		0.134		6.918		||	46.961		163.946		4.800		8.293		139.797
L03	0.796		7.339		0.081		0.136		6.930		||	47.408		163.375		4.827		8.249		139.285
L04	0.781		7.328		0.083		0.134		6.928		||	46.920		163.220		4.754		8.260		139.233
L05	0.776		6.855		0.079		0.134		6.458		||	46.717		162.910		4.803		8.252		138.886
L06	0.776		7.071		0.081		0.131		6.678		||	46.775		162.331		4.723		8.241		138.373
L07	0.781		6.807		0.090		0.144		6.391		||	46.746		162.504		4.777		8.234		138.524
L08	0.781		7.340		0.078		0.141		6.933		||	46.668		162.604		4.720		8.224		138.677
L09	0.837		6.519		0.083		0.143		6.099		||	46.669		163.626		4.753		8.201		139.692
L10	0.773		7.579		0.085		0.140		7.168		||	47.033		1