## qwen MOE trace and analysis module

In [None]:
# # import datasets
# from datasets import load_dataset
# data_id = "wikimedia/wikipedia"
# sub_set_id = "20231101.zh-classical"
# split = "train"
# dataset = load_dataset(data_id, sub_set_id, split=split)

In [1]:
from qwen_v1 import Qwen3MoeDecoderLayerTimed as v1Timed
from qwen_v2 import Qwen3MoeDecoderLayerTimed as v2Timed

version = 'v2'

def modify_qwen3_moe_block(type__: str):
    from transformers.models.qwen3_moe import modeling_qwen3_moe as qmoe
    if type__ == 'v1':
        qmoe.Qwen3MoeDecoderLayer = v1Timed
    elif type__ == 'v2':
        qmoe.Qwen3MoeDecoderLayer = v2Timed
    else:
        pass


modify_qwen3_moe_block("v2")

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig


# model_name = "Qwen/Qwen3-30B-A3B-Instruct-2507"
model_name = "Qwen/Qwen3-30B-A3B"

# load the tokenizer and the model
cfg = AutoConfig.from_pretrained(model_name)
if version == 'v2':
    from common import init_timer_registry
    init_timer_registry(
        num_layers=cfg.num_hidden_layers, keep_history=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype="auto",
    device_map="cuda:0"
)
model.eval()

Loading checkpoint shards:   0%|          | 0/16 [00:00<?, ?it/s]

Qwen3MoeForCausalLM(
  (model): Qwen3MoeModel(
    (embed_tokens): Embedding(151936, 2048)
    (layers): ModuleList(
      (0-47): 48 x Qwen3MoeDecoderLayerTimed(
        (self_attn): Qwen3MoeAttention(
          (q_proj): Linear(in_features=2048, out_features=4096, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=4096, out_features=2048, bias=False)
          (q_norm): Qwen3MoeRMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3MoeRMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MoeSparseMoeBlockV2(
          (gate): Linear(in_features=2048, out_features=128, bias=False)
          (experts): ModuleList(
            (0-127): 128 x Qwen3MoeMLP(
              (gate_proj): Linear(in_features=2048, out_features=768, bias=False)
              (up_proj): Linear(in_features=2048, out_features=768, bias=False)
              (down_proj):

In [3]:
text_list = ["explain the qwen"]
tokenizer.padding_side = "left"
input_001 = tokenizer(text_list, return_tensors="pt", padding=True, truncation=True).to(model.device)

input_001


{'input_ids': tensor([[94344,   279,  2804, 16948]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1]], device='cuda:0')}

In [3]:
from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments
args = PyTorchBenchmarkArguments(models=[model_name], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512])
benchmark = PyTorchBenchmark(args)
benchmark.run()

ImportError: cannot import name 'PyTorchBenchmark' from 'transformers' (/home/lkx/data/qwen_moe/.venv/lib/python3.13/site-packages/transformers/__init__.py)

## for v1 to test

In [None]:
from torch.profiler import profile, ProfilerActivity
import torch
import qwen_v1
# warm up
with torch.no_grad():
    model_output = model(**input_001, use_cache=True)
qwen_v1.reset_timers()
next_token = torch.argmax(model_output.logits[:, -1, :], dim=-1, keepdim=True)
past_kv = model_output.past_key_values

with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    record_shapes=False, with_stack=True, profile_memory=False
) as prof:
    with torch.no_grad():
        _ = model(next_token, past_key_values=past_kv, use_cache=True)
qwen_v1.show_res()
prof.export_chrome_trace("trace/trace_qwen3_moe_v1.json")

## for v2 to test

In [4]:
import common
common._TREG

TimerRegistry(num_layers=48, keep_history=True, timers={})

In [7]:
import common
import torch
# warm up

common.warmup_model(model, tokenizer, text_list, 10)

init_timer_registry(model.config.num_hidden_layers, keep_history=True)
with torch.no_grad():
    _ = model.generate(**input_001, max_new_tokens=128)  # decode

torch.cuda.synchronize()

# 打印结果
common.print_timers_summary()

=== Per-layer (ms) ===
layer	attn(PF)	mlp(PF)	gating(PF)	softmax(PF)	expert(PF)	norm(PF)	||	attn(DEC)	mlp(DEC)	gating(DEC)	softmax(DEC)	expert(DEC)	norm(DEC)
L00	0.787		6.956		0.073		0.126		6.378		0.112		||	51.640		169.326		5.388		8.234		144.154		8.048
L01	0.682		5.353		0.063		0.118		5.017		0.108		||	46.690		165.724		5.019		8.048		141.543		7.813
L02	0.673		5.904		0.065		0.118		5.570		0.118		||	46.662		164.684		4.871		8.065		141.029		7.757
L03	0.699		6.057		0.066		0.118		5.729		0.116		||	46.116		163.993		4.832		7.933		140.575		7.679
L04	0.702		5.676		0.065		0.118		5.347		0.105		||	45.916		164.344		4.809		8.003		140.856		7.681
L05	0.710		5.500		0.062		0.124		5.170		0.109		||	46.420		163.145		4.944		7.958		139.608		7.704
L06	0.681		5.712		0.077		0.116		5.374		0.105		||	46.766		163.645		4.796		7.976		140.163		7.719
L07	0.667		5.398		0.069		0.116		5.065		0.124		||	46.010		162.725		4.778		7.965		139.331		7.671
L08	0.659		5.875		0.061		0.117		5.554		0.102		||	46.039		164.119		4.803		7.948		14