In [None]:
#Loading the Model
from transformers import AutoProcessor, Llama4ForConditionalGeneration
import torch
from transformers import AutoConfig

config = AutoConfig.from_pretrained(
    "meta-llama/Llama-4-Scout-17B-16E-Instruct",
    output_router_logits=True,         # enable router-softmax output
)

processor = AutoProcessor.from_pretrained("meta-llama/Llama-4-Scout-17B-16E-Instruct")
model = Llama4ForConditionalGeneration.from_pretrained(
    "meta-llama/Llama-4-Scout-17B-16E-Instruct",
    config=config,
    attn_implementation="sdpa",
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

In [1]:
#Hook Function to Log
router_probs = {}
def make_hook(layer_name):
    def hook_fn(module, input, output):
        # Determine actual sequence length from input
        seq_len = input[0].shape[1] if isinstance(input[0], torch.Tensor) else 0

        # Handle tensor output
        if isinstance(output, torch.Tensor):
            out = output[:seq_len] if seq_len > 0 and seq_len < output.shape[0] else output
            out = out.detach().cpu()
        # Handle objects with 'router_probs' attribute
        elif hasattr(output, 'router_probs'):
            out = output.router_probs.detach().cpu()
        else:
            print(f"[{layer_name}] Unknown output type: {type(output)}")
            out = None

        # Store in dictionary if valid
        if out is not None:
            if layer_name not in router_probs:
                router_probs[layer_name] = []
            router_probs[layer_name].append(out)

    return hook_fn

for name, module in model.named_modules():
    if "router" in name.lower():
        module.register_forward_hook(make_hook(name))
        print(name)

NameError: name 'model' is not defined

In [2]:
#ONE TIME INFERENCE
import time
messages = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "翻译成英文：你好，我叫巴拉特" },
        ]
    },
]

# Start timing
start_total = time.time()

# Time the processing part
start_process = time.time()
tokenized = processor.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
)
inputs = processor.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
    return_tensors="pt",
).to(model.device)
process_time = time.time() - start_process
num_input_tokens = inputs["input_ids"].shape[1]
print(f"Number of input tokens: {num_input_tokens}")
# Time the generation part
start_generate = time.time()
outputs = model.generate(
    **inputs,
    max_new_tokens=50,
    return_dict_in_generate=True,
)
generate_time = time.time() - start_generate

# Time the decoding part
start_decode = time.time()
generated_tokens = outputs.sequences[0, inputs.input_ids.shape[1]:]
decoded_text = processor.decode(generated_tokens, skip_special_tokens=True)
decode_time = time.time() - start_decode

# Calculate total time
total_time = time.time() - start_total

# Print results
print(f"Number of output tokens: {len(generated_tokens)}")
print(f"Processing time: {process_time:.4f} seconds")
print(f"Generation time: {generate_time:.4f} seconds")
print(f"Decoding time: {decode_time:.4f} seconds")
print(f"Total time: {total_time:.4f} seconds")
print("\nGenerated text:")
print(decoded_text)

NameError: name 'processor' is not defined