In [9]:
from transformers import AutoTokenizer

from vllm import LLM, SamplingParams
from vllm.benchmarks.datasets import add_dataset_parser, get_samples
from vllm.v1.metrics.reader import Counter, Vector

In [18]:
from huggingface_hub import notebook_login, HfApi

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [19]:
try:
    api = HfApi()
    user_info = api.whoami()
    print(f"Token validated successfully! Logged in as: {user_info['name']}")
except Exception as e:
    print(f"Token validation failed. Error: {e}")

Token validated successfully! Logged in as: Chamath


In [49]:
MODEL_DIR = "meta-llama/Llama-3.1-8B-Instruct"  # The model to use
output_len = 256                               # Maximum number of tokens to generate
print_output = True                            # Whether to print the generated text

# Speculative Decoding Configuration
method = "eagle"                               # Choices: "ngram", "eagle", "eagle3"
num_spec_tokens = 2                            # Number of speculative tokens
EAGLE_DIR = None                               # Speculative model directory (leave None for default)
# Ngram-specific (only used if method="ngram")
prompt_lookup_max = 5
prompt_lookup_min = 2

# vLLM/Hardware Configuration
tp = 1                                         # Tensor parallelism size
enforce_eager = False                          # Enforce eager execution
enable_chunked_prefill = False                 # Enable chunked prefill

# Sampling Parameters
temp = 0.6                                     # Temperature (0 for deterministic)
top_p = 1.0
top_k = -1

In [46]:
class Sample:
    """Mock class for the expected prompt object."""
    def __init__(self, prompt_text):
        self.prompt = prompt_text
        
def get_samples(args, tokenizer):
    """
    Mock function to return a list of sample prompts.
    In a real benchmark, this would load a dataset.
    """
    print("Using manually defined prompts for notebook execution.")
    prompts = [
        "Explain the concept of quantum entanglement in simple terms.",
        "Write a short, dramatic opening for a novel about a time traveler.",
        "What are the three main types of rocks and how are they formed?",
    ]
    return [Sample(p) for p in prompts]

def add_dataset_parser(parser):
    """Mock function for dataset argument parsing."""
    pass

In [50]:
def main():
    endpoint_type = "openai-chat"

    model_dir = MODEL_DIR
    eagle_dir = EAGLE_DIR
    tokenizer = AutoTokenizer.from_pretrained(model_dir)

    prompts = get_samples(args, tokenizer)
    # add_special_tokens is False to avoid adding bos twice when using chat templates
    prompt_ids = [
        tokenizer.encode(prompt.prompt, add_special_tokens=False) for prompt in prompts
    ]

    if method == "eagle" or method == "eagle3":
        if method == "eagle" and eagle_dir is None:
            eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"

        elif method == "eagle3" and eagle_dir is None:
            eagle_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
        speculative_config = {
            "method": method,
            "model": eagle_dir,
            "num_speculative_tokens": num_spec_tokens,
        }
    elif method == "ngram":
        speculative_config = {
            "method": "ngram",
            "num_speculative_tokens": num_spec_tokens,
            "prompt_lookup_max": prompt_lookup_max,
            "prompt_lookup_min": prompt_lookup_min,
        }
    else:
        raise ValueError(f"unknown method: {method}")

    llm = LLM(
        model=model_dir,
        trust_remote_code=True,
        tensor_parallel_size=tp,
        enable_chunked_prefill=enable_chunked_prefill,
        enforce_eager=enforce_eager,
        gpu_memory_utilization=0.8,
        speculative_config=speculative_config,
        disable_log_stats=False,
    )

    sampling_params = SamplingParams(temperature=temp, max_tokens=output_len)
    outputs = llm.generate(prompt_token_ids=prompt_ids, sampling_params=sampling_params)

    # print the generated text
    if print_output:
        for output in outputs:
            print("-" * 50)
            print(f"prompt: {output.prompt}")
            print(f"generated text: {output.outputs[0].text}")
            print("-" * 50)

    try:
        metrics = llm.get_metrics()
    except AssertionError:
        print("Metrics are not supported in the V0 engine.")
        return

    total_num_output_tokens = sum(
        len(output.outputs[0].token_ids) for output in outputs
    )
    num_drafts = 0
    num_draft_tokens = 0
    num_accepted_tokens = 0
    acceptance_counts = [0] * num_spec_tokens
    for metric in metrics:
        if metric.name == "vllm:spec_decode_num_drafts":
            assert isinstance(metric, Counter)
            num_drafts += metric.value
        elif metric.name == "vllm:spec_decode_num_draft_tokens":
            assert isinstance(metric, Counter)
            num_draft_tokens += metric.value
        elif metric.name == "vllm:spec_decode_num_accepted_tokens":
            assert isinstance(metric, Counter)
            num_accepted_tokens += metric.value
        elif metric.name == "vllm:spec_decode_num_accepted_tokens_per_pos":
            assert isinstance(metric, Vector)
            for pos in range(len(metric.values)):
                acceptance_counts[pos] += metric.values[pos]

    print("-" * 50)
    print(f"total_num_output_tokens: {total_num_output_tokens}")
    print(f"num_drafts: {num_drafts}")
    print(f"num_draft_tokens: {num_draft_tokens}")
    print(f"num_accepted_tokens: {num_accepted_tokens}")
    acceptance_length = 1 + (num_accepted_tokens / num_drafts) if num_drafts > 0 else 1
    print(f"mean acceptance length: {acceptance_length:.2f}")
    print("-" * 50)

    # print acceptance at each token position
    for i in range(len(acceptance_counts)):
        acceptance_rate = acceptance_counts[i] / num_drafts if num_drafts > 0 else 0
        print(f"acceptance at token {i}: {acceptance_rate:.2f}")


In [None]:
main()