In [3]:
from vllm import LLM, SamplingParams
from vllm.sampling_params import GuidedDecodingParams

from enum import Enum
from pydantic import BaseModel

In [4]:
MAX_TOKENS = 50

#### Guided decoding by Choice (list of possible options)

In [5]:
guided_decoding_params_choice = GuidedDecodingParams(choice=["Positive", "Negative"])
sampling_params_choice = SamplingParams(guided_decoding=guided_decoding_params_choice)

prompt_choice = "Classify this sentiment: vLLM is wonderful!"

#### Guided decoding by Regex

In [6]:
guided_decoding_params_regex = GuidedDecodingParams(regex=r"\w+@\w+\.com\n")
sampling_params_regex = SamplingParams(
    guided_decoding=guided_decoding_params_regex,
    stop=["\n"],
    max_tokens=MAX_TOKENS,
)

prompt_regex = (
    "Generate an email address for Alan Turing, who works in Enigma."
    "End in .com and new line. Example result:"
    "alan.turing@enigma.com\n"
)

#### Guided decoding by JSON using Pydantic schema

In [7]:
class CarType(str, Enum):
    sedan = "sedan"
    suv = "SUV"
    truck = "Truck"
    coupe = "Coupe"

class CarDescription(BaseModel):
    brand: str
    model: str
    car_type: CarType

json_schema = CarDescription.model_json_schema()

guided_decoding_params_json = GuidedDecodingParams(json=json_schema)
sampling_params_json = SamplingParams(
    guided_decoding=guided_decoding_params_json,
    max_tokens=MAX_TOKENS,
)

prompt_json = (
    "Generate a JSON with the brand, model and car_type of"
    "the most iconic car from the 90's"
)

#### Guided decoding by Grammar

In [8]:
simplified_sql_grammar = """
root ::= select_statement
select_statement ::= "SELECT " column " from " table " where " condition
column ::= "col_1 " | "col_2 "
table ::= "table_1 " | "table_2 "
condition ::= column "= " number
number ::= "1 " | "2 "
"""

guided_decoding_params_grammar = GuidedDecodingParams(grammar=simplified_sql_grammar)
sampling_params_grammar = SamplingParams(
    guided_decoding=guided_decoding_params_grammar,
    max_tokens=MAX_TOKENS,
)

prompt_grammar = (
    "Generate an SQL query to show the 'username' and 'email'from the 'users' table."
)

### Generation

In [9]:
def format_output(title: str, output: str):
    print(f"{'-' * 50}\n{title}: {output}\n{'-' * 50}")

def generate_output(prompt: str, sampling_params: SamplingParams, llm: LLM):
    outputs = llm.generate(prompt, sampling_params=sampling_params)
    return outputs[0].outputs[0].text

In [10]:
def main():
    llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100)

    choice_output = generate_output(prompt_choice, sampling_params_choice, llm)
    format_output("Guided decoding by Choice", choice_output)

    regex_output = generate_output(prompt_regex, sampling_params_regex, llm)
    format_output("Guided decoding by Regex", regex_output)

    json_output = generate_output(prompt_json, sampling_params_json, llm)
    format_output("Guided decoding by JSON", json_output)

    grammar_output = generate_output(prompt_grammar, sampling_params_grammar, llm)
    format_output("Guided decoding by Grammar", grammar_output)

In [11]:
main()

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

INFO 09-05 21:13:52 [config.py:853] This model supports multiple tasks: {'embed', 'score', 'generate', 'reward', 'classify'}. Defaulting to 'generate'.


tokenizer_config.json: 0.00B [00:00, ?B/s]

INFO 09-05 21:13:52 [config.py:1467] Using max model len 100
INFO 09-05 21:13:59 [config.py:2267] Chunked prefill is enabled with max_num_batched_tokens=16384.
INFO 09-05 21:13:59 [config.py:4566] full_cuda_graph is not supported with cascade attention. Disabling cascade attention.


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

INFO 09-05 21:14:04 [__init__.py:244] Automatically detected platform rocm.
INFO 09-05 21:14:13 [core.py:459] Waiting for init message from front-end.
INFO 09-05 21:14:13 [core.py:69] Initializing a V1 LLM engine (v0.9.2.dev364+gb432b7a28) with config: model='Qwen/Qwen2.5-3B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-3B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=100, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoi

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:01<00:01,  1.38s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:02<00:00,  1.08s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:02<00:00,  1.13s/it]



INFO 09-05 21:14:22 [default_loader.py:272] Loading weights took 2.31 seconds
INFO 09-05 21:14:22 [gpu_model_runner.py:1782] Model loading took 5.9004 GiB and 8.246731 seconds
INFO 09-05 21:14:27 [backends.py:509] Using cache directory: /root/.cache/vllm/torch_compile_cache/fd65d2905e/rank_0_0/backbone for vLLM's torch.compile
INFO 09-05 21:14:27 [backends.py:520] Dynamo bytecode transform time: 5.32 s
INFO 09-05 21:14:47 [backends.py:181] Cache the graph of shape None for later use
INFO 09-05 21:14:47 [backends.py:193] Compiling a graph for general shape takes 17.25 s
INFO 09-05 21:14:51 [monitor.py:34] torch.compile takes 22.58 s in total
INFO 09-05 21:15:05 [gpu_worker.py:232] Available KV cache memory: 160.23 GiB
INFO 09-05 21:15:05 [kv_cache_utils.py:716] GPU KV cache size: 4,666,992 tokens
INFO 09-05 21:15:05 [kv_cache_utils.py:720] Maximum concurrency for 100 tokens per request: 41669.57x
INFO 09-05 21:15:05 [rocm.py:224] Using Triton Attention backend on V1 engine.


Capturing CUDA graphs: 100%|██████████| 67/67 [00:15<00:00,  4.29it/s]


INFO 09-05 21:15:20 [gpu_model_runner.py:2306] Graph capturing finished in 16 secs, took 0.27 GiB
INFO 09-05 21:15:21 [core.py:172] init engine (profile, create kv cache, warmup model) took 58.61 seconds


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0% 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

--------------------------------------------------
Guided decoding by Choice: Positive
--------------------------------------------------


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0% 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

--------------------------------------------------
Guided decoding by Regex: alan_turing@endenigma.com
--------------------------------------------------


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0% 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

--------------------------------------------------
Guided decoding by JSON: {"brand": "Lamborghini", "model": "Diablo", "car_type": "Coupe"}
--------------------------------------------------


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0% 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

--------------------------------------------------
Guided decoding by Grammar: SELECT col_1  from table_1  where col_1 = 1 
--------------------------------------------------


