In [1]:
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from vllm import LLM, SamplingParams

# Sample prompts.
prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)


def main():
    # Create an LLM.
    llm = LLM(model="facebook/opt-125m")
    # Generate texts from the prompts.
    # The output is a list of RequestOutput objects
    # that contain the prompt, generated text, and other information.
    outputs = llm.generate(prompts, sampling_params)
    # Print the outputs.
    print("\nGenerated Outputs:\n" + "-" * 60)
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt:    {prompt!r}")
        print(f"Output:    {generated_text!r}")
        print("-" * 60)


if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm


INFO 06-16 23:08:41 [__init__.py:244] Automatically detected platform cuda.


2025-06-16 23:08:45,176	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


INFO 06-16 23:08:56 [config.py:823] This model supports multiple tasks: {'embed', 'classify', 'reward', 'generate', 'score'}. Defaulting to 'generate'.
INFO 06-16 23:08:56 [config.py:2195] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 06-16 23:08:58 [core.py:455] Waiting for init message from front-end.
INFO 06-16 23:08:58 [core.py:70] Initializing a V1 LLM engine (v0.9.1) with config: model='facebook/opt-125m', speculative_config=None, tokenizer='facebook/opt-125m', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additio

Loading pt checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading pt checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.56it/s]
Loading pt checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.55it/s]



INFO 06-16 23:09:05 [default_loader.py:272] Loading weights took 0.43 seconds
INFO 06-16 23:09:06 [gpu_model_runner.py:1624] Model loading took 0.2389 GiB and 3.693791 seconds
INFO 06-16 23:09:10 [backends.py:462] Using cache directory: /home/j44zhu/.cache/vllm/torch_compile_cache/de10bd737c/rank_0_0 for vLLM's torch.compile
INFO 06-16 23:09:10 [backends.py:472] Dynamo bytecode transform time: 3.72 s
INFO 06-16 23:09:12 [backends.py:161] Cache the graph of shape None for later use
INFO 06-16 23:09:18 [backends.py:173] Compiling a graph for general shape takes 8.16 s
INFO 06-16 23:09:20 [monitor.py:34] torch.compile takes 11.88 s in total
INFO 06-16 23:09:21 [gpu_worker.py:227] Available KV cache memory: 70.44 GiB
INFO 06-16 23:09:22 [kv_cache_utils.py:715] GPU KV cache size: 2,051,664 tokens
INFO 06-16 23:09:22 [kv_cache_utils.py:719] Maximum concurrency for 2,048 tokens per request: 1001.79x
INFO 06-16 23:09:40 [gpu_model_runner.py:2048] Graph capturing finished in 18 secs, took 0.53 

Adding requests: 100%|██████████| 4/4 [00:00<00:00, 580.73it/s]
Processed prompts: 100%|██████████| 4/4 [00:00<00:00, 34.73it/s, est. speed input: 225.94 toks/s, output: 556.11 toks/s]



Generated Outputs:
------------------------------------------------------------
Prompt:    'Hello, my name is'
Output:    ' Joel, my dad is my friend and we are in a relationship. I am'
------------------------------------------------------------
Prompt:    'The president of the United States is'
Output:    ' reportedly being investigated by the FBI over the allegations that he broke campaign finance laws in'
------------------------------------------------------------
Prompt:    'The capital of France is'
Output:    ' all set to open a new gateway to the most notorious terror camps in Europe,'
------------------------------------------------------------
Prompt:    'The future of AI is'
Output:    ' in their hands. We cannot prevent the future.\nAgreed.  '
------------------------------------------------------------


In [None]:
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser


def create_parser():
    parser = FlexibleArgumentParser()
    # Add engine args
    EngineArgs.add_cli_args(parser)
    parser.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
    # Add sampling params
    sampling_group = parser.add_argument_group("Sampling parameters")
    sampling_group.add_argument("--max-tokens", type=int)
    sampling_group.add_argument("--temperature", type=float)
    sampling_group.add_argument("--top-p", type=float)
    sampling_group.add_argument("--top-k", type=int)

    return parser


def main(args: dict):
    # Pop arguments not used by LLM
    max_tokens = args.pop("max_tokens")
    temperature = args.pop("temperature")
    top_p = args.pop("top_p")
    top_k = args.pop("top_k")

    # Create an LLM
    llm = LLM(**args)

    # Create a sampling params object
    sampling_params = llm.get_default_sampling_params()
    if max_tokens is not None:
        sampling_params.max_tokens = max_tokens
    if temperature is not None:
        sampling_params.temperature = temperature
    if top_p is not None:
        sampling_params.top_p = top_p
    if top_k is not None:
        sampling_params.top_k = top_k

    # Generate texts from the prompts. The output is a list of RequestOutput
    # objects that contain the prompt, generated text, and other information.
    prompts = [
        "Hello, my name is",
        "The president of the United States is",
        "The capital of France is",
        "The future of AI is",
    ]
    outputs = llm.generate(prompts, sampling_params)
    # Print the outputs.
    print("-" * 50)
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
        print("-" * 50)


if __name__ == "__main__":
    parser = create_parser()
    args: dict = vars(parser.parse_args())
    main(args)