In [1]:
from vllm import LLM, SamplingParams

In [2]:
model_name = "meta-llama/Llama-3.1-8B"
llm = LLM(model=model_name, max_model_len=22000)

INFO 12-31 18:37:03 config.py:510] This model supports multiple tasks: {'reward', 'classify', 'score', 'generate', 'embed'}. Defaulting to 'generate'.
INFO 12-31 18:37:03 llm_engine.py:234] Initializing an LLM engine (v0.6.6.post1) with config: model='meta-llama/Llama-3.1-8B', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=22000, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=meta-llama/Llam

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 12-31 18:37:07 model_runner.py:1099] Loading model weights took 14.9888 GB
INFO 12-31 18:37:09 worker.py:241] Memory profiling takes 2.28 seconds
INFO 12-31 18:37:09 worker.py:241] the current vLLM instance can use total_gpu_memory (23.51GiB) x gpu_memory_utilization (0.90) = 21.16GiB
INFO 12-31 18:37:09 worker.py:241] model weights take 14.99GiB; non_torch_memory takes 0.04GiB; PyTorch activation peak memory takes 2.28GiB; the rest of the memory reserved for KV Cache is 3.86GiB.
INFO 12-31 18:37:09 gpu_executor.py:76] # GPU blocks: 1975, # CPU blocks: 2048
INFO 12-31 18:37:09 gpu_executor.py:80] Maximum concurrency for 22000 tokens per request: 1.44x
INFO 12-31 18:37:10 model_runner.py:1415] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utiliza

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:08<00:00,  4.25it/s]

INFO 12-31 18:37:18 model_runner.py:1535] Graph capturing finished in 8 secs, took 0.25 GiB
INFO 12-31 18:37:18 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 11.46 seconds





In [3]:
prompt = "What is the capital of France?"
messages = [
{
    "role": "user", "content": [
        {"type": "text", "text": prompt}
    ]
}
]

In [4]:
messages

[{'role': 'user',
  'content': [{'type': 'text', 'text': 'What is the capital of France?'}]}]

I found an error of:
```
ValueError: As of transformers v4.44, default chat template is no longer allowed, so you must provide a chat template if the tokenizer does not define one.
```

and solved it with:

```
python3 -m pip install git+https://github.com/huggingface/transformers
```

In [11]:
sampling_params = SamplingParams(temperature=0)

In [12]:
prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]

outputs = llm.generate(prompts, sampling_params)

Processed prompts: 100%|██████████| 4/4 [00:00<00:00, 11.17it/s, est. speed input: 72.59 toks/s, output: 178.69 toks/s]


In [13]:
outputs

[RequestOutput(request_id=4, prompt='Hello, my name is', prompt_token_ids=[128000, 9906, 11, 856, 836, 374], encoder_prompt=None, encoder_prompt_token_ids=None, prompt_logprobs=None, outputs=[CompletionOutput(index=0, text=" and I'm writing you today to learn more about the 2019 Ford F", token_ids=(323, 358, 2846, 4477, 499, 3432, 311, 4048, 810, 922, 279, 220, 679, 24, 14337, 435), cumulative_logprob=None, logprobs=None, finish_reason=length, stop_reason=None)], finished=True, metrics=RequestMetrics(arrival_time=1735666987.8465743, last_token_time=1735666987.8465743, first_scheduled_time=1735666987.8487513, first_token_time=1735666987.8968427, time_in_queue=0.002177000045776367, finished_time=1735666988.1878974, scheduler_time=0.0021776449998469616, model_forward_time=None, model_execute_time=None), lora_request=None, num_cached_tokens=0, multi_modal_placeholders={}),
 RequestOutput(request_id=5, prompt='The president of the United States is', prompt_token_ids=[128000, 791, 4872, 315,

In [14]:
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

Prompt: 'Hello, my name is', Generated text: " and I'm writing you today to learn more about the 2019 Ford F"
Prompt: 'The president of the United States is', Generated text: ' the head of state and head of government of the United States, indirectly elected to'
Prompt: 'The capital of France is', Generated text: ' a city of many faces. It is a city of history, a city of'
Prompt: 'The future of AI is', Generated text: ' here, and it’s already changing the way we live and work. From self'


In [2]:
import transformers
import torch

model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]

outputs = pipeline(
    messages,
    max_new_tokens=256,
)
print(outputs[0]["generated_text"][-1])

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{'role': 'assistant', 'content': "Yer lookin' fer a swashbucklin' introduction, eh? Well, matey, I be a pirate chatbot, at yer service! Me name be Captain Chatbeard, the scurviest, most cunning, and most charmin' pirate to ever sail the Seven Seas! Me and me trusty keyboard be ready to spin ye tales o' adventure, answer yer questions, and maybe even teach ye a thing or two about the pirate life. So hoist the colors, me hearty, and let's set sail fer a treasure trove o' knowledge!"}
