In [8]:
from smolagents import CodeAgent, DuckDuckGoSearchTool, Model, Tool, ChatMessage
from smolagents.models import remove_stop_sequences
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
from PIL import Image
import torch
import os


In [4]:
class VLLMModel(Model):
    
    def __init__(
        self,
        model_id: str,
        sampling_kwargs: dict = None,
        init_kwargs: dict = None,
        chat_kw_args: dict = None,
        **kwargs
    ):
        super().__init__(**kwargs)
        default_model_id = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
        if model_id is None:
            model_id = default_model_id
            logger.warning(f"`model_id`not provided, using this default model: '{model_id}'")
        self.model_id = model_id
        from vllm import LLM, SamplingParams
        if not init_kwargs:
            init_kwargs = {}
        if not sampling_kwargs:
            sampling_kwargs = {}
        default_max_tokens = 5000
        max_new_tokens = sampling_kwargs.get("max_new_tokens") or sampling_kwargs.get("max_tokens")
        if not max_new_tokens:
            kwargs["max_new_tokens"] = default_max_tokens
            logger.warning(
                f"`max_new_tokens` not provided, using this default value for `max_new_tokens`: {default_max_tokens}"
            )
        self.kwargs = kwargs
        self.sampling_params = SamplingParams(**sampling_kwargs)
        self.model = LLM(model=model_id, **init_kwargs)
        self._is_vlm = False

        
    def __call__(
        self,
        messages: List[Dict[str, str]],
        stop_sequences: Optional[List[str]] = None,
        grammar: Optional[str] = None,
        tools_to_call_from: Optional[List[Tool]] = None,
        images: Optional[List[Image.Image]] = None,
        **kwargs,
    ) -> ChatMessage:
        max_new_tokens = (
            kwargs.get("max_new_tokens")
            or kwargs.get("max_tokens")
            or self.kwargs.get("max_new_tokens")
            or self.kwargs.get("max_tokens")
        )
        completion_kwargs = {}
        if max_new_tokens:
            completion_kwargs["max_new_tokens"] = max_new_tokens
        import torch

        out = self.model.chat(messages, 
                              sampling_params=self.sampling_params,
                              use_tqdm=False)
        output = out[-1].outputs[-1].text
        if stop_sequences is not None:
            output = remove_stop_sequences(output, stop_sequences)
        raw = {'output': torch.tensor(out[-1].outputs[-1].token_ids), 
               "completion_kwargs": completion_kwargs}
        if tools_to_call_from is None:
            return ChatMessage(
                role="assistant",
                content=output,
                raw=raw,
            )
        else:
            if "Action:" in output:
                output = output.split("Action:", 1)[1].strip()
            try:
                start_index = output.index("{")
                end_index = output.rindex("}")
                output = output[start_index : end_index + 1]
            except Exception as e:
                raise Exception("No json blob found in output!") from e

            try:
                parsed_output = json.loads(output)
            except json.JSONDecodeError as e:
                raise ValueError(f"Tool call '{output}' has an invalid JSON structure: {e}")
            tool_name = parsed_output.get("name")
            tool_arguments = parsed_output.get("arguments")
            return ChatMessage(
                role="assistant",
                content="",
                tool_calls=[
                    ChatMessageToolCall(
                        id="".join(random.choices("0123456789", k=5)),
                        type="function",
                        function=ChatMessageToolCallDefinition(name=tool_name, arguments=tool_arguments),
                    )
                ],
                raw=raw,
            )


model_name = 'meta-llama/Llama-3.2-1B-Instruct'
#model_name = "facebook/opt-125m"
model = VLLMModel(model_name, sampling_kwargs={'max_tokens': 6000, 'temperature':0.8, 'top_p':0.95})

INFO 02-25 10:43:53 config.py:560] This model supports multiple tasks: {'embed', 'score', 'reward', 'generate', 'classify'}. Defaulting to 'generate'.
INFO 02-25 10:43:53 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.4.dev53+geb24dc4a.d20250223+cpu) with config: model='meta-llama/Llama-3.2-1B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.2-1B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto,  device_config=cpu, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_ti

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:01<00:00,  1.04s/it]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:01<00:00,  1.04s/it]


INFO 02-25 10:43:56 executor_base.py:111] # cpu blocks: 8192, # CPU blocks: 0
INFO 02-25 10:43:56 executor_base.py:116] Maximum concurrency for 131072 tokens per request: 1.00x





INFO 02-25 10:43:56 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 0.51 seconds


In [5]:
conversation = [
    {
        "role": "system",
        "content": "You are a helpful assistant"
    },
    {
        "role": "user",
        "content": "Hello"
    },
    {
        "role": "assistant",
        "content": "Hello! How can I assist you today?"
    },
    {
        "role": "user",
        "content": "Write an essay about the importance of higher education.",
    },
]
response = model(conversation)
response

INFO 02-25 10:43:57 chat_utils.py:332] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.


ChatMessage(role='assistant', content="The Importance of Higher Education: Unlocking the Potential of the Next Generation\n\nHigher education has long been regarded as the key to unlocking the potential of individuals, societies, and economies. It is the bridge between theoretical knowledge and practical application, fostering personal growth, skill-building, and social mobility. In today's fast-paced, technology-driven world, having a higher education is no longer a luxury but a necessity for success. The benefits of higher education extend far beyond the classroom, impacting individuals, communities, and the world at large.\n\nOne of the most significant advantages of higher education is the acquisition of knowledge and skills that enable individuals to adapt to an ever-changing job market. In today's economy, the pace of technological advancements, globalization, and shifting industries require workers to possess a broad range of skills, including critical thinking, creativity, prob

In [7]:

agent = CodeAgent(tools=[DuckDuckGoSearchTool()], model=model, )

agent.run("How many seconds would it take for a leopard at full speed to run through Pont des Arts?")

6210000.0