In [1]:
from smolagents import CodeAgent, DuckDuckGoSearchTool, Model, Tool, ChatMessage
from smolagents.models import remove_stop_sequences
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
from PIL import Image
import torch
import os


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class VLLMModel(Model):
    
    def __init__(
        self,
        model_id: str,
        sampling_kwargs: dict = None,
        init_kwargs: dict = None,
        chat_kw_args: dict = None,
        **kwargs
    ):
        super().__init__(**kwargs)
        default_model_id = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
        if model_id is None:
            model_id = default_model_id
            logger.warning(f"`model_id`not provided, using this default model: '{model_id}'")
        self.model_id = model_id
        from vllm import LLM, SamplingParams
        if not init_kwargs:
            init_kwargs = {}
        if not sampling_kwargs:
            sampling_kwargs = {}
        default_max_tokens = 5000
        max_new_tokens = sampling_kwargs.get("max_new_tokens") or sampling_kwargs.get("max_tokens")
        if not max_new_tokens:
            kwargs["max_new_tokens"] = default_max_tokens
            logger.warning(
                f"`max_new_tokens` not provided, using this default value for `max_new_tokens`: {default_max_tokens}"
            )
        self.kwargs = kwargs
        self.sampling_params = SamplingParams(**sampling_kwargs)
        self.model = LLM(model=model_name, **init_kwargs)
        self._is_vlm = False

        
    def __call__(
        self,
        messages: List[Dict[str, str]],
        stop_sequences: Optional[List[str]] = None,
        grammar: Optional[str] = None,
        tools_to_call_from: Optional[List[Tool]] = None,
        images: Optional[List[Image.Image]] = None,
        **kwargs,
    ) -> ChatMessage:
        max_new_tokens = (
            kwargs.get("max_new_tokens")
            or kwargs.get("max_tokens")
            or self.kwargs.get("max_new_tokens")
            or self.kwargs.get("max_tokens")
        )
        completion_kwargs = {}
        if max_new_tokens:
            completion_kwargs["max_new_tokens"] = max_new_tokens
        import torch

        out = self.model.chat(messages, 
                              sampling_params=self.sampling_params,
                              use_tqdm=False)
        output = out[-1].outputs[-1].text
        if stop_sequences is not None:
            output = remove_stop_sequences(output, stop_sequences)
        raw = {'output': torch.tensor(out[-1].outputs[-1].token_ids), 
               "completion_kwargs": completion_kwargs}
        if tools_to_call_from is None:
            return ChatMessage(
                role="assistant",
                content=output,
                raw=raw,
            )
        else:
            if "Action:" in output:
                output = output.split("Action:", 1)[1].strip()
            try:
                start_index = output.index("{")
                end_index = output.rindex("}")
                output = output[start_index : end_index + 1]
            except Exception as e:
                raise Exception("No json blob found in output!") from e

            try:
                parsed_output = json.loads(output)
            except json.JSONDecodeError as e:
                raise ValueError(f"Tool call '{output}' has an invalid JSON structure: {e}")
            tool_name = parsed_output.get("name")
            tool_arguments = parsed_output.get("arguments")
            return ChatMessage(
                role="assistant",
                content="",
                tool_calls=[
                    ChatMessageToolCall(
                        id="".join(random.choices("0123456789", k=5)),
                        type="function",
                        function=ChatMessageToolCallDefinition(name=tool_name, arguments=tool_arguments),
                    )
                ],
                raw=raw,
            )


model_name = 'meta-llama/Llama-3.2-1B-Instruct'
#model_name = "facebook/opt-125m"
model = VLLMModel(model_name, sampling_kwargs={'max_tokens': 6000, 'temperature':0.8, 'top_p':0.95})

INFO 02-24 15:38:37 __init__.py:207] Automatically detected platform cpu.
INFO 02-24 15:38:44 config.py:560] This model supports multiple tasks: {'classify', 'score', 'reward', 'generate', 'embed'}. Defaulting to 'generate'.
INFO 02-24 15:38:44 importing.py:16] Triton not installed or not compatible; certain GPU-related functions will not be available.
INFO 02-24 15:38:44 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.4.dev53+geb24dc4a.d20250223+cpu) with config: model='meta-llama/Llama-3.2-1B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.2-1B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto,  device_config=cpu, decoding_config=D

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.00it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.99it/s]


INFO 02-24 15:38:48 executor_base.py:111] # cpu blocks: 8192, # CPU blocks: 0
INFO 02-24 15:38:48 executor_base.py:116] Maximum concurrency for 131072 tokens per request: 1.00x





INFO 02-24 15:38:48 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 0.54 seconds


In [3]:
conversation = [
    {
        "role": "system",
        "content": "You are a helpful assistant"
    },
    {
        "role": "user",
        "content": "Hello"
    },
    {
        "role": "assistant",
        "content": "Hello! How can I assist you today?"
    },
    {
        "role": "user",
        "content": "Write an essay about the importance of higher education.",
    },
]
response = model(conversation)
response

INFO 02-24 15:38:49 chat_utils.py:332] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.


ChatMessage(role='assistant', content="The Importance of Higher Education: Unlocking the Potential of the Next Generation\n\nHigher education has long been regarded as the key to unlocking the potential of individuals, societies, and economies. It is a cornerstone of modern civilization, providing individuals with the skills, knowledge, and expertise necessary to succeed in an increasingly complex and interconnected world. The importance of higher education cannot be overstated, and it is essential for fostering a more informed, productive, and prosperous society.\n\nOne of the primary reasons higher education is so crucial is that it equips individuals with the knowledge and skills necessary to compete in the modern job market. In today's global economy, the standard of living and economic growth are directly tied to the level of education and training. Higher education institutions provide students with access to a vast array of courses, programs, and research opportunities that can 

In [5]:

agent = CodeAgent(tools=[DuckDuckGoSearchTool()], model=model)

agent.run("How many seconds would it take for a leopard at full speed to run through Pont des Arts?")

KeyboardInterrupt: 