# 🚀 Llama3-8b-Instruct React Agent Advanced


In [1]:
from vllm import LLM, SamplingParams
from llama_index.core.agent import ReActAgent
llm = LLM(
    model="/data/hf/Meta-Llama-3-8B-Instruct",
    trust_remote_code=True,
    tensor_parallel_size=2,
)
tokenizer = llm.get_tokenizer()

  from .autonotebook import tqdm as notebook_tqdm
2024-05-02 16:49:30,840	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


INFO 05-02 16:49:32 config.py:407] Custom all-reduce kernels are temporarily disabled due to stability issues. We will re-enable them once the issues are resolved.


2024-05-02 16:49:34,363	INFO worker.py:1724 -- Started a local Ray instance.


INFO 05-02 16:49:35 llm_engine.py:79] Initializing an LLM engine with config: model='/data/hf/Meta-Llama-3-8B-Instruct', tokenizer='/data/hf/Meta-Llama-3-8B-Instruct', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=2, disable_custom_all_reduce=True, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 05-02 16:49:46 llm_engine.py:337] # GPU blocks: 12465, # CPU blocks: 4096
INFO 05-02 16:49:47 model_runner.py:666] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 05-02 16:49:47 model_runner.py:670] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
[36m(RayWorkerVllm pid=1497972)[0m INFO 05-02 16:49:47 model_runner.py:666] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
[36m(RayWorkerVllm pid=1497972)[0m INFO 05-02 16:49:47 model_runner.py:670] CUDA graphs can take additional 1~3 GiB memory p

In [2]:
prompt = tokenizer.apply_chat_template([{"role": "system", "content": "You are a helpful Assistant named eric simon"}, {"role": "user", "content": "who are you?"}], 
                              tokenize=False, add_generation_prompt=True)
print(prompt)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful Assistant named eric simon<|eot_id|><|start_header_id|>user<|end_header_id|>

who are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>




In [3]:
from llama_index.core.tools import BaseTool, FunctionTool
from llama_index.core.agent import ReActAgent
from typing import Dict, List, Any, Sequence, Callable
import re
import json
from rich import print


REACT_PROMPT = '''You are an AI agent capable of using a variety of tools to answer question.

You have access to the following tools:
{{tools_desc}}

### Reponse Format

Response using the follow format:

Thought: think step by steps, how to solve the question, put your thought process here.
Action:
```json
{
    "tool": $TOOL_NAME,
    "args": $TOOL_ARGS
}
```
Observation: tool output
...(this Thought/Action/Observation can repeat N times until you get enough information to answer the question)
Thought: I now know the final answer 
Final Answer: make sure output the final answer here

$TOOL_NAME is the name of the tool. $TOOL_ARGS is a dictionary input matching the requirement of the tool.
'''


# copy from llama-index
# get function tools description
def get_react_tool_descriptions(tools: Sequence[BaseTool]) -> List[str]:
    """Tool."""
    tool_descs = []
    for tool in tools:
        tool_desc = (
            f"> Tool Name: {tool.metadata.name}\n"
            f"Tool Description: {tool.metadata.description}\n"
            f"Tool Args: {tool.metadata.fn_schema_str}\n"
        )
        tool_descs.append(tool_desc)
    return tool_descs


def search_weather(city_name: str) -> str:
    """check real-time weather and temperature of a city"""
    return "temperature 27, weather rain"


print(get_react_tool_descriptions([FunctionTool.from_defaults(search_weather)])[0])

In [4]:
class ReActOutputParser:
    """ReAct Output parser."""

    def parse(self, output: str):
        if "Final Answer:" in output:
            return self.extract_final_answer(output)

        if "Action:" in output:
            return self.extract_action(output)

        raise ValueError(f"Could not parse output: {output}")
    
    def extract_action(self, input_text: str):
        pattern = r"Thought:([\s\S]*?)Action:([\s\S]*)"
        match = re.search(pattern, input_text)
        if not match:
            raise ValueError(f"Could not extract Thought/Action from input text: {input_text}")

        thought = match.group(1).strip()
        action = match.group(2).strip()

        json_block_pattern = "```json([\s\S]*?)(```|$)"
        match = re.search(json_block_pattern, action)
        if not match:
            raise ValueError(f"Could not extract Action JSON block from input text: {action}")
        action_json = match.group(1).strip()
        import dirtyjson
        try:
            action_json = dirtyjson.loads(action_json)
        except:
            raise ValueError(f"Unable parse JSON from {action_json}")
        
        import json
        raw_message = json.dumps(action_json, indent=4)
        return {
            "step": "reasoning_acting", 
            "content": {"thought": thought, "action": action_json}, 
            "raw_message": input_text}


    def extract_final_answer(self, response):
        pattern = r"\s*Thought:([\s\S]*?)Final Answer:([\s\S]*?)(?:$)"
        match = re.search(pattern, response, re.DOTALL)
        if not match:
            raise ValueError(
                f"Could not extract final answer from input text: {response}"
            )

        thought = match.group(1).strip()
        answer = match.group(2).strip()
        raw_message = f"Thought: {thought}\nFinal Answer: {answer}"
        return {
            "step": "answer", 
            "content": {"thought": thought, "answer": answer}, 
            "raw_message": response,
            }


In [5]:
class ReActAgent:

    def __init__(self, llm, tools: List[Callable], system_prompt: str=None, max_rounds: int=10, output_parser=None):
        self.llm = llm
        self.tokenizer = llm.get_tokenizer()
        tools = [FunctionTool.from_defaults(fn=t) for t in tools]
        self.tools = {f.metadata.get_name(): f for f in tools}
        self.system_prompt = system_prompt or REACT_PROMPT
        self.max_rounds = max_rounds
        self.output_parser = output_parser or ReActOutputParser()
        self.is_llama3 = "<|eot_id|>" in tokenizer.vocab
    
    def format_system_prompt(self):
        tools_name = ", ".join([tool.metadata.get_name() for tool in self.tools.values()])
        tools_desc = "\n".join(get_react_tool_descriptions(self.tools.values()))
        return self.system_prompt.replace("{{tools_desc}}", tools_desc).replace("{{tools_name}}", tools_name)

    def generate(self, messages, stop=None, max_tokens=1024):
        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        stop_token_ids = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")] if self.is_llama3 else [tokenizer.eos_token_id]
        sampling_params = SamplingParams(max_tokens=max_tokens, stop=stop, stop_token_ids=stop_token_ids, temperature=0.)
        model_output = llm.generate([prompt], sampling_params=sampling_params, use_tqdm=False)[0].outputs[0].text
        return model_output
        
    def run(self, query, num_rounds=None, verbose=False):
        # Initialization
        num_rounds = min(num_rounds, self.max_rounds) if num_rounds else self.max_rounds
        messages = [
            {"role": "system", "content": self.format_system_prompt()},
            {"role": "user", "content": f"{query}"}
            ]
        trajectory = [{"step": "query", "content": query, "raw_message": query}]

        # query
        if verbose:
            print(f"[green3]Query[/green3]")
            print(query)
        for _ in range(1, 1+num_rounds):
            model_output = self.generate(messages, stop="Observation:")
            messages.append({"role": "assistant", "content": model_output})
            try:
                step_state = self.output_parser.parse(model_output)
            except Exception as e:
                return {"response": str(e), "trajectory": trajectory, "successful": False}
            if verbose:
                print(f"[green3]{step_state['step']}[/green3]")
                print(step_state["raw_message"])
            trajectory.append(step_state)
            if step_state["step"] == "reasoning_acting":
                func = self.tools[step_state["content"]["action"]["tool"]]
                args = step_state["content"]["action"]["args"]
                try:
                    observation = func(**args).content
                except Exception as e:
                    return {"response": str(e), "trajectory": trajectory, "successful": False}
                if verbose:
                    print(f"[green3]observation[/green3]")
                    print(observation)
                trajectory.append({"step": "observation", "content": observation, "raw_message": observation})
                messages.append({"role": "user", "content": f"Observation: {observation}"})
            elif step_state["step"] == "answer":
                return {"response": step_state["content"]["answer"], "trajectory": trajectory, "successful": True}
        return {"response": "Maximum number of iterations exceeded", "trajectory": trajectory, "successful": False}

In [6]:

def multiply(a: int, b: int) -> int:
    """Multiple two integers and returns the result integer"""
    return a * b


def add(a: int, b: int) -> int:
    """Add two integers and returns the result integer"""
    return a + b


def subtract(a: int, b: int) -> int:
    """Subtract two integers and returns the result integer"""
    return a - b


def divide(a: int, b: int) -> int:
    """Divides two integers and returns the result integer"""
    return a / b


tools = [multiply, add, subtract, divide]

In [7]:
agent = ReActAgent(llm, tools)
response = agent.run("What is (121 + 2) * 5 *100?", verbose=True)

In [9]:
print(response["response"])