In [1]:
import json
from typing import Sequence, List

from llama_index.core.llms import ChatMessage
from llama_index.core.tools import BaseTool, FunctionTool
from llama_index.core.agent import ReActAgent

import nest_asyncio

nest_asyncio.apply()

In [2]:
def multiply(a: int, b: int) -> int:
    """Multiple two integers and returns the result integer"""
    return a * b


def add(a: int, b: int) -> int:
    """Add two integers and returns the result integer"""
    return a + b


def subtract(a: int, b: int) -> int:
    """Subtract two integers and returns the result integer"""
    return a - b


def divide(a: int, b: int) -> int:
    """Divides two integers and returns the result integer"""
    return a / b


multiply_tool = FunctionTool.from_defaults(fn=multiply)
add_tool = FunctionTool.from_defaults(fn=add)
subtract_tool = FunctionTool.from_defaults(fn=subtract)
divide_tool = FunctionTool.from_defaults(fn=divide)

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-3.2-1B",
)

stopping_ids = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>"),
]

In [4]:
import os
# generate_kwargs parameters are taken from https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct

import torch
from llama_index.llms.huggingface import HuggingFaceLLM

# Optional quantization to 4bit
# import torch
# from transformers import BitsAndBytesConfig

# quantization_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_compute_dtype=torch.float16,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_use_double_quant=True,
# )
hf_token = os.environ.get("HUGGING_FACE_TOKEN")
llm = HuggingFaceLLM(
    model_name="meta-llama/Meta-Llama-3.1-8B-Instruct",
    model_kwargs={
        "token": hf_token,
        "torch_dtype": torch.bfloat16,  # comment this line and uncomment below to use 4bit
        # "quantization_config": quantization_config
    },
    generate_kwargs={
        "do_sample": True,
        "temperature": 0.6,
        "top_p": 0.9,
    },
    tokenizer_name="meta-llama/Meta-Llama-3.1-8B-Instruct",
    tokenizer_kwargs={"token": hf_token},
    stopping_ids=stopping_ids,
)




Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
agent = ReActAgent.from_tools(
    [multiply_tool, add_tool, subtract_tool, divide_tool],
    llm=llm,
    verbose=True,
)

agent.chat("What is 100 * 100?")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


> Running step 513f1ca7-a082-41d5-97d5-b8772d0d23f5. Step input: What is 100 * 100?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[1;3;38;5;200mThought: The current language of the user is: English. I need to use a tool to help me answer the question.
Action: multiply
Action Input: {'a': 100, 'b': 100}
[0m[1;3;34mObservation: 10000
[0m> Running step 4b558bdf-6249-4559-b268-91c7e7a6dd3d. Step input: None
[1;3;38;5;200mThought: I can answer without using any more tools. I'll use the user's language to answer
Answer: 10000
[0m

AgentChatResponse(response='10000', sources=[ToolOutput(content='10000', tool_name='multiply', raw_input={'args': (), 'kwargs': {'a': 100, 'b': 100}}, raw_output=10000, is_error=False)], source_nodes=[], is_dummy_stream=False, metadata=None)

In [6]:
from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    StorageContext,
    load_index_from_storage,
)

from llama_index.core.tools import QueryEngineTool, ToolMetadata

In [9]:
docs = SimpleDirectoryReader(input_dir="./pdfs/").load_data()

In [10]:
docs_indices = VectorStoreIndex.from_documents(docs)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [11]:
query_engine = docs_indices.as_query_engine(similarity_top_k=3)

In [12]:
query_engine_tools = [
    QueryEngineTool(
        query_engine=query_engine,
        metadata=ToolMetadata(
            name="lyft_10k",
            description=(
                "Provides information about an AI paper regarding LoRA technique for efficient fine-tuning of LLMs. "
                "Use a detailed plain text question as input to the tool."
            ),
        ),
    ),
]

In [13]:
agent = ReActAgent.from_tools(
    query_engine_tools,
    llm=llm,
    verbose=True,
)

In [14]:
response = agent.chat("What is the LoRA technique for efficient fine-tuning of LLMs?")
print(str(response))


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


> Running step 955cd52c-0103-4109-964f-42726e310c0b. Step input: What is the LoRA technique for efficient fine-tuning of LLMs?
[1;3;38;5;200mThought: The current language of the user is English. I need to use a tool to help me answer the question.
Action: lyft_10k
Action Input: {'input': 'What is the LoRA technique for efficient fine-tuning of LLMs?'}
[0m

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[1;3;34mObservation: The LoRA technique allows for efficient fine-tuning of Large Language Models (LLMs) by optimizing rank decomposition matrices of the dense layers' change during adaptation instead of directly updating the pre-trained weights. This approach enables training some dense layers indirectly while keeping the pre-trained weights frozen. By using low-rank matrices, LoRA reduces storage and computational requirements, making it both storage- and compute-efficient. Additionally, LoRA allows for sharing a pre-trained model to build multiple small LoRA modules for different tasks, facilitating efficient task-switching by replacing specific matrices. It also simplifies training, lowers hardware barriers, and introduces no additional inference latency compared to fully fine-tuned models when deployed.
[0m> Running step 777d8b01-ae16-4980-ac56-69dc62f05718. Step input: None
[1;3;38;5;200mThought: I can answer without using any more tools. I'll use the user's language to answer