In [16]:
import json
from typing import Sequence, List

from llama_index.core.llms import ChatMessage
from llama_index.core.tools import BaseTool, FunctionTool
from llama_index.core.agent import ReActAgent

import nest_asyncio

nest_asyncio.apply()

In [17]:
def multiply(a: int, b: int) -> int:
    """Multiple two integers and returns the result integer"""
    return a * b


def add(a: int, b: int) -> int:
    """Add two integers and returns the result integer"""
    return a + b


def subtract(a: int, b: int) -> int:
    """Subtract two integers and returns the result integer"""
    return a - b


def divide(a: int, b: int) -> int:
    """Divides two integers and returns the result integer"""
    return a / b


multiply_tool = FunctionTool.from_defaults(fn=multiply)
add_tool = FunctionTool.from_defaults(fn=add)
subtract_tool = FunctionTool.from_defaults(fn=subtract)
divide_tool = FunctionTool.from_defaults(fn=divide)

In [18]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-3.2-1B",
)

stopping_ids = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>"),
]

In [15]:
import os
# generate_kwargs parameters are taken from https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct

import torch
from llama_index.llms.huggingface import HuggingFaceLLM

# Optional quantization to 4bit
# import torch
# from transformers import BitsAndBytesConfig

# quantization_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_compute_dtype=torch.float16,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_use_double_quant=True,
# )
hf_token = os.environ.get("HUGGING_FACE_TOKEN")
llm = HuggingFaceLLM(
    model_name="meta-llama/Meta-Llama-3.1-8B-Instruct",
    model_kwargs={
        "token": hf_token,
        "torch_dtype": torch.bfloat16,  # comment this line and uncomment below to use 4bit
        # "quantization_config": quantization_config
    },
    generate_kwargs={
        "do_sample": True,
        "temperature": 0.2,
        "top_p": 0.9,
    },
    tokenizer_name="meta-llama/Meta-Llama-3.1-8B-Instruct",
    tokenizer_kwargs={"token": hf_token},
    stopping_ids=stopping_ids,
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [19]:
agent = ReActAgent.from_tools(
    [multiply_tool, add_tool, subtract_tool, divide_tool],
    llm=llm,
    verbose=True,
)

agent.chat("What is 100 * 100?")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


> Running step a4580108-e7dd-43d1-b8c8-cc8413d84829. Step input: What is 100 * 100?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[1;3;38;5;200mThought: The current language of the user is: English. I need to use a tool to help me answer the question.
Action: multiply
Action Input: {'a': 100, 'b': 100}
[0m[1;3;34mObservation: 10000
[0m> Running step e9020238-de43-4224-9551-94a3c55c8cd3. Step input: None
[1;3;38;5;200mThought: I can answer without using any more tools. I'll use the user's language to answer
Answer: 10000
[0m

AgentChatResponse(response='10000', sources=[ToolOutput(content='10000', tool_name='multiply', raw_input={'args': (), 'kwargs': {'a': 100, 'b': 100}}, raw_output=10000, is_error=False)], source_nodes=[], is_dummy_stream=False, metadata=None)

In [20]:
agent.chat("What is 3 * 2 + 100?")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


> Running step d5cebe6f-6ef0-4d7b-9977-175025c353e0. Step input: What is 3 * 2 + 100?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[1;3;38;5;200mThought: The current language of the user is: English. I need to use a tool to help me answer the question.
Action: multiply
Action Input: {'a': 3, 'b': 2}
[0m[1;3;34mObservation: 6
[0m> Running step 0dce18a4-7e6d-4922-ad1e-23da69865298. Step input: None


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[1;3;38;5;200mThought: I need to use another tool to help me answer the question.
Action: add
Action Input: {'a': 6, 'b': 100}
[0m[1;3;34mObservation: 106
[0m> Running step ac672043-0776-435d-88d6-ecc6758fae4d. Step input: None
[1;3;38;5;200mThought: I can answer without using any more tools. I'll use the user's language to answer
Answer: 106
[0m

AgentChatResponse(response='106', sources=[ToolOutput(content='6', tool_name='multiply', raw_input={'args': (), 'kwargs': {'a': 3, 'b': 2}}, raw_output=6, is_error=False), ToolOutput(content='106', tool_name='add', raw_input={'args': (), 'kwargs': {'a': 6, 'b': 100}}, raw_output=106, is_error=False)], source_nodes=[], is_dummy_stream=False, metadata=None)

In [21]:
from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    StorageContext,
    load_index_from_storage,
)

from llama_index.core.tools import QueryEngineTool, ToolMetadata

In [22]:
docs = SimpleDirectoryReader(input_dir="./pdfs/").load_data()

In [23]:
docs_indices = VectorStoreIndex.from_documents(docs)

In [24]:
query_engine = docs_indices.as_query_engine(similarity_top_k=3)

In [28]:
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
import os
from datetime import datetime

# Email credentials and SMTP settings
smtp_server = "smtp.gmail.com"
port = 587  # For starttls
sender_email = "lucasenkrateia@gmail.com"  # Enter your email
password = os.environ["GMAIL_PWD"]

def send_email(to, subject, body):
    """
    Send an email using the configured sender email and SMTP settings
    
    Args:
        to (str): Email address of the recipient
        subject (str): Subject line of the email
        body (str): Body text of the email
    """
    # Create message
    message = MIMEMultipart()
    message["From"] = sender_email
    message["To"] = to
    message["Subject"] = subject
    message.attach(MIMEText(body, "plain"))

    # Send email
    try:
        server = smtplib.SMTP(smtp_server, port)
        server.starttls()  # Secure the connection
        server.login(sender_email, password)
        server.sendmail(sender_email, to, message.as_string())
        print("Email sent successfully!")
    except Exception as e:
        print(f"Error: {e}")
    finally:
        server.quit()

In [29]:
email_tool = FunctionTool.from_defaults(fn=send_email)

In [30]:
query_engine_tools = [
    QueryEngineTool(
        query_engine=query_engine,
        metadata=ToolMetadata(
            name="lora_paper",
            description=(
                "Provides information about an AI paper regarding LoRA technique for efficient fine-tuning of LLMs. "
                "Use a detailed plain text question as input to the tool."
            ),
        ),
    ),
    email_tool
]

In [31]:
agent = ReActAgent.from_tools(
    query_engine_tools,
    llm=llm,
    verbose=True,
)

In [27]:
response = agent.chat("What is the LoRA technique for efficient fine-tuning of LLMs?")
print(str(response))


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


> Running step 5f99ef85-c519-44cd-b90e-ae2c5b4d1bd0. Step input: What is the LoRA technique for efficient fine-tuning of LLMs?
[1;3;38;5;200mThought: The current language of the user is English. I need to use a tool to help me answer the question.
Action: lora_paper
Action Input: {'input': 'LoRA technique for efficient fine-tuning of LLMs'}
[0m

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[1;3;34mObservation: The LoRA technique allows for efficient fine-tuning of Language Model Models (LLMs) by not requiring the accumulated gradient update to weight matrices to have full-rank during adaptation. This means that by setting the LoRA rank to the rank of the pre-trained weight matrices, LoRA roughly recovers the expressiveness of full fine-tuning. Additionally, when deployed in production, LoRA introduces no additional inference latency as the computation and storage of W=W0+BA can be done explicitly.
[0m> Running step 0d3fba52-2b6a-4703-be1a-56d250add0a4. Step input: None
[1;3;38;5;200mThought: I can answer without using any more tools. I'll use the user's language to answer.
Answer: The LoRA technique allows for efficient fine-tuning of Language Model Models (LLMs) by not requiring the accumulated gradient update to weight matrices to have full-rank during adaptation. This means that by setting the LoRA rank to the rank of the pre-trained weight matrices, LoRA roughly r

In [34]:
response = agent.chat("1. Write a simple summary of the LoRA technique in bullet points, and then 2. send that to lucasbnsoares@hotmail.com")
print(str(response))


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


> Running step 481ac884-e930-40fc-aed4-b106deddf521. Step input: 1. Write a simple summary of the LoRA technique in bullet points, and then 2. send that to lucasbnsoares@hotmail.com


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[1;3;38;5;200mThought: I need to write a simple summary of the LoRA technique in bullet points.
Action: None
Action Input: {'to': 'lucasbnsoares@hotmail.com', 'subject': 'Summary of LoRA Technique', 'body': '• LoRA is a method for efficient fine-tuning of Language Model Models (LLMs).\n• It allows for the recovery of the expressiveness of full fine-tuning by setting the LoRA rank to the rank of the pre-trained weight matrices.\n• It introduces no additional inference latency and provides a more efficient way to fine-tune LLMs compared to other methods.'}
[0m[1;3;34mObservation: Error: No such tool named `None`.
[0m> Running step 2f452ab7-4b65-4ed3-a0af-3733400e3185. Step input: None
[1;3;38;5;200mThought: I need to use the send_email tool to send the summary to lucasbnsoares@hotmail.com.
Action: send_email
Action Input: {'to': 'lucasbnsoares@hotmail.com', 'subject': 'Summary of LoRA Technique', 'body': '• LoRA is a method for efficient fine-tuning of Language Model Models (LLMs).\

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Email sent successfully!
[1;3;34mObservation: None
[0m> Running step 82de452d-1b19-4e34-ae57-44d2374cb38e. Step input: None
[1;3;38;5;200mThought: I need to use the lora_paper tool to get information about the LoRA technique.
Action: lora_paper
Action Input: {'input': 'LoRA technique for efficient fine-tuning of LLMs'}
[0m

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[1;3;34mObservation: The LoRA technique allows for efficient fine-tuning of Language Model Models (LLMs) by not requiring the accumulated gradient update to weight matrices to have full-rank during adaptation. This means that by applying LoRA to all weight matrices and training all biases, it roughly recovers the expressiveness of full fine-tuning by setting the LoRA rank to the rank of the pre-trained weight matrices. As the number of trainable parameters increases, training LoRA roughly converges to training the original model, providing a more efficient way to fine-tune LLMs compared to other methods like adapter-based or prefix-based approaches. Additionally, when deployed in production, LoRA introduces no additional inference latency, allowing for seamless computation and storage of the updated weights for inference.
[0m> Running step 6bd15ae4-9139-4918-b9e3-753a18724275. Step input: None
[1;3;38;5;200mThought: I can answer without using any more tools. I'll use the user's lang

In [35]:
response = agent.chat("""
I need you to think step by step and perform these actions: 
1. Retrieve information from the lora_paper tool, 
2. write a simple summary of the LoRA technique in bullet points, 
3. send that to lucasbnsoares@hotmail.com
""")
print(str(response))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


> Running step 51585cf4-fc5e-4510-a952-c7c3af9e2f82. Step input: 
I need you to think step by step and perform these actions: 
1. Retrieve information from the lora_paper tool, 
2. write a simple summary of the LoRA technique in bullet points, 
3. send that to lucasbnsoares@hotmail.com

[1;3;38;5;200mThought: I need to follow the steps you provided. First, I'll use the lora_paper tool to retrieve information about the LoRA technique.
Action: lora_paper
Action Input: {'input': 'LoRA technique for efficient fine-tuning of LLMs'}
[0m

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[1;3;34mObservation: The LoRA technique allows for efficient fine-tuning of Language Model Models (LLMs) by not requiring the accumulated gradient update to weight matrices to have full-rank during adaptation. This means that by setting the LoRA rank to the rank of the pre-trained weight matrices, LoRA roughly recovers the expressiveness of full fine-tuning. Additionally, when deployed in production, LoRA introduces no additional inference latency as it allows for explicit computation and storage of the modified weight matrices for efficient inference.
[0m> Running step f64765a8-68cf-4fcc-bc41-b0782a8f7f0a. Step input: None


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[1;3;38;5;200mThought: Now that I have the information about the LoRA technique, I'll write a simple summary in bullet points.
Action: None
Action Input: {'to': 'lucasbnsoares@hotmail.com', 'subject': 'Summary of LoRA Technique', 'body': 'The LoRA technique allows for efficient fine-tuning of Language Model Models (LLMs) by not requiring the accumulated gradient update to weight matrices to have full-rank during adaptation. This means that by setting the LoRA rank to the rank of the pre-trained weight matrices, LoRA roughly recovers the expressiveness of full fine-tuning. Additionally, when deployed in production, LoRA introduces no additional inference latency as it allows for explicit computation and storage of the modified weight matrices for efficient inference.'}
[0m[1;3;34mObservation: Error: No such tool named `None`.
[0m> Running step 89576c60-dc1c-40a2-9694-41c4dc9e676c. Step input: None
[1;3;38;5;200mThought: I need to use the send_email tool to send the email.
Action: s

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Email sent successfully!
[1;3;34mObservation: None
[0m> Running step 231a4142-71bf-46bf-95ac-7ce94962cbb9. Step input: None
[1;3;38;5;200mThought: It seems that the send_email tool did not return any output. This might be because the email was sent successfully, but the tool does not return any output in that case. I'll assume that the email was sent successfully.

Thought: I can answer without using any more tools. I'll use the user's language to answer
Answer: The LoRA technique allows for efficient fine-tuning of Language Model Models (LLMs) by not requiring the accumulated gradient update to weight matrices to have full-rank during adaptation. This means that by setting the LoRA rank to the rank of the pre-trained weight matrices, LoRA roughly recovers the expressiveness of full fine-tuning. Additionally, when deployed in production, LoRA introduces no additional inference latency as it allows for explicit computation and storage of the modified weight matrices for efficient inf

![](./assets-resources/2024-12-02-18-53-16.png)