Import Necessary Modules

In [None]:
from operator import itemgetter
from helpers import get_llm, get_retriever
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import Runnable, RunnablePassthrough, RunnableParallel, RunnableLambda, chain
from langchain_core.tools import tool
from typing import Annotated
import requests
from langchain.callbacks.tracers import ConsoleCallbackHandler
from helpers import crawl
from langchain_core.documents import Document
from langchain.chains.query_constructor.base import (
    StructuredQueryOutputParser,
    get_query_constructor_prompt,
)
from langchain.chains.query_constructor.base import AttributeInfo

In [None]:
llm = get_llm("gpt-4o")

In [None]:
documents = crawl(
        start_url="https://win066.wixsite.com/brillar-bank/",
        ignore_list=[
            "https://win066.wixsite.com/brillar-bank/brillar-bank-blog-1",
            "https://win066.wixsite.com/brillar-bank/brillar-bank-blog-2",
            "https://win066.wixsite.com/brillar-bank/brillar-bank-blog-3",
            "https://win066.wixsite.com/brillar-bank/brillar-bank-blog-4",
        ],
    )

documents

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=2000, chunk_overlap=200
        )
docs = text_splitter.split_documents(documents)
docs

In [4]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("./Hong Leong Annual Report 2022.pdf")
pages = []
async for page in loader.alazy_load():
    pages.append(page)

loader = PyPDFLoader("./782292_JUMBO_Annual_Report_2023.pdf")
async for page in loader.alazy_load():
    pages.append(page)

pages

[Document(metadata={'source': './Hong Leong Annual Report 2022.pdf', 'page': 0}, page_content='HONG LEONG ASIA LTD.\nANNUAL REPORT 2022\nRELIABLE\nRESILIENT\nRESPONSIBLE'),
 Document(metadata={'source': './Hong Leong Annual Report 2022.pdf', 'page': 1}, page_content='CONTENTS\n01 \nCorporate Profile\n02 \nCorporate Highlights\n03 \nFinancial Highlights\n04 \nWhat We Do\n07 \nOur Vision\n08 \nChairman’s Message\n12 \nBoard of Directors17 \nManagement Team\n19 \nCEO’s Review\n25 \nInvestor Communications\n28 \nSustainability \nBoard Statement\n33 \nOperating Entities\n34 \nCorporate Directory \n35 \nCorporate  Governance Report74 \nFinancial Report\n225 \nAnalysis of Shareholdings\n227 \n5-Year Financial Summary\n228 \nNotice of  Annual General Meeting\n236 \nAdditional Information  on Directors Seeking  Re-election / Appointment at the 62nd Annual General Meeting\nProxy FormOperating NetworkCORPORATE  \nPROFILE ABOUT US\n1 F ormerly known as diesel engines\n2 https:/ /www.hongleong.com.

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=2000, chunk_overlap=200
        )
docs = text_splitter.split_documents(pages)
docs

In [None]:
for doc in docs:
  doc.page_content = f'SOURCE:{doc.metadata["source"]}\nPASSAGE:{doc.page_content}'

docs

In [None]:
prompt = """Analyze the data and extract the metadata. 
You should return in comma separated list. 
You should extract the following data: product (if applicable).

Example Response:
product: IPhone
\n\n
Data: {data}"""


prompt_template = ChatPromptTemplate.from_template(prompt)

extraction_chain = prompt_template | llm | StrOutputParser()

for doc in docs:
  result = extraction_chain.invoke({"data": doc.page_content})
  key, value = result.split(": ")
  doc.metadata.update({key: value})

docs

In [None]:
from helpers import ingest_data

ingest_data(documents=docs, embedding_model="BAAI/bge-m3", index_name="metadata_test_bge", vector_db="chromadb")

In [None]:
document_content_description = "Fixed Deposit Data"
metadata_field_info = [
  AttributeInfo(
        name="product",
        description="The product that the institution offers",
        type="string",
    ),
]
prompt = get_query_constructor_prompt(
    document_content_description,
    metadata_field_info,
) 

output_parser = StructuredQueryOutputParser.from_components()
query_constructor = prompt | llm | output_parser

In [None]:
query_constructor.invoke("what are the interest rates for senior fixed deposit?")

In [None]:
docs = [
    Document(
        page_content="A bunch of scientists bring back dinosaurs and mayhem breaks loose",
        metadata={"year": 1993, "rating": 7.7, "director": "Christopher Nolan", "genre": "science fiction"},
    ),
    Document(
        page_content="Leo DiCaprio gets lost in a dream within a dream within a dream within a ...",
        metadata={"year": 2010, "director": "Christopher Nolan", "rating": 8.2},
    ),
    Document(
        page_content="A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea",
        metadata={"year": 2006, "director": "Christopher Nolan", "rating": 8.6},
    ),
    Document(
        page_content="A bunch of normal-sized women are supremely wholesome and some men pine after them",
        metadata={"year": 2019, "director": "Christopher Nolan", "rating": 8.3},
    ),
    Document(
        page_content="Toys come alive and have a blast doing so",
        metadata={"year": 1995, "genre": "animated", "director": "Christopher Nolan"},
    ),
    Document(
        page_content="Three men walk into the Zone, three men walk out of the Zone",
        metadata={
            "year": 1979,
            "director": "Andrei Tarkovsky",
            "genre": "thriller",
            "rating": 9.9,
        },
    ),
]

In [None]:
from helpers import ingest_data

ingest_data(documents=docs, embedding_model="BAAI/bge-m3", index_name="metadata_test_movies", vector_db="qdrant")

In [None]:
metadata_field_info = [
    AttributeInfo(
        name="genre",
        description="The genre of the movie. One of ['science fiction', 'comedy', 'drama', 'thriller', 'romance', 'action', 'animated']",
        type="string",
    ),
    AttributeInfo(
        name="year",
        description="The year the movie was released",
        type="integer",
    ),
    AttributeInfo(
        name="director",
        description="The name of the movie director",
        type="string",
    ),
    AttributeInfo(
        name="rating", description="A 1-10 rating for the movie", type="float"
    )
  ]

document_content_description = "Brief summary of a movie"
prompt = get_query_constructor_prompt(
    document_content_description,
    metadata_field_info,
    enable_limit=True
) 

output_parser = StructuredQueryOutputParser.from_components()
query_constructor = prompt | llm | output_parser


In [None]:
query_constructor.invoke("Has Christopher Nolan directed any movies about women?")

In [None]:
from langchain.retrievers.self_query.base import SelfQueryRetriever
from helpers import get_vector_store_instance
from langchain_community.query_constructors.qdrant import QdrantTranslator

vector_store = get_vector_store_instance(embedding_model="text-embedding-3-large", dimension=256, index_name="metadata_test_movies", vector_db="qdrant")
retriever = SelfQueryRetriever(
    query_constructor=query_constructor,
    vectorstore=vector_store,
    structured_query_translator=QdrantTranslator(metadata_key="metadata"),
    verbose=True,
)

In [None]:
from pprint import pp

result = retriever.invoke("Give me the movies that are directed by Christopher Nolan.", config={"callbacks": [ConsoleCallbackHandler()]})
result

#### Define Functions and Schema
In langchain, schema can be defined together with the actual function.

In [None]:
@tool("get_weather")
def get_weather(
    location: Annotated[str, "Location for the weather forecast, e.g. London, UK"]
):
    """Forecast the weather for the provided location."""

    api_key = "777c42660156447db5842748240110"
    result = requests.get(
        f"https://api.weatherapi.com/v1/current.json?key={api_key}&q={location}"
    )

    return result.json()


@tool("get_interest_rate")
def get_interest_rate(
    amount: Annotated[int, "Amount of deposit"],
    interest_rate: Annotated[float, "Interest rate percentage"],
    term: Annotated[int, "Maturity period in month"],
):
    """Interest calculation for fixed deposit."""
    
    interest = amount * (interest_rate / 100) * (term / 12)
    if term > 36:
        interest += 111

    return interest

Create Tool Collection

In [None]:
tools = [get_weather, get_interest_rate]

Initiate Retriever

In [None]:
retriever = get_retriever(
    index_name="metadata_test",
    embedding_model="text-embedding-3-large",
    dimension=256,
    vector_db="chromadb",
    top_k=5,
)

Initiate LLM

In [None]:
llm = get_llm("gpt-4o")
llm_with_tools = llm.bind_tools(tools)

Create Instructions for RAG Chain

In [None]:
qa_instructions = (
    """Use tool calls if necessary. Answer the user question given the following context. Each passage has a SOURCE which is the source of the document. After your answer, leave a blank line and then give the source of the passages you answered from. Put them in a comma separated list, prefixed with [SOURCES]:. If the sources looks like file path, just extract the file name. If no sources are related, response [NO SOURCES].

    Example:

    Question: What is the meaning of life?
    Response:
    The meaning of life is 42.

    SOURCES: www.google.com

    \n\n{context}."""
)

In [None]:
contextualize_instructions = """Convert the latest user question into a standalone question given the chat history. Don't answer the question, return the question and nothing else (no descriptive text)."""
contextualize_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_instructions),
        ("placeholder", "{chat_history}"),
        ("human", "{question}"),
    ]
)
contextualize_question = contextualize_prompt | llm | StrOutputParser()

# qa_instructions = (
#     """Use tool calls if necessary. Answer the user question given the following context.
#     \n\n{context}."""
# )
qa_prompt = ChatPromptTemplate.from_messages(
    [("system", qa_instructions), ("human", "{question}")]
)

Create a RAG Chain Combined with Tool Calling

In [None]:
@chain
def tool_call(input_: dict) -> Runnable:
    llm_result = input_.get("llm_result")
    if llm_result.tool_calls:
        test_instruction = """Answer the question using the tool response."""
        test_prompt = ChatPromptTemplate.from_messages(
            [("system", test_instruction), ("human", "{question}")]
        )
        test_prompt.messages.append(llm_result)
        for tool_call in llm_result.tool_calls:
            selected_tool = next(
                temp_tool for temp_tool in tools if temp_tool.name == tool_call["name"]
            )
            tool_response = selected_tool.invoke(tool_call)
            test_prompt.messages.append(tool_response)
        return {"question": itemgetter("question")} | test_prompt | llm_with_tools

    else:
        return llm_result

@chain
def contextualize_if_needed(input_: dict) -> Runnable:
    if input_.get("chat_history"):
        return contextualize_question
    else:
        return RunnablePassthrough() | itemgetter("question")

# Pass input query to retriever
retrieve_docs_chain = itemgetter("question") | retriever

def format_docs(docs):
    return "\n\n".join(f'{doc.page_content}' for doc in docs)

formatted_prompt = {
        "question": itemgetter("question") | RunnablePassthrough(),
        "context": lambda x: format_docs(x["context"]),
    } | RunnableParallel(prompt=qa_prompt, question=itemgetter('question'))


llm_result_chain = formatted_prompt | RunnableParallel(llm_result=itemgetter('prompt') | llm_with_tools, question=itemgetter("question"))

output_chain = llm_result_chain | tool_call | StrOutputParser()

final_chain = (
    RunnablePassthrough.assign(question=contextualize_if_needed)
    .assign(context=retrieve_docs_chain)
    .assign(answer=output_chain)
)

# final_chain.get_graph().print_ascii()

Invoke the Chain

In [None]:
question = "What is the return of equity for jumbo?"
result = final_chain.invoke(
    {
        "question": question,
        "chat_history": [],
    }
)
result

In [None]:
question = "What is the return on equity for Hong Leong?"
result = final_chain.invoke(
    {
        "question": question,
        "chat_history": [],
    }
)
result

In [None]:
question = "The attendance of Mr. Tan Cher Liang for the number of Board and Board commitee mettings held for FY2023 in jumbo."
result = final_chain.invoke(
    {
        "question": question,
        "chat_history": [],
    }
)
result

In [None]:
question = "The attendance of Ms. Sim Yu Juan Rachel for the number of Board and Board commitee mettings held for FY2023 in jumbo."
result = final_chain.invoke(
    {
        "question": question,
        "chat_history": [],
    }
)
result

In [None]:
question = "Who attended by the invitation at the number of Board and Board commitee mettings held for FY2023 in jumbo?"
result = final_chain.invoke(
    {
        "question": question,
        "chat_history": [],
    }
)
result

Get Sources

In [None]:
new_llm = llm
prompt = """Each passage has a SOURCE which is the source of the document. After looking at the input, leave a blank line and then give the source of the passages the input is seemed to be generated from. Put them in a comma separated list, prefixed with [SOURCES]:. If the sources looks like file path, just extract the file name. If no sources are related, response [NO SOURCES].

    Example:

    Input: The meaning of life is 42.
    SOURCES: www.google.com
    
Input: {input}
Documents: {context}
"""
prompt_template = ChatPromptTemplate.from_template(prompt)

def get_chat_history(input_):
  return "\n".join([f"{role}: {content}" for role, content in input_])

def get_context(input_):
  return "\n\n".join(f'SOURCE:{context.metadata["source"]}\nPASSAGE:{context.page_content}' for context in input_)

result_chain = {
  "input": itemgetter('input'),
  "context": itemgetter('context') | RunnableLambda(get_context),
} | prompt_template | new_llm | StrOutputParser()

result_chain.invoke({
  "context": context,
  "input": result['answer']
})