In [3]:
from dotenv import load_dotenv
import os

## [Overview](markdown/overview.md)

### [Simple RAG query](markdown/simple-rag-query.md)

In [24]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_ollama import OllamaEmbeddings, ChatOllama

#### INDEXING ####

# Load Documents
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

embedding_function = OllamaEmbeddings(model='mxbai-embed-large')

current_dir = os.getcwd() 
persistent_directory = os.path.join(current_dir, "db", "chroma_db")

# Embed
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=embedding_function, persist_directory=persistent_directory)

vectorstore.persist()


retriever = vectorstore.as_retriever()

#### RETRIEVAL and GENERATION ####

# Prompt
prompt = hub.pull("rlm/rag-prompt")

# LLM
#llm = ChatOllama(model="qwen2.5:32b-instruct-q4_K_M")
llm = ChatOllama(model="llama3.1")

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Question
rag_chain.invoke("What is Task Decomposition?")

'Task Decomposition is a technique that breaks down complex tasks into smaller, simpler steps. It involves instructing a model to "think step by step" to utilize more test-time computation and transform big tasks into multiple manageable tasks. This technique enhances model performance on complex tasks.'

## Second Example
### In this instance we are using an already indexed vector db
### The 'chroma_db' persisted db contains chunks from a text file of the odyssey 
### We ask Who is Odysseus' wife?

In [None]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_ollama import OllamaEmbeddings, ChatOllama

embedding_function = OllamaEmbeddings(model='mxbai-embed-large')

# Get the current directory
current_dir = os.getcwd()

# Get the directory above the current directory
parent_dir = os.path.dirname(current_dir)

persistent_directory = os.path.join(parent_dir, "db", "chroma_db")

db = Chroma(persist_directory=persistent_directory,embedding_function=embedding_function)

bookretriever = db.as_retriever()

#### RETRIEVAL and GENERATION ####

# Prompt
prompt = hub.pull("rlm/rag-prompt")

# LLM
llm = ChatOllama(model="llama3.1")

# Chain
rag_chain = (
    # {"context": retriever | format_docs, "question": RunnablePassthrough()}
    {"context": bookretriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Question
rag_chain.invoke("Who is Odysseus' wife?")

"Penelope is Odysseus' wife. She is described as a very admirable woman with an excellent nature. When Agamemnon visits, he praises Penelope's goodness, saying that she would not murder her husband."

## Part 3: Retrieval
In the following code sample we can restrict the number of documents returned by setting the {"k": 1} arguement - in this case just one

In [8]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 1})

docs = retriever.get_relevant_documents("What is Task Decomposition?")

len(docs)

1

## Generation
### Simple example without a retriever 
A Chat prompt is given and piped into the llm with the previosly created docs

In [10]:
from langchain.prompts import ChatPromptTemplate
# Prompt
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
print(prompt)

input_variables=['context', 'question'] input_types={} partial_variables={} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Answer the question based only on the following context:\n{context}\n\nQuestion: {question}\n'), additional_kwargs={})]


In [11]:
# LLM
llm = ChatOllama(model="llama3.1")

In [12]:
# Chain
chain = prompt | llm

In [13]:
# Run
chain.invoke({"context":docs,"question":"What is Task Decomposition?"})

AIMessage(content='Task decomposition refers to the process of breaking down complex tasks into smaller, more manageable subtasks that can be executed independently or in a specific order to achieve the overall goal.\n\nIn the context of LLM-powered autonomous agents, task decomposition is crucial for enabling these agents to tackle complex problems and interact with external components effectively. By decomposing tasks into smaller units, these agents can better navigate the complexity of their environment, make more informed decisions, and adapt to unexpected situations more efficiently.\n\nTask decomposition involves various steps, including:\n\n1.  **Task Analysis**: Identifying the key elements required for a specific task.\n2.  **Subtask Definition**: Breaking down the main task into smaller, specific subtasks that can be executed independently or in sequence.\n3.  **Resource Allocation**: Assigning resources to each subtask based on its complexity and requirements.\n4.  **Task E

### Using a prompt from langchain hub

In [14]:
from langchain import hub
prompt_hub_rag = hub.pull("rlm/rag-prompt")

In [15]:
print(prompt_hub_rag)

input_variables=['context', 'question'] input_types={} partial_variables={} metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})]


In [16]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("What is Task Decomposition?")

'Task Decomposition is a technique that involves breaking down complicated tasks into multiple manageable tasks, making it easier for the agent to plan ahead and perform the task efficiently. According to the text, this is achieved through the "Chain of Thought" (CoT) prompting technique, which instructs the model to think step by step and utilize more test-time computation to decompose hard tasks into smaller and simpler steps.'

## [Query Transformations](markdown/query-transformations.md)

## [Part 5: Multi Query](markdown/multi-query.md)

In [25]:
from langchain.prompts import ChatPromptTemplate
from langchain_ollama import ChatOllama

llm = ChatOllama(model="qwen2.5:32b-instruct-q4_K_M")

# Multi Query: Different Perspectives
template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_perspectives = ChatPromptTemplate.from_template(template)

from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

generate_queries = (
    prompt_perspectives 
    | llm
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)


In [26]:
from langchain.load import dumps, loads

def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]

# Retrieve
question = "What is task decomposition for LLM agents?"
retrieval_chain = generate_queries | retriever.map() | get_unique_union
docs = retrieval_chain.invoke({"question":question})
len(docs)

1

In [27]:
from operator import itemgetter
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough

# RAG
template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

llm = ChatOllama(model="qwen2.5:32b-instruct-q4_K_M")

final_rag_chain = (
    {"context": retrieval_chain, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question":question})

'Task decomposition in the context of Large Language Model (LLM) agents refers to breaking down a complex task into smaller, more manageable sub-tasks. This process helps the agent to approach complicated tasks systematically by using a "chain of thought" (CoT) prompting technique. The CoT method instructs the model to think step by step, decomposing harder tasks into simpler components that are easier to handle individually. This not only aids in executing complex tasks more effectively but also provides insights into the reasoning and decision-making process of the LLM agent.'

## Second multi query example

In [30]:
from langchain.prompts import ChatPromptTemplate
from langchain_ollama import ChatOllama


llm = ChatOllama(model="llama3.1")

# Multi Query: Different Perspectives
template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_perspectives = ChatPromptTemplate.from_template(template)

from langchain_core.output_parsers import StrOutputParser

generate_queries = (
    prompt_perspectives 
    | llm
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

from langchain.load import dumps, loads

def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]

# Retrieve
question = "How did Odysseus get home?"
retrieval_chain = generate_queries | bookretriever.map() | get_unique_union
docs = retrieval_chain.invoke({"question":question})
len(docs)

from operator import itemgetter

# RAG
template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

llm = ChatOllama(model="llama3.1")

final_rag_chain = (
    {"context": retrieval_chain, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question":question})


'According to the text, it seems that the gods have decreed that Odysseus will return home in a perilous voyage of 20 days on a raft. The specific details of how he got home are not provided in the snippets you shared, but this passage from document #32449cf9-fefa-4e5e-ba09-7207b3620fc2 suggests that he was to be convoyed neither by gods nor men and would have to make his way back on a raft.'

## [Part 6: RAG-Fusion](markdown/rank-fusion.md)

In [35]:
# RAG-Fusion: Related
template = """You are a helpful assistant that generates multiple search queries based on a single input query. \n
Generate multiple search queries related to: {question} \n
Output (4 queries):"""
prompt_rag_fusion = ChatPromptTemplate.from_template(template)
question = "What is task decomposition for LLM agents?"

In [36]:
from langchain_core.output_parsers import StrOutputParser

generate_queries = (
    prompt_rag_fusion 
    | llm
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [37]:
from langchain.load import dumps, loads

def reciprocal_rank_fusion(results: list[list], k=60):
    """ Reciprocal_rank_fusion that takes multiple lists of ranked documents 
        and an optional parameter k used in the RRF formula """
    
    # Initialize a dictionary to hold fused scores for each unique document
    fused_scores = {}

    # Iterate through each list of ranked documents
    for docs in results:
        # Iterate through each document in the list, with its rank (position in the list)
        for rank, doc in enumerate(docs):
            # Convert the document to a string format to use as a key (assumes documents can be serialized to JSON)
            doc_str = dumps(doc)
            # If the document is not yet in the fused_scores dictionary, add it with an initial score of 0
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            # Retrieve the current score of the document, if any
            previous_score = fused_scores[doc_str]
            # Update the score of the document using the RRF formula: 1 / (rank + k)
            fused_scores[doc_str] += 1 / (rank + k)

    # Sort the documents based on their fused scores in descending order to get the final reranked results
    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    # Return the reranked results as a list of tuples, each containing the document and its fused score
    return reranked_results

retrieval_chain_rag_fusion = generate_queries | retriever.map() | reciprocal_rank_fusion
docs = retrieval_chain_rag_fusion.invoke({"question": question})
len(docs)

3

In [38]:

# RAG
template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    {"context": retrieval_chain_rag_fusion, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question":question})

'Task decomposition, specifically Chain of Thought (CoT), has become a standard prompting technique for enhancing model performance on complex tasks. It instructs the model to "think step by step" to utilize more test-time computation and decompose hard tasks into smaller and simpler steps. This process transforms big tasks into multiple manageable tasks and sheds light into an interpretation of the model\'s thinking process.'

### Second Rank Fusion Example

In [39]:
question = "Who was the wife of Odysseus?"
retrieval_chain_rag_fusion = generate_queries | bookretriever.map() | reciprocal_rank_fusion
docs = retrieval_chain_rag_fusion.invoke({"question": question})
len(docs)

# RAG
template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    {"context": retrieval_chain_rag_fusion, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question":question})

'According to the text, the answer is Penelope. She is mentioned in several passages as the wife of Odysseus (also referred to as Ulysses) and is concerned about her future with the suitors who are vying for her hand in marriage while Odysseus is away at war.'

## [Part 7: Decomposition](markdown/decomposition.md)

In [40]:
# Decomposition
template = """You are a helpful assistant that generates multiple sub-questions related to an input question. \n
The goal is to break down the input into a set of sub-problems / sub-questions that can be answers in isolation. \n
Generate multiple search queries related to: {question} \n
Output (3 queries):"""
prompt_decomposition = ChatPromptTemplate.from_template(template)

In [41]:
llm = ChatOllama(model="llama3.1")
# Chain
generate_queries_decomposition = ( prompt_decomposition | llm | StrOutputParser() | (lambda x: x.split("\n")))

# Run
question = "What are the main components of an LLM-powered autonomous agent system?"
questions = generate_queries_decomposition.invoke({"question":question})

In [42]:
print(questions)

['Based on the input question, here are three potential sub-questions with corresponding search queries:', '', '**1. What are the core technologies that enable LLMs in an autonomous agent system?**', '', 'Search query: `"LLM-powered autonomous agent system core technologies"` or `"components of a conversational AI system"`', '', 'This sub-question aims to identify the essential technologies, such as large language model architectures (e.g., transformer), inference engines, and knowledge graph databases, that power LLMs in an autonomous agent system.', '', '**2. What is the role of natural language understanding (NLU) in an LLM-powered autonomous agent system?**', '', 'Search query: `"LLM-powered autonomous agent system NLU"` or `"natural language understanding in conversational AI"`', '', 'This sub-question delves into the specific components involved in NLU, such as intent detection, entity recognition, and context understanding, which enable an LLM to comprehend user inputs.', '', "*

Answer recursively

In [43]:
# Prompt
template = """Here is the question you need to answer:

\n --- \n {question} \n --- \n

Here is any available background question + answer pairs:

\n --- \n {q_a_pairs} \n --- \n

Here is additional context relevant to the question: 

\n --- \n {context} \n --- \n

Use the above context and any background question + answer pairs to answer the question: \n {question}
"""

decomposition_prompt = ChatPromptTemplate.from_template(template)

In [44]:
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser

def format_qa_pair(question, answer):
    """Format Q and A pair"""
    
    formatted_string = ""
    formatted_string += f"Question: {question}\nAnswer: {answer}\n\n"
    return formatted_string.strip()

llm = ChatOllama(model="llama3.1")
q_a_pairs = ""
for q in questions:
    
    rag_chain = (
    {"context": itemgetter("question") | retriever, 
     "question": itemgetter("question"),
     "q_a_pairs": itemgetter("q_a_pairs")} 
    | decomposition_prompt
    | llm
    | StrOutputParser())

    answer = rag_chain.invoke({"question":q,"q_a_pairs":q_a_pairs})
    q_a_pair = format_qa_pair(q,answer)
    q_a_pairs = q_a_pairs + "\n---\n"+  q_a_pair

In [45]:
print(answer)

Based on the provided context and additional information, I will answer the sub-question.

**What are the critical components of a Knowledge Management System (KMS) that provide the Language Model (LLM) with access to relevant information and context?**

The critical components of a KMS that provide the LLM with access to relevant information and context include:

1. **Data Ingestion**: The process of collecting, processing, and storing data from various sources, making it available for the LLM to access.
2. **Entity Linking**: The ability to identify and link entities (e.g., people, places, organizations) in unstructured text to their corresponding knowledge representations, enabling the LLM to understand relationships between entities.
3. **Knowledge Graph Construction**: The process of creating a structured representation of knowledge by linking entities, concepts, and relationships, providing the LLM with a comprehensive understanding of the subject matter.

These components enable