In [None]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.schema import Document
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.runnables import RunnableParallel
from dotenv import load_dotenv
import os

app_dir = os.path.join(os.getcwd(), "app")
load_dotenv(os.path.join(app_dir, ".env"))

loader = DirectoryLoader("./data", glob="**/*.txt")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=120,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
chunks = text_splitter.split_documents(docs)

embedding_function = OpenAIEmbeddings()
model = ChatOpenAI()

db = Chroma.from_documents(chunks, embedding_function)
retriever = db.as_retriever()

In [None]:
def format_docs(docs: list[Document]):
    return "\n".join(doc.page_content for doc in docs)

In [None]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | model
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

In [None]:
result = rag_chain_with_source.invoke(input="Who is the owner of the restaurant")

In [7]:
result

{'context': [Document(page_content='Creating Chef Amico’s Restaurant', metadata={'source': 'data\\founder.txt'}),
  Document(page_content="into Chef Amico. Her mission was to uncover the secret behind the restaurant's growing fame. She was greeted by Amico", metadata={'source': 'data\\restaurant.txt'}),
  Document(page_content='One evening, as the sun cast a golden glow over the city, a renowned food critic, Elena Rossi, stepped into Chef Amico.', metadata={'source': 'data\\restaurant.txt'}),
  Document(page_content='and relish life’s simple pleasures. His restaurant was a haven where strangers became friends over plates of arancini', metadata={'source': 'data\\founder.txt'})],
 'question': 'Who is the owner of the restaurant',
 'answer': 'Chef Amico'}

### Why is that approach bad?

You will always retrieve top-k documents and pass them to the model. 
No matter how relevant the documents are

In [9]:
from sentence_transformers import CrossEncoder

cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

In [10]:
docs = result["context"]
contents = [doc.page_content for doc in docs]

In [11]:
contents

['Creating Chef Amico’s Restaurant',
 "into Chef Amico. Her mission was to uncover the secret behind the restaurant's growing fame. She was greeted by Amico",
 'One evening, as the sun cast a golden glow over the city, a renowned food critic, Elena Rossi, stepped into Chef Amico.',
 'and relish life’s simple pleasures. His restaurant was a haven where strangers became friends over plates of arancini']

In [12]:
pairs = []
for text in contents:
    pairs.append(["Who is the owner of the restaurant", text])

In [13]:
pairs

[['Who is the owner of the restaurant', 'Creating Chef Amico’s Restaurant'],
 ['Who is the owner of the restaurant',
  "into Chef Amico. Her mission was to uncover the secret behind the restaurant's growing fame. She was greeted by Amico"],
 ['Who is the owner of the restaurant',
  'One evening, as the sun cast a golden glow over the city, a renowned food critic, Elena Rossi, stepped into Chef Amico.'],
 ['Who is the owner of the restaurant',
  'and relish life’s simple pleasures. His restaurant was a haven where strangers became friends over plates of arancini']]

In [14]:
scores = cross_encoder.predict(pairs)
scores

array([-3.4635909, -3.4789827, -4.978613 , -5.3002462], dtype=float32)

In [15]:
scored_docs = zip(scores, docs)
sorted_docs = sorted(scored_docs, reverse=True)
sorted_docs

[(-3.4635909,
  Document(page_content='Creating Chef Amico’s Restaurant', metadata={'source': 'data\\founder.txt'})),
 (-3.4789827,
  Document(page_content="into Chef Amico. Her mission was to uncover the secret behind the restaurant's growing fame. She was greeted by Amico", metadata={'source': 'data\\restaurant.txt'})),
 (-4.978613,
  Document(page_content='One evening, as the sun cast a golden glow over the city, a renowned food critic, Elena Rossi, stepped into Chef Amico.', metadata={'source': 'data\\restaurant.txt'})),
 (-5.3002462,
  Document(page_content='and relish life’s simple pleasures. His restaurant was a haven where strangers became friends over plates of arancini', metadata={'source': 'data\\founder.txt'}))]

In [16]:
reranked_docs = [doc for _, doc in sorted_docs][0:2]
reranked_docs

[Document(page_content='Creating Chef Amico’s Restaurant', metadata={'source': 'data\\founder.txt'}),
 Document(page_content="into Chef Amico. Her mission was to uncover the secret behind the restaurant's growing fame. She was greeted by Amico", metadata={'source': 'data\\restaurant.txt'})]

### Integrate that in LCEL

In [17]:
# retriever which retrieves more than 4 documents
retriever = db.as_retriever(search_kwargs={"k": 10})

In [19]:
from sentence_transformers import CrossEncoder
from langchain_core.runnables import RunnableLambda


def rerank_documents(input_data):
    query = input_data["question"]
    docs = input_data["context"]

    cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
    contents = [doc.page_content for doc in docs]

    pairs = [(query, text) for text in contents]
    scores = cross_encoder.predict(pairs)

    scored_docs = zip(scores, docs)
    sorted_docs = sorted(scored_docs, key=lambda x: x[0], reverse=True)
    return [doc for _, doc in sorted_docs]


template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
model = ChatOpenAI()

rag_chain_from_docs = (
    RunnablePassthrough.assign(context=RunnableLambda(rerank_documents))
    | prompt
    | model
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

In [20]:
result = rag_chain_with_source.invoke(input="Who is the owner of the restaurant")
result

{'context': [Document(page_content='Creating Chef Amico’s Restaurant', metadata={'source': 'data\\founder.txt'}),
  Document(page_content="into Chef Amico. Her mission was to uncover the secret behind the restaurant's growing fame. She was greeted by Amico", metadata={'source': 'data\\restaurant.txt'}),
  Document(page_content='One evening, as the sun cast a golden glow over the city, a renowned food critic, Elena Rossi, stepped into Chef Amico.', metadata={'source': 'data\\restaurant.txt'}),
  Document(page_content='and relish life’s simple pleasures. His restaurant was a haven where strangers became friends over plates of arancini', metadata={'source': 'data\\founder.txt'}),
  Document(page_content='the restaurant quickly gained fame for its authentic flavors and Amico’s innovative twists on traditional recipes.', metadata={'source': 'data\\founder.txt'}),
  Document(page_content='young chefs, shares his knowledge at culinary workshops, and supports local farmers and producers.', met

### LLM-based Document Compressor

In [21]:
from langchain.prompts import PromptTemplate

DOCUMENT_EVALUATOR_PROMPT = PromptTemplate(
    input_variables=["document", "question"],
    template="""You are an AI language model assistant. Your task is to evaluate the provided document to determine if it is suited to answer the given user question. Assess the document for its relevance to the question, the completeness of information, and the accuracy of the content.

    Original question: {question}
    Document for Evaluation: {document}
    Evaluation Result: <<'True' if the document is suited to answer the question, 'False' if it is not>>

    Note: Conclude with a 'True' or 'False' based on your analysis of the document's relevance, completeness, and accuracy in relation to the question.""",
)

In [22]:
from langchain.schema import Document

documents = [
    Document(page_content="The owner is Guivanni"),
    Document(page_content="Pizza Salami costs 10$"),
    Document(page_content="We close the restaurant at 10p.m each day"),
]

model = ChatOpenAI()
compression_chain = DOCUMENT_EVALUATOR_PROMPT | model | StrOutputParser()

In [24]:
compression_chain.invoke(
    {"question": "Who is the owner of the restaurant", "document": documents[1]}
)

'False'

### Now lets make that dynamic

In [25]:
def evaluate_documents(input: dict):
    documents = input.get("documents", [])
    question = input.get("question")

    DOCUMENT_EVALUATOR_PROMPT = PromptTemplate(
        input_variables=["document", "question"],
        template="""You are an AI language model assistant. Your task is to evaluate the provided document to determine if it is suited to answer the given user question. Assess the document for its relevance to the question, the completeness of information, and the accuracy of the content.

        Original question: {question}
        Document for Evaluation: {document}
        Evaluation Result: <<'True' if the document is suited to answer the question, 'False' if it is not>>

        Note: Conclude with a 'True' or 'False' based on your analysis of the document's relevance, completeness, and accuracy in relation to the question.""",
    )
    model = ChatOpenAI()
    compression_chain = DOCUMENT_EVALUATOR_PROMPT | model | StrOutputParser()

    results = []
    for document in documents:
        evaluation_result = compression_chain.invoke(
            {"document": document.page_content, "question": question}
        )
        result = evaluation_result == "True"
        print(result)
        results.append(result)

    filtered_documents = [doc for doc, res in zip(documents, results) if res]

    return filtered_documents

In [26]:
_input = {
    "documents": [
        Document(page_content="The owner is Guivanni"),
        Document(page_content="Pizza Salami costs 10$"),
        Document(page_content="We close the restaurant at 10p.m each day"),
    ],
    "question": "Who is the owner of the restaurant?",
}

results = evaluate_documents(_input)
print(results)

True
False
False
[Document(page_content='The owner is Guivanni')]
