In [None]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
import os

app_dir = os.path.join(os.getcwd(), "app")
load_dotenv(os.path.join(app_dir, ".env"))

loader = DirectoryLoader("./data", glob="**/*.txt")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=120,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
chunks = text_splitter.split_documents(docs)

embedding_function = OpenAIEmbeddings()
model = ChatOpenAI()

db = Chroma.from_documents(docs, embedding_function)
retriever = db.as_retriever()

In [None]:
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnableLambda
import re

query = "Who owns the restaurant?"


QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from a vector
    database. By generating multiple perspectives on the user question, your goal is to help
    the user overcome some of the limitations of the distance-based similarity search.
    Provide these alternative question like this:
    <<question1>>
    <<question2>>
    Only provide the query, no numbering.
    Original question: {question}""",
)


def split_and_clean_text(input_text):
    return [item for item in re.split(r"<<|>>", input_text) if item.strip()]

In [None]:
model = ChatOpenAI()
rephrase_chain = (
    QUERY_PROMPT | model | StrOutputParser() | RunnableLambda(split_and_clean_text)
)

In [None]:
list_of_questions = rephrase_chain.invoke("Who is the owner of the restaurant")

In [None]:
docs = [retriever.get_relevant_documents(q) for q in list_of_questions]

In [None]:
def flatten_and_unique_documents(documents):
    flattened_docs = [doc for sublist in documents for doc in sublist]

    unique_docs = []
    unique_contents = set()
    for doc in flattened_docs:
        if doc.page_content not in unique_contents:
            unique_docs.append(doc)
            unique_contents.add(doc.page_content)

    return unique_docs

In [None]:
flatten_and_unique_documents(documents=docs)

In [None]:
HYDE_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five hypothetical answers to the user's query. These answers should offer diverse perspectives or interpretations, aiding in a comprehensive understanding of the query. Present the hypothetical answers as follows:

    Hypothetical Answer 1: <<Answer considering a specific perspective>>
    Hypothetical Answer 2: <<Answer from a different angle>>
    Hypothetical Answer 3: <<Answer exploring an alternative possibility>>
    Hypothetical Answer 4: <<Answer providing a contrasting viewpoint>>
    Hypothetical Answer 5: <<Answer that includes a unique insight>>

    Note: Present only the hypothetical answers, without numbering, to provide a range of potential interpretations or solutions related to the query.
    Original question: {question}""",
)

In [None]:
hyde_chain = (
    HYDE_PROMPT | model | StrOutputParser() | RunnableLambda(split_and_clean_text)
)

In [None]:
list_of_questions = hyde_chain.invoke("Who is the owner of the restaurant")
list_of_questions

In [None]:
docs = [retriever.get_relevant_documents(q) for q in list_of_questions]
flatten_and_unique_documents(documents=docs)