# Retrieval Augmented Generation using LangChain
1. Setup

In [41]:
import os
from dotenv import load_dotenv
from openai import OpenAI
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

load_dotenv()

openai_client = OpenAI()
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

2. Load Document (My mum's book)

In [42]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "data/book_abt_chairs.pdf"

if not os.path.exists(file_path):
    print("The PDF file does not exist!!")
    sys.exit(1)

loader = PyPDFLoader(file_path)
pages = loader.load()

3. Load embedding model and store PDF as vector in RAM memmory

In [43]:
from langchain_core.vectorstores import InMemoryVectorStore

embeddings = OpenAIEmbeddings()
vector_store = InMemoryVectorStore.from_documents(pages, embedding=embeddings)

retriever = vector_store.as_retriever()

4. Method for combining the retrieved information so its easier to pass onto the LLM and be more token-efficient

In [44]:
def format_doc(docs):
    return "\n\n".join([doc.page_content for doc in docs if doc.page_content])

5. Expand Query method I added last and adjusted the code accordingly, because the user's question might not match how the information appears in the document and it helps in cases where the user is not 100% sure what they are looking for.

In [45]:
def expand_query(question: str, llm_model) -> list[str]:
    prompt = ChatPromptTemplate.from_template(
        "Преформулирай следния въпрос в 5 разнообразни и семантично различни заявки:\n\n{question}"
    )
    query_chain = RunnablePassthrough() | prompt | llm_model | StrOutputParser()
    result = query_chain.invoke(question)
    return [q.strip() for q in result.split("\n") if q.strip()] #Creates a list of 5 queries


6. Does similarity search for each generated query and removes duplicate texts if there are any.

In [46]:
from typing import List
from langchain_core.documents import Document

def retrieve_documents(question: str, k: int = 2) -> list[Document]:
    expanded_questions = expand_query(question, llm)
    all_docs = []
    for q in expanded_questions:
        docs = vector_store.similarity_search(q, k=k)
        all_docs.extend(docs)

    seen = set()
    unique_docs = []
    for doc in all_docs:
        if doc.page_content not in seen:    # Remove duplicates
            unique_docs.append(doc)
            seen.add(doc.page_content)

    return unique_docs[:2]

Retrieve Info, Format docs into a single string, system message and OpenAI request

In [47]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages([
    ("system", "Отговори на въпроса САМО на базата на тази информация:\n\n{context}\n\n"
               "Ако отговорът не е намерен, кажи 'не мога да отговоря на базата на информацията от документа.'"),
    ("user", "{question}")
])

In [48]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableLambda

rag_chain = (
    {"context": RunnableLambda(retrieve_documents) | RunnableLambda(format_doc), "question": RunnablePassthrough()}    | prompt
    | llm
    | StrOutputParser()
)

In [49]:
#Wrapping this for reusability
def answer_question(question: str) -> str:
    return rag_chain.invoke(question)

In [50]:
question = "Има ли тема за ергономия?"
response = answer_question(question)
print("Отговор:", response)

Отговор: Да, в документа се обсъжда темата за ергономия, като се акцентира на добрата ергономия и създаването на среда, която позволява на тялото да се придвижва по различни начини. Специално се споменава концепцията за "динамично седене" и как ергономията е важна за проектирането на съвременни столове.
