In [None]:
# --- Standard library ---
import os
import re
from typing import TypedDict, Annotated, Literal, Sequence

# --- LangChain Core ---
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage, BaseMessage
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

# --- LangChain Community ---
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.vectorstores import FAISS

# --- LangChain OpenAI ---
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# --- LangChain Text Splitters ---
from langchain_text_splitters import RecursiveCharacterTextSplitter

# --- Other third-party ---
from dotenv import load_dotenv

In [None]:
load_dotenv()

# **Load Research Papers**

In [None]:
pdf_paths = [
    "Research_Papers/Attentoion_is_all_you_need.pdf",
    "Research_Papers/BERT.pdf",
    "Research_Papers/RoBerta.pdf",
    "Research_Papers/ALBERT.pdf",
    "Research_Papers/DistilBERT.pdf",
    "Research_Papers/RAG.pdf"
]

docs = []
for path in pdf_paths:
    loader = PyMuPDFLoader(file_path=path)
    docs.extend(loader.load())

print(f"Loaded {len(docs)} pages from {len(pdf_paths)} PDF files")

# **Split Documents into Chunks**

In [None]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(docs)
print(f"Created {len(chunks)} chunks")

In [None]:
# chunks[0].page_content

# **Create Vector Store**

In [None]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vectore_store = FAISS.from_documents(chunks, embeddings)
print("Vector store created successfully")

In [None]:
# index_id = vectore_store.index_to_docstore_id[5]
# vectore_store.get_by_ids([index_id])

# **Create Retriever**

In [None]:
retriver = vectore_store.as_retriever(search_type="similarity", search_kwargs={"k": 4})

In [None]:
retriver

In [None]:
# output = retriver.invoke("what is BERT?")
# output[0].page_content

# **Initialize LLM**

In [None]:
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.2)

# **Create Prompt Template**

In [None]:
prompt = PromptTemplate(
    template="""You are a helpful assistant.
    Answer ONLY from the provided research paper context.
    If the context is insufficient, just say "Nothing about this is mentioned in the provided context."

    Context: {context}
    
    Question: {question}
    
    Answer:""",
    input_variables=['context', 'question']
)

In [None]:
# question = "what is BERT?"
# retrieved_docs = retriver.invoke(question)
# retrieved_docs[0].page_content

In [None]:
# context_text = " ".join(doc.page_content for doc in retrieved_docs)
# context_text

In [None]:
# final_prompt = prompt.invoke({"context": context_text, "question": question})
# final_prompt

In [None]:
# answer = llm.invoke(final_prompt)
# answer.content

# **Build RAG Chain**

In [None]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

In [None]:
def format_docs(retrives_docs):
    context_text = " ".join(doc.page_content for doc in retrives_docs)
    return context_text

In [None]:
parallel_chain = RunnableParallel({
    'context': retriver | RunnableLambda(format_docs),
    'question': RunnablePassthrough()
})

In [None]:
# parallel_chain.invoke("what is BERT?")

In [None]:
parser = StrOutputParser()

In [None]:
main_chain = parallel_chain | prompt | llm | parser

# **Run Traditional RAG**

In [None]:
user_query = "What are different types of BERT?"
main_chain.invoke(user_query)

In [None]:
user_query = "Explain the attention mechanism in transformers"
main_chain.invoke(user_query)

In [None]:
user_query = "What is RAG and how does it work?"
main_chain.invoke(user_query)