In [25]:
# Importing required Library
import pandas as pd
import os 
from langchain_community.document_loaders import PyPDFLoader
import langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI

In [16]:
# Loading data from PDF
loader = PyPDFLoader("test_pdf.pdf")
pdf = loader.load()

# Splitting the data
splitter = RecursiveCharacterTextSplitter(chunk_size=200,chunk_overlap=20)
split_doc = splitter.split_documents(pdf)

In [18]:
# Storing data in vector db
embedding = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001",
    google_api_key=os.getenv("gemini_key"))
vectordb = Chroma.from_documents(
    embedding=embedding,
    documents=split_doc,
    collection_name="myCollection",
    persist_directory="my_chroma_db"
)

In [30]:
# Retriving data
retriever = vectordb.as_retriever(search_kwargs={"k":1})

# Making LLM Wrapper
llm = ChatGoogleGenerativeAI(
    model = "gemini-2.0-flash",
    google_api_key=os.getenv("gemini_key")
)
# Embedding it in chain
qa = RetrievalQA.from_chain_type(
    llm= llm,
    retriever=retriever,
    chain_type="stuff"
)

result =qa.invoke("What is the main architecture proposed in this paper?")
print(result)


{'query': 'What is the main architecture proposed in this paper?', 'result': 'The main architecture proposed in this paper is the Transformer architecture.'}
