##Install All packages


In [None]:
# !pip install -q --upgrade langchain pypdf chromadb google-generativeai langchain-google-genai python-dotenv

In [3]:
import google.generativeai as genai
from langchain import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

In [4]:
from dotenv import load_dotenv
import os

load_dotenv()

True

In [5]:
GOOGLE_API_KEY=os.getenv('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

In [6]:
loader = PyPDFDirectoryLoader("./sample_data/")
data = loader.load()

In [7]:
data

[Document(page_content='Retrieval-Augmented Generation for\nKnowledge-Intensive NLP Tasks\nPatrick Lewis†‡, Ethan Perez?,\nAleksandra Piktus†, Fabio Petroni†, Vladimir Karpukhin†, Naman Goyal†, Heinrich Küttler†,\nMike Lewis†, Wen-tau Yih†, Tim Rocktäschel†‡, Sebastian Riedel†‡, Douwe Kiela†\n†Facebook AI Research;‡University College London;?New York University;\nplewis@fb.com\nAbstract\nLarge pre-trained language models have been shown to store factual knowledge\nin their parameters, and achieve state-of-the-art results when ﬁne-tuned on down-\nstream NLP tasks. However, their ability to access and precisely manipulate\nknowledge is still limited, and hence on knowledge-intensive tasks, their perfor-\nmance lags behind task-speciﬁc architectures. Additionally, providing provenance\nfor their decisions and updating their world knowledge remain open research prob-\nlems. Pre-trained models with a differentiable access mechanism to explicit non-\nparametric memory can overcome this issue

In [8]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
content = "\n\n".join(str(page.page_content) for page in data)

In [9]:
content=content.lower()

In [10]:
texts = text_splitter.split_text(content)

In [25]:
for i in texts:
    print(i)
    print("\n\n")

retrieval-augmented generation for
knowledge-intensive nlp tasks
patrick lewis†‡, ethan perez?,
aleksandra piktus†, fabio petroni†, vladimir karpukhin†, naman goyal†, heinrich küttler†,
mike lewis†, wen-tau yih†, tim rocktäschel†‡, sebastian riedel†‡, douwe kiela†
†facebook ai research;‡university college london;?new york university;
plewis@fb.com
abstract
large pre-trained language models have been shown to store factual knowledge
in their parameters, and achieve state-of-the-art results when ﬁne-tuned on down-
stream nlp tasks. however, their ability to access and precisely manipulate
knowledge is still limited, and hence on knowledge-intensive tasks, their perfor-
mance lags behind task-speciﬁc architectures. additionally, providing provenance
for their decisions and updating their world knowledge remain open research prob-
lems. pre-trained models with a differentiable access mechanism to explicit non-



lems. pre-trained models with a differentiable access mechanism to explicit n

In [27]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

In [28]:
embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001")

#Create embeddings for each of the Text chunks and Store them in Vectorstore-chromadb

In [29]:
vector_store = Chroma.from_texts(texts, embeddings).as_retriever()

#Create a Prompt Template

In [30]:
prompt_template = """
  Please answer the question in as much detail as possible based on the provided context.
  Ensure to include all relevant details. If the answer is not available in the provided context,
  kindly respond with "The answer is not available in the context." Please avoid providing incorrect answers.
\n\n
  Context:\n {context}?\n
  Question: \n{question}\n

  Answer:
"""

prompt = PromptTemplate(template = prompt_template, input_variables = ["context", "question"])

#Load the model

In [31]:
from langchain_google_genai import ChatGoogleGenerativeAI
model = ChatGoogleGenerativeAI(model="gemini-pro",
                             temperature=0.3)


In [32]:
chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)


In [33]:
question = input("Enter your question: ")
docs = vector_store.get_relevant_documents(question)

response = chain(
    {"input_documents":docs, "question": question}
    , return_only_outputs=True)
response


{'output_text': 'The answer is not available in the context.'}