In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_huggingface.llms import HuggingFaceEndpoint
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

In [2]:
model = "sentence-transformers/all-MiniLM-L6-v2"
hf_embeddings = HuggingFaceInferenceAPIEmbeddings(
    api_key = "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",
    model_name = model,
)

In [6]:
loader = PyPDFLoader("panchatantra.pdf")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap = 200)
documents = text_splitter.split_documents(docs)


In [None]:
db = Chroma.from_documents(documents=documents, persist_directory='./chromadb',collection_name = "panchtantra_embeddings",embedding= hf_embeddings)

In [87]:
query = "What is greed?"
result = db.similarity_search(query=query)
result

[Document(metadata={'author': 'Administrator', 'creationdate': 'D:20070709165412', 'creator': 'PScript5.dll Version 5.2.2', 'moddate': 'D:20070709165412', 'page': 6, 'page_label': '7', 'producer': 'AFPL Ghostscript 8.53', 'source': 'panchtantra.pdf', 'title': 'Microsoft Word - panchtantra.doc', 'total_pages': 23}, page_content='MORAL: Excess of greed is harmful. \n \n \n \nThe Bird With Two Heads'),
 Document(metadata={'author': 'Administrator', 'creationdate': 'D:20070709165412', 'creator': 'PScript5.dll Version 5.2.2', 'moddate': 'D:20070709165412', 'page': 6, 'page_label': '7', 'producer': 'AFPL Ghostscript 8.53', 'source': 'panchtantra.pdf', 'title': 'Microsoft Word - panchtantra.doc', 'total_pages': 23}, page_content='MORAL: Excess of greed is harmful. \n \n \n \nThe Bird With Two Heads'),
 Document(metadata={'author': 'Pandit Vishnu Sharma', 'creationdate': '2012-12-21T16:23:36+02:00', 'creator': 'Adobe Acrobat Pro 10.1.4', 'keywords': 'Panchatantra\r\nPandit Vishnu Sharma\r\nTra

In [88]:
prompt = ChatPromptTemplate.from_template("""
Answer ONLY using the context below. If unsure, say "I don't know".

CONTEXT:
{context}

QUESTION: 
{input}

ANSWER (no markdown):
""")

In [89]:
llm = HuggingFaceEndpoint(
    repo_id="google/gemma-2-2b-it",
    task="text-generation",
    huggingfacehub_api_token="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"
)


In [90]:
document_chain=create_stuff_documents_chain(llm,prompt)
retriever=db.as_retriever()
retrieval_chain=create_retrieval_chain(retriever,document_chain)


In [92]:
response=retrieval_chain.invoke({"input":"Who is Vishnu Sharma?"})
response['answer']




'Vishnu Sharma is the author of the Panchatantra, a collection of fables. He was a pandit, a learned scholar in ancient India. His works were translated by G. L. Chandiramani.'