In [1]:
# Importing necessary libraries
from langchain_community.llms import Ollama
from langchain_community.embeddings import OllamaEmbeddings
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.vectorstores import VectorStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

In [2]:
#RAG prompt template
RAG_PROMPT_TEMPLATE = """
Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
If the answer cannot be deduced from the context, do not give an answer.</s>
Context:
{context}
---
Now here is the question you need to answer.

Question: {question}
"""
prompt = PromptTemplate.from_template(RAG_PROMPT_TEMPLATE)

In [3]:
#RAG model initialization
MODEL = "phi3"
model = Ollama(model=MODEL)

In [4]:
#PDF loader
loader = PyPDFLoader("tn.pdf")
docs = loader.load()
print(docs)
special = u"\uf076"

for doc in docs:
    doc.page_content = doc.page_content.replace('\n', '')
    doc.page_content = doc.page_content.replace(special,'')
    doc.page_content = doc.page_content.replace('  ',' ')
# Transform data
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
texts = text_splitter.split_documents(docs)

# Embedding the input chunk
embeddings = HuggingFaceEmbeddings(model_name='bert-base-uncased')
docsearch = FAISS.from_documents(texts, embeddings)
retriever = docsearch.as_retriever()

[Document(page_content=' \nHIGH LIGHTS FOR BUDGET ESTIMATES 202 4-25 \nTamil Development  \n\uf076 The twin epics of Tamil literature, Silappathikaram and \nManimegalai, will be translated into 25 Indian and \nforeign languages at a cost of Rs. 2 crore.  \n\uf076 In order to translate literary works and  spread the \neuphonious  notes  of Tamil language across the world, \nan allocation of Rs.2 crore will be made in the coming \nyear. \n\uf076 In order to ensure that Tamil flourishes in the rapidly \nadvancing technological landscape , an allocation of \nRs.5 crore will be made to enable startups  \nto develop  Natural Language Processing and Large \nLanguage Models  based on machine learning and \nartificia l intelligence.  \n\uf076 In order to enable future generations to appreciate \nthe richness of Tamil language and the glorious history \nof Tamil people, a project to digitize rare books and \ndocuments will be undertaken at a cost of Rs.2 crore.  \n \n ', metadata={'source': 'tn.

  from .autonotebook import tqdm as notebook_tqdm
No sentence-transformers model found with name bert-base-uncased. Creating a new one with MEAN pooling.


In [5]:
from operator import itemgetter

chain = (
    {
        "context": itemgetter("question") | retriever,
        "question": itemgetter("question"),
    }
    | prompt
    | model
)

### Give your questions in below cell to test the model

In [7]:
questions = [
    "What is the corpus of the 'Research and Business Development Fund for Technical Textiles and Man Made Fibre'?"
]

for question in questions:
    print(f"Question: {question}")
    print(f"Answer: {chain.invoke({'question': question})}")
    print()

Question: What is the corpus of the 'Research and Business Development Fund for Technical Textiles and Man Made Fibre'?
Answer: The corpus of the 'Research and Business Development Fund for Technical Textiles and Man Made Fibre' will be Rs.25 crore.

