In [1]:
# !pip install -qU langchain langchain_community langchain-huggingface
# !pip install -qU sentence-transformers faiss-cpu

In [2]:
import re
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFaceHub
from langchain.chains import RetrievalQA
from langchain.prompts.chat import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate

In [3]:
def extract_and_vectordb(pdf_file):
    loader = PyPDFLoader(pdf_file)
    documents = loader.load_and_split()
    texts = [chunk.page_content for chunk in documents]

    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vec_db = FAISS.from_texts(texts, embeddings)
    return vec_db

In [4]:
def vec_retriever(vector_db):
    return vector_db.as_retriever(search_kwargs={"k": 5})

In [5]:
def llm_model(model_name, api_key, temperature):
    return HuggingFaceHub(
        repo_id=model_name,
        model_kwargs={"temperature": temperature, "max_tokens": 512},
        huggingfacehub_api_token=api_key,
    )

In [6]:
system_message = SystemMessagePromptTemplate.from_template(
    "You are a helpful assistant who provides concise and accurate answers based on the given context."
)
human_message = HumanMessagePromptTemplate.from_template(
    "Given the following context:\n{context}\n\nAnswer the question:\n{question}"
)
chat_prompt = ChatPromptTemplate.from_messages([system_message, human_message])

In [7]:
def initialize_chain(model_name, api_key, temperature, chat_prompt, pdf_path):
    llm = llm_model(model_name, api_key, temperature)

    vec_store = extract_and_vectordb(pdf_path)
    retriever = vec_retriever(vec_store)

    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        chain_type="stuff",
        chain_type_kwargs={"prompt": chat_prompt}
        )
    return qa_chain

In [8]:
def get_answer(qa_chain, query):
    response = qa_chain.invoke({"query": query})
    return response

In [9]:
def postprocess(response, query):
    result = response['result']
    query_index = result.rfind(query)
    if query_index != -1:
        answer = result[query_index + len(query):].strip()
        return answer
    else:
        return ""

In [None]:
HUGGINGFACEHUB_API_TOKEN = ""
api_key = HUGGINGFACEHUB_API_TOKEN

model_name = "mistralai/Mistral-7B-Instruct-v0.3"
pdf_path = "ai_doc.pdf"

In [11]:
qa_chain = initialize_chain(
    model_name=model_name, 
    api_key=api_key, 
    temperature=0.1, 
    chat_prompt=chat_prompt, 
    pdf_path=pdf_path)

  return HuggingFaceHub(





In [12]:
query = "what is the main topic of the pdf?"
answer = get_answer(
    qa_chain,
    query)
print(postprocess(answer, query))

The main topic of the PDF is the impact of Artificial Intelligence (AI) on Data Analytics.


In [13]:
query = "what are the pdf sections?"
answer = get_answer(
    qa_chain,
    query)
print(postprocess(answer, query))

The sections in the provided PDF are:
1. Title
2. Abstract
3. Introduction
4. Body (with subsections)
   a. Machine Learning in Data Analytics
   b. Deep Learning and Neural Networks
   c. AI-Driven Predictive Analytics
   d. Automation of Data Processing
5. Conclusion
6. Summary


In [14]:
query = "what is the first sentence of the Summary?"
answer = get_answer(
    qa_chain,
    query)
print(postprocess(answer, query))

The first sentence of the Summary is: "This paper discussed the transformative role of AI in data analytics."
