In [1]:
#installing necessary libraries and packages
!pip install langchain langchain_community faiss-cpu unstructured[all_docs] langchain_huggingface



In [2]:
!apt-get install poppler-utils

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.5).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [3]:
#import necessary libraries
import os
import re
from langchain_community.document_loaders import DirectoryLoader,UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEndpoint
from langchain import PromptTemplate
from langchain.chains import RetrievalQA

In [4]:
#setting the API Key
from google.colab import userdata
API_KEY=userdata.get('HF_TOKEN')

os.environ["HUGGINGFACEHUB_API_TOKEN"]=API_KEY

In [5]:
#extracting text from FirstAid Manual
loader=DirectoryLoader(path="./data/",glob="./*.pdf",show_progress=True,loader_cls=UnstructuredFileLoader)
text=loader.load()

100%|██████████| 1/1 [00:47<00:00, 47.44s/it]


In [6]:
text



In [7]:
#chunking of extracted text
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=400)
text_doc=text_splitter.split_documents(text)

In [8]:
print(f"length of text: {len(text)}")
print(f"length of text_doc before stripping: {len(text_doc)}")

#stripping the text_doc
repeatition=0
for index,doc in enumerate(text_doc):
  if "PREFACE" in doc.page_content and repeatition==1:
    text_doc=text_doc[index:]
    break
  elif "PREFACE" in doc.page_content and repeatition!=1:
    repeatition+=1

print(f"length of text_doc after stripping: {len(text_doc)}")

length of text: 1
length of text_doc before stripping: 827
length of text_doc after stripping: 741


In [None]:
#embedding of extracted text and string it in vector store
db=FAISS.from_documents(text_doc,HuggingFaceEmbeddings(model_name="NeuML/pubmedbert-base-embeddings"))

  db=FAISS.from_documents(text_doc,HuggingFaceEmbeddings(model_name="NeuML/pubmedbert-base-embeddings"))


In [None]:
#creating retriever pipeline
retriever=db.as_retriever(search_type="similarity",kwargs=1)

#check for proper retrieval
query = "What is first aid?"
retriever.get_relevant_documents(query)[0].page_content

In [None]:
#creating llm model
llm_model=HuggingFaceEndpoint(
    repo_id="mistralai/Mistral-7B-v0.1",
    max_length=50,
    temperature=0.5
)

#creating prompt template
prompt= """Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.And rephrase the answer into not more than 50 tokens.

Context: {context}
Question: {question}
"""
prompt_template=PromptTemplate(
    input_variables=["context","question"],
    template=prompt
)

#creating chain
chain=RetrievalQA.from_chain_type(
    llm=llm_model,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs={"prompt":prompt_template}
)

In [None]:
def clean_response(output):
    # Remove unwanted numbered sections (e.g., "\n\n1.")
    output = re.sub(r"\n\n\d+\.", "", output)

    # Define replacements for specific unwanted patterns
    replacements = {
        "\n": "",
        "(cid:1)": "",
        ":(cid:1)": "",
        ";(cid:1)": "",
        "TITLE OF CHAPTER.": ""
    }

    # Apply replacements
    for old, new in replacements.items():
        output = output.replace(old, new)

    # Remove the "Answer" prefix, if present
    if output.startswith("Answer"):
        output = output[7:].lstrip()

    # Split the text into sentences and remove duplicates
    sentences = output.split(".")
    unique_sentences = set()
    cleaned_sentences = []

    for sentence in sentences:
        if sentence and sentence not in unique_sentences:
            unique_sentences.add(sentence)
            cleaned_sentences.append(sentence.strip() + ".")

    # Combine cleaned sentences and remove text after "Question:"
    cleaned_text = "".join(cleaned_sentences)
    final_text = cleaned_text.split("Question:")[0]

    return final_text.strip()


In [None]:
while True:
  query=input("Enter your query: ")
  if query.lower()=="thank you":
    print("Welcome")
    break
  answer=chain.invoke({"query": query})
  output=clean_response(answer["result"])
  print(f"Answer: {output}")
  print()