In [1]:

import os
from dotenv import load_dotenv

# Load the Hugging Face token from the .env file
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")


In [2]:
# Step1 : Load Raw Pdf 

from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
# For Raw pdf loading 

DATA_PATH = "data/"

def load_pdf_files(data):
    loader = DirectoryLoader(data,
                             glob="*.pdf",
                             loader_cls=PyPDFLoader)
    
    documents = loader.load()
    return documents

documents = load_pdf_files(data=DATA_PATH)
print("Length of pdf pages " , len(documents))

# All the pages are loaded .

# Step2 : Create Chunks 

from langchain.text_splitter import RecursiveCharacterTextSplitter
# For creating Chunks
def create_chunks(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=50)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

text_chunks = create_chunks(extracted_data=documents)
print(len(text_chunks))


# Step3 : Create vector Embeddings

from langchain_huggingface import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer

def get_embedding_model():
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embedding_model
# used for clustering or semantic search 

embedding_model = get_embedding_model()

# Step4 : Store Embeddings in FAISS

from langchain_community.vectorstores import FAISS
DB_FAISS_PATH = "vectorstore/faiss_db"
db=FAISS.from_documents(text_chunks, embedding_model)
# in chunks ko is embedding model ke through embedding bana do 
db.save_local(DB_FAISS_PATH)

Length of pdf pages  1299
10977



  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


#### Load the pages / Documents > Chunk > Vector Embed > Store

### Connect Memory with LLM 

In [3]:
# Step 1 : Set Up LLM

from langchain_huggingface import HuggingFaceEndpoint
from langchain_core.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_huggingface import HuggingFaceEmbeddings

HUGGINGFACE_REPO_ID="mistralai/Mistral-7B-Instruct-v0.3"

def load_llm(huggingface_repo_id):
    llm = HuggingFaceEndpoint(repo_id = huggingface_repo_id,temperature=0.5,model_kwargs={"token":HF_TOKEN,"max_length": 512})
    return llm

In [4]:
# Step 2 : Connect LLM with FAISS and create Chain

CUSTOM_PROMPT_TEMPLATE = """
                Use the pieces of information provided in the context to answer user's question.
                If you dont know the answer, just say that you dont know, dont try to make up an answer. 
                Dont provide anything out of the given context

                Context: {context}
                Question: {question}

                Start the answer directly. No small talk please.
                """

def set_custom_prompt(custom_prompt_template):
    prompt=PromptTemplate(template=custom_prompt_template, input_variables=["context", "question"])
    return prompt


# Load Database 

DB_FAISS_PATH = "vectorstore/faiss_db"
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = FAISS.load_local(DB_FAISS_PATH,embedding_model,allow_dangerous_deserialization=True)

# Create QA Chain'

qa_chain=RetrievalQA.from_chain_type(llm=load_llm(HUGGINGFACE_REPO_ID),
                                      chain_type='stuff',
                                       retriever= db.as_retriever(search_kwargs={'k ': 3 }),
                                       return_source_documents=True ,
                                       chain_type_kwargs = {'prompt':set_custom_prompt( CUSTOM_PROMPT_TEMPLATE)})






Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [7]:
# Now invoke the chain with single query 

user_query = input("Enter your query : ")
response = qa_chain.invoke({'query':user_query})

print('RESULT :' , response['result'])
print("source documents : ", response['source_documents'])



RESULT :     Cancer can be cured through various treatments such as surgery, chemotherapy, radiation therapy, and in some cases, the use of cancer vaccines. The best chance for a surgical cure is usually with the first operation. However, it's important to note that the success of these treatments can vary greatly depending on the type and stage of the cancer, as well as the individual's overall health. It's always best to consult with a healthcare professional for personalized advice.
source documents :  [Document(id='01d129eb-5274-453c-95d3-e511ffb3f584', metadata={'source': 'data\\The_GALE_ENCYCLOPEDIA_of_MEDICINE_SECOND.pdf', 'page': 26}, page_content='curative for some stomach, genital/urinary, thyroid,\nbreast, skin, and central nervous system cancers. The best\nchance for a surgical cure is usually with the first opera-\nGALE ENCYCLOPEDIA OF MEDICINE 2638\nCancer therapy, definitive'), Document(id='1017822f-ed18-4c72-a043-28c7a41b5bf3', metadata={'source': 'data\\The_GALE_ENCYCL