In [42]:
from langchain.document_loaders import PyPDFLoader , DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [43]:
# Extract the data from the pdf file
def load_pdf_file(data):
    loader = DirectoryLoader(data , glob="*.pdf" , loader_cls=PyPDFLoader)
    documents = loader.load()

    return documents

In [44]:
# extract the data
extracted_data = load_pdf_file(data ='f:/Complete ML/All_Projects/MLProject6_chatbot/Data/')

In [45]:
# now chunking the data
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [46]:
text_chunks = text_split(extracted_data)
print(f"Number of text chunks created: {len(text_chunks)}")

Number of text chunks created: 8709


In [47]:
# now using an embedding model to create the vector embedding 
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2')
    return embeddings


# This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional 
# dense vector space and can be used for tasks like clustering or semantic search.

In [48]:
embeddings = download_hugging_face_embeddings()

In [49]:
query_result = embeddings.embed_query("Hello World")
print("length" , len(query_result))

length 384


In [80]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [81]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GROQ_API_KEY  = os.getenv("GROQ_API_KEY")

In [52]:
# initilaise the pine cone and store the embeddings
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "chatbot"

pc.create_index(
    name=index_name,
    dimension=384,  # Dimension of the embeddings
    metric="cosine",  # Similarity metric
    spec =  ServerlessSpec(
        cloud='aws',
        region='us-east-1',
    )
)

{
    "name": "chatbot",
    "metric": "cosine",
    "host": "chatbot-er3ppo8.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [83]:
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GROQ_API_KEY"] = GROQ_API_KEY 

In [54]:
# Embed each chunk and upsert the embeddings intyo the Pinecone index .
from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,  # document to store
    index_name=index_name,  # which index db i have to store
    embedding=embeddings,   # giving the model
)

In [55]:
# Load the existing index 
from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,  # which index db i have to store
    embedding=embeddings,   # giving the model
)

In [56]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x290427e66e0>

In [None]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 3})
# the above wiull give the top 3 similar chunks to the query

In [58]:
query = "What is Hyperbaric oxygen therapy ? "
results = retriever.invoke(query)

In [59]:
results

[Document(id='bd5b2369-c128-405b-b9cf-adcaa2c46ced', metadata={'creationdate': '2004-12-18T17:52:16-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T18:38:56-06:00', 'page': 807.0, 'page_label': '808', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'f:\\Complete ML\\All_Projects\\MLProject6_chatbot\\Data\\book.pdf', 'total_pages': 940.0}, page_content='blood from carrying sufficient oxygen to the brain\nand other organs. As a result, the person may lose\nconsciousness, stop breathing, and die without\nartificial respiration (assisted breathing) and other\nmeans of elevating the blood oxygen level.\nHyperbaric oxygen therapy —Pure oxygen is\nadministered to the patient in a special chamber at\nthree times the normal atmospheric pressure. The\npatient gets more oxygen faster to overcome\nsevere asphyxiation.\nPulmonary—Pertaining to the lungs.'),
 Document(id='3397cd3a-ba46-4c35-92c1-15e5c5723138', metadata={'creationdate': '2004-12-18T17:52:16-05:00', 'creator': 'PyPDF', 'moddate':

In [84]:
# to get the proper result we have to integrate the large language model
from langchain_groq import ChatGroq

llm = ChatGroq(
    api_key=GROQ_API_KEY,
    model="llama3-8b-8192",  # or "mixtral-8x7b-32768", "gemma-7b-it"
)


In [85]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [86]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [88]:
response = rag_chain.invoke({"input": "what is Prognosis?"})
print(response["answer"])

Prognosis refers to the long-term outlook or survival after therapy. It varies depending on the type of tumor or cancer, with overall survival rates around 50% for patients treated for colon and rectal cancer, dependent on the stage of the cancer.


In [90]:
response = rag_chain.invoke({"input": "What is Acne?"})
print(response["answer"])

I don't know the answer to that question based on the provided context. The provided text does mention Acne as a key term, but it does not provide a definition of Acne.
