In [79]:
#Used to find the current project working directory
%pwd


'/home/arun/ZERO/MYWORK/AI_MEDICAL_CHATBOT'

In [80]:
#To change the pwd to the just previous directory or folder
import os
os.chdir("/home/arun/ZERO/MYWORK/AI_MEDICAL_CHATBOT")

In [81]:
#This is tpo make sure that we are working in the project folder directory
%pwd

'/home/arun/ZERO/MYWORK/AI_MEDICAL_CHATBOT'

In [None]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [83]:
#Extract the data from the pdf file
def load_pdf_file(data):
    loader = DirectoryLoader(
        data, glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    documents = loader.load()
    return documents

In [84]:
extracted_data = load_pdf_file(data = "Data/")

In [85]:
# extracted_data

In [86]:
#Split the data into chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks


In [87]:
text_chunks = text_split(extracted_data)

print("Length of the chunks",len(text_chunks))

Length of the chunks 5859


In [None]:
#Downloading embedding models from hugging face
from langchain_community.embeddings import HuggingFaceEmbeddings
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [89]:
embeddings = download_hugging_face_embeddings()

In [90]:
#Testing
query_result = embeddings.embed_query("Hi, How are you?")
print(len(query_result))
#Here 384 is the dimension of the vector

384


In [106]:
from dotenv import load_dotenv
load_dotenv()

True

In [105]:
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')


In [107]:
GROQ_API_KEY = os.environ.get('GROQ_API_KEY')

In [94]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "medicalbottt"
pc.create_index(
    name=index_name,
    dimension=384,
    metric='cosine',
    spec=ServerlessSpec(
        cloud='aws',
        region="us-east-1"
    )
)


{
    "name": "medicalbottt",
    "metric": "cosine",
    "host": "medicalbottt-9a9rsfu.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [None]:
import os
os.environ["PINECONE_API_KEY"] == PINECONE_API_KEY


True

In [109]:
os.environ["GROQ_API_KEY"] == GROQ_API_KEY

True

In [96]:
#Embed each chunks and upsert into the pinecone index
from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings
)

In [97]:
#Load existing index
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [98]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={'k':5})

In [99]:
retrieved_docs = retriever.invoke("What is acne?")

In [100]:
retrieved_docs

[Document(id='bd3f4689-07fb-445a-a265-376381b89a11', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 'page_label': '40', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data/Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='91ccf369-ca5b-4f4e-b0fb-2e96437adee1', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 38.0, 'page_label': '39', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data/Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed. (Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM - 

In [None]:
# Import necessary libraries
from langchain_groq import ChatGroq
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
import os

# Initialize Groq LLM (LLaMA3-70B)
llm = ChatGroq(
    api_key=GROQ_API_KEY,
    model_name="llama3-70b-8192", 
    temperature=0.4,
    max_tokens=500,
)

# Define the system prompt
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

# Create chat prompt template
prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}"),
])

# Create document QA chain with Groq LLM
question_answer_chain = create_stuff_documents_chain(llm, prompt)

# Build the full RAG pipeline
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

# Run a sample query
response = rag_chain.invoke({"input": "what is Acne?"})

# Output the answer
print(response["answer"])


Acne is a common skin disease characterized by pimples on the face, chest, and back, occurring when the pores of the skin become clogged with oil, dead skin cells, and bacteria.
