In [1]:
print("OK!")

OK!


In [2]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import os
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\user\\OneDrive\\Desktop\\Medical-Chatbot'

In [None]:
# Extract data from the PDF
def load_pdf(data):
    from langchain.document_loaders import PyPDFLoader
    import os

    all_docs = []
    for filename in os.listdir(data):
        if filename.endswith(".pdf"):
            loader = PyPDFLoader(os.path.join(data, filename))
            pages = loader.load()
            # Skip the first 14 pages (adjust this number if needed)
            all_docs.extend(pages[14:])

    return all_docs

extracted_data = load_pdf("data/")


In [6]:

import re


def clean_text(text):
    lines = text.split("\n")
    clean_lines = [
        line for line in lines
        if not line.strip().isdigit()  # Remove page numbers
        and not re.match(r"^GALE ENCYCLOPEDIA OF MEDICINE", line.strip())
    ]
    return "\n".join(clean_lines)

def text_split(extracted_data):
    from langchain.schema import Document
    cleaned_docs = []
    for doc in extracted_data:
        cleaned_content = clean_text(doc.page_content)
        cleaned_docs.append(Document(page_content=cleaned_content))

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(cleaned_docs)

    return text_chunks


In [7]:

text_chunks = text_split(extracted_data)
print("length of my chunk:", len(text_chunks))

length of my chunk: 7207


In [8]:
for i, chunk in enumerate(text_chunks[:3]):
    print(f"\nChunk {i+1}:\n{'-'*30}")
    print(chunk.page_content[:500])  # First 500 chars



Chunk 1:
------------------------------
Abdominal aorta ultrasound see Abdominal
ultrasound
Abdominal aortic aneurysm see Aortic
aneurysm
Abdominal hernia see Hernia
Abdominal thrust see Heimlich maneuver
Abdominal ultrasound
Definition
Ultrasound technology allows doctors to “see”
inside a patient without resorting to surgery. A transmit-
ter sends high frequency sound waves into the body,
where they bounce off the different tissues and organs to
produce a distinctive pattern of echoes. A receiver

Chunk 2:
------------------------------
“hears” the returning echo pattern and forwards it to a
computer, which translates the data into an image on a
television screen. Because ultrasound can distinguish
subtle variations between soft, fluid-filled tissues, it is
particularly useful in providing diagnostic images of the
abdomen. Ultrasound can also be used in treatment.
Purpose
The potential medical applications of ultrasound
were first recognized in the 1940s as an outgrowth of the

Chun

In [9]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [10]:
embeddings = download_hugging_face_embeddings()

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [11]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [12]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [13]:
from dotenv import load_dotenv
load_dotenv()

True

In [14]:
PINECONE_API_KEY=os.getenv('PINECONE_API_Key')
print(f"Key length: {len(PINECONE_API_KEY)} characters") 

Key length: 75 characters


In [None]:
GOOGLE_API_KEY=os.getenv('GOOGLE_API_KEY')
print("API key configured:", bool(os.getenv("GOOGLE_API_KEY")))
print(f"Key length: {len(GOOGLE_API_KEY)} characters") 

API key configured: True
Key length: 39 characters


In [16]:

from langchain_pinecone import PineconeVectorStore

docsearch=PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name='medical-chatbot',
    embedding=embeddings,
)

In [17]:
retriever=docsearch.as_retriever(search_type="similarity",search_kwargs={"k":3})

In [18]:
retriever_docs=retriever.invoke("what is Acne")

In [19]:
retriever_docs

[Document(id='7cf5c925-d946-4abf-becf-946d153b482d', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 'page_label': '40', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='72daf663-e280-4008-be9a-adda2cbab451', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 'page_label': '40', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='92474467-1056-4ce8-b387-ae79bcbe1860', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    model="models/gemini-1.5-flash-latest",  # Most widely available
    # model="gemini-1.5-pro-latest",  # If you have access
    # model="gemini-pro",  # Legacy name
    temperature=0.4,
    max_output_tokens=500,
    google_api_key=GOOGLE_API_KEY
)


In [31]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt=(
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question.If you don't know the answer,say that you "
    "don't know,Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt=ChatPromptTemplate.from_messages(
    [
        ("system",system_prompt),
        ("human","{input}"),
    ]
)

In [32]:
question_answer_chain=create_stuff_documents_chain(llm,prompt)
rag_chain=create_retrieval_chain(retriever,question_answer_chain)

In [41]:
response=rag_chain.invoke({"input":"What is Acne?"})
print(response["answer"])

I am sorry, but this document only contains metadata and does not provide a definition of acne.  I do not know what acne is.


In [25]:
import google.generativeai as genai
genai.configure(api_key=GOOGLE_API_KEY)

print("Available models:")
for m in genai.list_models():
    if "gemini" in m.name:
        print(f"- {m.name} (supports: {m.supported_generation_methods})")

Available models:
- models/gemini-1.0-pro-vision-latest (supports: ['generateContent', 'countTokens'])
- models/gemini-pro-vision (supports: ['generateContent', 'countTokens'])
- models/gemini-1.5-pro-latest (supports: ['generateContent', 'countTokens'])
- models/gemini-1.5-pro-001 (supports: ['generateContent', 'countTokens', 'createCachedContent'])
- models/gemini-1.5-pro-002 (supports: ['generateContent', 'countTokens', 'createCachedContent'])
- models/gemini-1.5-pro (supports: ['generateContent', 'countTokens'])
- models/gemini-1.5-flash-latest (supports: ['generateContent', 'countTokens'])
- models/gemini-1.5-flash-001 (supports: ['generateContent', 'countTokens', 'createCachedContent'])
- models/gemini-1.5-flash-001-tuning (supports: ['generateContent', 'countTokens', 'createTunedModel'])
- models/gemini-1.5-flash (supports: ['generateContent', 'countTokens'])
- models/gemini-1.5-flash-002 (supports: ['generateContent', 'countTokens', 'createCachedContent'])
- models/gemini-1.5-f

In [24]:
# Test your retriever directly
test_docs = retriever.invoke("What is acne?")
print(f"Retrieved {len(test_docs)} documents")
for i, doc in enumerate(test_docs):
    print(f"\nDocument {i+1}:")
    print(doc.page_content[:300] + "...")  # Show first 300 characters

Retrieved 3 documents

Document 1:
GALE ENCYCLOPEDIA OF MEDICINE 226
Acne
GEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26...

Document 2:
GALE ENCYCLOPEDIA OF MEDICINE 226
Acne
GEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26...

Document 3:
GALE ENCYCLOPEDIA OF MEDICINE 226
Acne
GEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26...


In [34]:
retriever_docs = retriever.invoke("What is Acne?")
for i, doc in enumerate(retriever_docs):
    print(f"\nChunk {i+1}:\n{'-'*30}\n{doc.page_content}")



Chunk 1:
------------------------------
GALE ENCYCLOPEDIA OF MEDICINE 226
Acne
GEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26

Chunk 2:
------------------------------
GALE ENCYCLOPEDIA OF MEDICINE 226
Acne
GEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26

Chunk 3:
------------------------------
GALE ENCYCLOPEDIA OF MEDICINE 226
Acne
GEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26


In [37]:
import re

def clean_text(text):
    # Removing metadata patterns such as page numbers and "GEM" information
    text = re.sub(r"^GALE ENCYCLOPEDIA OF MEDICINE.*", "", text)  # Remove any line starting with GALE ENCYCLOPEDIA OF MEDICINE
    text = re.sub(r"^.*GEM - \d{4}.*", "", text)  # Remove GEM metadata
    text = re.sub(r"^.*Page \d{1,3}.*", "", text)  # Remove page information
    text = re.sub(r"\d{1,3}.*", "", text)  # Remove any page number references
    text = re.sub(r"\s{2,}", " ", text)  # Replace multiple spaces with a single space

    # Also remove anything that does not appear relevant like medical references that don’t contribute to the main text
    return text.strip()



In [38]:
test_docs = retriever.invoke("What is acne? Define acne and describe its causes.")

for i, doc in enumerate(test_docs):
    print(f"\nDocument {i+1}:")
    print(doc.page_content[:500])  # Show first 500 characters to verify the content.



Document 1:
Acne
Acne vulgaris affecting a woman’s face. Acne is the general
name given to a skin disorder in which the sebaceous
glands become inflamed.(Photograph by Biophoto Associ-
ates, Photo Researchers, Inc. Reproduced by permission.)

Document 2:
Acne
Acne vulgaris affecting a woman’s face. Acne is the general
name given to a skin disorder in which the sebaceous
glands become inflamed.(Photograph by Biophoto Associ-
ates, Photo Researchers, Inc. Reproduced by permission.)

Document 3:
GALE ENCYCLOPEDIA OF MEDICINE 2 25
Acne
Acne vulgaris affecting a woman’s face. Acne is the general
name given to a skin disorder in which the sebaceous
glands become inflamed.(Photograph by Biophoto Associ-
ates, Photo Researchers, Inc. Reproduced by permission.)
GEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25


In [39]:
import re

def clean_text(text):
    # Remove metadata patterns
    text = re.sub(r"^GALE ENCYCLOPEDIA OF MEDICINE.*", "", text)  # Remove lines starting with GALE ENCYCLOPEDIA OF MEDICINE
    text = re.sub(r"^GEM - \d{4}.*", "", text)  # Remove GEM metadata
    text = re.sub(r"^Page \d{1,3}.*", "", text)  # Remove page number information
    text = re.sub(r"\d{1,3}.*", "", text)  # Remove any page number references
    text = re.sub(r"Photograph.*", "", text)  # Remove any photograph references
    text = re.sub(r"\s{2,}", " ", text)  # Replace multiple spaces with a single space

    return text.strip()

# After retrieval, clean the documents
def remove_metadata_from_documents(documents):
    cleaned_documents = []
    for doc in documents:
        cleaned_content = clean_text(doc.page_content)
        cleaned_documents.append(cleaned_content)
    return cleaned_documents

# Apply cleaning to the retrieved documents
test_docs_cleaned = remove_metadata_from_documents(test_docs)

# Print the cleaned documents
for i, doc in enumerate(test_docs_cleaned):
    print(f"\nCleaned Document {i+1}:")
    print(doc[:500])  # Show first 500 characters of the cleaned document



Cleaned Document 1:
Acne
Acne vulgaris affecting a woman’s face. Acne is the general
name given to a skin disorder in which the sebaceous
glands become inflamed.(
ates, Photo Researchers, Inc. Reproduced by permission.)

Cleaned Document 2:
Acne
Acne vulgaris affecting a woman’s face. Acne is the general
name given to a skin disorder in which the sebaceous
glands become inflamed.(
ates, Photo Researchers, Inc. Reproduced by permission.)

Cleaned Document 3:
Acne
Acne vulgaris affecting a woman’s face. Acne is the general
name given to a skin disorder in which the sebaceous
glands become inflamed.(
ates, Photo Researchers, Inc. Reproduced by permission.)
GEM -


In [40]:
import re

def clean_text(text):
    # Remove any metadata or unwanted lines
    text = re.sub(r"^GALE ENCYCLOPEDIA OF MEDICINE.*", "", text)  # Remove lines starting with GALE ENCYCLOPEDIA OF MEDICINE
    text = re.sub(r"^GEM - .+", "", text)  # Remove GEM metadata entirely
    text = re.sub(r"^Page \d{1,3}.*", "", text)  # Remove page number information
    text = re.sub(r"\d{1,3}.*", "", text)  # Remove any numeric references (like page numbers or codes)
    text = re.sub(r"Photograph.*", "", text)  # Remove photograph references
    text = re.sub(r"Reproduced by permission.*", "", text)  # Remove permission notice
    text = re.sub(r"\s{2,}", " ", text)  # Replace multiple spaces with a single space

    # Strip any trailing or leading spaces
    return text.strip()

# Apply cleaning to the retrieved documents
def remove_metadata_from_documents(documents):
    cleaned_documents = []
    for doc in documents:
        cleaned_content = clean_text(doc.page_content)
        cleaned_documents.append(cleaned_content)
    return cleaned_documents

# Apply cleaning to test documents
test_docs_cleaned = remove_metadata_from_documents(test_docs)

# Print the cleaned documents
for i, doc in enumerate(test_docs_cleaned):
    print(f"\nCleaned Document {i+1}:")
    print(doc[:500])  # Show first 500 characters of the cleaned document



Cleaned Document 1:
Acne
Acne vulgaris affecting a woman’s face. Acne is the general
name given to a skin disorder in which the sebaceous
glands become inflamed.(
ates, Photo Researchers, Inc.

Cleaned Document 2:
Acne
Acne vulgaris affecting a woman’s face. Acne is the general
name given to a skin disorder in which the sebaceous
glands become inflamed.(
ates, Photo Researchers, Inc.

Cleaned Document 3:
Acne
Acne vulgaris affecting a woman’s face. Acne is the general
name given to a skin disorder in which the sebaceous
glands become inflamed.(
ates, Photo Researchers, Inc. GEM -
