In [2]:
%pwd

'c:\\Users\\amrha\\Downloads\\Health_Care_ChatBot\\research'

In [4]:
import os
os.chdir('../')

In [3]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import CharacterTextSplitter

In [6]:
def load_pdf(data):
    loader = DirectoryLoader(data, glob="**/*.pdf", loader_cls=PyPDFLoader)
    documents=loader.load()
    return documents

In [7]:
extracted_data = load_pdf('data/')

In [10]:
len(extracted_data)

637

## preproccessing

In [11]:
from typing import List
from langchain.schema import Document
def filter_to_minimal_docs(docs:List[Document]) -> List[Document]:
    """Given a list of Documents objects, return a new list of Document objects containing only 'source' in metadata and the original page contant."""
    minimal_docs :List[Document] = []

    for doc in docs:
        src= doc.metadata.get('source')
        minimal_docs.append(Document(page_content=doc.page_content, metadata={'source': src}))
    return minimal_docs

In [12]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [18]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
# spitting the documents into smaller chunks
def text_splitter(minimal_docs):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    texts_chunk=text_splitter.split_documents(minimal_docs)
    return texts_chunk

In [19]:
texts_chunk = text_splitter(minimal_docs)
print(len(texts_chunk))

5859


In [21]:
from langchain.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    """
    Download and return the HuggingFace embeddings model.
    """
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name
    )
    return embeddings

embedding = download_embeddings()

In [22]:
embedding

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [27]:
vector=embedding.embed_query("Hello world")
print(len(vector))

384


In [29]:
import os
print(os.getcwd())  # Show current working directory
print(os.path.isfile('.env'))  # Check if .env exists in this directory

c:\Users\amrha\Downloads\Health_Care_ChatBot
True


In [172]:
from dotenv import load_dotenv
import os
result = load_dotenv()  # take environment variables from .env.
print(f".env loaded: {result}")
print(f"Current working directory: {os.getcwd()}")
print(f".env exists: {os.path.isfile('.env')}")

.env loaded: True
Current working directory: c:\Users\amrha\Downloads\Health_Care_ChatBot
.env exists: True


In [173]:
from dotenv import load_dotenv
load_dotenv()

True

In [175]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

In [176]:
from pinecone import Pinecone

In [177]:
pinecone_api_key = PINECONE_API_KEY
pc=Pinecone(api_key=pinecone_api_key)

In [178]:
pc

<pinecone.pinecone.Pinecone at 0x25fe4681490>

In [219]:
from pinecone import ServerlessSpec
index_name = "medical-chatbot"
if index_name not in pc.list_indexes():
    pc.create_index(
        name=index_name,
        dimension=384,  # Dimension of the embedding model
        metric="cosine",  # Similarity metric
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )


PineconeApiException: (409)
Reason: Conflict
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'x-pinecone-api-version': '2025-04', 'x-cloud-trace-context': 'd81c4a595918a989ece76e888b99c87a', 'date': 'Thu, 28 Aug 2025 18:02:20 GMT', 'server': 'Google Frontend', 'Content-Length': '85', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"ALREADY_EXISTS","message":"Resource  already exists"},"status":409}


In [92]:
index=pc.Index(index_name)

In [None]:
from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_documents(documents=texts_chunk,embedding=embedding, index_name=index_name)

In [None]:
# load existing index

from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_existing_index(embedding=embedding, index_name=index_name)

In [218]:
# ADD More Data to the existing index
dswish=Document(page_content="Diabetes is a chronic condition that affects how your body turns food into energy. Most of the food you eat is broken down into sugar (also called glucose) and released into your bloodstream. When your blood sugar goes up, it signals your pancreas to release insulin. Insulin acts like a key to let the blood sugar into your body's cells for use as energy.",
                 metadata={"source": "diabetes.txt"})
docsearch.add_documents(documents=[dswish])

['1c8222e3-e114-4788-8af5-b090fc417091']

In [180]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [181]:
retriever_docs=retriever.get_relevant_documents("What is diabetes?")

In [182]:
retriever_docs

[Document(id='fa6f63da-1410-4a3e-a933-328a76affe5a', metadata={'source': 'diabetes.txt'}, page_content='Diabetes is a chronic condition that affects how your body turns food into energy. Most of the food you eat is broken down into sugar (also called glucose) and released into your bloodstream. When your blood sugar goes up, it signals your pancreas to release insulin. Insulin acts like a key to let the blood sugar into your body’s cells for use as energy.'),
 Document(id='deb0499c-e740-43c1-989c-2b23ab683b67', metadata={'source': 'data\\Medical_book.pdf'}, page_content='begin to fall. A person with diabetes mellitus either does\nnot make enough insulin, or makes insulin that does not\nwork properly. The result is blood sugar that remains\nhigh, a condition called hyperglycemia.\nDiabetes must be diagnosed as early as possible. If\nleft untreated, it can damage or cause failure of the eyes,\nkidneys, nerves, heart, blood vessels, and other body\norgans. Hypoglycemia, or low blood sugar

In [183]:
from dotenv import load_dotenv
load_dotenv()

True

In [None]:
# from langchain_openai import ChatOpenAI
# chatModel = ChatOpenAI(model="gpt-4o")

In [185]:
import google.generativeai as genai
import os

# ضيف الـ API key
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

# اختار الموديل (فيه اختيارات زي gemini-1.5-flash أو gemini-1.5-pro)
model = genai.GenerativeModel("gemini-1.5-flash")

# جرب تسأل سؤال
response = model.generate_content("What is Acromegaly and gigantism?")
print(response.text)

Acromegaly and gigantism are both caused by excessive growth hormone (GH) production, but they differ primarily in *when* the excessive GH occurs:

**Gigantism:**

* **Onset:** Occurs before puberty, while the growth plates in long bones are still open.
* **Characteristics:**  Leads to significant increases in overall height, often resulting in individuals who are exceptionally tall.  Other features may include disproportionately large hands and feet.  Internal organs also tend to be enlarged.
* **Cause:** Usually caused by a pituitary adenoma (a benign tumor) that secretes excessive GH *before* the growth plates close.  In rare cases, it can be caused by other conditions stimulating GH production.

**Acromegaly:**

* **Onset:** Occurs after puberty, after the growth plates have closed.
* **Characteristics:** Does *not* cause increased height. Instead, it causes the thickening of bones and soft tissues.  This leads to characteristic features such as:
    * Enlarged hands and feet (ofte

In [187]:
import google.generativeai as genai
chatModel = genai.GenerativeModel("gemini-1.5-flash")

In [189]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.prompts import ChatPromptTemplate

In [190]:
system_prompt = (
    "You are an Medical assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [196]:
# !pip install langchain-google-genai

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
chatModel = ChatGoogleGenerativeAI(model="gemini-1.5-flash", google_api_key=os.getenv("GOOGLE_API_KEY"))

In [194]:
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [195]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])

Acromegaly is a disorder caused by the pituitary gland's abnormal release of a chemical, leading to increased bone and soft tissue growth and other bodily disturbances.  Gigantism is not explicitly defined in the provided text.  More information is needed to fully explain the relationship between acromegaly and gigantism.


In [215]:
test_questions = [
    {"question": "what the diabetes", "answer": "Diabetes mellitus is a condition where the body doesn't produce enough insulin or doesn't use insulin properly, resulting in high blood sugar (hyperglycemia). If left untreated, it can damage various organs. Early diagnosis is crucial for preventing serious complications."},
]

In [216]:
results = []
for item in test_questions:
    response = rag_chain.invoke({"input": item["question"]})
    results.append({"question": item["question"], "model_answer": response["answer"], "true_answer": item["answer"]})

In [217]:
def f1_score(pred, true):
    pred_tokens = set(pred.split())
    true_tokens = set(true.split())
    common = pred_tokens & true_tokens
    if not common:
        return 0
    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(true_tokens)
    return 2 * (precision * recall) / (precision + recall)

scores = [f1_score(r["model_answer"], r["true_answer"]) for r in results]
average_f1 = sum(scores) / len(scores)
print("Average F1 Score:", average_f1)

Average F1 Score: 0.8648648648648649
