# PDF Extraction

In [2]:
%pwd

'z:\\VsCode\\Medussa-Medical Chatbot\\Medussa--Medical-Chatbot-\\research'

In [3]:
import os 
os.chdir("../")

In [4]:
%pwd

'z:\\VsCode\\Medussa-Medical Chatbot\\Medussa--Medical-Chatbot-'

In [5]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [11]:
# Extract PDF Files
def load_pdfs(directory):
    loader=DirectoryLoader(
        directory,
        glob='*.pdf',
        loader_cls=PyPDFLoader
    )
    documents=loader.load()
    return documents

In [14]:
extracted_data=load_pdfs("Z:\VsCode\Medussa-Medical Chatbot\Medussa--Medical-Chatbot-\data")

  extracted_data=load_pdfs("Z:\VsCode\Medussa-Medical Chatbot\Medussa--Medical-Chatbot-\data")


In [15]:
# extracted_data
len(extracted_data)

637

In [16]:
from typing import List
from langchain.schema import Document  # Document is a object that is used for compatability 

def filter_extracted_data(docs:List[Document])->List[Document]:
    """
    Filter out extracted data """
    filter_data:List[Document]=[]
    for doc in docs:
        src=doc.metadata.get('source')
        filter_data.append(
            Document(
                page_content=doc.page_content,
                metadata={'source':src}
            )
        )
    return filter_data

In [17]:
minimal_docs=filter_extracted_data(extracted_data)

In [19]:
# minimal_docs

In [7]:
# Split the documents into smaller chunks
def text_split(minimal_docs):
    text_splitter=RecursiveCharacterTextSplitter(
        chunk_size=700,
        chunk_overlap=10,    # For understanding the context
    )
    texts_chunks=text_splitter.split_documents(minimal_docs)
    return texts_chunks

In [20]:
texts_chunks=text_split(minimal_docs)
print(f"No of chunks {len(texts_chunks)}")

No of chunks 4203


# Embedding creation

In [21]:
from langchain.embeddings import HuggingFaceEmbeddings
def download_embeddings():
    model_name="NeuML/pubmedbert-base-embeddings"
    embeddings=HuggingFaceEmbeddings(
        model_name=model_name
    )
    return embeddings

In [22]:
embeddings=download_embeddings()

  embeddings=HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


In [24]:
# embeddings.embed_query("Hello World")

In [6]:
from dotenv import load_dotenv
import os
load_dotenv()

True

# ADD to Weaviate Vdb

In [None]:
import weaviate
WEAVIATE_URL=os.environ.get("WEAVIATE_URL")
WEAVIATE_API_KEY=os.environ.get("WEAVIATE_API_KEY")


In [63]:
import os
os.environ["WEAVIATE_URL"] = WEAVIATE_URL
os.environ["WEAVIATE_API_KEY"] = WEAVIATE_API_KEY

In [31]:
import weaviate, os
import weaviate.classes as wvc

URL = os.getenv("WEAVIATE_URL")
APIKEY = os.getenv("WEAVIATE_API_KEY")

# Connect to Weaviate Cloud
client= weaviate.connect_to_weaviate_cloud(
    cluster_url=URL,
    auth_credentials=wvc.init.Auth.api_key(APIKEY),
)

client.is_ready()

True

In [32]:
from langchain_weaviate import WeaviateVectorStore
import weaviate
from tqdm import tqdm

doc_search = WeaviateVectorStore(
    client=client,
    index_name="Medussa",
    embedding=embeddings,
    text_key="page_content"
)

# Insert in batches with tqdm
batch_size = 100
for i in tqdm(range(0, len(texts_chunks), batch_size), desc="Uploading to Weaviate"):
    batch = texts_chunks[i:i+batch_size]
    doc_search.add_documents(batch)


Uploading to Weaviate: 100%|██████████| 43/43 [11:25<00:00, 15.94s/it]


In [36]:
dswith=Document(
    page_content="Hello my friend",
    metadata={"source":"me"}
)

# Add more documents 

In [37]:
doc_search.add_documents(documents=[dswith])

['e3debe14-6022-4e2b-9513-e8a74116a27e']

In [41]:
retriever=doc_search.as_retriever(search_type='similarity',search_kwargs={"k":3})

In [43]:
retriver_docs=retriever.invoke("What is Acne")
retriver_docs

[Document(metadata={'text': None, 'source': 'Z:\\VsCode\\Medussa-Medical Chatbot\\Medussa--Medical-Chatbot-\\data\\Medical_book.pdf'}, page_content='disease specialist, or an endocrinologist, a specialist who\ntreats diseases of the body’s endocrine (hormones and\nglands) system.\nAcne has a characteristic appearance and is not diffi-\ncult to diagnose. The doctor takes a complete medical\nhistory, including questions about skin care, diet, factors\ncausing flare-ups, medication use, and prior treatment.\nPhysical examination includes the face, upper neck,\nchest, shoulders, back, and other affected areas. Under\ngood lighting, the doctor determines what types and how\nmany blemishes are present, whether they are inflamed,\nwhether they are deep or superficial, and whether there is\nscarring or skin discoloration.'),
 Document(metadata={'text': None, 'source': 'Z:\\VsCode\\Medussa-Medical Chatbot\\Medussa--Medical-Chatbot-\\data\\Medical_book.pdf'}, page_content='and sweating in hot we

In [11]:
from langchain_google_genai import ChatGoogleGenerativeAI

chatModel = ChatGoogleGenerativeAI(
    model="gemini-2.5-pro",
    temperature=0.0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)



In [57]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


In [58]:
system_prompt = (
    "You are a Medical assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


In [59]:
# Create the question-answer chain
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
# Create the retrieval-augmented generation (RAG) chain
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [60]:
# Invoke the chain with a query
response = rag_chain.invoke({"input": "What is Acromegaly and gigantism?"})

print(response["answer"])

Acromegaly and gigantism are rare disorders caused by an excess of growth hormone (GH) released from the pituitary gland. Gigantism occurs in children whose bony growth plates have not yet closed, leading to exceptional growth of long bones and unusual height. When this condition occurs in adults after bone growth has stopped, it is called acromegaly.
