In [2]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from typing import List
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv 

In [3]:

import os
load_dotenv()
os.chdir("../.")
%pwd

'c:\\Personal\\AI_Handson\\Projects\\Medical_Chatbot_with_Langchain'

In [4]:

def load_knowledge_files(data):
    loder = DirectoryLoader(
        path=data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    documents = loder.load()
    return documents

In [5]:
loded_data = load_knowledge_files("data")
print(len(loded_data))


52


In [6]:

def filter_required_data_from_Knowledge_files(document):

    filtered_document = []

    for doc in document:
        metadata_src = {
            "source": doc.metadata.get("source"),
            "page": doc.metadata.get("page")
            }
        filtered_document.append(
            Document(
                page_content=doc.page_content,
                metadata = {"source":metadata_src}
            )
        )
    
    return filtered_document

In [7]:
filtered_document = filter_required_data_from_Knowledge_files(loded_data)
print(filtered_document)

[Document(metadata={'source': {'source': 'data\\Medical_Oncology_Handbook_.pdf', 'page': 0}}, page_content='Abhishek Joshi \nCorinne Ryan \nSabe Sabesan \nSuresh Varma \nZulfiquer Otty \n \n \nDEPARTMENT OF MEDICAL ONCOLOGY \nTOWNSVILLE CANCER CENTRE \nTOWNSVILLE UNIVERSITY HOSPITAL  \nTOWNSVILLE, AUSTRALIA \n \n \n \n \n \n \n \n \n \n \nMEDICAL ONCOLOGY HANDBOOK \nFOR JUNIOR MEDICAL OFFICERS'), Document(metadata={'source': {'source': 'data\\Medical_Oncology_Handbook_.pdf', 'page': 1}}, page_content='Medical Oncology Handbook for Junior Medical Officers \n5\nth\n Edition June 2020, \nTownsville, Australia. \n \nTownsville Cancer Centre is a teaching partner of the James Cook University and \nresearch partner of the Australian Institute of Tropical Health &Medicine, \nTownsville, Queensland, Australia.'), Document(metadata={'source': {'source': 'data\\Medical_Oncology_Handbook_.pdf', 'page': 2}}, page_content='1 \n \n \nINTRODUCTION: \nWelcome to the Department of Medical Oncology at t

In [8]:

def perform_ext_spliting(filtered_document):
    text_splitter= RecursiveCharacterTextSplitter(
        chunk_size = 800,
        chunk_overlap = 50
    )
    text = text_splitter.split_documents(filtered_document)
    return text

In [9]:
text= perform_ext_spliting(filtered_document)
print(text[36])

page_content='• Dabrafenib (b-Raf inhibitor) 
Pyrexia, rash, squamous cell carcinomas of skin. 
• Mekinist – MEK inhibitor 
Usually combined with Dabrafenib to reduce in cidence of skin squamous cell 
carcinoma 
• Zolendronic acid 
Renal function, hypocalcaemia, requires dose reduction for renal impairment.  
Need calcium supplements.' metadata={'source': {'source': 'data\\Medical_Oncology_Handbook_.pdf', 'page': 15}}


In [12]:
def download_opensource_embeded_model():
    embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    )
    
    return embedding_model

In [13]:
embeding_model = download_opensource_embeded_model()

Loading weights: 100%|██████████| 103/103 [00:00<00:00, 448.84it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [30]:
def create_vector_database():

    vector_db = FAISS.from_documents(
    documents=text,
    embedding=embeding_model
    )
    return vector_db
    
def save_vertor_database(vector_db):
    
    vector_db.save_local("vector_database\\oncology_faiss_index")

def load_local_vector_database(database_name):

    vector_db = FAISS.load_local(
    database_name,
    embeding_model,
    allow_dangerous_deserialization=True
    )

    return vector_db

In [19]:
vector_database= create_vector_database()
save_vertor_database(vector_database)

In [None]:
vector_db = load_local_vector_database("vector_database\\oncology_faiss_index")

RuntimeError: Error in __cdecl faiss::FileIOReader::FileIOReader(const char *) at D:\a\faiss-wheels\faiss-wheels\third-party\faiss\faiss\impl\io.cpp:70: Error: 'f' failed: could not open oncology_faiss_index\index.faiss for reading: No such file or directory

In [15]:
def retrive_relevent_document(vector_db):
    retriever = vector_db.as_retriever(
    search_type="mmr",
    search_kwargs={
        "k": 5,
        "fetch_k": 20,
        "lambda_mult": 0.7
        }
    )
    return retriever

In [27]:
retriever = retrive_relevent_document(vector_db)
query = "what is head and neck cancer"

docs= retriever.invoke(query)

In [17]:
chatModel = ChatOpenAI(model="gpt-4o")

In [26]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import HumanMessage, AIMessage

chat_template = ChatPromptTemplate.from_messages(
    [
        ("system","""You are a senior medical oncologist. Use ONLY the provided medical context to answer. 
        If information is missing, say you don't know. Provide structured, step-by-step clinical reasoning. Context: {document}"""),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "Clinical question: {topic}")
    ]
)

In [None]:
chat_history = []

while True:
    question = input("User: ")

    docs = retriever.invoke(question)
    context = "\n\n".join(doc.page_content for doc in docs)

    prompt = chat_template.invoke({
        "document": context,
        "topic": question,
        "chat_history": chat_history
    })

    response = chatModel.invoke(prompt)

    print("AI:", response.content)

    # save conversation
    chat_history.append(HumanMessage(content=question))
    chat_history.append(AIMessage(content=response.content))

chat_history = chat_history[-6:]

AI: Head and neck cancer refers to a group of biologically similar cancers that originate in the upper aerodigestive tract, which includes areas such as the nasal cavity, pharynx, larynx, oral cavity, and other nearby regions. These cancers are known for their potential complexity due to the anatomical structures involved. Treatment can vary based on whether the disease is resectable, unresectable, or if there is a need for organ preservation. For resectable disease, surgery is typically pursued, and postoperative radiotherapy with chemotherapy may improve survival in high-risk cases. When the disease is deemed unresectable or for the purpose of organ preservation, a combination of induction chemotherapy followed by chemoradiotherapy is often indicated. Metastatic head and neck cancer is treated with a regimen of chemotherapy, typically involving Cisplatin or Carboplatin in combination with 5FU/Capecitabine, and Nivolumab can be considered as a second-line treatment.


KeyboardInterrupt: Interrupted by user