In [1]:
from dotenv import load_dotenv
load_dotenv()
import os
os.environ['HF_TOKEN']=os.getenv("HF_TOKEN")

Huggingface embedding model

In [2]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


Load PDF having text, imgage, tables

In [None]:
from langchain_community.document_loaders import PyMuPDFLoader, PDFPlumberLoader
file_path="C:/Users/User/Desktop/AgenticAI2/data/Derm_Handbook_3rd-Edition-_Nov_2020-FINAL.pdf"
mupdfLoader= PyMuPDFLoader(file_path)
plumberLoader= PDFPlumberLoader(file_path)
docs1= mupdfLoader.load()
docs2=plumberLoader.load()
documents = docs1 + docs2


Splitting loaded data into chunks

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = text_splitter.split_documents(documents)

Connect to Mongo DB vector search database. Use flat index and store embeded data into mongo vecor database

In [None]:
from langchain.vectorstores import MongoDBAtlasVectorSearch
from pymongo import MongoClient
client = MongoClient("mongodb+srv://me:12345@cluster0.jjamtmf.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
collection = client["dermatology_rag"]["dermatology_rag_collection"]

vectorstore = MongoDBAtlasVectorSearch(
    collection=collection,
    embedding=embeddings,
    index_name="default"
)
vectorstore.add_documents(chunks)
print(f"Inserted {len(chunks)} chunks into the vector store.")


  vectorstore = MongoDBAtlasVectorSearch(


Inserted 4833 chunks into the vector store.


Similarity search . Getting retriever

In [6]:
vector_retriever = vectorstore.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 20, "score_threshold": 0.3}
)

BM25 retriever

In [7]:
from langchain.retrievers import BM25Retriever
from langchain_core.retrievers import BaseRetriever
bm25_retriever = BM25Retriever.from_documents(documents)
bm25_retriever.k = 10

Hybrid vector

In [8]:
from langchain_core.retrievers import BaseRetriever
from langchain_core.documents import Document
from pydantic import PrivateAttr

class HybridRetriever(BaseRetriever):
    _bm25_retriever: any = PrivateAttr()
    _vector_retriever: any = PrivateAttr()
    _alpha: float = PrivateAttr()
    _top_k: int = PrivateAttr()

    def __init__(self, bm25_retriever, vector_retriever, alpha: float = 0.5, top_k: int = 5):
        super().__init__()
        self._bm25_retriever = bm25_retriever
        self._vector_retriever = vector_retriever
        self._alpha = alpha
        self._top_k = top_k

    def _get_relevant_documents(self, query: str) -> list[Document]:
        bm25_docs = self._bm25_retriever.get_relevant_documents(query)
        vector_docs = self._vector_retriever.get_relevant_documents(query)

        scored_docs = {}

        for rank, doc in enumerate(bm25_docs):
            score = 1 - rank / len(bm25_docs)
            scored_docs[doc.page_content] = self._alpha * score

        for rank, doc in enumerate(vector_docs):
            score = 1 - rank / len(vector_docs)
            if doc.page_content in scored_docs:
                scored_docs[doc.page_content] += (1 - self._alpha) * score
            else:
                scored_docs[doc.page_content] = (1 - self._alpha) * score

        sorted_docs = sorted(scored_docs.items(), key=lambda x: x[1], reverse=True)

        result = []
        added = set()
        for doc_text, _ in sorted_docs:
            if doc_text not in added and len(result) < self._top_k:
                for doc in bm25_docs + vector_docs:
                    if doc.page_content == doc_text:
                        result.append(doc)
                        added.add(doc_text)
                        break

        return result


In [9]:
hybrid_retriever = HybridRetriever(bm25_retriever, vector_retriever, alpha=0.6, top_k=8)

In [10]:
print(collection.find_one({}, {"embedding": 1}))

{'_id': ObjectId('68401f3e13052cd1dee5e891'), 'embedding': [-0.005387810058891773, -0.03321627900004387, 0.013909813947975636, 0.030351582914590836, -0.0125221386551857, 0.005901284050196409, 0.08369802683591843, 0.01693200133740902, -0.15055610239505768, -0.01960763707756996, -0.004600342828780413, -0.12214155495166779, -0.0068047684617340565, 0.04622591286897659, -0.0664670541882515, 0.031013553962111473, -0.032221779227256775, -0.02272224612534046, 0.020355792716145515, -0.016892461106181145, -0.08868879079818726, 0.04607740417122841, 0.09465746581554413, -0.014864693395793438, -0.03721733018755913, -0.030444251373410225, 0.036712903529405594, 0.0004241647257003933, -0.06691686809062958, 0.062265440821647644, -0.05031697079539299, 0.04498080536723137, -0.030885547399520874, -0.012237059883773327, 0.030217688530683517, -0.06404484063386917, -0.0020141799468547106, 0.04671221971511841, 0.009758020751178265, 0.12039405852556229, -0.05271968990564346, -0.06231796741485596, -0.0370076633

Invoke hybrid_retriever

In [11]:
results = hybrid_retriever.invoke("what are common skin diseases?")
print(results)

  bm25_docs = self._bm25_retriever.get_relevant_documents(query)
No relevant docs were retrieved using the relevance score threshold 0.3




In [12]:
def get_embedding(text: str) -> list[float]:
    if not text.strip():
        print("empty text")
        return []
    
   
    embedding = embeddings.embed_query(text)
    return embedding


def vectorsearch(user_query: str, collection):
    query_embedding = get_embedding(user_query)
    print(query_embedding)
    
    if not query_embedding:
        return "Invalid or empty query"
    
    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_index",
                "queryVector": query_embedding,
                "path": "embedding",
                "numCandidates": 150,
                "limit": 5
            }
        },
        {
            "$project": {
                "fullplot": 1,
                "title": 1,
                "genres": 1,
                "score": {"$meta": "vectorSearchScore"},
                 "_id": 1,
                "text": 1,
                "chunk": 1
            }
        }
    ]
    
    result = collection.aggregate(pipeline)
    return list(result)


In [13]:
vectorsearch("what is common skin diseases?", collection)

[0.027969878166913986, 0.049181658774614334, -0.004127552732825279, 0.04028607904911041, 0.00453167362138629, 0.01499728299677372, 0.0761299803853035, 0.10632237046957016, -0.08701633661985397, 0.03138210251927376, 0.07858264446258545, -0.07025542110204697, -0.04146458953619003, 0.0640190839767456, -0.017147306352853775, -0.04439431056380272, -0.06132151186466217, -0.04078083857893944, 0.045978646725416183, 0.019097095355391502, -0.07035017013549805, 0.08180100470781326, 0.02937837690114975, -0.06527621299028397, -0.09450968354940414, -0.05165368318557739, 0.005715498700737953, -0.019486993551254272, -0.03276035562157631, -0.00669135944917798, -0.0704919770359993, 0.0038316440768539906, 0.012550466693937778, -0.004565250128507614, -0.027722731232643127, -0.028768518939614296, -0.01807238720357418, 0.015080343000590801, 0.027164170518517494, 0.04664185270667076, -0.08003021776676178, -0.04185783490538597, 0.014554919674992561, -0.013097517192363739, 0.01441480964422226, -0.0031855904962

[{'_id': ObjectId('6841c87d97121a4ffc5e1353'),
  'text': 'ter presents an overview of the causes, prevalence and\nimpact of skin disease.\nCauses\nThe skin is the boundary between ourselves and the\nworld around us. It is an important sense organ, and\ncontrols heat and water loss. It reﬂects internal changes\n1\nSkin disease in perspective\nTable 1.1 The most common categories\nof skin disorder in the UK.\nSkin cancer\nAcne\nAtopic eczema\nPsoriasis\nViral warts\nOther infective skin disorders\nBenign tumours and vascular lesions\nLeg ulcers',
  'title': '',
  'score': 0.85576331615448},
 {'_id': ObjectId('68412321d8ccc62b15d06462'),
  'text': 'Dermatology: Handbook for medical students & junior doctors  \n \n \n \nBritish Association of Dermatologists \n6\n \n \n \n \n• \nDermatology is the study of both normal and abnormal skin and associated \nstructures such as hair, nails, and oral and genital mucous membranes. \n \n \n \n \n• \nSkin diseases are very common, affecting up to a th

In [14]:
from langchain_core.retrievers import BaseRetriever
from langchain_core.documents import Document

class MongoDBVectorRetriever(BaseRetriever):
    def __init__(self, collection):
        self.collection = collection

    def _get_relevant_documents(self, query: str) -> list[Document]:
        results = vectorsearch(query, self.collection)

        documents = []
        for res in results:
            text = res.get("text", "") or res.get("chunk", "")
            metadata = {
                "title": res.get("title", ""),
                "genres": res.get("genres", ""),
                "score": res.get("score", 0),
                "_id": str(res.get("_id"))
            }
            documents.append(Document(page_content=text, metadata=metadata))

        return documents


In [15]:
from langchain_core.retrievers import BaseRetriever
from langchain_core.documents import Document
from typing import List
from pymongo.collection import Collection
from langchain.embeddings import HuggingFaceEmbeddings

# Your vector search function
def get_embedding(text: str) -> list[float]:
    if not text.strip():
        print("empty text")
        return []
    return embeddings.embed_query(text)

def vectorsearch(user_query: str, collection: Collection):
    query_embedding = get_embedding(user_query)
    if not query_embedding:
        return []

    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_index",
                "queryVector": query_embedding,
                "path": "embedding",
                "numCandidates": 150,
                "limit": 5
            }
        },
        {
            "$project": {
                "fullplot": 1,
                "title": 1,
                "genres": 1,
                "score": {"$meta": "vectorSearchScore"},
                "_id": 1,
                "text": 1,
                "chunk": 1
            }
        }
    ]
    return list(collection.aggregate(pipeline))


class MongoDBVectorRetriever(BaseRetriever):

    def __init__(self, collection: Collection):
        super().__init__()
        self._collection = collection

    def _get_relevant_documents(self, query: str) -> List[Document]:
        results = vectorsearch(query, self._collection)

        docs = []
        for res in results:
            content = res.get("text") or res.get("chunk", "")
            metadata = {
                "title": res.get("title", ""),
                "score": res.get("score", 0),
                "_id": str(res.get("_id"))
            }
            docs.append(Document(page_content=content, metadata=metadata))
        return docs


In [None]:
from pymongo import MongoClient
from langchain.embeddings import HuggingFaceEmbeddings


client = MongoClient("mongodb+srv://me:12345@cluster0.jjamtmf.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
db = client["dermatology_rag"]
collection = db["dermatology_rag_collection"]


embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


mongodb_vector_retriever = MongoDBVectorRetriever(collection)
results = mongodb_vector_retriever.invoke("what is dermetology?")

for doc in results:
    print(doc.page_content, "\n", doc.metadata)

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


Clinical Dermatology 
 {'title': '', 'score': 0.7981039881706238, '_id': '6841c87d97121a4ffc5e1321'}
Dermatology: Handbook for medical students & junior doctors
What is dermatology?
• Dermatology is the study of both normal and abnormal skin and associated
structures such as hair, nails, and oral and genital mucous membranes.
Why is dermatology important?
• Skin diseases are very common, affecting up to a third of the population at any one
time.
• Skin diseases have serious impacts on life. They can cause physical damage, 
 {'title': '', 'score': 0.7871807813644409, '_id': '6840fbdc085201a2740832d1'}
Dermatology: Handbook for medical students & junior doctors
What is dermatology?
• Dermatology is the study of both normal and abnormal skin and associated
structures such as hair, nails, and oral and genital mucous membranes.
Why is dermatology important?
• Skin diseases are very common, affecting up to a third of the population at any one
time.
• Skin diseases have serious impacts on lif

In [17]:
docs = hybrid_retriever.invoke("what is dermetology？")
print(f"Retrieved {len(docs)} context documents.")
print(docs[0].page_content if docs else "No context found.")

No relevant docs were retrieved using the relevance score threshold 0.3


Retrieved 8 context documents.
298
CHAPTER 20
Nodular prurigo (Fig. 20.7) may be a variant on this
theme as manifested in atopic subjects, who scratch
and rub remorselessly at their extremely itchy nodules.
Hair-pulling habit
Trichotillomania is too dramatic a word for what is
usually only a minor comfort habit in children, rank-
ing alongside nail-biting and lip-licking. Perhaps the
term should be dropped in favour of ‘hair-pulling
habit’. It is usually of little consequence, and children
who twist and pull their hair, often as they are going
to sleep, seldom have major psychiatric disorders. The
habit often goes away most quickly if it is ignored.
However, more severe degrees of hair-pulling are some-
times seen in disturbed adolescents and in those with
learning difﬁculties; then the outlook for full regrowth
is less good, even with formal psychiatric help.
The diagnosis can usually be made on the history,
but some parents do not know what is going on. The
bald areas do not show the

In [24]:
from langchain import hub
prompt=hub.pull("rlm/rag-prompt")

In [25]:
from langchain_core.prompts import PromptTemplate
prompt_template=PromptTemplate(
    template="""You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:""",
    input_variables=['context','question'],
)

In [20]:
from langchain_groq import ChatGroq
model=ChatGroq(model="gemma2-9b-it")
model

ChatGroq(client=<groq.resources.chat.completions.Completions object at 0x00000206362BB620>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x0000020636878440>, model_name='gemma2-9b-it', model_kwargs={}, groq_api_key=SecretStr('**********'))

In [26]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain=(
    {"context": hybrid_retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    |StrOutputParser()
)

In [34]:
query="what is Dermatology?"
rag_results=rag_chain.invoke(query)
print(rag_results)

No relevant docs were retrieved using the relevance score threshold 0.3


Dermatology is the branch of medicine concerned with the diagnosis, treatment, and prevention of diseases of the skin.  



In [29]:
from docx import Document
import os


output_path = "vector_search_results.docx"


if os.path.exists(output_path):
    doc = Document(output_path)
else:
    doc = Document()
    doc.add_heading('Vector Search Results', level=1)
doc.add_paragraph("\n---\n")
doc.add_paragraph(f"Query: {query}")
doc.add_paragraph(f"Result: {rag_results}")

doc.save(output_path)