# HYBRID SEARCH IN CHROMADB

In [1]:
import os
import sys
from dotenv import load_dotenv


# Load environment variables from a .env file
load_dotenv('D:/Code/AI/.env')



True

#### Approach1

2️⃣ Setup ChromaDB and Collection

In [2]:
import chromadb
from sentence_transformers import SentenceTransformer

# Initialize ChromaDB client
chroma_client = chromadb.PersistentClient(path="./chroma_db")

# Load a sentence transformer model for dense embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Create a collection with metadata filtering
collection = chroma_client.get_or_create_collection(
    name="hybrid_search",
    metadata={"hnsw:space": "cosine"}  # Ensures cosine similarity for dense search
)


  from .autonotebook import tqdm as notebook_tqdm


3️⃣ Add Documents with Metadata

In [3]:
docs = [
    {"id": "1", "text": "Machine learning is amazing.", "metadata": {"category": "AI", "author": "Alice"}},
    {"id": "2", "text": "The stock market is volatile.", "metadata": {"category": "Finance", "author": "Bob"}},
    {"id": "3", "text": "Neural networks are powerful models.", "metadata": {"category": "AI", "author": "Charlie"}},
]

# Convert texts to embeddings
embeddings = [embedding_model.encode(doc["text"]).tolist() for doc in docs]

# Add data to ChromaDB
collection.add(
    ids=[doc["id"] for doc in docs],
    documents=[doc["text"] for doc in docs],
    embeddings=embeddings,
    metadatas=[doc["metadata"] for doc in docs]
)


3️⃣ Add Documents with Metadata

1. I need a change here I will be using a custom pdf using langchain PyMuPDF.
2. Now chunk the data using Langchain Semantic Chunker and chunk the pdf data.
    

4️⃣ Perform Hybrid Search (Dense + Sparse)

In [4]:
query = "deep learning advancements"

# Convert query to dense embedding
query_embedding = embedding_model.encode(query).tolist()

# Perform hybrid search with metadata filtering
results = collection.query(
    query_embeddings=[query_embedding],
    query_texts=[query],  # Enables BM25 sparse search
    n_results=5,
    where={"category": "AI"}  # Metadata filtering
)

print(results)


Number of requested results 5 is greater than number of elements in index 3, updating n_results = 3


{'ids': [['3', '1']], 'embeddings': None, 'documents': [['Neural networks are powerful models.', 'Machine learning is amazing.']], 'uris': None, 'data': None, 'metadatas': [[{'author': 'Charlie', 'category': 'AI'}, {'author': 'Alice', 'category': 'AI'}]], 'distances': [[0.4906539336261596, 0.562261735540617]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}


#### Approach 2

2️⃣ Load the PDF and Extract Text


Each document contains:

page_content (text from the page)
metadata (page number, source)


In [2]:
from langchain.document_loaders import PyMuPDFLoader

# Load PDF
pdf_path = "if-hp-cancer-guide-lyhe007-cll.pdf"  # Change this to your actual PDF file
loader = PyMuPDFLoader(pdf_path)

# Extract documents
documents = loader.load()


3️⃣ Chunk the Data using LangChain's Semantic Chunker

We'll use the SemanticChunker from LangChain to split the text intelligently.

In [3]:
from langchain_experimental.text_splitter import SemanticChunker
# from sentence_transformers import SentenceTransformer

from langchain.embeddings import HuggingFaceEmbeddings
# from langchain.text_splitter import SemanticChunker

# Use HuggingFaceEmbeddings (which has embed_documents)
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Initialize Semantic Chunker
text_splitter = SemanticChunker(embedding_model, breakpoint_threshold_type="percentile")

# Extract text from Document objects
document_texts = [doc.page_content for doc in documents]  # Extract plain text


# Chunk the extracted text
# chunks = text_splitter.create_documents(document_texts)
# Step 3: Chunk the text while keeping metadata
chunks = text_splitter.create_documents([doc.page_content for doc in documents], metadatas=[doc.metadata for doc in documents])



  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


4️⃣ Store Chunks in ChromaDB
We'll store:

1. Chunked text

2. Embeddings

3. Metadata (page number, source, etc.)

In [4]:
import chromadb

# Initialize ChromaDB
chroma_client = chromadb.PersistentClient(path="./chroma_db")
collection = chroma_client.get_or_create_collection(name="pdf_search",metadata={"hnsw:space": "cosine"})

# Step 5: Prepare data for ChromaDB
chunk_texts = [chunk.page_content for chunk in chunks]
chunk_embeddings = embedding_model.embed_documents(chunk_texts)
chunk_metadatas = [chunk.metadata if chunk.metadata else {"source": "unknown"} for chunk in chunks]  # Ensure non-empty metadata

# Step 6: Store in ChromaDB
collection.add(
    ids=[str(i) for i in range(len(chunk_texts))],
    documents=chunk_texts,
    embeddings=chunk_embeddings,
    metadatas=chunk_metadatas  # Ensures non-empty metadata
)

🔹 Step 7: Retrieve Documents with Relevance Score

In [7]:
from langchain.vectorstores import Chroma
from langchain.schema import Document

# Step 1: Load ChromaDB collection
chroma_client = chromadb.PersistentClient(path="./chroma_db")
collection = chroma_client.get_or_create_collection(name="pdf_search",metadata={"hnsw:space": "cosine"})

# Step 2: Wrap Chroma in LangChain's Retriever
vectorstore = Chroma(
    client=chroma_client,
    collection_name="pdf_search",
    embedding_function=embedding_model,
    
)

retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 5})

# Step 3: User Query
user_query = "Explain the Target Population"  # Replace with user input

# Step 4: Perform Hybrid Search with Relevance Score & Metadata Filtering
search_results = collection.query(
    query_texts=[user_query],
    n_results=5,  # Retrieve top 5 relevant results
    where={"source": {"$eq": "if-hp-cancer-guide-lyhe007-cll.pdf"}},  # Metadata filtering
    include=["documents", "metadatas", "distances"]  # Include relevance scores
)

# Step 5: Display Results with Relevance Scores
for i, (doc, metadata, score) in enumerate(zip(search_results["documents"][0], 
                                                search_results["metadatas"][0], 
                                                search_results["distances"][0])):
    print(f"🔹 Result {i+1}:")
    print(f"Content: {doc}")
    print(f"Metadata: {metadata}")
    print(f"Relevance Score: {1 - score:.4f}=={score:.4f}")  # Convert distance to similarity score
    print("-" * 50)


🔹 Result 1:
Content: No overall survival (OS) difference has 
yet been reported in this study. The ECOG 1912 study compared ibrutinib + rituximab to chemoimmunotherapy with fludarabine, 
cyclophosphamide and rituximab (FCR) in a young and fit CLL population.30  The results of the ECOG 
1912 study are very important because in contradiction to the other frontline studies of BTKi versus 
chemo-immunotherapy, this study observed both a PFS and an OS advantage of ibrutinib + R 
compared to FCR. The hazard ratio for PFS was 0.352 (95% CI 0.223-0.558; p<0.0001) and 0.168 
(95% CI 0.053-0.538; p=0.0003, pre-specified boundary for superiority p=0.0005) for OS, both 
favoured IR. No significant different was demonstrated in PFS in the subgroup of patients with 
mutated IgHV. The open-label, phase 3, FLAIR trial, randomized previously untreated CLL patients 
(N=771) to receive ibrutinib and rituximab or FCR (1:1). After a median 53 months of follow-up, 
median progression free survival was not r

✅ Delete a Specific Collection

In [None]:
import chromadb

# Initialize the ChromaDB client
chroma_client = chromadb.PersistentClient(path="./chroma_db")

# Delete the collection
chroma_client.delete_collection(name="pdf_search")

print("✅ Collection deleted successfully!")

✅ Step 8: Pass Retrieved Chunks & Query to LLM for Generation </br>
Now that we've retrieved the most relevant chunks using Hybrid Search, let's pass them along </br>with the user query to an LLM for generating an answer.</br>

🔹 How This Works</br>
Retrieve Relevant Chunks → (From ChromaDB, Step 7)</br>
Format Context → (Concatenate chunks into a structured prompt)</br>
Send to LLM → (Use an LLM like OpenAI’s GPT, Mistral, or Llama3) </br>
Generate Response → (LLM provides an answer based on context) </br>

In [9]:
#🔹 Step 8: Generate Response with LLM
from langchain_ollama import ChatOllama
from langchain.schema import Document

# Step 1: Load OpenAI LLM
llm = ChatOllama(model="deepseek-r1:14b", temperature=0.7)

# Step 2: Format Retrieved Chunks as Context
retrieved_context = "\n\n".join([doc for doc in search_results["documents"][0]])

# Step 3: Construct Prompt for LLM
formatted_prompt = f"""
You are an AI assistant with access to relevant documents.

### Context:
{retrieved_context}

### User Query:
{user_query}

Based on the given context, provide a detailed and accurate response.
"""

# Step 4: Generate Answer using LLM
response = llm.invoke(formatted_prompt)

# Step 5: Display Output
print("🔹 LLM Response:")
print(response.content)



🔹 LLM Response:
<think>
Okay, so I need to explain the target population based on the provided context. Let me read through the context carefully to understand who is being studied or treated in these trials and guidelines.

First, I notice that there are mentions of studies like ECOG 1912 and FLAIR trial. The ECOG study compared ibrutinib plus rituximab (IR) with FCR (fludarabine, cyclophosphamide, and rituximab). It's noted that this was done in a "young and fit" CLL population. So, the target here seems to be younger, healthier individuals with Chronic Lymphocytic Leukemia.

Looking further, the FLAIR trial also randomized previously untreated CLL patients, specifically N=771, comparing IR versus FCR. The median follow-up was about 53 months. So again, this is a group of CLL patients who are treatment-naive and presumably fit enough to be part of a phase 3 trial.

In the guideline section, it mentions that FCR is compared with FC in younger fit CLL patients as well, from the GCLLSG 