# Safely Store ENV variables for Gemini, Chroma DB connection, and vector embeddings
### Additionl techniques: cleared memory of any existing variables to avoid accidental exposure. Performed due to significant security risk of exposing PII and PHI for healthcare data.
---

In [None]:
# # load dotenv + imports for retriever tool
from getpass import getpass
import os
from dotenv import load_dotenv
from langchain_chroma import Chroma 
from langchain.tools import tool
import langchainhub as hub
import chromadb
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_core.documents import Document
from langchain_community.retrievers import BM25Retriever
from langchain_classic.retrievers.ensemble import EnsembleRetriever
from langchain_core.runnables import RunnableLambda, RunnableSequence, RunnableMap, RunnablePassthrough, RunnableParallel
from langchain_core.runnables import Runnable

load_dotenv()

api_key = os.getenv("CHROMA_API_KEY") or getpass("Paste CHROMA_API_KEY: ")
tenant = os.getenv("CHROMA_TENANT") or input("Enter CHROMA_TENANT: ")
database = os.getenv("CHROMA_DATABASE") or input("Enter CHROMA_DATABASE: ")
gemini_key = os.getenv("GEMINI_API_KEY") or getpass("Enter GEMINI_API_KEY: ")

os.environ["GOOGLE_API_KEY"] = gemini_key

# Load environment variables from chromaDB --CloudClient
client = chromadb.CloudClient(
    api_key=api_key, 
    tenant=tenant, 
    database=database)

OLD_COLLECTION = "ng12"
NEW_COLLECTION = "NG12_GEMINI3072"

embedding = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

#CHECK DIMENSIONS
vec = embedding.embed_query("dimension check")
print("Gemini embedding dimensions:", len(vec))

# attaching old to new collection with embedding function for semantic search
old_vs = Chroma(
    client=client,
    collection_name=OLD_COLLECTION
)

old_data = old_vs.get(include=["documents", "metadatas"])
old_docs = old_data.get("documents", [])
old_metas = old_data.get("metadatas", [])

if not old_docs:
    raise ValueError(f"No documents found in OLD collection '{OLD_COLLECTION}'.")

docs_to_copy = [
    Document(page_content=text, metadata=(meta or {}))
    for text, meta in zip(old_docs, old_metas)
]

print(f"Found {len(docs_to_copy)} chunks in '{OLD_COLLECTION}'.")

#new colletion with GEMINI
new_vs = Chroma(
    client=client,
    collection_name=NEW_COLLECTION,
    embedding_function=embedding
)

#semantic retriever/ vector retriever
vector_retriever = new_vs.as_retriever(search_kwargs={"k": 5})

# keyword with BM25 retriever
new_data = new_vs.get(include=["documents", "metadatas"])
documents_from_chroma = [
    Document(page_content=text, metadata=(meta or {}))
    for text, meta in zip(new_data["documents"], new_data["metadatas"])
]

bm25_retriever = BM25Retriever.from_documents(documents_from_chroma)
bm25_retriever.k = 20


#hybrid retriever 
hybrid_retriever = EnsembleRetriever(
    retrievers=[vector_retriever, bm25_retriever],
    weights=[0.5, 0.5]
)

Gemini embedding dimensions: 3072
Found 300 chunks in 'ng12'.


ValueError: not enough values to unpack (expected 3, got 0)

In [16]:
# 1) What does YOUR embedding object output?
vec = embedding.embed_query("test")
print("embedding class:", type(embedding))
print("embedding dim:", len(vec))

embedding class: <class 'langchain_google_genai.embeddings.GoogleGenerativeAIEmbeddings'>
embedding dim: 3072


In [17]:
print("vectorstore embedding:", type(vectorstore._embedding_function))
test_vec2 = vectorstore._embedding_function.embed_query("test")
print("vectorstore embedding dim:", len(test_vec2))

vectorstore embedding: <class 'langchain_google_genai.embeddings.GoogleGenerativeAIEmbeddings'>
vectorstore embedding dim: 3072


In [2]:
# helper for context engineering
def get_cancer_context(docs):
    context = []
    for i, doc in enumerate(docs):
        page = doc.metatdata.get("page", "?")
        block = f"[Source: Page {page}] {doc.page_content}"
        context.append(block)
    return "\n---\n".join(context)

In [None]:
# ! pip install -qU langchain langchain-core langchain-community
#! pip install -U langchain langchain-community langchain-chroma rank_bm25 flashrank

# Hybrid Retrieval Search with BM25 semantic + keyword search
---

### Semantic Retriever
---

###  Lexical Retriever with BM25
---

In [None]:
from langchain_core.documents import Document

collection_data = vectorstore.get()

documents_from_chroma = [
    Document(page_content=doc, metadata=meta)
    for doc, meta in zip(
        collection_data["documents"],
        collection_data["metadatas"]
    )
]

bm25_retriever = BM25Retriever.from_documents(documents_from_chroma)
bm25_retriever.k = 20

hybrid_retriever = EnsembleRetriever(
    retrievers=[vector_retriever, bm25_retriever],
    weights=[0.5, 0.5]
)

results = hybrid_retriever.invoke(
    "Urgent referral criteria for lung cancer"
)

### **Combine retrievers with EnsembleRetriever + BM25 for hybrid retrieval**
---
### **Tradeoffs**: 

#### **Balance vs. Imbalanced Weights for retrievers:**
---

**Why did I use 0.5 semantic and 0.5 lexical for the weights in my hybrid retriever?**

**Reasoning:** 
I wanted to balance the weight + importance of both the semantic and lexical retrieval meaning. The way the query's are formatted is unknown to me at this stage, so the best bet was to start with an equal balance and then adjust as needed.

Also, to compensate for the equal balance, I'm adding a lightweight **re-ranker LLM FlashRank**, it uses a pointwise reranking system. I chose this because it's fast, efficient, and runs FREE locally on my machine. At the end of the day, I would say that the semantic retriever will likely be more important for the types of queries that will be used in my clinical application, but I wanted to give the lexical retriever a fair chance to contribute as well.


In [None]:
# langchain depreacted the get relevant docs method for Ensemble Retrievers, updated method uses runnable utilities
from langchain_core.runnables import RunnableLambda, RunnableSequence, RunnableMap, RunnablePassthrough, RunnableParallel
from langchain_core.runnables import Runnable

# Perform Context Engineering to prepare for Agentic Reasoning
---
### Tradeoffs:

**Why did I choose this approach for context engineering?**

**Reasoning:** 
At first, my goal was to return a single string of relevant documents. However, I realized that my agent would have trouble parsing my actual data in production. Based on the way the data is formatted, I had to make some adjustments, and go a bit deeper with formatting. Also, one of the key constraints in building this agent included a task to cite the specific sources within the relevant data retrieved from the NG12 documents. I could have kept my single string, however, there would have been significant fine-tuning and overhead later. Performing context engineering was the only plausible to way achieve this goal. 

#### Adding Clinical Context Tool

In [None]:
# clinical context tooling for response -pg143
from langchain_core.tools import BaseTool, tool
@tool
def get_cancer_context(docs):
    """
    Tool to extract the relevant clinial context from the retrieved documents. 
    This tool will be how we prepare the reasoning for the agent, to ensure 
    correct hybrid results for the NG12 Cancer Risk Assessor guidelines engine.
    """

    context =[]
    for i, doc in enumerate(docs):
        page = doc.metadata.get("page", "Unknown Page")
        section = doc.metadata.get("section", "General Guidance")

        block = (
            f"[CLINICAL EVIDENCEM {i+1}]\n"
            f"SOURCE: NICE Guideline NG12, Page {page}, Section: {section}\n"
            f"TEXT: {doc.page_content}\n"
        )
        context.append(block)
    return "\n---\n".join(context)

