# Safely Store ENV variables for Gemini, Chroma DB connection, and vector embeddings
### Additionl techniques: cleared memory of any existing variables to avoid accidental exposure. Performed due to significant security risk of exposing PII and PHI for healthcare data.
---

In [None]:
#! pip install -U langchain-google-vertexai


In [None]:
# ! pip install -U llama_index chromadb google-generativeai

In [None]:
#! pip install llama-index llama-index-vector-stores-chroma llama-index-embeddings-huggingface llama-index-retrievers-bm25


In [None]:
#! pip install rank-bm25
#! pip install langchain-classic rank_bm25 langchain-community

In [4]:
# imports 
# # load dotenv + imports for retriever tool
from getpass import getpass
import os
from dotenv import load_dotenv
from langchain_chroma import Chroma 
from langchain.tools import tool
import langchainhub as hub
import chromadb
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import vertexai
from vertexai.language_models import TextEmbeddingModel
from langchain_community.retrievers import BM25Retriever
from langchain_core.runnables import RunnableLambda, RunnableSequence, RunnableMap, RunnablePassthrough, RunnableParallel
from langchain_core.runnables import Runnable
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext, VectorStoreIndex, Document
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.retrievers import QueryFusionRetriever


In [5]:

load_dotenv()

api_key = os.getenv("CHROMA_API_KEY") or getpass("Paste CHROMA_API_KEY: ")
tenant = os.getenv("CHROMA_TENANT") or input("Enter CHROMA_TENANT: ")
database = os.getenv("CHROMA_DATABASE") or input("Enter CHROMA_DATABASE: ")
gemini_key = os.getenv("GEMINI_API_KEY") or getpass("Enter GEMINI_API_KEY: ")
vertex_ai_key = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") or getpass("Enter GOOGLE_APPLICATION_CREDENTIALS: ")

os.environ["GOOGLE_API_KEY"] = gemini_key
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = vertex_ai_key

# Load environment variables from chromaDB --CloudClient
client = chromadb.CloudClient(
    api_key=api_key, 
    tenant=tenant, 
    database=database)

#importing Gemini for embedding, checking dimensions for retriever
embeddings = GoogleGenerativeAIEmbeddings(model="gemini-embedding-001")
embeddings.embed_query("NG12 Cancer Risk Assessor guidelines")

#vertexai init
embedding = GoogleGenerativeAIEmbeddings(
    model="gemini-embedding-001",
    project="CancerRiskAgent",
    vertexai=True,
)

In [6]:
#get collection name from chromaDB
COLLECTION_NAME = "ng12"
chroma_collection = client.get_collection(name=COLLECTION_NAME)

#vectore store
vector_store = ChromaVectorStore(
    chroma_collection=chroma_collection
)

# storage context 
storage_context = StorageContext.from_defaults(vector_store=vector_store)
storage_context.persist()

# Enhancing Retrieval in Responses by Improving Contextual Understanding and Relevance with Llama_index
---
#### **Tradeoffs:**
- Originally, my plan was to use the same framework for both retrievers, then fetch the data from my vector store in chroma. However, I found that my BM25 retriever simply would not return any data for either framework--langchain or llama_index. Later on, I found out that due to how the BM25-Retriever architecture is built, the keyword search library needs access to pull data from **raw text nodes**. This does not align with my current Chroma vector store database architecture, which is built **with stored embeddings--not original text**. Hence, why orginally no data was being returned in my prior iteration. 
#### **How I solved for this:**
To mitigate for this, I decided to change my framework and use llama_index for BM25/lexical retriever; it provides a much more seamless integration with the Chroma vector store architecture. Then, I used llama_index to pass my list of nodes directly to the BM25 retriever, which solved the problem.
#### **Improvement techniques with Llama_index:**
- Use raw text nodes for BM25 retrieval to ensure compatibility with the keyword search library.
- Implement response enhancement by providing additional context or *page_content* for my document queries.

In [7]:
# adding response enhancement for generated outputs
documents = [
    Document(text="Shortness of breath with cough or fatigue or chest pain or weight loss or appetite loss (unexplained), 40 and over: possible cancer Lung or mesothelioma", metadata={"referral": "Urgent", "source": "Suspected cancer: recognition and referral (NG12) 2026", "page": "52"}),
    Document(text="Bleeding, bruising or petechiae, unexplained: possible cancer Leukaemia", metadata={"referral": "Very urgent", "source": "Suspected cancer: recognition and referral (NG12) 2026", "page": "43"}),
    Document(text="Fracture unexplained, 60 and over: possible cancer Myeloma", metadata={"referral": "Unexplained", "source": "Suspected cancer: recognition and referral (NG12) 2026", "page": "55"}),
    Document(text="Refer people using a suspected cancer pathway referral for oesophageal cancer if they: have dysphagia or, are aged 55 and over, with weight loss, and they have any of the following: upper abdominal pain, reflux, dyspepsia. [2015, amended 2025]", metadata={"referral": "Suspected cancer pathway referral", "source": "Suspected cancer: recognition and referral (NG12) 2026", "page": "11"}),
    Document(text="Skin lesion that raises the suspicion of a basal cell carcinoma: possible cancer Basal cell carcinoma  ", metadata={"referral": "Raises the suspicion of", "source": "Suspected cancer: recognition and referral (NG12) 2026", "page": "58"}),
    Document(text="Urinary urgency or frequency, increased and persistent or frequent, particularly more than 12 times per month in women, especially if 50 and over: possible cancer Ovarian", metadata={"referral": "Persistent", "source": "Suspected cancer: recognition and referral (NG12) 2026", "page": "60"}),
    Document(text="Upper abdominal pain with low haemoglobin levels or raised platelet count or nausea or vomiting, 55 and over: possible cancer Oesophageal or stomach ", metadata={"referral": "Non-urgent", "source": "Suspected cancer: recognition and referral (NG12) 2026", "page": "40"}),
    Document(text="Petechiae unexplained in children and young people: possible cancer Leukaemia", metadata={"referral": "Immediate", "source": "Suspected cancer: recognition and referral (NG12) 2026", "page": "74"})
]

# Stopping here for now--will continue to add more techniques 

In [None]:
VectorStoreIndex.from_documents(documents, storage_context=storage_context, embeddings=embeddings)

In [None]:
index = VectorStoreIndex.from_vector_store(vector_store, 
                                           storage_context=storage_context,
                                           embeddings=embeddings)

vector_retriever = index.as_retriever(similarity_top_k=5)
bm25_retriever = BM25Retriever.from_documents(documents)

In [None]:
# semantic retriever
vector_retriever = vector_store.as_retriever(search_kwargs={"k": 5})

In [None]:
#pull data from vector to create documents for BM25
doc_data = vector_store.get()
documents = [Document(page_content=d, metadata=m) for d, m in zip(doc_data['documents'], doc_data['metadatas'])]

In [None]:
#lexical retriever
bm25_retriever = BM25Retriever.from_documents(documents)
bm25_retriever.k = 20

In [None]:
#semantic retriever/ vector retriever
vector_retriever = new_vs.as_retriever(search_kwargs={"k": 5})

# keyword with BM25 retriever
new_data = new_vs.get(include=["documents", "metadatas"])
documents_from_chroma = [
    Document(page_content=text, metadata=(meta or {}))
    for text, meta in zip(new_data["documents"], new_data["metadatas"])
]

bm25_retriever = BM25Retriever.from_documents(documents_from_chroma)
bm25_retriever.k = 20


#hybrid retriever 
hybrid_retriever = EnsembleRetriever(
    retrievers=[vector_retriever, bm25_retriever],
    weights=[0.5, 0.5]
)

In [None]:
# helper for context engineering
def get_cancer_context(docs):
    context = []
    for i, doc in enumerate(docs):
        page = doc.metatdata.get("page", "?")
        block = f"[Source: Page {page}] {doc.page_content}"
        context.append(block)
    return "\n---\n".join(context)

In [None]:
# ! pip install -qU langchain langchain-core langchain-community
#! pip install -U langchain langchain-community langchain-chroma rank_bm25 flashrank

# Hybrid Retrieval Search with BM25 semantic + keyword search
---

### Semantic Retriever
---

###  Lexical Retriever with BM25
---

In [None]:
from langchain_core.documents import Document

collection_data = vectorstore.get()

documents_from_chroma = [
    Document(page_content=doc, metadata=meta)
    for doc, meta in zip(
        collection_data["documents"],
        collection_data["metadatas"]
    )
]

bm25_retriever = BM25Retriever.from_documents(documents_from_chroma)
bm25_retriever.k = 20

hybrid_retriever = EnsembleRetriever(
    retrievers=[vector_retriever, bm25_retriever],
    weights=[0.5, 0.5]
)

results = hybrid_retriever.invoke(
    "Urgent referral criteria for lung cancer"
)

### **Combine retrievers with EnsembleRetriever + BM25 for hybrid retrieval**
---
### **Tradeoffs**: 

#### **Balance vs. Imbalanced Weights for retrievers:**
---

**Why did I use 0.5 semantic and 0.5 lexical for the weights in my hybrid retriever?**

**Reasoning:** 
I wanted to balance the weight + importance of both the semantic and lexical retrieval meaning. The way the query's are formatted is unknown to me at this stage, so the best bet was to start with an equal balance and then adjust as needed.

Also, to compensate for the equal balance, I'm adding a lightweight **re-ranker LLM FlashRank**, it uses a pointwise reranking system. I chose this because it's fast, efficient, and runs FREE locally on my machine. At the end of the day, I would say that the semantic retriever will likely be more important for the types of queries that will be used in my clinical application, but I wanted to give the lexical retriever a fair chance to contribute as well.


In [None]:
# langchain depreacted the get relevant docs method for Ensemble Retrievers, updated method uses runnable utilities
from langchain_core.runnables import RunnableLambda, RunnableSequence, RunnableMap, RunnablePassthrough, RunnableParallel
from langchain_core.runnables import Runnable

# Perform Context Engineering to prepare for Agentic Reasoning
---
### Tradeoffs:

**Why did I choose this approach for context engineering?**

**Reasoning:** 
At first, my goal was to return a single string of relevant documents. However, I realized that my agent would have trouble parsing my actual data in production. Based on the way the data is formatted, I had to make some adjustments, and go a bit deeper with formatting. Also, one of the key constraints in building this agent included a task to cite the specific sources within the relevant data retrieved from the NG12 documents. I could have kept my single string, however, there would have been significant fine-tuning and overhead later. Performing context engineering was the only plausible to way achieve this goal. 

#### Adding Clinical Context Tool

In [None]:
# clinical context tooling for response -pg143
from langchain_core.tools import BaseTool, tool
@tool
def get_cancer_context(docs):
    """
    Tool to extract the relevant clinial context from the retrieved documents. 
    This tool will be how we prepare the reasoning for the agent, to ensure 
    correct hybrid results for the NG12 Cancer Risk Assessor guidelines engine.
    """

    context =[]
    for i, doc in enumerate(docs):
        page = doc.metadata.get("page", "Unknown Page")
        section = doc.metadata.get("section", "General Guidance")

        block = (
            f"[CLINICAL EVIDENCEM {i+1}]\n"
            f"SOURCE: NICE Guideline NG12, Page {page}, Section: {section}\n"
            f"TEXT: {doc.page_content}\n"
        )
        context.append(block)
    return "\n---\n".join(context)

