# Safely Store ENV variables for Gemini, Chroma DB connection, and vector embeddings
### Additionl techniques: cleared memory of any existing variables to avoid accidental exposure. Performed due to significant security risk of exposing PII and PHI for healthcare data.
---

In [1]:
# # load dotenv + imports for retriever tool
from getpass import getpass
import os
from dotenv import load_dotenv
from langchain_chroma import Chroma 
from langchain.tools import tool
import langchainhub as hub
import chromadb
from langchain_google_genai import GoogleGenerativeAIEmbeddings


api_key = os.getenv("CHROMA_API_KEY") or getpass("Paste CHROMA_API_KEY: ")
tenant = os.getenv("CHROMA_TENANT") or input("Enter CHROMA_TENANT: ")
database = os.getenv("CHROMA_DATABASE") or input("Enter CHROMA_DATABASE: ")
gemini_key = os.getenv("GEMINI_API_KEY") or getpass("Enter GEMINI_API_KEY: ")

# Load environment variables from chromaDB --CloudClient
client = chromadb.CloudClient(
    api_key=api_key,
    tenant=tenant,
    database=database
)
collection = client.get_collection(name="nice_org")

# load env to variable for langchain
os.environ["GOOGLE_API_KEY"] = gemini_key
# connecting embeddings with Gemini
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")

vectorstore = Chroma(
    client=client,
    collection_name="nice_org",
    embedding_function=embeddings
)

print("Connection successful")

Connection successful


In [None]:
# ! pip install -qU langchain langchain-core langchain-community
#! pip install -U langchain langchain-community langchain-chroma rank_bm25 flashrank

  pid, fd = os.forkpty()


In [None]:
# import retriever tool BM25 for hybrid search
from langchain_community.retrievers import BM25Retriever
from langchain_core.retrievers import BaseRetriever
from langchain_classic.retrievers import EnsembleRetriever # have to import classic for hyrbid ensemble retriever 

# Hybrid Retrieval Search with BM25 semntic + keyword search
---

### Semantic Retriever
---

In [11]:
# semantics
vector_retriever = vectorstore.as_retriever(
    search_kwargs={"k": 5}
)


###  Lexical Retriever with BM25
---

In [12]:
from langchain_core.documents import Document

In [14]:
#import documents for BM25 retriever
raw_data = vectorstore.get()

#pulling documents from chromaDB collection and the metadata to ensure the agent is able to cite and pull relevant info in answers
documents_for_BM25 = [
    Document(page_content=text, metatdata=meta)
    for text, meta in zip(raw_data["documents"], raw_data["metadatas"])
]
documents = collection.get(include=["documents"])["documents"]


# bm25 retriever for keyword search
bm25_retriever = BM25Retriever.from_documents(documents_for_BM25)
bm25_retriever.k = 20 # set k to 20 to ensure no matches are missed; using 5 for reranker instead


### Combine retrievers with EnsembleRetriever + BM25 for hybrid retrieval 
---
### **Tradeoffs**: 

#### **Balance vs. Imbalanced Weights for retrievers:**

**Why did I use 0.5 semantic and 0.5 lexical for the weights in my hybrid retriever?**

**Reasoning:** 
I wanted to balance the weight + importance of both the semantic and lexical retrieval meaning. The way the query's are formatted is unknown to me at this stage, so the best bet was to start with an equal balance and then adjust as needed.

Also, to compensate for the equal balance, I'm adding a lightweight **re-ranker LLM FlashRank**, it uses a pointwise reranking system. I chose this because it's fast, efficient, and runs FREE locally on my machine. At the end of the day, I would say that the semantic retriever will likely be more important for the types of queries that will be used in my clinical application, but I wanted to give the lexical retriever a fair chance to contribute as well.


In [None]:
# combinination of retrievers
hybrid_retriever = EnsembleRetriever(
    retrievers=[vector_retriever, bm25_retriever],
    weights=[0.5, 0.5]
)

results = hybrid_retriever.get_relevant_documents("What symptoms trigger an urgent referral for mastitis?")