# 🔍 Hybrid Retrieval with BioClinicalBERT, FAISS, and BM25

This notebook demonstrates how to build a **hybrid retrieval system** using:
- **DuckDB** → medical data storage
- **BM25** → keyword-based retrieval
- **BioClinicalBERT + FAISS** → semantic retrieval
- **Hybrid search** → combines both methods for better results

In [1]:
pip install duckdb pandas numpy faiss-cpu rank-bm25 transformers torch sentence-transformers tqdm

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m76.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25, faiss-cpu
Successfully installed faiss-cpu-1.12.0 rank-bm25-0.2.2


In [9]:

import os, json, pickle, time, re
from pathlib import Path
from typing import List, Optional, Tuple

import duckdb
import pandas as pd
import numpy as np
from tqdm import tqdm

from rank_bm25 import BM25Okapi
import faiss
from sentence_transformers import SentenceTransformer

# Paths
DB_PATH = "medwhisper.db"
INDEX_DIR = Path("indexes/medwhisper_hybrid")
INDEX_DIR.mkdir(parents=True, exist_ok=True)

# Build toggles
BUILD_FROM_SCRATCH = True
BATCH_SIZE = 64
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)


In [10]:
import duckdb
import pandas as pd

# Update with the correct path if your DB is not in the current working directory


# Connect (keep consistent with earlier cells, don’t mix read_only and normal)
con = duckdb.connect(DB_PATH)

# Show all tables in the database
tables = con.execute("SHOW TABLES").fetchdf()
print("Tables in DB:")
print(tables)


Tables in DB:
                    name
0                  audit
1             conditions
2     conditions_curated
3             encounters
4     encounters_curated
5               feedback
6                 images
7            medications
8    medications_curated
9           observations
10  observations_curated
11              patients
12      patients_curated
13            procedures
14    procedures_curated
15        raw_conditions
16        raw_encounters
17       raw_medications
18      raw_observations
19          raw_patients
20        raw_procedures
21               reports
22       timeline_events
23           transcripts


In [39]:
# Show all rows/columns without truncation
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.width", None)

In [40]:
# Filter only curated tables
curated_tables = [t for t in tables['name'] if t.endswith("_curated")]

print("Curated Tables:", curated_tables)

# Show first 5 rows of each curated table as DataFrame
for t in curated_tables:

    print(f"\n=== {t} ===")
    display(con.execute(f"SELECT * FROM {t} LIMIT 5").fetchdf())


Curated Tables: ['conditions_curated', 'encounters_curated', 'medications_curated', 'observations_curated', 'patients_curated', 'procedures_curated']

=== conditions_curated ===


Unnamed: 0,condition_id,patient_id,encounter_id,code,description,onset_date,abatement_date,clinical_status
0,cond_1,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,8151e1b2-5789-578d-3e3a-d1d9b7c142ee,224299000,Received higher education (finding),2005-03-16,NaT,
1,cond_2,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,6dce0dad-f85f-8a26-9583-f1bcdde4efc1,266948004,Has a criminal record (finding),2006-03-22,NaT,
2,cond_3,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,6dce0dad-f85f-8a26-9583-f1bcdde4efc1,361055000,Misuses drugs (finding),2006-03-22,2021-04-07,
3,cond_4,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,6f2cf873-f64e-74a7-efba-6d772f418395,160904001,Part-time employment (finding),2012-03-28,2018-04-04,
4,cond_5,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,6f2cf873-f64e-74a7-efba-6d772f418395,706893006,Victim of intimate partner abuse (finding),2012-03-28,NaT,



=== encounters_curated ===


Unnamed: 0,encounter_id,patient_id,start_time,end_time,encounter_class,code,description,provider
0,8151e1b2-5789-578d-3e3a-d1d9b7c142ee,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,2005-03-16 06:04:53+00:00,2005-03-16 06:54:52+00:00,wellness,162673000,General examination of patient (procedure),0d67d251-73f5-3118-be75-41f33e95b7d1
1,6dce0dad-f85f-8a26-9583-f1bcdde4efc1,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,2006-03-22 06:04:53+00:00,2006-03-22 06:35:36+00:00,wellness,162673000,General examination of patient (procedure),0d67d251-73f5-3118-be75-41f33e95b7d1
2,6f2cf873-f64e-74a7-efba-6d772f418395,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,2012-03-28 06:04:53+00:00,2012-03-28 06:54:32+00:00,wellness,162673000,General examination of patient (procedure),0d67d251-73f5-3118-be75-41f33e95b7d1
3,87741be8-2223-14c5-375d-976414fcd9aa,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,2016-04-11 11:14:53+00:00,2016-04-11 12:14:53+00:00,emergency,50849002,Emergency room admission (procedure),3169b71a-aa09-3f9e-8ca4-a592bb52e8aa
4,1f9f55cd-ae88-b9ab-15e1-927e73848ca3,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,2017-12-22 02:04:53+00:00,2017-12-22 02:19:53+00:00,ambulatory,185345009,Encounter for symptom (procedure),5f024f16-dc88-3249-8eef-e04d27a7d717



=== medications_curated ===


Unnamed: 0,medication_id,patient_id,encounter_id,code,description,start_date,end_date,status
0,med_1,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,87741be8-2223-14c5-375d-976414fcd9aa,313782,Acetaminophen 325 MG Oral Tablet,2016-04-11,2016-04-26,
1,med_2,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,0d6b6a0a-8f84-c860-ba6f-b241d14f55bf,849574,Naproxen sodium 220 MG Oral Tablet,2018-02-24,2018-03-21,
2,med_3,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,fcd44e1e-0c9f-ab84-f187-11f0418849f4,313782,Acetaminophen 325 MG Oral Tablet,2024-10-05,2024-10-23,
3,med_4,edc17058-55fb-08c7-12df-ece93a402e50,3f9da7ae-4222-4094-dc1b-e30b9189d650,562251,Amoxicillin 250 MG / Clavulanate 125 MG Oral T...,2017-06-18,2017-06-30,
4,med_5,edc17058-55fb-08c7-12df-ece93a402e50,329ee98f-4c76-5422-653d-03fc8eb0bcf9,1043400,Acetaminophen 21.7 MG/ML / Dextromethorphan Hy...,2020-07-12,2020-07-25,



=== observations_curated ===


Unnamed: 0,observation_id,patient_id,encounter_id,code,description,value,unit,observed_at
0,obs_1,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,1f9f55cd-ae88-b9ab-15e1-927e73848ca3,8310-5,Body temperature,37.8,Cel,2017-12-21 21:04:53
1,obs_2,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,e2113f4b-89c7-a231-d3ae-a6a6b4f41566,8302-2,Body Height,185.6,cm,2018-04-04 02:04:53
2,obs_3,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,e2113f4b-89c7-a231-d3ae-a6a6b4f41566,72514-3,Pain severity - 0-10 verbal numeric rating [Sc...,0.0,{score},2018-04-04 02:04:53
3,obs_4,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,e2113f4b-89c7-a231-d3ae-a6a6b4f41566,29463-7,Body Weight,104.5,kg,2018-04-04 02:04:53
4,obs_5,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,e2113f4b-89c7-a231-d3ae-a6a6b4f41566,39156-5,Body mass index (BMI) [Ratio],30.3,kg/m2,2018-04-04 02:04:53



=== patients_curated ===


Unnamed: 0,patient_id,gender,birthdate,deathdate,race,ethnicity,address,city,state,zip
0,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,M,1987-01-21,NaT,white,nonhispanic,840 Grimes Well Apt 27,Duxbury,Massachusetts,2332
1,edc17058-55fb-08c7-12df-ece93a402e50,M,1986-03-31,NaT,white,nonhispanic,166 Funk Burg,Gardner,Massachusetts,1440
2,80e7f50a-3e99-d5ac-cf97-f8a4b4f9e6c7,F,2006-02-17,NaT,white,nonhispanic,218 Hodkiewicz Route,Ludlow,Massachusetts,0
3,782001bc-f712-50ae-04f5-9a488f3ef4aa,F,1991-10-20,NaT,white,hispanic,113 Dooley Extension Unit 99,Hampden,Massachusetts,0
4,30e48e16-2df7-207e-7a3d-1650ef0c1ed8,F,1956-06-10,1961-04-13,white,nonhispanic,1084 Zulauf Park,Bourne,Massachusetts,2532



=== procedures_curated ===


Unnamed: 0,procedure_id,patient_id,encounter_id,code,description,performed_at
0,proc_1,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,87741be8-2223-14c5-375d-976414fcd9aa,288086009,Suture open wound (procedure),2016-04-11 07:14:53
1,proc_2,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,87741be8-2223-14c5-375d-976414fcd9aa,384700001,Injection of tetanus antitoxin (procedure),2016-04-11 07:31:41
2,proc_3,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,0d6b6a0a-8f84-c860-ba6f-b241d14f55bf,288086009,Suture open wound (procedure),2018-02-24 06:48:33
3,proc_4,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,e2113f4b-89c7-a231-d3ae-a6a6b4f41566,710824005,Assessment of health and social care needs (pr...,2018-04-04 02:04:53
4,proc_5,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,e2113f4b-89c7-a231-d3ae-a6a6b4f41566,710841007,Assessment of anxiety (procedure),2018-04-04 02:52:18


# 2. Connect to DuckDB and Build Corpus:

- Connect to the DuckDB database
- Extract tables (conditions, encounters, medications, observations, procedures, timeline)
- Build a **corpus** with `doc_id`, `text`, and metadata

In [11]:

def connect_duckdb(db_path: str):
    return duckdb.connect(db_path)

def safe_to_datetime(series):
    try:
        return pd.to_datetime(series, errors="coerce", utc=True)
    except Exception:
        return pd.to_datetime(pd.Series([None]*len(series)), errors="coerce", utc=True)

def make_text_from_row(table: str, row: pd.Series) -> str:
    if table == "conditions_curated":
        return f"CONDITION: {row['description']} | Code: {row['code']} | Onset: {row['onset_date']} | Status: {row['clinical_status']}"
    elif table == "encounters_curated":
        return f"ENCOUNTER: {row['description']} | Class: {row['encounter_class']} | Code: {row['code']} | Start: {row['start_time']} | End: {row['end_time']}"
    elif table == "medications_curated":
        return f"MEDICATION: {row['description']} | Code: {row['code']} | Start: {row['start_date']} | End: {row['end_date']} | Status: {row['status']}"
    elif table == "observations_curated":
        return f"OBSERVATION: {row['description']} | Value: {row['value']} {row['unit']} | Observed at: {row['observed_at']}"
    elif table == "procedures_curated":
        return f"PROCEDURE: {row['description']} | Code: {row['code']} | Performed at: {row['performed_at']}"
    elif table == "timeline_events":
        return f"TIMELINE EVENT: {row['description']} | Type: {row['event_type']} | At: {row['event_time']}"
    else:
        return str(row.to_dict())

def build_corpus(con) -> pd.DataFrame:
    tables = {
        "conditions_curated": "SELECT condition_id AS row_id, patient_id, encounter_id, code, description, onset_date, abatement_date, clinical_status FROM conditions_curated",
        "encounters_curated": "SELECT encounter_id AS row_id, patient_id, start_time, end_time, encounter_class, code, description, provider FROM encounters_curated",
        "medications_curated": "SELECT medication_id AS row_id, patient_id, encounter_id, code, description, start_date, end_date, status FROM medications_curated",
        "observations_curated": "SELECT observation_id AS row_id, patient_id, encounter_id, code, description, value, unit, observed_at FROM observations_curated",
        "procedures_curated": "SELECT procedure_id AS row_id, patient_id, encounter_id, code, description, performed_at FROM procedures_curated",
        "timeline_events": "SELECT event_id AS row_id, patient_id, event_time, event_type, description FROM timeline_events",
    }
    frames = []
    for table, sql in tables.items():
        df = con.execute(sql).df()
        if len(df) == 0:
            continue
        if table == "conditions_curated":
            df["ts"] = pd.to_datetime(df["onset_date"], errors="coerce", utc=True)
        elif table == "encounters_curated":
            df["ts"] = safe_to_datetime(df["start_time"])
        elif table == "medications_curated":
            df["ts"] = pd.to_datetime(df["start_date"], errors="coerce", utc=True)
        elif table == "observations_curated":
            df["ts"] = safe_to_datetime(df["observed_at"])
        elif table == "procedures_curated":
            df["ts"] = safe_to_datetime(df["performed_at"])
        elif table == "timeline_events":
            df["ts"] = safe_to_datetime(df["event_time"])
        else:
            df["ts"] = pd.NaT
        df["source"] = table
        df["doc_id"] = table + ":" + df["row_id"].astype(str)
        for col in ["encounter_id", "code", "patient_id"]:
            if col not in df.columns:
                df[col] = None
        df["text"] = df.apply(lambda r: make_text_from_row(table, r), axis=1)
        frames.append(df[["doc_id","text","source","patient_id","encounter_id","code","ts"]])
    corpus = pd.concat(frames, ignore_index=True)
    corpus = corpus[corpus["text"].str.strip() != ""].reset_index(drop=True)
    return corpus

con = connect_duckdb(DB_PATH)
corpus = build_corpus(con)
print("Corpus size:", len(corpus))
corpus.head(3)


Corpus size: 245404


Unnamed: 0,doc_id,text,source,patient_id,encounter_id,code,ts
0,conditions_curated:cond_1,CONDITION: Received higher education (finding)...,conditions_curated,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,8151e1b2-5789-578d-3e3a-d1d9b7c142ee,224299000,2005-03-16 00:00:00+00:00
1,conditions_curated:cond_2,CONDITION: Has a criminal record (finding) | C...,conditions_curated,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,6dce0dad-f85f-8a26-9583-f1bcdde4efc1,266948004,2006-03-22 00:00:00+00:00
2,conditions_curated:cond_3,CONDITION: Misuses drugs (finding) | Code: 361...,conditions_curated,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,6dce0dad-f85f-8a26-9583-f1bcdde4efc1,361055000,2006-03-22 00:00:00+00:00


# 3. Build BM25 Index

tokenize documents and build a BM25 index for **keyword-based retrieval**.

In [12]:

def simple_tokenize(text: str) -> List[str]:
    return re.findall(r"[a-z0-9]+", text.lower())

def build_bm25(texts: List[str]):
    tokenized = [simple_tokenize(t) for t in texts]
    return BM25Okapi(tokenized), tokenized

bm25, tokenized = build_bm25(corpus["text"].tolist())
print("BM25 built on", len(corpus), "documents")


BM25 built on 245404 documents


# 4. Build FAISS Index with BioClinicalBERT

- **BioClinicalBERT** (pretrained Hugging Face model) with mean pooling
- Generate sentence embeddings for all documents
- Build a FAISS index for **semantic retrieval**

In [13]:
from sentence_transformers import SentenceTransformer, models
import numpy as np, faiss
from tqdm import tqdm

BIOCLINICALBERT = "emilyalsentzer/Bio_ClinicalBERT"
FORCED_DIM = 768

# 1. Load BioClinicalBERT backbone
we = models.Transformer(BIOCLINICALBERT)

# 2. Add pooling (mean over tokens → sentence embedding)
pool = models.Pooling(
    we.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)

# 3. Build final SentenceTransformer with explicit `modules=`
model = SentenceTransformer(modules=[we, pool])
print("✅ Using BioClinicalBERT, dim =", model.get_sentence_embedding_dimension())

# 4. Encode texts
texts = corpus["text"].tolist()
all_embs = []
for i in tqdm(range(0, len(texts), BATCH_SIZE), desc="Embedding (BioClinicalBERT)", dynamic_ncols=True, leave=True, position=0 ):
    batch = texts[i:i+BATCH_SIZE]
    embs = model.encode(
        batch,
        batch_size=min(BATCH_SIZE, 32),
        show_progress_bar=False,
        convert_to_numpy=True,
        normalize_embeddings=False
    ).astype("float32")
    all_embs.append(embs)

embeddings = np.vstack(all_embs)
embeddings /= np.linalg.norm(embeddings, axis=1, keepdims=True)

# 5. Build FAISS index
index = faiss.IndexFlatIP(FORCED_DIM)
index.add(embeddings)

print("Embeddings shape:", embeddings.shape)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


✅ Using BioClinicalBERT, dim = 768


Embedding (BioClinicalBERT): 100%|██████████| 3835/3835 [12:05<00:00,  5.28it/s]


Embeddings shape: (245404, 768)


# 5. Hybrid Search Function

- Combine **BM25 scores** and **embedding similarities**
- Apply optional filters (patient, source, code, time range)
- Return top results

In [34]:

from dataclasses import dataclass

@dataclass
class SearchFilters:
    sources: Optional[List[str]] = None
    patient_id: Optional[str] = None
    code: Optional[str] = None
    date_from: Optional[pd.Timestamp] = None
    date_to: Optional[pd.Timestamp] = None

def normalize_scores(vals: np.ndarray) -> np.ndarray:
    if len(vals) == 0:
        return vals
    vmin, vmax = float(np.min(vals)), float(np.max(vals))
    if abs(vmin - vmax) < 1e-12:
        return np.ones_like(vals) * 0.5
    return (vals - vmin) / (vmax - vmin + 1e-12)

def prefilter_indices(corpus: pd.DataFrame, flt: SearchFilters) -> np.ndarray:
    mask = pd.Series([True]*len(corpus))
    if flt.sources:
        mask &= corpus["source"].isin(flt.sources)
    if flt.patient_id:
        mask &= (corpus["patient_id"] == flt.patient_id)
    if flt.code:
        mask &= (corpus["code"] == flt.code)
    if flt.date_from is not None:
        mask &= (corpus["ts"].notna()) & (pd.to_datetime(corpus["ts"], utc=True) >= flt.date_from)
    if flt.date_to is not None:
        mask &= (corpus["ts"].notna()) & (pd.to_datetime(corpus["ts"], utc=True) <= flt.date_to)
    return np.where(mask.values)[0]

def faiss_topN(query_emb: np.ndarray, topN: int = 100) -> Tuple[np.ndarray, np.ndarray]:
    D, I = index.search(query_emb.reshape(1,-1).astype("float32"), topN)
    return I[0], D[0]

def bm25_topN(query: str, topN: int = 100) -> Tuple[np.ndarray, np.ndarray]:
    toks = simple_tokenize(query)
    scores = bm25.get_scores(toks)
    top_idx = np.argsort(scores)[::-1][:topN]
    return top_idx, scores[top_idx]

query_model = SentenceTransformer(BIOCLINICALBERT)

def hybrid_search(query: str, k: int = 10, alpha: float = 0.65, flt: Optional[SearchFilters] = None,
                  faiss_candidates: int = 200, bm25_candidates: int = 200) -> pd.DataFrame:
    q_emb = query_model.encode([query], convert_to_numpy=True, normalize_embeddings=False).astype("float32")[0]
    q_emb = q_emb / (np.linalg.norm(q_emb) + 1e-12)
    faiss_idx, _ = faiss_topN(q_emb, topN=faiss_candidates)
    bm25_idx, _ = bm25_topN(query, topN=bm25_candidates)
    cand = np.unique(np.concatenate([faiss_idx, bm25_idx], axis=0))
    if flt is not None:
        allowed = set(prefilter_indices(corpus, flt))
        cand = np.array([i for i in cand if i in allowed], dtype=int)
    if len(cand) == 0:
        return pd.DataFrame(columns=["score","bm25","emb","doc_id","source","patient_id","encounter_id","code","ts","snippet","text"])
    emb_sims = (embeddings[cand] @ q_emb.reshape(-1,1)).ravel()
    full_bm25_scores = bm25.get_scores(simple_tokenize(query))
    bm25_sels = full_bm25_scores[cand]
    emb_norm = normalize_scores(emb_sims)
    bm25_norm = normalize_scores(bm25_sels)
    final = alpha*emb_norm + (1.0-alpha)*bm25_norm
    out = corpus.iloc[cand].copy().reset_index(drop=True)
    out["emb"] = emb_sims
    out["bm25"] = bm25_sels
    out["score"] = final
    out = out.sort_values("score", ascending=False).head(k)
    out["snippet"] = out["text"].str.slice(0,280)
    return out[["score","emb","bm25","doc_id","source","patient_id","encounter_id","code","ts","snippet","text"]]




# 6. Test Queries

In [8]:

tests = [
    "tetanus antitoxin injection",
    "body temperature and pain severity",
    "acetaminophen oral tablet",
    "general examination of patient",
    "suture open wound procedure",
]
for q in tests:
    print("\n=== Query:", q)
    res = hybrid_search(q, k=5, alpha=0.65)
    display(res.drop(columns=["text"]).reset_index(drop=True))



=== Query: tetanus antitoxin injection


Unnamed: 0,score,emb,bm25,doc_id,source,patient_id,encounter_id,code,ts,snippet
0,0.860117,0.849317,26.519821,timeline_events:proc_2,timeline_events,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,,,2016-04-11 07:31:41+00:00,TIMELINE EVENT: Injection of tetanus antitoxin...
1,0.860084,0.849313,26.519821,timeline_events:proc_7375,timeline_events,ae05f1fa-7913-f7bc-41bd-2dc8827555e7,,,2017-12-29 21:38:14+00:00,TIMELINE EVENT: Injection of tetanus antitoxin...
2,0.836458,0.846397,26.519821,timeline_events:proc_7226,timeline_events,1f0ca842-8c2d-a943-c047-dafce690f5a2,,,2022-12-08 08:55:05+00:00,TIMELINE EVENT: Injection of tetanus antitoxin...
3,0.833806,0.84607,26.519821,timeline_events:proc_15746,timeline_events,28f107b5-e973-ece3-b762-c2dbd9a01ba8,,,2024-05-23 01:14:18+00:00,TIMELINE EVENT: Injection of tetanus antitoxin...
4,0.818731,0.842187,27.822274,procedures_curated:proc_2,procedures_curated,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,87741be8-2223-14c5-375d-976414fcd9aa,384700001.0,2016-04-11 07:31:41+00:00,PROCEDURE: Injection of tetanus antitoxin (pro...



=== Query: body temperature and pain severity


Unnamed: 0,score,emb,bm25,doc_id,source,patient_id,encounter_id,code,ts,snippet
0,0.878304,0.82808,7.288429,observations_curated:obs_34068,observations_curated,e5ed5bc3-51e1-a9a7-01fb-f66b8ac4045d,99a10465-1395-60d1-e572-6c4b1907f55d,75893-8,2014-08-05 05:21:41+00:00,OBSERVATION: Pain severity in the past week - ...
1,0.877023,0.827963,7.288429,observations_curated:obs_34064,observations_curated,e5ed5bc3-51e1-a9a7-01fb-f66b8ac4045d,3554af2d-e6c4-82bb-2cc4-5774e45dfbd7,75893-8,2014-05-27 04:21:41+00:00,OBSERVATION: Pain severity in the past week - ...
2,0.861664,0.826556,7.288429,observations_curated:obs_34060,observations_curated,e5ed5bc3-51e1-a9a7-01fb-f66b8ac4045d,1b62477a-a1a5-c5d0-4367-ace58be2e280,75893-8,2014-03-28 04:21:41+00:00,OBSERVATION: Pain severity in the past week - ...
3,0.859792,0.826384,7.288429,observations_curated:obs_34014,observations_curated,e5ed5bc3-51e1-a9a7-01fb-f66b8ac4045d,af7c5c86-a7f9-db3b-1d68-755a54861dc7,75893-8,2014-02-02 08:21:41+00:00,OBSERVATION: Pain severity in the past week - ...
4,0.85957,0.815216,11.173488,observations_curated:obs_47752,observations_curated,787f9e8e-d3a4-0407-55d1-01a3414fceaf,7174fc0d-87bd-f276-be37-1789719a62c9,8310-5,2020-12-09 16:57:40+00:00,OBSERVATION: Body temperature | Value: 41.8 Ce...



=== Query: acetaminophen oral tablet


Unnamed: 0,score,emb,bm25,doc_id,source,patient_id,encounter_id,code,ts,snippet
0,0.952923,0.884335,11.831079,timeline_events:med_1838,timeline_events,e5ed5bc3-51e1-a9a7-01fb-f66b8ac4045d,,,2014-10-26 00:00:00+00:00,TIMELINE EVENT: Acetaminophen 325 MG / Oxycodo...
1,0.943499,0.883721,11.831079,timeline_events:med_221,timeline_events,641efcda-7397-4172-c6ac-8231342fa53e,,,2017-01-10 00:00:00+00:00,TIMELINE EVENT: Acetaminophen 325 MG / Oxycodo...
2,0.931882,0.882964,11.831079,timeline_events:med_4266,timeline_events,5fda1015-d0a5-e32d-d0b8-4662e6ce6c2b,,,2016-01-21 00:00:00+00:00,TIMELINE EVENT: Acetaminophen 325 MG / Oxycodo...
3,0.928626,0.882752,11.831079,timeline_events:med_4427,timeline_events,5fda1015-d0a5-e32d-d0b8-4662e6ce6c2b,,,2024-10-10 00:00:00+00:00,TIMELINE EVENT: Acetaminophen 325 MG / Oxycodo...
4,0.927255,0.882663,11.831079,timeline_events:med_227,timeline_events,641efcda-7397-4172-c6ac-8231342fa53e,,,2021-05-11 00:00:00+00:00,TIMELINE EVENT: Acetaminophen 325 MG / Oxycodo...



=== Query: general examination of patient


Unnamed: 0,score,emb,bm25,doc_id,source,patient_id,encounter_id,code,ts,snippet
0,0.870653,0.798585,14.809222,timeline_events:0c5a352c-a83c-34b0-42fb-6ad3a8...,timeline_events,b9bacf2f-7027-2e05-fa5b-19167071fdde,,,2023-12-15 10:00:50+00:00,TIMELINE EVENT: General examination of patient...
1,0.854851,0.798055,14.809222,timeline_events:b580f8ad-d16a-e254-afe1-c63e64...,timeline_events,b9bacf2f-7027-2e05-fa5b-19167071fdde,,,2015-10-30 10:00:50+00:00,TIMELINE EVENT: General examination of patient...
2,0.846317,0.797768,14.809222,timeline_events:f17d77c0-2b97-19df-cde2-161e3c...,timeline_events,033cccaf-bc92-3ddd-b64c-9ea45268a971,,,2021-02-13 03:15:47+00:00,TIMELINE EVENT: General examination of patient...
3,0.841051,0.797591,14.809222,timeline_events:affd9222-4e10-e9d8-6e31-2b9e73...,timeline_events,f49221bb-20fb-45cb-9345-09b6a83ae9de,,,2020-07-31 00:53:40+00:00,TIMELINE EVENT: General examination of patient...
4,0.831188,0.79726,14.809222,timeline_events:d15c7e8f-0ac7-8705-8c1b-d362b2...,timeline_events,033cccaf-bc92-3ddd-b64c-9ea45268a971,,,2025-03-08 03:15:47+00:00,TIMELINE EVENT: General examination of patient...



=== Query: suture open wound procedure


Unnamed: 0,score,emb,bm25,doc_id,source,patient_id,encounter_id,code,ts,snippet
0,1.0,0.866081,32.205144,procedures_curated:proc_5991,procedures_curated,033cccaf-bc92-3ddd-b64c-9ea45268a971,bd374338-7f51-3219-3eba-276d0a62e82b,288086009,2020-01-30 02:00:11+00:00,PROCEDURE: Suture open wound (procedure) | Cod...
1,0.997735,0.865855,32.205144,procedures_curated:proc_3,procedures_curated,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,0d6b6a0a-8f84-c860-ba6f-b241d14f55bf,288086009,2018-02-24 06:48:33+00:00,PROCEDURE: Suture open wound (procedure) | Cod...
2,0.985516,0.864633,32.205144,procedures_curated:proc_15491,procedures_curated,70775c58-59fb-a3db-9858-1d427567c195,eac559cb-da69-cb48-32e9-8547ee88df02,288086009,2019-03-29 03:45:42+00:00,PROCEDURE: Suture open wound (procedure) | Cod...
3,0.979929,0.864074,32.205144,procedures_curated:proc_6837,procedures_curated,006c29d1-d868-3a9e-ceab-31f23e398f45,40835bfb-9439-d4ea-7384-320a102aa656,288086009,2018-06-08 09:18:17+00:00,PROCEDURE: Suture open wound (procedure) | Cod...
4,0.974174,0.863498,32.205144,procedures_curated:proc_3039,procedures_curated,6754b3bf-f5ac-f359-fef6-87cf4b8508ab,0b9195ac-948e-33b2-5aed-96047268b7c4,288086009,2017-05-23 15:04:00+00:00,PROCEDURE: Suture open wound (procedure) | Cod...


In [9]:
pip install llama-index-llms-google-genai google-generativeai --upgrade


Collecting llama-index-llms-google-genai
  Downloading llama_index_llms_google_genai-0.6.2-py3-none-any.whl.metadata (3.0 kB)
Collecting llama-index-core<0.15,>=0.14.3 (from llama-index-llms-google-genai)
  Downloading llama_index_core-0.14.3-py3-none-any.whl.metadata (2.5 kB)
Collecting aiosqlite (from llama-index-core<0.15,>=0.14.3->llama-index-llms-google-genai)
  Downloading aiosqlite-0.21.0-py3-none-any.whl.metadata (4.3 kB)
Collecting banks<3,>=2.2.0 (from llama-index-core<0.15,>=0.14.3->llama-index-llms-google-genai)
  Downloading banks-2.2.0-py3-none-any.whl.metadata (12 kB)
Collecting dataclasses-json (from llama-index-core<0.15,>=0.14.3->llama-index-llms-google-genai)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting deprecated>=1.2.9.3 (from llama-index-core<0.15,>=0.14.3->llama-index-llms-google-genai)
  Downloading Deprecated-1.2.18-py2.py3-none-any.whl.metadata (5.7 kB)
Collecting dirtyjson<2,>=1.0.8 (from llama-index-core<0.15,>=0.14.3->ll

In [1]:
import os
os.environ["GOOGLE_API_KEY"] = "AIzaSyBHOEUwAi0DODZdcbxpmH9lydPqMWkjqOY"  # keep this out of source control


In [50]:
from llama_index.core import Settings
from llama_index.llms.google_genai import GoogleGenAI

# Free/fast choice; change to "gemini-2.5-pro" if you later enable billing
Settings.llm = GoogleGenAI(model="gemini-2.5-flash")

ERROR:asyncio:Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7b47ec26dd60>


In [4]:
resp = await Settings.llm.acomplete("Reply with the word READY.")
print(resp.text)

READY


In [35]:
from llama_index.core.retrievers import BaseRetriever
from llama_index.core.schema import NodeWithScore, TextNode, QueryBundle
import pandas as pd, numpy as np, time

CONF_THRESHOLD = 0.58  # tweak if desired
DEFAULT_K = 6

class HybridRetriever(BaseRetriever):
    def __init__(self, k: int = DEFAULT_K, alpha: float = 0.65, flt=None):
        self.k, self.alpha, self.flt = k, alpha, flt

    def _retrieve(self, qb: QueryBundle):
        df = hybrid_search(qb.query_str, k=self.k, alpha=self.alpha, flt=self.flt)
        nodes = []
        for _, r in df.iterrows():
            node = TextNode(
                text=r.text,
                metadata={
                    "doc_id": r.doc_id,
                    "source": r.source,
                    "patient_id": r.patient_id,
                    "encounter_id": r.encounter_id,
                    "code": r.code,
                    "ts": None if pd.isna(r.ts) else str(r.ts),
                },
                id_=r.doc_id,
            )
            nodes.append(NodeWithScore(node=node, score=float(r.score)))
        return nodes


In [36]:
# If you're in Jupyter/Colab and want to keep sync calls:
# %pip -q install nest_asyncio
import nest_asyncio, asyncio; nest_asyncio.apply()

def _softmax(x):
    x = np.array(x, dtype=float)
    if x.size == 0: return x
    e = np.exp(x - x.max()); return e / (e.sum() + 1e-9)

def qa(query: str, k: int = DEFAULT_K, alpha: float = 0.65, flt=None):
    retr = HybridRetriever(k=k, alpha=alpha, flt=flt)
    nodes = retr.retrieve(query)

    scores = [n.score for n in nodes]
    conf = float(_softmax(scores)[:2].sum()) if scores else 0.0

    evidence = "\n\n".join([f"[{i+1}] {n.node.get_text()[:350]}" for i, n in enumerate(nodes[:6])])
    prompt = (
        "Answer using ONLY the evidence below. Be concise and factual. "
        "If something is not supported, say you're uncertain.\n\n"
        f"Question: {query}\n\nEvidence:\n{evidence}\n\nAnswer:"
    )

    # Call LLM (sync style now works in notebook thanks to nest_asyncio)
    try:
        out = Settings.llm.complete(prompt)
        answer = out.text.strip()
    except Exception:
        # extractive fallback if LLM call fails
        answer = " ".join([n.node.get_text()[:180] for n in nodes[:3]])

    if conf < CONF_THRESHOLD:
        answer += " I'm not fully confident—please verify with external clinical sources."

    citations = [{
        "doc_id": n.node.metadata.get("doc_id"),
        "source": n.node.metadata.get("source"),
        "patient_id": n.node.metadata.get("patient_id"),
        "encounter_id": n.node.metadata.get("encounter_id"),
        "code": n.node.metadata.get("code"),
        "ts": n.node.metadata.get("ts"),
        "score": round(float(n.score or 0.0), 3),
        "snippet": n.node.get_text()[:280],
    } for n in nodes[:k]]

    return {
        "query": query,
        "answer_text": answer.strip(),
        "confidence": round(conf, 3),
        "confidence_percent": int(round(conf * 100)),
        "citations": citations,
        "retriever_top_k": k,
        "meta": {"alpha": alpha, "threshold": CONF_THRESHOLD, "ts": time.strftime("%Y-%m-%d %H:%M:%S")},
    }


In [37]:
resp = qa("what is tetanus antitoxin injection")
print(resp["answer_text"])
resp["confidence"], resp["citations"][:2]


It is a procedure. What it is beyond being a procedure is uncertain. I'm not fully confident—please verify with external clinical sources.


(0.339,
 [{'doc_id': 'timeline_events:proc_2',
   'source': 'timeline_events',
   'patient_id': '8c8e1c9a-b310-43c6-33a7-ad11bad21c40',
   'encounter_id': None,
   'code': None,
   'ts': '2016-04-11 07:31:41+00:00',
   'score': 0.78,
   'snippet': 'TIMELINE EVENT: Injection of tetanus antitoxin (procedure) | Type: procedure | At: 2016-04-11 07:31:41+00:00'},
  {'doc_id': 'timeline_events:proc_7375',
   'source': 'timeline_events',
   'patient_id': 'ae05f1fa-7913-f7bc-41bd-2dc8827555e7',
   'encounter_id': None,
   'code': None,
   'ts': '2017-12-29 21:38:14+00:00',
   'score': 0.768,
   'snippet': 'TIMELINE EVENT: Injection of tetanus antitoxin (procedure) | Type: procedure | At: 2017-12-29 21:38:14+00:00'}])

In [38]:
resp = qa("What procedures involved suturing wounds?")
print(resp["answer_text"])
print("Confidence:", resp["confidence"])
print("Citations:", resp["citations"])


I'm uncertain. The provided evidence only contains observations about pain interference with general activity and does not mention any procedures involving suturing wounds. I'm not fully confident—please verify with external clinical sources.
Confidence: 0.337
Citations: [{'doc_id': 'timeline_events:obs_6290', 'source': 'timeline_events', 'patient_id': 'a331b5bc-cbea-a205-a8bf-dbf3255ef36a', 'encounter_id': None, 'code': None, 'ts': '2025-03-02 09:33:30+00:00', 'score': 0.822, 'snippet': 'TIMELINE EVENT: What number best describes how pain has interfered with your general activity during the past week | Type: observation | At: 2025-03-02 09:33:30+00:00'}, {'doc_id': 'timeline_events:obs_6384', 'source': 'timeline_events', 'patient_id': 'a331b5bc-cbea-a205-a8bf-dbf3255ef36a', 'encounter_id': None, 'code': None, 'ts': '2025-06-30 10:33:30+00:00', 'score': 0.809, 'snippet': 'TIMELINE EVENT: What number best describes how pain has interfered with your general activity during the past week 

# Metrics

In [19]:
import time
import numpy as np
from sklearn.metrics import precision_score, recall_score
from sentence_transformers import util

# ---------- Retrieval Metrics ----------
def precision_at_k(relevant, retrieved, k):
    return len(set(relevant) & set(retrieved[:k])) / k

def recall_at_k(relevant, retrieved, k):
    return len(set(relevant) & set(retrieved[:k])) / max(1, len(relevant))

def mrr_at_k(relevant, retrieved, k):
    for i, doc in enumerate(retrieved[:k]):
        if doc in relevant:
            return 1.0 / (i+1)
    return 0.0

def ndcg_at_k(relevant, retrieved, k):
    dcg = 0.0
    for i, doc in enumerate(retrieved[:k]):
        if doc in relevant:
            dcg += 1.0 / np.log2(i+2)
    idcg = sum([1.0 / np.log2(i+2) for i in range(min(len(relevant), k))])
    return dcg / idcg if idcg > 0 else 0.0

# ---------- Augmentation Metrics ----------
def cosine_similarity(a, b, model):
    """Embed Q/A with the same SentenceTransformer and compute cosine sim"""
    emb_a, emb_b = model.encode([a, b], convert_to_tensor=True)
    return float(util.pytorch_cos_sim(emb_a, emb_b))

def faithfulness(answer, citations_text, model, threshold=0.5):
    """Check if answer aligns with retrieved evidence"""
    sim = cosine_similarity(answer, citations_text, model)
    return sim, sim >= threshold

def hallucination(answer, citations_text, model, threshold=0.3):
    """Flag if answer is far from citations"""
    sim = cosine_similarity(answer, citations_text, model)
    return sim < threshold


In [None]:
def evaluate_query(query, relevant_doc_ids=None, k=5):
    """
    query: user query (str)
    relevant_doc_ids: list of ground-truth doc_ids (for retrieval eval)
    k: top-k
    """
    start = time.time()
    resp = qa(query, k=k)   
    latency = time.time() - start

    # Retrieval metrics 
    retrieval_metrics = {}
    if relevant_doc_ids:
        retrieved_ids = [c['doc_id'] for c in resp['citations']]
        retrieval_metrics = {
            "Precision@k": precision_at_k(relevant_doc_ids, retrieved_ids, k),
            "Recall@k": recall_at_k(relevant_doc_ids, retrieved_ids, k),
            "MRR@k": mrr_at_k(relevant_doc_ids, retrieved_ids, k),
            "nDCG@k": ndcg_at_k(relevant_doc_ids, retrieved_ids, k),
        }

    # Augmentation metrics
    citations_text = " ".join([c["snippet"] for c in resp["citations"]])
    faith_sim, faithful = faithfulness(resp["answer_text"], citations_text, query_model)
    halluc = hallucination(resp["answer_text"], citations_text, query_model)

    augmentation_metrics = {
        "Faithfulness_Sim": round(faith_sim, 3),
        "Faithful?": faithful,
        "Hallucination?": halluc,
        "Answer_Relevance_to_Query": round(cosine_similarity(resp["answer_text"], query, query_model), 3),
    }

    # System metrics
    system_metrics = {
        "Confidence": resp["confidence"],
        "Latency_sec": round(latency, 3),
    }

    # Print results
    print(f"\n=== Query: {query} ===")
    print("Answer:", resp["answer_text"])
    print("Confidence:", resp["confidence"])
    print("Latency:", round(latency, 3), "sec")
    print("Citations:", len(resp["citations"]))
    print("\n-- Retrieval Metrics --", retrieval_metrics)
    print("-- Augmentation Metrics --", augmentation_metrics)
    print("-- System Metrics --", system_metrics)

    return {"retrieval": retrieval_metrics, "augmentation": augmentation_metrics, "system": system_metrics}


In [22]:
# With known relevant doc_ids (gold standard) for evaluation
evaluate_query("Chronic pain")

# Without gold labels, you still get augmentation + system metrics
evaluate_query("Chronic pain")



=== Query: Chronic pain ===
Answer: The evidence indicates chronic low back pain with onset dates of 2014-02-06, 2019-05-20, and 2014-02-05. I'm not fully confident—please verify with external clinical sources.
Confidence: 0.401
Latency: 2.274 sec
Citations: 5

-- Retrieval Metrics -- {}
-- Augmentation Metrics -- {'Faithfulness_Sim': 0.851, 'Faithful?': True, 'Hallucination?': False, 'Answer_Relevance_to_Query': 0.737}
-- System Metrics -- {'Confidence': 0.401, 'Latency_sec': 2.274}

=== Query: Chronic pain ===
Answer: Chronic low back pain is present, with onset dates of 2014-02-05, 2014-02-06, and 2019-05-20. I'm not fully confident—please verify with external clinical sources.
Confidence: 0.401
Latency: 3.115 sec
Citations: 5

-- Retrieval Metrics -- {}
-- Augmentation Metrics -- {'Faithfulness_Sim': 0.84, 'Faithful?': True, 'Hallucination?': False, 'Answer_Relevance_to_Query': 0.751}
-- System Metrics -- {'Confidence': 0.401, 'Latency_sec': 3.115}


{'retrieval': {},
 'augmentation': {'Faithfulness_Sim': 0.84,
  'Faithful?': True,
  'Hallucination?': False,
  'Answer_Relevance_to_Query': 0.751},
 'system': {'Confidence': 0.401, 'Latency_sec': 3.115}}