# Hybrid Retrieval with BioClinicalBERT, FAISS, and BM25

- **DuckDB** → medical data storage
- **BM25** → keyword-based retrieval
- **BioClinicalBERT + FAISS** → semantic retrieval
- **Hybrid search** → combines both methods for better results


# 1. Setup and Imports

In [5]:

import os, json, pickle, time, re
from pathlib import Path
from typing import List, Optional, Tuple

import duckdb
import pandas as pd
import numpy as np
from tqdm import tqdm

from rank_bm25 import BM25Okapi
import faiss
from sentence_transformers import SentenceTransformer

# Paths
DB_PATH = "medwhisper.db"
INDEX_DIR = Path("indexes/medwhisper_hybrid")
INDEX_DIR.mkdir(parents=True, exist_ok=True)

# Build toggles
BUILD_FROM_SCRATCH = True
BATCH_SIZE = 64
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)


# 2. Connect to DuckDB and Build Corpus:

- Connect to the DuckDB database
- Extract tables (conditions, encounters, medications, observations, procedures, timeline)
- Build a **corpus** with `doc_id`, `text`, and metadata

In [6]:

def connect_duckdb(db_path: str):
    return duckdb.connect(db_path)

def safe_to_datetime(series):
    try:
        return pd.to_datetime(series, errors="coerce", utc=True)
    except Exception:
        return pd.to_datetime(pd.Series([None]*len(series)), errors="coerce", utc=True)

def make_text_from_row(table: str, row: pd.Series) -> str:
    if table == "conditions_curated":
        return f"CONDITION: {row['description']} | Code: {row['code']} | Onset: {row['onset_date']} | Status: {row['clinical_status']}"
    elif table == "encounters_curated":
        return f"ENCOUNTER: {row['description']} | Class: {row['encounter_class']} | Code: {row['code']} | Start: {row['start_time']} | End: {row['end_time']}"
    elif table == "medications_curated":
        return f"MEDICATION: {row['description']} | Code: {row['code']} | Start: {row['start_date']} | End: {row['end_date']} | Status: {row['status']}"
    elif table == "observations_curated":
        return f"OBSERVATION: {row['description']} | Value: {row['value']} {row['unit']} | Observed at: {row['observed_at']}"
    elif table == "procedures_curated":
        return f"PROCEDURE: {row['description']} | Code: {row['code']} | Performed at: {row['performed_at']}"
    elif table == "timeline_events":
        return f"TIMELINE EVENT: {row['description']} | Type: {row['event_type']} | At: {row['event_time']}"
    else:
        return str(row.to_dict())

def build_corpus(con) -> pd.DataFrame:
    tables = {
        "conditions_curated": "SELECT condition_id AS row_id, patient_id, encounter_id, code, description, onset_date, abatement_date,clinical_status FROM conditions_curated",
        "encounters_curated": "SELECT encounter_id AS row_id, patient_id, start_time, end_time, encounter_class, code, description,provider FROM encounters_curated",
        "medications_curated": "SELECT medication_id AS row_id, patient_id, encounter_id, code, description, start_date, end_date, status FROM medications_curated",
        "observations_curated": "SELECT observation_id AS row_id, patient_id, encounter_id, code, description, value, unit, observed_at FROM observations_curated",
        "procedures_curated": "SELECT procedure_id AS row_id, patient_id, encounter_id, code, description, performed_at FROM procedures_curated",
        "timeline_events": "SELECT event_id AS row_id, patient_id, event_time, event_type, description FROM timeline_events",
    }
    frames = []
    for table, sql in tables.items():
        df = con.execute(sql).df()
        if len(df) == 0:
            continue
        if table == "conditions_curated":
            df["ts"] = pd.to_datetime(df["onset_date"], errors="coerce", utc=True)
        elif table == "encounters_curated":
            df["ts"] = safe_to_datetime(df["start_time"])
        elif table == "medications_curated":
            df["ts"] = pd.to_datetime(df["start_date"], errors="coerce", utc=True)
        elif table == "observations_curated":
            df["ts"] = safe_to_datetime(df["observed_at"])
        elif table == "procedures_curated":
            df["ts"] = safe_to_datetime(df["performed_at"])
        elif table == "timeline_events":
            df["ts"] = safe_to_datetime(df["event_time"])
        else:
            df["ts"] = pd.NaT
        df["source"] = table
        df["doc_id"] = table + ":" + df["row_id"].astype(str)
        for col in ["encounter_id", "code", "patient_id"]:
            if col not in df.columns:
                df[col] = None
        df["text"] = df.apply(lambda r: make_text_from_row(table, r), axis=1)
        frames.append(df[["doc_id","text","source","patient_id","encounter_id","code","ts"]])
    corpus = pd.concat(frames, ignore_index=True)
    corpus = corpus[corpus["text"].str.strip() != ""].reset_index(drop=True)
    return corpus

con = connect_duckdb(DB_PATH)
corpus = build_corpus(con)
print("Corpus size:", len(corpus))
corpus.head(3)


Corpus size: 245404


Unnamed: 0,doc_id,text,source,patient_id,encounter_id,code,ts
0,conditions_curated:cond_1,CONDITION: Received higher education (finding)...,conditions_curated,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,8151e1b2-5789-578d-3e3a-d1d9b7c142ee,224299000,2005-03-16 00:00:00+00:00
1,conditions_curated:cond_2,CONDITION: Has a criminal record (finding) | C...,conditions_curated,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,6dce0dad-f85f-8a26-9583-f1bcdde4efc1,266948004,2006-03-22 00:00:00+00:00
2,conditions_curated:cond_3,CONDITION: Misuses drugs (finding) | Code: 361...,conditions_curated,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,6dce0dad-f85f-8a26-9583-f1bcdde4efc1,361055000,2006-03-22 00:00:00+00:00


# 3. Build BM25 Index

tokenize documents and build a BM25 index for **keyword-based retrieval**.

In [7]:

def simple_tokenize(text: str) -> List[str]:
    return re.findall(r"[a-z0-9]+", text.lower())

def build_bm25(texts: List[str]):
    tokenized = [simple_tokenize(t) for t in texts]
    return BM25Okapi(tokenized), tokenized

bm25, tokenized = build_bm25(corpus["text"].tolist())
print("BM25 built on", len(corpus), "documents")


BM25 built on 245404 documents


# 4. Build FAISS Index with BioClinicalBERT

- **BioClinicalBERT** (pretrained Hugging Face model) with mean pooling
- Generate sentence embeddings for all documents
- Build a FAISS index for **semantic retrieval**

In [9]:
from sentence_transformers import SentenceTransformer, models
import numpy as np, faiss
from tqdm import tqdm

BIOCLINICALBERT = "emilyalsentzer/Bio_ClinicalBERT"
FORCED_DIM = 768

# 1. Load BioClinicalBERT backbone
we = models.Transformer(BIOCLINICALBERT)

# 2. Add pooling (mean over tokens → sentence embedding)
pool = models.Pooling(
    we.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)

# 3. Build final SentenceTransformer with explicit `modules=`
model = SentenceTransformer(modules=[we, pool])
print("✅ Using BioClinicalBERT, dim =", model.get_sentence_embedding_dimension())

# 4. Encode texts
texts = corpus["text"].tolist()
all_embs = []
for i in tqdm(range(0, len(texts), BATCH_SIZE), desc="Embedding (BioClinicalBERT)", dynamic_ncols=True, leave=True, position=0 ):
    batch = texts[i:i+BATCH_SIZE]
    embs = model.encode(
        batch,
        batch_size=min(BATCH_SIZE, 32),
        show_progress_bar=False,
        convert_to_numpy=True,
        normalize_embeddings=False
    ).astype("float32")
    all_embs.append(embs)

embeddings = np.vstack(all_embs)
embeddings /= np.linalg.norm(embeddings, axis=1, keepdims=True)

# 5. Build FAISS index
index = faiss.IndexFlatIP(FORCED_DIM)
index.add(embeddings)

print("Embeddings shape:", embeddings.shape)


✅ Using BioClinicalBERT, dim = 768


Embedding (BioClinicalBERT): 100%|██████████| 3835/3835 [12:05<00:00,  5.29it/s]


Embeddings shape: (245404, 768)


# 5. Hybrid Search Function

- Combine **BM25 scores** and **embedding similarities**
- Apply optional filters (patient, source, code, time range)
- Return top results

In [10]:

from dataclasses import dataclass

@dataclass
class SearchFilters:
    sources: Optional[List[str]] = None
    patient_id: Optional[str] = None
    code: Optional[str] = None
    date_from: Optional[pd.Timestamp] = None
    date_to: Optional[pd.Timestamp] = None

def normalize_scores(vals: np.ndarray) -> np.ndarray:
    if len(vals) == 0:
        return vals
    vmin, vmax = float(np.min(vals)), float(np.max(vals))
    if abs(vmin - vmax) < 1e-12:
        return np.ones_like(vals) * 0.5
    return (vals - vmin) / (vmax - vmin + 1e-12)

def prefilter_indices(corpus: pd.DataFrame, flt: SearchFilters) -> np.ndarray:
    mask = pd.Series([True]*len(corpus))
    if flt.sources:
        mask &= corpus["source"].isin(flt.sources)
    if flt.patient_id:
        mask &= (corpus["patient_id"] == flt.patient_id)
    if flt.code:
        mask &= (corpus["code"] == flt.code)
    if flt.date_from is not None:
        mask &= (corpus["ts"].notna()) & (pd.to_datetime(corpus["ts"], utc=True) >= flt.date_from)
    if flt.date_to is not None:
        mask &= (corpus["ts"].notna()) & (pd.to_datetime(corpus["ts"], utc=True) <= flt.date_to)
    return np.where(mask.values)[0]

def faiss_topN(query_emb: np.ndarray, topN: int = 100) -> Tuple[np.ndarray, np.ndarray]:
    D, I = index.search(query_emb.reshape(1,-1).astype("float32"), topN)
    return I[0], D[0]

def bm25_topN(query: str, topN: int = 100) -> Tuple[np.ndarray, np.ndarray]:
    toks = simple_tokenize(query)
    scores = bm25.get_scores(toks)
    top_idx = np.argsort(scores)[::-1][:topN]
    return top_idx, scores[top_idx]

query_model = SentenceTransformer(BIOCLINICALBERT)

def hybrid_search(query: str, k: int = 10, alpha: float = 0.65, flt: Optional[SearchFilters] = None,
                  faiss_candidates: int = 200, bm25_candidates: int = 200) -> pd.DataFrame:
    q_emb = query_model.encode([query], convert_to_numpy=True, normalize_embeddings=False).astype("float32")[0]
    q_emb = q_emb / (np.linalg.norm(q_emb) + 1e-12)
    faiss_idx, _ = faiss_topN(q_emb, topN=faiss_candidates)
    bm25_idx, _ = bm25_topN(query, topN=bm25_candidates)
    cand = np.unique(np.concatenate([faiss_idx, bm25_idx], axis=0))
    if flt is not None:
        allowed = set(prefilter_indices(corpus, flt))
        cand = np.array([i for i in cand if i in allowed], dtype=int)
    if len(cand) == 0:
        return pd.DataFrame(columns=["score","bm25","emb","doc_id","source","patient_id","encounter_id","code","ts","snippet","text"])
    emb_sims = (embeddings[cand] @ q_emb.reshape(-1,1)).ravel()
    full_bm25_scores = bm25.get_scores(simple_tokenize(query))
    bm25_sels = full_bm25_scores[cand]
    emb_norm = normalize_scores(emb_sims)
    bm25_norm = normalize_scores(bm25_sels)
    final = alpha*emb_norm + (1.0-alpha)*bm25_norm
    out = corpus.iloc[cand].copy().reset_index(drop=True)
    out["emb"] = emb_sims
    out["bm25"] = bm25_sels
    out["score"] = final
    out = out.sort_values("score", ascending=False).head(k)
    out["snippet"] = out["text"].str.slice(0,280)
    return out[["score","emb","bm25","doc_id","source","patient_id","encounter_id","code","ts","snippet","text"]]




# 6. Test Queries

In [11]:

tests = [
    "tetanus antitoxin injection",
    "body temperature and pain severity",
    "acetaminophen oral tablet",
    "general examination of patient",
    "suture open wound procedure",
]
for q in tests:
    print("\n=== Query:", q)
    res = hybrid_search(q, k=5, alpha=0.65)
    display(res.drop(columns=["text"]).reset_index(drop=True))



=== Query: tetanus antitoxin injection


Unnamed: 0,score,emb,bm25,doc_id,source,patient_id,encounter_id,code,ts,snippet
0,0.860117,0.849317,26.519821,timeline_events:proc_2,timeline_events,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,,,2016-04-11 07:31:41+00:00,TIMELINE EVENT: Injection of tetanus antitoxin...
1,0.860084,0.849313,26.519821,timeline_events:proc_7375,timeline_events,ae05f1fa-7913-f7bc-41bd-2dc8827555e7,,,2017-12-29 21:38:14+00:00,TIMELINE EVENT: Injection of tetanus antitoxin...
2,0.836458,0.846397,26.519821,timeline_events:proc_7226,timeline_events,1f0ca842-8c2d-a943-c047-dafce690f5a2,,,2022-12-08 08:55:05+00:00,TIMELINE EVENT: Injection of tetanus antitoxin...
3,0.833806,0.84607,26.519821,timeline_events:proc_15746,timeline_events,28f107b5-e973-ece3-b762-c2dbd9a01ba8,,,2024-05-23 01:14:18+00:00,TIMELINE EVENT: Injection of tetanus antitoxin...
4,0.818731,0.842187,27.822274,procedures_curated:proc_2,procedures_curated,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,87741be8-2223-14c5-375d-976414fcd9aa,384700001.0,2016-04-11 07:31:41+00:00,PROCEDURE: Injection of tetanus antitoxin (pro...



=== Query: body temperature and pain severity


Unnamed: 0,score,emb,bm25,doc_id,source,patient_id,encounter_id,code,ts,snippet
0,0.878304,0.82808,7.288429,observations_curated:obs_34068,observations_curated,e5ed5bc3-51e1-a9a7-01fb-f66b8ac4045d,99a10465-1395-60d1-e572-6c4b1907f55d,75893-8,2014-08-05 05:21:41+00:00,OBSERVATION: Pain severity in the past week - ...
1,0.877023,0.827963,7.288429,observations_curated:obs_34064,observations_curated,e5ed5bc3-51e1-a9a7-01fb-f66b8ac4045d,3554af2d-e6c4-82bb-2cc4-5774e45dfbd7,75893-8,2014-05-27 04:21:41+00:00,OBSERVATION: Pain severity in the past week - ...
2,0.861664,0.826556,7.288429,observations_curated:obs_34060,observations_curated,e5ed5bc3-51e1-a9a7-01fb-f66b8ac4045d,1b62477a-a1a5-c5d0-4367-ace58be2e280,75893-8,2014-03-28 04:21:41+00:00,OBSERVATION: Pain severity in the past week - ...
3,0.859792,0.826384,7.288429,observations_curated:obs_34014,observations_curated,e5ed5bc3-51e1-a9a7-01fb-f66b8ac4045d,af7c5c86-a7f9-db3b-1d68-755a54861dc7,75893-8,2014-02-02 08:21:41+00:00,OBSERVATION: Pain severity in the past week - ...
4,0.85957,0.815216,11.173488,observations_curated:obs_47752,observations_curated,787f9e8e-d3a4-0407-55d1-01a3414fceaf,7174fc0d-87bd-f276-be37-1789719a62c9,8310-5,2020-12-09 16:57:40+00:00,OBSERVATION: Body temperature | Value: 41.8 Ce...



=== Query: acetaminophen oral tablet


Unnamed: 0,score,emb,bm25,doc_id,source,patient_id,encounter_id,code,ts,snippet
0,0.952923,0.884335,11.831079,timeline_events:med_1838,timeline_events,e5ed5bc3-51e1-a9a7-01fb-f66b8ac4045d,,,2014-10-26 00:00:00+00:00,TIMELINE EVENT: Acetaminophen 325 MG / Oxycodo...
1,0.943499,0.883721,11.831079,timeline_events:med_221,timeline_events,641efcda-7397-4172-c6ac-8231342fa53e,,,2017-01-10 00:00:00+00:00,TIMELINE EVENT: Acetaminophen 325 MG / Oxycodo...
2,0.931882,0.882964,11.831079,timeline_events:med_4266,timeline_events,5fda1015-d0a5-e32d-d0b8-4662e6ce6c2b,,,2016-01-21 00:00:00+00:00,TIMELINE EVENT: Acetaminophen 325 MG / Oxycodo...
3,0.928626,0.882752,11.831079,timeline_events:med_4427,timeline_events,5fda1015-d0a5-e32d-d0b8-4662e6ce6c2b,,,2024-10-10 00:00:00+00:00,TIMELINE EVENT: Acetaminophen 325 MG / Oxycodo...
4,0.927255,0.882663,11.831079,timeline_events:med_227,timeline_events,641efcda-7397-4172-c6ac-8231342fa53e,,,2021-05-11 00:00:00+00:00,TIMELINE EVENT: Acetaminophen 325 MG / Oxycodo...



=== Query: general examination of patient


Unnamed: 0,score,emb,bm25,doc_id,source,patient_id,encounter_id,code,ts,snippet
0,0.870653,0.798585,14.809222,timeline_events:0c5a352c-a83c-34b0-42fb-6ad3a8...,timeline_events,b9bacf2f-7027-2e05-fa5b-19167071fdde,,,2023-12-15 10:00:50+00:00,TIMELINE EVENT: General examination of patient...
1,0.854851,0.798055,14.809222,timeline_events:b580f8ad-d16a-e254-afe1-c63e64...,timeline_events,b9bacf2f-7027-2e05-fa5b-19167071fdde,,,2015-10-30 10:00:50+00:00,TIMELINE EVENT: General examination of patient...
2,0.846317,0.797768,14.809222,timeline_events:f17d77c0-2b97-19df-cde2-161e3c...,timeline_events,033cccaf-bc92-3ddd-b64c-9ea45268a971,,,2021-02-13 03:15:47+00:00,TIMELINE EVENT: General examination of patient...
3,0.841051,0.797591,14.809222,timeline_events:affd9222-4e10-e9d8-6e31-2b9e73...,timeline_events,f49221bb-20fb-45cb-9345-09b6a83ae9de,,,2020-07-31 00:53:40+00:00,TIMELINE EVENT: General examination of patient...
4,0.831188,0.79726,14.809222,timeline_events:d15c7e8f-0ac7-8705-8c1b-d362b2...,timeline_events,033cccaf-bc92-3ddd-b64c-9ea45268a971,,,2025-03-08 03:15:47+00:00,TIMELINE EVENT: General examination of patient...



=== Query: suture open wound procedure


Unnamed: 0,score,emb,bm25,doc_id,source,patient_id,encounter_id,code,ts,snippet
0,1.0,0.866081,32.205144,procedures_curated:proc_5991,procedures_curated,033cccaf-bc92-3ddd-b64c-9ea45268a971,bd374338-7f51-3219-3eba-276d0a62e82b,288086009,2020-01-30 02:00:11+00:00,PROCEDURE: Suture open wound (procedure) | Cod...
1,0.997735,0.865855,32.205144,procedures_curated:proc_3,procedures_curated,8c8e1c9a-b310-43c6-33a7-ad11bad21c40,0d6b6a0a-8f84-c860-ba6f-b241d14f55bf,288086009,2018-02-24 06:48:33+00:00,PROCEDURE: Suture open wound (procedure) | Cod...
2,0.985516,0.864633,32.205144,procedures_curated:proc_15491,procedures_curated,70775c58-59fb-a3db-9858-1d427567c195,eac559cb-da69-cb48-32e9-8547ee88df02,288086009,2019-03-29 03:45:42+00:00,PROCEDURE: Suture open wound (procedure) | Cod...
3,0.979929,0.864074,32.205144,procedures_curated:proc_6837,procedures_curated,006c29d1-d868-3a9e-ceab-31f23e398f45,40835bfb-9439-d4ea-7384-320a102aa656,288086009,2018-06-08 09:18:17+00:00,PROCEDURE: Suture open wound (procedure) | Cod...
4,0.974174,0.863498,32.205144,procedures_curated:proc_3039,procedures_curated,6754b3bf-f5ac-f359-fef6-87cf4b8508ab,0b9195ac-948e-33b2-5aed-96047268b7c4,288086009,2017-05-23 15:04:00+00:00,PROCEDURE: Suture open wound (procedure) | Cod...
