In [None]:
import pandas as pd
import pickle
from pathlib import Path

In [None]:
DATA_DIR = "mimic_sample_1000"

In [None]:
# Load admissions first
admissions_df = pd.read_csv(Path(DATA_DIR) / "admissions.csv_sample1000.csv", parse_dates=["admittime","dischtime"],  low_memory=False)

# Initialize link_tables
link_tables = {}

# Load tables that need special processing first
# Diagnoses with ICD definitions
icd_dx = pd.read_csv(Path(DATA_DIR) / "d_icd_diagnoses.csv.csv", low_memory=False)
dx = pd.read_csv(Path(DATA_DIR) / "diagnoses_icd.csv_sample1000.csv", low_memory=False)
link_tables["diagnoses_icd"] = dx.merge(icd_dx, on="icd_code", how="left")

# Procedures with ICD definitions
icd_proc = pd.read_csv(Path(DATA_DIR) / "d_icd_procedures.csv.csv", low_memory=False)
pr = pd.read_csv(Path(DATA_DIR) / "procedures_icd.csv_sample1000.csv", low_memory=False)
link_tables["procedures_icd"] = pr.merge(icd_proc, on="icd_code", how="left")

# Lab events with definitions
lab_defs = pd.read_csv(Path(DATA_DIR) / "d_labitems.csv.csv", low_memory=False)
link_tables["labevents"] = (
    pd.read_csv(Path(DATA_DIR) / "labevents.csv_sample1000.csv", parse_dates=["charttime","storetime"], low_memory=False)
    .merge(lab_defs, on="itemid", how="left")
)

# Microbiology events with lab definitions
link_tables["microbiologyevents"] = (
    pd.read_csv(Path(DATA_DIR) / "microbiologyevents.csv_sample1000.csv", parse_dates=["charttime","storetime", "chartdate","storedate"], low_memory=False)
    .merge(lab_defs, left_on="test_itemid", right_on="itemid", how="left")
)

#! Note: The following tables are commented out as they are not used in the current context.
# # HCPCS events with definitions
# hcpcs_defs = pd.read_csv(DATA_DIR / "d_hcpcs.csv.csv", low_memory=False)
# hcp = pd.read_csv(DATA_DIR / "hcpcsevents.csv_sample1000.csv", low_memory=False)
# link_tables["hcpcsevents"] = (
#     hcp.merge(
#         hcpcs_defs,
#         left_on="hcpcs_cd",
#         right_on="code",
#         how="left",
#         suffixes=("", "_def")
#     )
#     .rename(columns={"short_description": "event_desc",
#                     "short_description_def": "code_desc"})
#     .drop(columns=["code"])
# )

# Load provider info for tables that need it
prov = pd.read_csv(Path(DATA_DIR) / "provider.csv.csv", low_memory=False)

# Merging Prescriptions, POE, and EMAR
pres = pd.read_csv(Path(DATA_DIR) / "prescriptions.csv_sample1000.csv", low_memory=False)
poe = pd.read_csv(Path(DATA_DIR) / "poe.csv_sample1000.csv", parse_dates=["ordertime"], low_memory=False)
emar = pd.read_csv(Path(DATA_DIR) / "emar.csv_sample1000.csv", parse_dates=["charttime","storetime"], low_memory=False)

tmp = pd.merge(pres, poe, on=['poe_id','hadm_id'], how='left')
link_tables["prescriptions"] = pd.merge(tmp, emar, on=['poe_id','hadm_id'], how='left')


# Load remaining tables with provider merging where applicable
for tbl in ["transfers"]:
    df = pd.read_csv(Path(DATA_DIR) / f"{tbl}.csv_sample1000.csv", low_memory=False)

# Didn't see the need for provider and services

    link_tables[tbl] = df

# Group by hadm_id for constant‐time lookup
grouped = {name: df.groupby("hadm_id") for name, df in link_tables.items() if "hadm_id" in df.columns}

In [None]:
link_tables.keys()  # to see what tables we have loaded

In [None]:
link_tables["labevents"].head()  # to see the admissions table

In [None]:
export_dir = Path("mimic_sample_1000/exports")
export_dir.mkdir(exist_ok=True)

# Export admissions_df
with open(export_dir / "admissions_df.pkl", "wb") as f:
    pickle.dump(admissions_df, f)

# Export link_tables
with open(export_dir / "link_tables.pkl", "wb") as f:
    pickle.dump(link_tables, f)

# Export grouped tables (for convenience)
with open(export_dir / "grouped_tables.pkl", "wb") as f:
    pickle.dump(grouped, f)


In [None]:
link_tables["microbiologyevents"].columns  # to see the structure of the prescriptions table

In [None]:
from langchain.schema import Document

def make_section_docs(adm_row, grouped):
    hadm = adm_row.hadm_id
    subj = adm_row.subject_id
    base_meta = {
        "hadm_id": hadm,
        "subject_id": subj,
        "admittime": adm_row.admittime.isoformat() if pd.notna(adm_row.admittime) else "N/A",
        "dischtime": adm_row.dischtime.isoformat() if pd.notna(adm_row.dischtime) else "N/A",
        "admission_type": adm_row.admission_type
    }
    docs = []
    def safe(val, default="N/A"):
        if pd.isna(val) or (isinstance(val, str) and not val.strip()):
            return default
        return val

    # — Header
    header = (
        f"Admission {hadm} (Subject {subj})\n"
        f"- Admitted: {adm_row.admittime}    Discharged: {adm_row.dischtime}\n"
        f"- Type: {adm_row.admission_type}    ExpireFlag: {adm_row.hospital_expire_flag}"
    )
    docs.append(Document(page_content=header, metadata={**base_meta, "section":"header"}))

    # — Diagnoses
    if hadm in grouped["diagnoses_icd"].groups:
        df_dx = grouped["diagnoses_icd"].get_group(hadm)
        lines = [f"{safe(row.icd_code)}: {safe(row.long_title)}" for _, row in df_dx.iterrows()]
        docs.append(Document(
            page_content="Diagnoses (ICD):\n" + "\n".join(lines),
            metadata={**base_meta, "section":"diagnoses"}
        ))

    # — Procedures
    if hadm in grouped["procedures_icd"].groups:
        df_proc = grouped["procedures_icd"].get_group(hadm)
        lines = [f"{safe(row.icd_code)}: {safe(row.long_title)}" for _, row in df_proc.iterrows()]
        docs.append(Document(
            page_content="Procedures (ICD):\n" + "\n".join(lines),
            metadata={**base_meta, "section":"procedures"}
        ))
    # — Labs
    if hadm in grouped["labevents"].groups:
        df_labs = grouped["labevents"].get_group(hadm)
        lines = []
        for _, row in df_labs.iterrows():
            chart_time = row.charttime.strftime("%Y-%m-%d %H:%M") if pd.notna(row.charttime) else "N/A"
            store_time = row.storetime.strftime("%Y-%m-%d %H:%M") if pd.notna(row.storetime) else "N/A"
            line = f"{safe(row.itemid)}: {safe(row.label)} - (chart time: {chart_time} ~ store time: {store_time}) {safe(row.value)} {safe(row.valuenum)} | {safe(row.label)} - {safe(row.category)} - {safe(row.fluid)} - {safe(row.priority)} | {safe(row.flag)}"
            lines.append(line)
        docs.append(Document(
            page_content="Labs:\n" + "\n".join(lines),
            metadata={**base_meta, "section":"labs"}
        ))

    # — Microbiology
    if hadm in grouped["microbiologyevents"].groups:
        df_micro = grouped["microbiologyevents"].get_group(hadm)
        lines = []
        for _, row in df_micro.iterrows():
            chart_time = row.charttime.strftime("%Y-%m-%d %H:%M") if pd.notna(row.charttime) else "N/A"
            store_time = row.storetime.strftime("%Y-%m-%d %H:%M") if pd.notna(row.storetime) else "N/A"
            chart_date = row.chartdate.strftime("%Y-%m-%d") if pd.notna(row.chartdate) else "N/A"
            store_date = row.storedate.strftime("%Y-%m-%d") if pd.notna(row.storedate) else "N/A"
            line = f"{safe(row.test_itemid)}: {safe(row.test_name)} - {safe(row.spec_type_desc)} (chart time: {chart_time} ~ store time: {store_time} ~ chart date: {chart_date} ~ store date: {store_date}) | {safe(row.comments)}"
            lines.append(line)
        docs.append(Document(
            page_content="Microbiology:\n" + "\n".join(lines),
            metadata={**base_meta, "section":"microbiology"}
        ))
    # — Prescriptions and EMAR AND POE
    if hadm in grouped["prescriptions"].groups:
        df_combined = grouped["prescriptions"].get_group(hadm)
        lines = []
        for _, row in df_combined.iterrows():
            order_time = row.ordertime.strftime("%Y-%m-%d %H:%M") if pd.notna(row.ordertime) else "N/A"
            chart_time = row.charttime.strftime("%Y-%m-%d %H:%M") if pd.notna(row.charttime) else "N/A"
            line = (
                f"{safe(row.drug_type)} ({safe(row.drug)}) - {safe(row.formulary_drug_cd)} "
                f"{safe(row.dose_unit_rx)} {safe(row.dose_val_rx)} {safe(row.prod_strength)} | "
                f"{safe(row.doses_per_24_hrs)} doses/24hrs | Order at {safe(order_time)} ({safe(row.order_type)}, {safe(row.order_status)}) | "
                f"Administered: {safe(row.medication)} at {safe(chart_time)}"
            )
            lines.append(line)
        page_content = "Combined Prescriptions, Orders, and Administration:\n" + "\n".join(lines)
        docs.append(Document(
            page_content=page_content,
            metadata={**base_meta, "section": "prescriptions"}
        ))
    return docs

# build a flat list of section‐level docs
section_docs = []
for _, adm in admissions_df.iterrows():
    section_docs.extend(make_section_docs(adm, grouped))

print(f"Emitted {len(section_docs)} small Documents.")


In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=50
)

chunked_docs = splitter.split_documents(section_docs)
print(f"→ {len(chunked_docs)} total chunks ready for embedding.")

In [None]:
for d in chunked_docs:
    # keep only what you filter on downstream:
    md = {
      "hadm_id": d.metadata["hadm_id"],
      "subject_id": d.metadata["subject_id"],
      "section": d.metadata["section"],
      "admittime": pd.to_datetime(d.metadata["admittime"]),
      "dischtime": pd.to_datetime(d.metadata["dischtime"]),
    }
    d.metadata = md


In [None]:
from langchain.embeddings import SentenceTransformerEmbeddings

clinical_emb = SentenceTransformerEmbeddings(
    model_name="../models/S-PubMedBert-MS-MARCO",
    encode_kwargs={"batch_size": 16}
)

In [None]:
texts = [doc.page_content for doc in chunked_docs[:5]]
vectors = clinical_emb.embed_documents(texts)
print([len(v) for v in vectors])  # should each be e.g. 768-dimensional

In [None]:
import pickle
# Save the chunked documents to a file
with open("../mimic_sample_1000/chunked_docs.pkl", "wb") as f:
    pickle.dump(chunked_docs, f)

Loading different embedding models

Model: all-MiniLM-L6-v2

In [None]:
from sentence_transformers import SentenceTransformer

local_model_dir = Path("../models/all-MiniLM-L6-v2")
local_model_dir.mkdir(parents=True, exist_ok=True)

model_name = "sentence-transformers/all-MiniLM-L6-v2"

# First, download and save the model
model = SentenceTransformer(model_name)
model.save(str(local_model_dir))

In [None]:
from langchain.embeddings import SentenceTransformerEmbeddings

clinical_emb = SentenceTransformerEmbeddings(
    model_name="../models/all-MiniLM-L6-v2",
    encode_kwargs={"batch_size": 16}
)

In [None]:
from langchain.vectorstores import FAISS
vectorstore = FAISS.from_documents(chunked_docs, clinical_emb)
vectorstore.save_local("../vector_stores/faiss_mimic_sample1000_mini-lm")

Model: S-PubMedBert-MS-MARCO

In [None]:
from sentence_transformers import SentenceTransformer

local_model_dir = Path("../models/S-PubMedBert-MS-MARCO")
local_model_dir.mkdir(parents=True, exist_ok=True)

model_name = "pritamdeka/S-PubMedBert-MS-MARCO"

# First, download and save the model
model = SentenceTransformer(model_name)
model.save(str(local_model_dir))


In [None]:
from langchain.embeddings import SentenceTransformerEmbeddings

clinical_emb = SentenceTransformerEmbeddings(
    model_name="../models/S-PubMedBert-MS-MARCO",
    encode_kwargs={"batch_size": 16}
)

In [None]:
from langchain.vectorstores import FAISS
vectorstore = FAISS.from_documents(chunked_docs, clinical_emb)
vectorstore.save_local("../vector_stores/faiss_mimic_sample1000_ms-marco")

Model: static-retrieval-mrl-en-v1

In [None]:
from sentence_transformers import SentenceTransformer

local_model_dir = Path("../models/static-retrieval-mrl-en-v1")
local_model_dir.mkdir(parents=True, exist_ok=True)

model_name = "sentence-transformers/static-retrieval-mrl-en-v1"

# First, download and save the model
model = SentenceTransformer(model_name)
model.save(str(local_model_dir))

In [None]:
from langchain.embeddings import SentenceTransformerEmbeddings

clinical_emb = SentenceTransformerEmbeddings(
    model_name="../models/static-retrieval-mrl-en-v1",
    encode_kwargs={"batch_size": 16}
)

In [None]:
from langchain.vectorstores import FAISS
vectorstore = FAISS.from_documents(chunked_docs, clinical_emb)
vectorstore.save_local("../vector_stores/faiss_mimic_sample1000_static-retr")

Model: multi-qa-mpnet-base-cos-v1

In [None]:
from sentence_transformers import SentenceTransformer

local_model_dir = Path("../models/multi-qa-mpnet-base-cos-v1")
local_model_dir.mkdir(parents=True, exist_ok=True)

model_name = "sentence-transformers/multi-qa-mpnet-base-cos-v1"

# First, download and save the model
model = SentenceTransformer(model_name)
model.save(str(local_model_dir))

In [None]:
from langchain.embeddings import SentenceTransformerEmbeddings

clinical_emb = SentenceTransformerEmbeddings(
    model_name="./models/multi-qa-mpnet-base-cos-v1",
    encode_kwargs={"batch_size": 16}
)

In [None]:
from langchain.vectorstores import FAISS
with open("../mimic_sample_1000/chunked_docs.pkl", "rb") as f:
    chunked_docs = pickle.load(f)


vectorstore = FAISS.from_documents(chunked_docs, clinical_emb)


vectorstore.save_local("../vector_stores/faiss_mimic_sample1000_multi-qa")

Model: BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext

In [None]:
from langchain.vectorstores import FAISS
import pickle
with open("../mimic_sample_1000/chunked_docs.pkl", "rb") as f:
    chunked_docs = pickle.load(f)

# Use HuggingFaceEmbeddings for BiomedBERT
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings

clinical_emb = HuggingFaceEmbeddings(
    model_name="microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext",
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
 )

In [None]:
vectorstore = FAISS.from_documents(chunked_docs, clinical_emb)


vectorstore.save_local("../vector_stores/faiss_mimic_sample1000_biomedbert")

Model: all-mpnet-base-v2

In [None]:
from sentence_transformers import SentenceTransformer

local_model_dir = Path(
    "../models/all-mpnet-base-v2")
local_model_dir.mkdir(parents=True, exist_ok=True)

model_name = "sentence-transformers/all-mpnet-base-v2"

# First, download and save the model
model = SentenceTransformer(model_name)
model.save(str(local_model_dir))

In [None]:
from langchain.embeddings import SentenceTransformerEmbeddings

clinical_emb = SentenceTransformerEmbeddings(
    model_name="../models/all-mpnet-base-v2",
    encode_kwargs={"batch_size": 16}
)

In [None]:
from langchain.vectorstores import FAISS
with open("../mimic_sample_1000/chunked_docs.pkl", "rb") as f:
    chunked_docs = pickle.load(f)


vectorstore = FAISS.from_documents(chunked_docs, clinical_emb)


vectorstore.save_local("../vector_stores/faiss_mimic_sample1000_mpnet-v2")

Model: e5-base-v2

In [None]:
from sentence_transformers import SentenceTransformer

local_model_dir = Path(
    "../models/e5-base-v2")
local_model_dir.mkdir(parents=True, exist_ok=True)

model_name = "intfloat/e5-base-v2"

# First, download and save the model
model = SentenceTransformer(model_name)
model.save(str(local_model_dir))

In [None]:
from langchain.embeddings import SentenceTransformerEmbeddings

clinical_emb = SentenceTransformerEmbeddings(
    model_name="../models/e5-base-v2",
    encode_kwargs={"batch_size": 16}
)

In [None]:
from langchain.vectorstores import FAISS
with open("../mimic_sample_1000/chunked_docs.pkl", "rb") as f:
    chunked_docs = pickle.load(f)


vectorstore = FAISS.from_documents(chunked_docs, clinical_emb)


vectorstore.save_local("../vector_stores/faiss_mimic_sample1000_e5-base")

TimKond/S-PubMedBert-MedQuAD

In [None]:
from sentence_transformers import SentenceTransformer

local_model_dir = Path(
    "../models/S-PubMedBert-MedQuAD")
local_model_dir.mkdir(parents=True, exist_ok=True)

model_name = "TimKond/S-PubMedBert-MedQuAD"

# First, download and save the model
model = SentenceTransformer(model_name)
model.save(str(local_model_dir))

In [None]:
from langchain.embeddings import SentenceTransformerEmbeddings

clinical_emb = SentenceTransformerEmbeddings(
    model_name="../models/S-PubMedBert-MedQuAD",
    encode_kwargs={"batch_size": 16}
)

In [None]:
from langchain.vectorstores import FAISS
with open("../mimic_sample_1000/chunked_docs.pkl", "rb") as f:
    chunked_docs = pickle.load(f)


vectorstore = FAISS.from_documents(chunked_docs, clinical_emb)


vectorstore.save_local(
    "../vector_stores/faiss_mimic_sample1000_MedQuAD")

pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb

In [None]:
from sentence_transformers import SentenceTransformer

local_model_dir = Path(
    "../models/BioBERT-mnli-snli-scinli-scitail-mednli-stsb")
local_model_dir.mkdir(parents=True, exist_ok=True)

model_name = "pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb"

# First, download and save the model
model = SentenceTransformer(model_name)
model.save(str(local_model_dir))

In [None]:
from langchain.embeddings import SentenceTransformerEmbeddings

clinical_emb = SentenceTransformerEmbeddings(
    model_name="../models/BioBERT-mnli-snli-scinli-scitail-mednli-stsb",
    encode_kwargs={"batch_size": 16}
)

In [None]:
from langchain.vectorstores import FAISS
with open("../mimic_sample_1000/chunked_docs.pkl", "rb") as f:
    chunked_docs = pickle.load(f)


vectorstore = FAISS.from_documents(chunked_docs, clinical_emb)


vectorstore.save_local("../vector_stores/faiss_mimic_sample1000_BioBERT")

FremyCompany/BioLORD-2023-C

In [None]:
from sentence_transformers import SentenceTransformer

local_model_dir = Path(
    "../models/BioLORD-2023-C")
local_model_dir.mkdir(parents=True, exist_ok=True)

model_name = "FremyCompany/BioLORD-2023-C"

# First, download and save the model
model = SentenceTransformer(model_name)
model.save(str(local_model_dir))

In [None]:
from langchain.embeddings import SentenceTransformerEmbeddings

clinical_emb = SentenceTransformerEmbeddings(
    model_name="../models/BioLORD-2023-C",
    encode_kwargs={"batch_size": 16}
)

In [None]:
from langchain.vectorstores import FAISS
with open("../mimic_sample_1000/chunked_docs.pkl", "rb") as f:
    chunked_docs = pickle.load(f)


vectorstore = FAISS.from_documents(chunked_docs, clinical_emb)


vectorstore.save_local("../vector_stores/faiss_mimic_sample1000_BioLORD")