# Unified Clinical Corpus → Chroma Pipeline

This notebook loads the first 1,000 rows from `medical_data.csv`, `patient_notes.csv`, `PMC-Patients.csv`, and `pubmed_dataset.csv`, cleans and de-identifies every note, merges all text into a single master document, and stores both the full document and LangChain-tokenised passages inside a Chroma vector database backed by BioClinicalBERT embeddings.

Workflow outline:
1. **Cleaning & Normalisation** – remove noise/special symbols, expand UMLS-style medical abbreviations, lowercase, and redact PHI markers for HIPAA/GDPR compliance.
2. **LangChain Tokenisation & BioClinicalBERT Embeddings** – build a master document, measure token counts with a Hugging Face tokenizer, and prepare embeddings with a PubMed/BioBERT-family model.
3. **Chunking & ChromaDB Storage** – split the master document into ~100–300 word passages using `RecursiveCharacterTextSplitter`, then insert both the full document and the chunks into a Chroma collection (metadata includes source, chunk index, timestamps).
4. **Final Outputs** – persist the cleaned corpus, chunk metadata, Chroma persistence directory, and demonstrate a sample semantic query for RAG readiness.



In [7]:
import os
import re
import shutil
import subprocess
import sys
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd

import nltk
nltk.download('punkt', quiet=True)

import spacy
from scispacy.abbreviation import AbbreviationDetector

from transformers import AutoTokenizer

# Minimal robust import handling for langchain / langchain_community variants.
# Try recommended variants, pip install only if both fail.
try:
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    from langchain_community.vectorstores import Chroma
    from langchain_community.embeddings import HuggingFaceEmbeddings
except ImportError:
    try:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'langchain', 'langchain-community', 'sentence-transformers'])
    except Exception as install_exc:
        raise ImportError(
            f"Could not install langchain/langchain-community: {install_exc}"
        )
    try:
        from langchain.text_splitter import RecursiveCharacterTextSplitter
        from langchain_community.vectorstores import Chroma
        from langchain_community.embeddings import HuggingFaceEmbeddings
    except ImportError:
        # Try alternative import style after structure change around 2023/2024
        try:
            from langchain_text_splitters import RecursiveCharacterTextSplitter
            from langchain_community.vectorstores import Chroma
            from langchain_community.embeddings import HuggingFaceEmbeddings
        except ImportError:
            try:
                from langchain_community.text_splitters import RecursiveCharacterTextSplitter
                from langchain_community.vectorstores import Chroma
                from langchain_community.embeddings import HuggingFaceEmbeddings
            except ImportError:
                raise ImportError(
                    "Could not import RecursiveCharacterTextSplitter, Chroma, and HuggingFaceEmbeddings "
                    "from any known langchain or langchain_community package structure. "
                    "Please check your installation of langchain, langchain-community, and langchain-text-splitters."
                )


Defaulting to user installation because normal site-packages is not writeable


2025-11-26 16:32:42.389388: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764154962.468755    4934 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764154962.490220    4934 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1764154962.642863    4934 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1764154962.642885    4934 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1764154962.642888    4934 computation_placer.cc:177] computation placer alr

RuntimeError: Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_tf_utils because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

## Paths & Dataset Configuration

We pull all source data from `data/raw/` and constrain to the first 1,000 rows per file to keep the demonstration tractable.


In [None]:
DATA_ROOT = Path('/home/root495/Inexture/CDSS-RAG/data/raw')
OUTPUT_ROOT = Path('/home/root495/Inexture/CDSS-RAG/data/processed')
OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)

DATASETS = {
    'medical_data': {'path': DATA_ROOT / 'medical_data.csv', 'text_column': 'TEXT'},
    'patient_notes': {'path': DATA_ROOT / 'patient_notes.csv', 'text_column': 'pn_history'},
    'pmc_patients': {'path': DATA_ROOT / 'PMC-Patients.csv', 'text_column': 'patient'},
    'pubmed': {'path': DATA_ROOT / 'pubmed_dataset.csv', 'text_column': 'contents'}
}

MAX_ROWS = 1000
DATASETS


{'medical_data': {'path': PosixPath('/home/root495/Inexture/CDSS-RAG/data/raw/medical_data.csv'),
  'text_column': 'TEXT'},
 'patient_notes': {'path': PosixPath('/home/root495/Inexture/CDSS-RAG/data/raw/patient_notes.csv'),
  'text_column': 'pn_history'},
 'pmc_patients': {'path': PosixPath('/home/root495/Inexture/CDSS-RAG/data/raw/PMC-Patients.csv'),
  'text_column': 'patient'},
 'pubmed': {'path': PosixPath('/home/root495/Inexture/CDSS-RAG/data/raw/pubmed_dataset.csv'),
  'text_column': 'contents'}}

## Load First 1,000 Rows Per Dataset

Each dataset is truncated with `head(1000)` and tagged with `source` plus `source_row_id` for downstream metadata.


In [None]:
raw_dfs = {}
for name, cfg in DATASETS.items():
    df = pd.read_csv(cfg['path']).head(MAX_ROWS)
    df['source'] = name
    df['source_row_id'] = df.index
    raw_dfs[name] = df

{k: len(v) for k, v in raw_dfs.items()}


{'medical_data': 744,
 'patient_notes': 1000,
 'pmc_patients': 1000,
 'pubmed': 1000}

In [None]:
{name: df[[DATASETS[name]['text_column']]].head(2) for name, df in raw_dfs.items()}


## Cleaning & Normalisation

We remove noisy symbols, standardise spacing/case, expand common medical abbreviations (UMLS-inspired) via ScispaCy’s abbreviation detector, and redact PHI markers such as emails, phone numbers, MRNs, and dates.


In [None]:
NLP_MODEL = 'en_core_sci_sm'
nlp = spacy.load(NLP_MODEL, disable=['ner'])
nlp.add_pipe('abbreviation_detector')

UMLS_ABBREV_MAP = {
    'HTN': 'hypertension',
    'DM': 'diabetes mellitus',
    'SOB': 'shortness of breath',
    'CAD': 'coronary artery disease',
    'COPD': 'chronic obstructive pulmonary disease',
    'CHF': 'congestive heart failure',
    'Pt': 'patient',
    'BP': 'blood pressure',
    'HR': 'heart rate',
    'c/o': 'complains of'
}

PHI_PATTERNS = {
    'emails': re.compile(r'\b[\w.-]+@[\w.-]+\.[A-Za-z]{2,}\b'),
    'phones': re.compile(r'(?:\+?\d{1,2}[ -]?)?(?:\(\d{3}\)|\d{3})[ -]?\d{3}[ -]?\d{4}'),
    'dates': re.compile(r'\b(?:\d{1,2}[/-]){2}\d{2,4}\b'),
    'mrn': re.compile(r'\b(?:mrn|patient id)\s*[:#]?\s*\d+\b', re.IGNORECASE),
    'names': re.compile(r'\b([A-Z][a-z]+\s[A-Z][a-z]+)\b')
}


def normalize_whitespace(text: str) -> str:
    return re.sub(r'\s+', ' ', str(text)).strip()


def remove_special_chars(text: str) -> str:
    return re.sub(r"[^0-9A-Za-z.,;:!?%\-\s'\/]", ' ', text)


def expand_abbreviations(text: str) -> str:
    doc = nlp(text)
    expanded = text
    for abrv in doc._.abbreviations:
        key = abrv.text.strip()
        if key in UMLS_ABBREV_MAP:
            expanded = re.sub(rf'\b{re.escape(key)}\b', UMLS_ABBREV_MAP[key], expanded)
    for short, long in UMLS_ABBREV_MAP.items():
        expanded = re.sub(rf'\b{re.escape(short)}\b', long, expanded, flags=re.IGNORECASE)
    return expanded


def deidentify(text: str) -> str:
    redacted = text
    for pattern in PHI_PATTERNS.values():
        redacted = pattern.sub('[REDACTED]', redacted)
    return redacted


def clean_text(text: str) -> str:
    text = normalize_whitespace(text)
    text = remove_special_chars(text)
    text = expand_abbreviations(text)
    text = text.lower()
    text = deidentify(text)
    return text

cleaned_dfs = {}
for name, df in raw_dfs.items():
    text_col = DATASETS[name]['text_column']
    df = df.copy()
    df['clean_text'] = df[text_col].fillna('').apply(clean_text)
    cleaned_dfs[name] = df

{k: df[['clean_text']].head(1)['clean_text'].iloc[0][:120] for k, df in cleaned_dfs.items()}



  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]
  global_matches = self.global_matcher(doc)
  global_matches = self.global_matcher(doc)
  global_matches = self.global_matcher(doc)
  global_matches = self.global_matcher(doc)


{'medical_data': 'admission date:    2162-3-3    discharge date:    2162-3-25    date of birth:    2080-1-4    sex: m service: medicine al',
 'patient_notes': "17-year-old male, has come to the student health clinic complaining of heart pounding. mr. cleveland's mother has given ",
 'pmc_patients': 'this 60-year-old male was hospitalized due to moderate ards from covid-19 with symptoms of fever, dry cough, and dyspnea',
 'pubmed': ' biochemical studies on camomile components/iii. in vitro studies about the antipeptic activity of  -- -alpha-bisabolol '}

## Combine Into a Single Master Document

All cleaned text segments are concatenated (respecting dataset order) into one long document; every downstream step operates on this unified corpus.


In [None]:
combined_df = pd.concat(cleaned_dfs.values(), ignore_index=True)
master_document = '\n\n'.join(combined_df['clean_text'].astype(str).tolist())
master_path = OUTPUT_ROOT / 'master_document.txt'
master_path.write_text(master_document, encoding='utf-8')

corpus_df = pd.DataFrame([
    {
        'source': 'combined_corpus',
        'source_row_id': 0,
        'clean_text': master_document,
        'char_len': len(master_document),
        'doc_path': str(master_path)
    }
])
corpus_df[['source', 'char_len', 'doc_path']]



Unnamed: 0,source,char_len
0,combined_corpus,12946090


In [None]:
## LangChain Tokenisation & BioClinicalBERT Embeddings

We rely on LangChain’s `RecursiveCharacterTextSplitter` with a Hugging Face tokenizer-backed length function to respect ~100–300 word segments. Embeddings come from a PubMed/BioBERT-derived SentenceTransformer (`pritamdeka/pubmedbert-base-embeddings`).


## BPE Tokenisation & BioClinicalBERT Embeddings

We train a fresh BPE tokenizer on the master document to satisfy the BPE requirement, then encode the unified text with `emilyalsentzer/Bio_ClinicalBERT` (WordPiece-compatible) to obtain dense embeddings.


In [None]:
EMBED_MODEL_NAME = 'pritamdeka/pubmedbert-base-embeddings'
embedding_model = HuggingFaceEmbeddings(model_name=EMBED_MODEL_NAME)

length_tokenizer = AutoTokenizer.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext')

def token_length(text: str) -> int:
    return len(length_tokenizer.encode(text, add_special_tokens=False))

splitter = RecursiveCharacterTextSplitter(
    chunk_size=320,  # ≈100–300 words
    chunk_overlap=60,
    length_function=token_length,
    separators=['\n\n', '\n', '. ', ' '],
    add_start_index=True
)

chunk_docs = splitter.create_documents(
    texts=[master_document],
    metadatas=[{'source': 'combined_corpus'}]
)
len(chunk_docs)








2025-11-26 16:03:11.644595: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764153191.731874    7933 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764153191.757037    7933 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1764153191.912858    7933 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1764153191.912883    7933 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1764153191.912885    7933 computation_placer.cc:177] computation placer alr

Unnamed: 0,source,bpe_tokens
0,combined_corpus,"[admission, date, :, 2162, -, 3, -, 3, dischar..."


## Chunk Metadata

`chunk_docs` already contain LangChain `Document` objects with page content and start indices. We capture their metadata in a DataFrame for auditing before pushing everything into Chroma.

In [None]:
chunk_records = []
for idx, doc in enumerate(chunk_docs):
    chunk_text = doc.page_content
    chunk_records.append({
        'chunk_index': idx,
        'chunk_word_count': len(chunk_text.split()),
        'start_index': doc.metadata.get('start_index'),
        'source': doc.metadata.get('source', 'combined_corpus'),
        'chunk_text': chunk_text,
        'text_preview': chunk_text[:200]
    })

chunk_df = pd.DataFrame(chunk_records)
chunk_df.head(3)



In [None]:
## Store the Master Document and Chunks in Chroma

We reset the Chroma persistence directory so that each run writes a fresh collection, then add (1) the entire master document and (2) every LangChain chunk with rich metadata.


In [None]:
CHROMA_DIR = OUTPUT_ROOT / 'chroma_clinical'
if CHROMA_DIR.exists():
    shutil.rmtree(CHROMA_DIR)

vectorstore = Chroma(
    collection_name='clinical_master',
    embedding_function=embedding_model,
    persist_directory=str(CHROMA_DIR)
)

document_entry = {
    'granularity': 'document',
    'source': 'combined_corpus',
    'chunk_index': None,
    'chunk_word_count': len(master_document.split()),
    'timestamp': datetime.utcnow().isoformat()
}
vectorstore.add_texts(texts=[master_document], metadatas=[document_entry])

chunk_metadatas = []
for idx, doc in enumerate(chunk_docs):
    chunk_metadatas.append({
        'granularity': 'chunk',
        'source': doc.metadata.get('source', 'combined_corpus'),
        'chunk_index': idx,
        'chunk_word_count': len(doc.page_content.split()),
        'timestamp': datetime.utcnow().isoformat()
    })
vectorstore.add_texts(
    texts=[doc.page_content for doc in chunk_docs],
    metadatas=chunk_metadatas
)

vectorstore.persist()
vectorstore._collection.count()



## Final Corpus Preparation & Sample Retrieval

We persist the cleaned master document, chunk metadata table, and the Chroma persistence directory under `data/processed/`, then run a quick similarity search to verify the collection is ready for downstream RAG workloads.


In [None]:
chunk_metadata_path = OUTPUT_ROOT / 'chunk_metadata.parquet'
chunk_df.to_parquet(chunk_metadata_path, index=False)

processed_paths = {
    'master_document': str(master_path),
    'chunk_metadata': str(chunk_metadata_path),
    'chroma_dir': str(CHROMA_DIR)
}
processed_paths


In [None]:
query = "patient with hypertension and shortness of breath management"
results = vectorstore.similarity_search(query, k=3)
[
    {
        'rank': idx + 1,
        'chunk_index': doc.metadata.get('chunk_index'),
        'source': doc.metadata.get('source'),
        'preview': doc.page_content[:200] + '...'
    }
    for idx, doc in enumerate(results)
]
