# Enhancing Medical Report Findings with Retrieval-Augmented Generation (RAG): Integrating LLM Models and Chroma DB using the LangChain framework for searchable data.

### Installing the required libraries

In [1]:
!pip install "PyPDF2" "chromadb==0.4.0" "langchain==0.2.0" "langchain-community" "sentence-transformers" 

Collecting langchain==0.2.0
  Using cached langchain-0.2.0-py3-none-any.whl.metadata (13 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.14-py3-none-any.whl.metadata (2.9 kB)
Collecting sentence-transformers
  Using cached sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Collecting pydantic<2.0,>=1.9 (from chromadb==0.4.0)
  Using cached pydantic-1.10.19-cp312-cp312-win_amd64.whl.metadata (153 kB)
Collecting numpy>=1.21.6 (from chromadb==0.4.0)
  Downloading numpy-2.2.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting tokenizers>=0.13.2 (from chromadb==0.4.0)
  Using cached tokenizers-0.21.0-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting tqdm>=4.65.0 (from chromadb==0.4.0)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain==0.2.0)
  Using cached SQLAlchemy-2.0.36-cp312-cp312-win_amd64.whl.metadata (9.9 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain==0.2.0)
  Using cached aioht

### Reading PDF reports and remove sensitive data

In [1]:
from PyPDF2 import PdfReader
def read_pdf(file_path):
    # Initialize the reader for the PDF file
    
    reader = PdfReader(file_path)
    
    # Extract text from each page and store it in a variable
    pdf_text = ""
    for page in reader.pages:
        pdf_text += page.extract_text() + "\n"  # Adding a newline for separation between pages
  
    text = pdf_text.replace("\n", " ")
    
    return text
    #return page

In [2]:
import re
def remove_sensitive(text):
    text = re.sub(r"Patient\s+Name\s+:\s+.*?\s+(?=Patient\s+ID)", "", text)#removing patient name
    text = re.sub(r"Referring\s+Physician\s+.*?\s+Report", "Report", text)
    text = re.sub(r"Dr [A-Za-z\s]+MD\s*|Reg No:\s*\d+", "", text).strip()
    return text


### Creating field for json and removing other sensitive data

In [3]:
import re
import json
def text_json(extracted_data):
# Regex patterns to extract fields
    patterns = {
        "Patient ID": r"Patient\s+ID\s*:\s*(\S+)",
        "Date": r"Report\s+Date\s+/ Time\s+:\s+(\d{1,2}\s+\w+\s+\d{4})",
        "Time": r"(\d{2}:\d{2}:\d{2})",
        "Patient age": r"Patient\s+age\s+/\s+Sex\s+:\s+(\d{3}Y)",
        "Sex": r"Sex\s*:\s*\d{3}[A-Za-z]*\s*/\s*(\w)",
        "TECHNIQUE": r"TECHNIQUE\s*:\s*(.*?)\s*FINDINGS",
        "FINDINGS": r"FINDINGS\s*:\s*(.*?)\s*IMPRESSION",
        "IMPRESSION": r"IMPRESSION\s*:\s*(.*)"
    }
    
    # Extract fields using regex
    data = {}
    for key, pattern in patterns.items():
        match = re.search(pattern, extracted_data, re.DOTALL)
        if match:
            data[key] = match.group(1).strip()
    
    # removes the unwanted portion from the "FINDINGS" and "IMPRESSION" field while keeping the rest of the content intact.
    try:
        findings = data["FINDINGS"]
        impression = data["IMPRESSION"]
        updated_findings = re.sub(r"Patient\s+ID\s*:\s*(\S+).*Report\s+Date\s+/ Time\s+:\s+(\d{1,2}\s+\w+\s+\d{4})\s*(\S+)", "", findings).strip()
        updated_impression = re.sub(r"Patient\s+ID\s*:\s*(\S+).*Report\s+Date\s+/ Time\s+:\s+(\d{1,2}\s+\w+\s+\d{4})\s*(\S+)", "", impression).strip()

        # Update the Data
        data["FINDINGS"] = updated_findings
        data["IMPRESSION"] = updated_impression
    except:
        print("no unwanted found")
    # Update the JSON
    json_data = json.dumps(data, indent=4)
    
    return json_data
    

In [19]:
# Define the file path to the PDF file named "MO.pdf" in the current working directory
file_path = r"Data/MO.pdf"  # The 'r' prefix indicates a raw string to handle any special characters in the path

# Read the PDF file and extract its content as text
print(
    # Remove sensitive information like patient name, physician name, etc.
    text_json(
        remove_sensitive(
            read_pdf(file_path)  # Read PDF and extract text content
        )
    )
)
# Convert the sanitized text to a structured JSON format and print it

{
    "Patient ID": "MR0000444444",
    "Date": "20 April  2020",
    "Time": "10:16:04",
    "Patient age": "075Y",
    "Sex": "M",
    "TECHNIQUE": "T2 FSE Axials  / Sagittals  & Coronals.   T1 &T2  FLAIR  Axials.  DWI,  GRE  Axials.    3D TOF  MR Angiography  of Intracranial  Arteries.",
    "FINDINGS": "Large wedge shaped lesion with restricted diffusion low ADC values and hyperintensities  in T2 & FLAIR images noted involving capsuloganglionic region, parietal lobe, adjacent  frontal insular cortex, temporal lobe in right side.   Focal FLAIR hyperintensities without restricted diffusion in bilateral posterior  periventricular wh ite matter, left corona radiata and centrum semiovale.   Curvilinear  blooming  SWI hypointensities  in distal  right  MCA.   Cerebellum, 4th ventricle, brain stem & CP angle regions are within normal limits.  Sella, suprasellar & parasellar areas are normal.   The ex tracerebral spaces and supratentorial ventricular system are normal.  Rest of the cerebra

### Processing multiple pdf file for chromadb format

In [5]:
# Process all PDF files in the folder
import os
folder_path = r"Data/"
all_data = []
for filename in os.listdir(folder_path):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(folder_path, filename)
        print(pdf_path)
        extracted_data = read_pdf(pdf_path)
        remove_sensitive_data = remove_sensitive(extracted_data)
        extracted_json = text_json(remove_sensitive_data)
        ## Parse the JSON strings into dictionaries and append to the list
        all_data.append(json.loads(extracted_json))
#print(all_data)

Data/MO.pdf
Data/RA.pdf
Data/VA.pdf


### Converting JSON to ChromaDB format

In [7]:
from langchain.schema import Document

documents = [
    Document(page_content=f"{item['Patient ID']} {item['Patient age']} {item['Sex']} {item['TECHNIQUE']} {item['FINDINGS']} {item['IMPRESSION']}", metadata={})
    for item in all_data
]


In [8]:
# Verify the result
for doc in documents:
    print(f"Page Content: {doc.page_content}, Metadata: {doc.metadata}")

Page Content: MR0000444444 075Y M T2 FSE Axials  / Sagittals  & Coronals.   T1 &T2  FLAIR  Axials.  DWI,  GRE  Axials.    3D TOF  MR Angiography  of Intracranial  Arteries. Large wedge shaped lesion with restricted diffusion low ADC values and hyperintensities  in T2 & FLAIR images noted involving capsuloganglionic region, parietal lobe, adjacent  frontal insular cortex, temporal lobe in right side.   Focal FLAIR hyperintensities without restricted diffusion in bilateral posterior  periventricular wh ite matter, left corona radiata and centrum semiovale.   Curvilinear  blooming  SWI hypointensities  in distal  right  MCA.   Cerebellum, 4th ventricle, brain stem & CP angle regions are within normal limits.  Sella, suprasellar & parasellar areas are normal.   The ex tracerebral spaces and supratentorial ventricular system are normal.  Rest of the cerebral parenchyma is normal is normal.   Midline structures and corpus callosum are normal.  No haemorrhagic pathology.   No extraaxial  coll

### Creating chunks from document from chromaDB (vector database)
### Using all-MiniLM-L6-v2 embedings
### Saving the data into chromaDB

In [69]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

# -------------------------------
# Split Data into Chunks (Optional for Large Content)
# -------------------------------
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
split_documents = text_splitter.split_documents(documents)

# -------------------------------
# Use Hugging Face Embeddings (all-MiniLM-L6-v2)
# -------------------------------
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)

# -------------------------------
# Store Embeddings in ChromaDB
# -------------------------------
chroma_db4 = Chroma.from_documents(split_documents, embeddings, persist_directory="./chroma_sample_db")

# Save embeddings to the ChromaDB directory
chroma_db4.persist()
print("✅ Data has been successfully inserted into ChromaDB!")

✅ Data has been successfully inserted into ChromaDB!


In [70]:
print(embeddings)

client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
) model_name='sentence-transformers/all-MiniLM-L6-v2' cache_folder=None model_kwargs={} encode_kwargs={} multi_process=False show_progress=False


### Test Queries

In [71]:
def retrieve_queries(queries, k):
    # Create a retriever instance from the Chroma database to retrieve relevant documents
    retriever = chroma_db4.as_retriever()
    # Loop through each query in the test_queries list
    for query in queries:
        # Retrieve the top k most relevant documents for the current query
        results = retriever.get_relevant_documents(query, k=k)
        
        # Print the current query being tested
        print(f"🔍 Query: {query}")
        
        # Loop through the top k results and display their content
        for i, doc in enumerate(results[:k]):  # Limit to top k results
            print(f"💡 Result {i+1}: {doc.page_content}")
        
        # Print a separator line for better readability between results
        print("-" * 50)

In [72]:
# Define a list of test queries to be processed
queries = ["Patient is not cooperative"]
k = 1
retrieve_queries(queries, k)

🔍 Query: Patient is not cooperative
💡 Result 1: are normal.  Rest of the cerebral parenchyma is normal.   Midline  structures  and corpus  callosum  are normal.   No haemorrhagic pathology. No extraaxial collection seen.  MR Angiography shows : (Patient is not cooperative)   Distal right MCA branches are not well visualized.   Small  stenotic  segment  in distal  left MCA.   Mild  stenosis  seen in cavernous  and infrapetrous  parts  of right  ICA.         Fetal  type of A1 segment  of right  ACA  -- Normal  varient.   Hypoplastic  right
--------------------------------------------------


In [73]:
# Define a list of test queries to be processed
queries = ["Patient is not cooperative","including right side of splenium of corpus callosum"]
k = 2
retrieve_queries(queries, k)

🔍 Query: Patient is not cooperative
💡 Result 1: are normal.  Rest of the cerebral parenchyma is normal.   Midline  structures  and corpus  callosum  are normal.   No haemorrhagic pathology. No extraaxial collection seen.  MR Angiography shows : (Patient is not cooperative)   Distal right MCA branches are not well visualized.   Small  stenotic  segment  in distal  left MCA.   Mild  stenosis  seen in cavernous  and infrapetrous  parts  of right  ICA.         Fetal  type of A1 segment  of right  ACA  -- Normal  varient.   Hypoplastic  right
💡 Result 2: are normal.  Rest of the cerebral parenchyma is normal.   Midline  structures  and corpus  callosum  are normal.   No haemorrhagic pathology. No extraaxial collection seen.  MR Angiography shows : (Patient is not cooperative)   Distal right MCA branches are not well visualized.   Small  stenotic  segment  in distal  left MCA.   Mild  stenosis  seen in cavernous  and infrapetrous  parts  of right  ICA.         Fetal  type of A1 segment  of r