In [None]:
import re
import os
from langchain.embeddings import OllamaEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader, DirectoryLoader
from langchain.schema import Document
import uuid

In [None]:
# Load documents
loader = DirectoryLoader("path_to_directory", glob="./*.txt", loader_cls=TextLoader, show_progress=True)
documents = loader.load()

In [None]:
# Define the embedding model
oembed = OllamaEmbeddings(model="mxbai-embed-large")

In [None]:
# Create directory for persistence if it doesn't exist
persist_directory = "text_reports/all_embed_db"
os.makedirs(persist_directory, exist_ok=True)

#### Step 1: Section-Based Preprocessing

In [None]:
def identify_all_sections(reports):
    all_sections = set()
    section_pattern = re.compile(r'^([A-Z\s]+):', re.MULTILINE)

    for report in reports:
        sections = section_pattern.findall(report)
        all_sections.update(sections)

    # Clean up section headers
    cleaned_sections = [section.strip() for section in all_sections]
    return sorted(cleaned_sections)
    
# Extract text from documents
document_texts = [doc.page_content for doc in documents]

# Identify sections
all_sections = identify_all_sections(document_texts)
print("All identified sections:", all_sections)

In [None]:
def parse_report_sections(report_text):
    sections = {
        "EXAMINATION": "", "INDICATION": "", "TECHNIQUE": "", 
        "COMPARISON": "", "FINDINGS": "", "IMPRESSION": "", 
        "HISTORY": ""
    }
    
    section_aliases = {
        "REASON FOR EXAMINATION": "INDICATION",
        "CLINICAL HISTORY": "HISTORY",
        "CLINICAL INDICATION": "INDICATION",
        "ACS TECHNIQUE": "TECHNIQUE",
        "AMS COMPARISON": "COMPARISON",
        "CHF IMPRESSION": "IMPRESSION",
        "FOLLOWUP IMPRESSION": "IMPRESSION"
    }
    
    current_section = None
    
    for line in report_text.splitlines():
        line = line.strip()
        header_match = re.match(
            r"(EXAMINATION|INDICATION|TECHNIQUE|COMPARISON|FINDINGS|IMPRESSION|HISTORY|REASON FOR EXAMINATION|CLINICAL HISTORY|CLINICAL INDICATION|ACS TECHNIQUE|AMS COMPARISON|CHF IMPRESSION|FOLLOWUP IMPRESSION):\s*(.*)",
            line, re.IGNORECASE
        )
        
        if header_match:
            header = header_match.group(1).upper()
            content = header_match.group(2).strip()
            
            if header in section_aliases:
                header = section_aliases[header]
            
            current_section = header
            sections[current_section] = content
        elif current_section and line:
            sections[current_section] += " " + line
    
    for section in sections:
        sections[section] = sections[section].strip()
    
    return sections

#### Step 2: Embed Each Section Separately

In [None]:
def create_embeddings_for_sections(sections, embed_model):
    embeddings = {}
    for section, text in sections.items():
        if text.strip():  # Embed only if the section is not empty
            embeddings[section] = embed_model.embed_query(text)
    # print(f'embeddings: {embeddings}')
    return embeddings

#### Step 3: Document Loading and Storage

In [None]:
def process_and_store_in_batches(documents, batch_size, embed_model, persist_directory):
    num_batches = (len(documents) + batch_size - 1) // batch_size
    vectordb = Chroma(persist_directory=persist_directory, embedding_function=embed_model)

    for i in range(int(num_batches)):
        print(f"Processing batch {i + 1}/{int(num_batches)}")
        batch_docs = documents[i * batch_size:(i + 1) * batch_size]
        
        for doc in batch_docs:
            case_id = str(uuid.uuid4())  # Generate a unique case ID
            report_text = doc.page_content
            sections = parse_report_sections(report_text)

            for section, content in sections.items():
                if content.strip():  # Only process non-empty sections
                    metadata = {"section": section, "case_id": case_id}
                    chroma_doc = Document(page_content=content, metadata=metadata)
                    vectordb.add_documents([chroma_doc])

        print(f"Batch {i + 1}/{int(num_batches)} processed and persisted successfully.")

    vectordb.persist()

In [None]:
batch_size = 10000
process_and_store_in_batches(documents, batch_size, oembed, persist_directory)