## Unstructured Document Rertriveal Pipeline

In [None]:
# 1. SETUP - Install once
# pip install langchain langchain-community langchain-unstructured chromadb langchain-openai unstructured[local-inference]

from langchain_community.document_loaders import DirectoryLoader
from langchain_unstructured import UnstructuredLoader
from unstructured.cleaners.core import clean_extra_whitespace
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma

# 2. LOAD PDFs → Elements (30-60 mins)
loader = DirectoryLoader(
    "../data/pdf",
    glob="**/*.pdf",
    loader_cls=UnstructuredLoader,
    loader_kwargs={
        "mode": "elements",
        "strategy": "fast",           # Quick text extraction
        "post_processors": [clean_extra_whitespace],
        "languages": ["eng"],
    },
    use_multithreading=True,
    show_progress=True
)

print("Loading 1074 PDFs...")
docs = loader.load()  # ~40k elements
print(f"Loaded: {len(docs)} elements")

Loading 1074 PDFs...


  0%|          | 0/1076 [00:00<?, ?it/s]INFO: pikepdf C++ to Python logger bridge initialized
  8%|▊         | 88/1076 [02:33<22:22,  1.36s/it]INFO: PDF text extraction failed, skip text extraction...
INFO: PDF text extraction failed, skip text extraction...
 32%|███▏      | 344/1076 [06:45<16:02,  1.31s/it]INFO: PDF text extraction failed, skip text extraction...
 50%|█████     | 538/1076 [10:44<1:54:05, 12.72s/it]

In [7]:
from langchain_core.documents import Document
from langchain_ollama import OllamaEmbeddings
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
import json
import glob

# 1. Ollama embeddings
embeddings = OllamaEmbeddings(model="nomic-embed-text")

# 2. Load and CLEAN metadata during loading (your 504K elements already loaded)
def clean_metadata_for_chroma(metadata):
    """Convert all lists/objects to strings for Chroma compatibility"""
    cleaned = {}
    for key, value in metadata.items():
        if key in ["coordinates", "element_id", "parent_id"]:
            continue  # Skip complex fields entirely
        elif isinstance(value, list):
            cleaned[key] = ",".join(str(v) for v in value)  # ["eng"] → "eng"
        elif isinstance(value, dict):
            continue  # Skip dicts entirely
        else:
            cleaned[key] = value
    return cleaned

# Re-process your loaded elements with clean metadata
print("Cleaning metadata for Chroma...")
cleaned_elements = []
for doc in all_elements:
    cleaned_metadata = clean_metadata_for_chroma(doc.metadata)
    cleaned_doc = Document(
        page_content=doc.page_content,
        metadata=cleaned_metadata
    )
    cleaned_elements.append(cleaned_doc)

print(f"Cleaned metadata complete: {len(cleaned_elements)} documents")

# 3. Split large elements
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(cleaned_elements)
print(f"Final chunks ready: {len(chunks)}")

# 4. BATCHED embedding (500K chunks needs batches)
print("Starting batched embedding...")
vectorstore = None
batch_size = 3000  # Conservative batch size

for i in range(0, len(chunks), batch_size):
    batch = chunks[i:i+batch_size]
    print(f"Batch {i//batch_size + 1}: {len(batch)} chunks ({i+len(batch)}/{len(chunks)})")
    
    if vectorstore is None:
        vectorstore = Chroma.from_documents(
            documents=batch,
            embedding=embeddings,
            persist_directory="./chroma_smart_elements",
            collection_name="humanitarian_data"
        )
    else:
        vectorstore.add_documents(batch)

print(f"Success! {vectorstore._collection.count()} chunks indexed in ./chroma_smart_elements")

Cleaning metadata for Chroma...
Cleaned metadata complete: 504065 documents
Final chunks ready: 502106
Starting batched embedding...
Batch 1: 3000 chunks (3000/502106)
Batch 2: 3000 chunks (6000/502106)
Batch 3: 3000 chunks (9000/502106)
Batch 4: 3000 chunks (12000/502106)
Batch 5: 3000 chunks (15000/502106)
Batch 6: 3000 chunks (18000/502106)
Batch 7: 3000 chunks (21000/502106)
Batch 8: 3000 chunks (24000/502106)
Batch 9: 3000 chunks (27000/502106)
Batch 10: 3000 chunks (30000/502106)
Batch 11: 3000 chunks (33000/502106)
Batch 12: 3000 chunks (36000/502106)
Batch 13: 3000 chunks (39000/502106)
Batch 14: 3000 chunks (42000/502106)
Batch 15: 3000 chunks (45000/502106)
Batch 16: 3000 chunks (48000/502106)
Batch 17: 3000 chunks (51000/502106)
Batch 18: 3000 chunks (54000/502106)
Batch 19: 3000 chunks (57000/502106)
Batch 20: 3000 chunks (60000/502106)
Batch 21: 3000 chunks (63000/502106)
Batch 22: 3000 chunks (66000/502106)
Batch 23: 3000 chunks (69000/502106)
Batch 24: 3000 chunks (72000

In [12]:
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings

# Load your completed index
embeddings = OllamaEmbeddings(model="nomic-embed-text")
vectorstore = Chroma(
    persist_directory="./chroma_smart_elements",
    collection_name="humanitarian_data",
    embedding_function=embeddings
)

print(f"Loaded: {vectorstore._collection.count()} vectors")

# Test queries
test_queries = [
    "donor data sharing risks",
    "humanitarian data responsibility", 
    "key takeaways data sharing",
    "ACRONYMS",
    "What is Congressional Record?"
]

for query in test_queries:
    results = vectorstore.similarity_search(query, k=7)
    print(f"\nQuery: '{query}'")
    for i, doc in enumerate(results, 1):
        print(f"  {i}. [{doc.metadata['category']}] {doc.metadata['filename'][:25]} (pg {doc.metadata['page_number']})")
        print(f"     {doc.page_content[:150]}...")

Loaded: 502106 vectors

Query: 'donor data sharing risks'
  1. [UncategorizedText] 2544CYX3TC3T5QB2NTVXD3IUF (pg 154)
     ....
  2. [UncategorizedText] 2544CYX3TC3T5QB2NTVXD3IUF (pg 170)
     ....
  3. [UncategorizedText] 2544CYX3TC3T5QB2NTVXD3IUF (pg 179)
     ....
  4. [UncategorizedText] 2544CYX3TC3T5QB2NTVXD3IUF (pg 180)
     ....
  5. [UncategorizedText] 2544CYX3TC3T5QB2NTVXD3IUF (pg 189)
     ....
  6. [UncategorizedText] 2544CYX3TC3T5QB2NTVXD3IUF (pg 217)
     ....
  7. [UncategorizedText] 2544CYX3TC3T5QB2NTVXD3IUF (pg 237)
     ....

Query: 'humanitarian data responsibility'
  1. [Header] e86f389121e94e5a65f89285a (pg 5)
     Scenarios in humanitarian data management...
  2. [Header] e86f389121e94e5a65f89285a (pg 5)
     Scenarios in humanitarian data management...
  3. [NarrativeText] 642c5aed3f342a15e2ae287d5 (pg 4)
     Section 1: Introduction offers an overview of key concepts related to data responsibility in humanitarian action, explains the role of OCHA in humanit...
  