# Module 06 - Notebook 02: Document Processing Pipeline

## Learning Objectives
- Load documents from various formats (PDF, DOCX, TXT, HTML)
- Implement smart chunking strategies
- Extract and preserve metadata
- Clean and preprocess text
- Build a complete ingestion pipeline

---

## 1. Document Loading

RAG systems need to handle multiple document formats.

In [None]:
!pip install -q PyPDF2 python-docx beautifulsoup4 langchain

In [None]:
from langchain.document_loaders import (
    TextLoader,
    PyPDFLoader,
    Docx2txtLoader,
    UnstructuredHTMLLoader
)
from langchain.schema import Document
from pathlib import Path

class DocumentLoader:
    """Load documents from various formats."""
    
    @staticmethod
    def load_file(filepath: str) -> list[Document]:
        """Auto-detect format and load."""
        path = Path(filepath)
        suffix = path.suffix.lower()
        
        loaders = {
            '.txt': TextLoader,
            '.pdf': PyPDFLoader,
            '.docx': Docx2txtLoader,
            '.html': UnstructuredHTMLLoader
        }
        
        loader_class = loaders.get(suffix)
        if not loader_class:
            raise ValueError(f"Unsupported file type: {suffix}")
        
        loader = loader_class(filepath)
        return loader.load()

# Example: Create a sample text file
sample_text = """This is a sample document for RAG.
It contains multiple paragraphs.

This is the second paragraph with more information.
"""

# Save to file
with open("/tmp/sample.txt", "w") as f:
    f.write(sample_text)

# Load it
docs = DocumentLoader.load_file("/tmp/sample.txt")
print(f"Loaded {len(docs)} documents")
print(f"Content: {docs[0].page_content[:100]}...")
print(f"Metadata: {docs[0].metadata}")

## 2. Smart Chunking

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_documents(documents: list[Document], chunk_size: int = 500) -> list[Document]:
    """Chunk documents with optimal settings."""
    
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size * 0.1),  # 10% overlap
        length_function=len,
        separators=["\n\n", "\n", ".", " ", ""]
    )
    
    chunks = splitter.split_documents(documents)
    
    # Add chunk metadata
    for i, chunk in enumerate(chunks):
        chunk.metadata["chunk_id"] = i
        chunk.metadata["chunk_size"] = len(chunk.page_content)
    
    return chunks

# Test chunking
long_doc = Document(
    page_content="This is a long document. " * 100,
    metadata={"source": "test.txt"}
)

chunks = chunk_documents([long_doc], chunk_size=200)
print(f"Split into {len(chunks)} chunks")
print(f"\nFirst chunk:")
print(f"  Content: {chunks[0].page_content[:100]}...")
print(f"  Metadata: {chunks[0].metadata}")

## 3. Metadata Extraction

In [None]:
import re
from datetime import datetime

class MetadataExtractor:
    """Extract metadata from documents."""
    
    @staticmethod
    def extract_from_text(text: str) -> dict:
        """Extract metadata like dates, emails, URLs."""
        metadata = {}
        
        # Extract emails
        emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
        if emails:
            metadata['emails'] = emails
        
        # Extract URLs
        urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
        if urls:
            metadata['urls'] = urls
        
        # Extract dates (simple pattern)
        dates = re.findall(r'\b\d{4}-\d{2}-\d{2}\b', text)
        if dates:
            metadata['dates'] = dates
        
        # Basic stats
        metadata['word_count'] = len(text.split())
        metadata['char_count'] = len(text)
        
        return metadata
    
    @staticmethod
    def enrich_document(doc: Document) -> Document:
        """Add extracted metadata to document."""
        extracted = MetadataExtractor.extract_from_text(doc.page_content)
        doc.metadata.update(extracted)
        return doc

# Test metadata extraction
test_doc = Document(
    page_content="""Contact us at support@company.com
    Visit https://example.com for more info.
    Last updated: 2024-01-15""",
    metadata={"source": "contact.txt"}
)

enriched = MetadataExtractor.enrich_document(test_doc)
print("Enriched metadata:")
print(enriched.metadata)

## 4. Complete Ingestion Pipeline

In [None]:
from typing import List
import chromadb
from openai import OpenAI

class IngestionPipeline:
    """Complete pipeline: Load â†’ Clean â†’ Chunk â†’ Embed â†’ Store."""
    
    def __init__(self, collection_name: str):
        self.openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
        self.chroma_client = chromadb.Client()
        self.collection = self.chroma_client.create_collection(collection_name)
    
    def process_file(self, filepath: str, chunk_size: int = 500):
        """Complete pipeline for a single file."""
        print(f"Processing: {filepath}")
        
        # Step 1: Load
        docs = DocumentLoader.load_file(filepath)
        print(f"  Loaded: {len(docs)} documents")
        
        # Step 2: Enrich metadata
        docs = [MetadataExtractor.enrich_document(doc) for doc in docs]
        print(f"  Enriched metadata")
        
        # Step 3: Chunk
        chunks = chunk_documents(docs, chunk_size)
        print(f"  Created: {len(chunks)} chunks")
        
        # Step 4: Embed
        texts = [chunk.page_content for chunk in chunks]
        response = self.openai_client.embeddings.create(
            model="text-embedding-3-small",
            input=texts
        )
        embeddings = [item.embedding for item in response.data]
        print(f"  Generated: {len(embeddings)} embeddings")
        
        # Step 5: Store
        self.collection.add(
            documents=texts,
            embeddings=embeddings,
            metadatas=[chunk.metadata for chunk in chunks],
            ids=[f"chunk_{i}" for i in range(len(chunks))]
        )
        print(f"  Stored in vector database")
        print(f"âœ“ Pipeline complete: {self.collection.count()} total chunks\n")

# Demo the pipeline
pipeline = IngestionPipeline("rag_demo")
pipeline.process_file("/tmp/sample.txt", chunk_size=100)

## Summary

You learned:
- âœ… Loading multiple document formats
- âœ… Smart chunking strategies
- âœ… Metadata extraction and enrichment
- âœ… Building complete ingestion pipelines

## Best Practices
1. **Auto-detect** file formats
2. **Preserve metadata** through processing
3. **Use recursive chunking** for best results
4. **Extract structured info** (dates, emails, URLs)
5. **Batch embed** for efficiency

## Next Steps
- ðŸ“˜ Notebook 03: Retrieval Strategies