# Overview
This notebook demonstrates how to evaluate RAG systems on documents containing images, charts, tables, and other visual elements. Most enterprise documents contain critical information in visual formats that pure text-based RAG systems miss entirely.

# Background
Traditional RAG evaluation focuses on clean text documents, but real-world enterprise documents are complex:
- Financial reports with embedded charts and tables
- Technical manuals with diagrams and flowcharts  
- Research papers with data visualizations
- Forms and structured documents

This notebook bridges the gap between research and reality by evaluating how well RAG systems handle documents with visual content.

**What Metrics Should You Care About?**
- **Visual Content Recall**: How many relevant charts/tables are retrieved?
- **OCR Quality Impact**: How do extraction errors affect retrieval?
- **Completeness Improvement**: Are answers more complete with visual content?
- **Cost vs Quality**: ROI analysis of different OCR approaches

# What Will We Do? 
* Process PDF documents with embedded images/tables using AWS Textract
* Create evaluation dataset with visual-heavy documents
* Compare retrieval performance: text-only vs text+visual
* Measure OCR quality impact on retrieval accuracy
* Analyze cost-benefit of multi-modal RAG

**Let's get started!**

In [2]:
import chromadb
import boto3
import pandas as pd
import numpy as np
import json
import re
import io
import base64
from typing import List, Dict, Any, Optional, Tuple
from pydantic import BaseModel
from concurrent.futures import ThreadPoolExecutor, as_completed
from PIL import Image
import tempfile
import os
from pathlib import Path

# Initialize clients
session = boto3.Session(profile_name='default')
bedrock_client = boto3.client('bedrock-runtime')
textract_client = boto3.client('textract')
s3_client = boto3.client('s3')

# Initialize Chroma client
chroma_client = chromadb.PersistentClient(path="../data/chroma")

print("All clients initialized successfully")

All clients initialized successfully


# Document Processing Pipeline

We'll create a comprehensive pipeline that can handle various document types and extract both text and visual content using AWS Textract.

In [5]:
class VisualContent(BaseModel):
    content_type: str  # "table", "key_value", "text", "form"
    content: str       # The extracted/formatted content
    confidence: float  # OCR confidence score
    bounding_box: Dict # Location in document
    metadata: Dict = {}

class EnrichedChunk(BaseModel):
    id_: str
    text_content: str
    visual_content: List[VisualContent] = []
    document_type: str  # "financial_report", "technical_manual", etc.
    has_visual_elements: bool = False
    metadata: Dict[str, Any] = {}

class MultiModalDocumentProcessor:
    def __init__(self, textract_client, s3_client, bucket_name: str = None):
        self.textract_client = textract_client
        self.s3_client = s3_client
        self.bucket_name = bucket_name
    
    def upload_to_s3(self, file_path: str, s3_key: str) -> str:
        """Upload file to S3 for Textract processing"""
        if not self.bucket_name:
            raise ValueError("S3 bucket name required for Textract processing")
        
        self.s3_client.upload_file(file_path, self.bucket_name, s3_key)
        return s3_key
    
    def extract_with_textract(self, s3_key: str) -> Dict:
        """Extract text, tables, and forms using AWS Textract"""
        try:
            response = self.textract_client.analyze_document(
                Document={
                    'S3Object': {
                        'Bucket': self.bucket_name,
                        'Name': s3_key
                    }
                },
                FeatureTypes=['TABLES', 'FORMS']
            )
            return response
        except Exception as e:
            print(f"Textract extraction failed: {e}")
            return None
    
    def parse_textract_response(self, response: Dict) -> Tuple[str, List[VisualContent]]:
        """Parse Textract response into text and visual content"""
        if not response:
            return "", []
        
        blocks = response.get('Blocks', [])
        
        # Extract text blocks
        text_blocks = []
        visual_content = []
        
        for block in blocks:
            if block['BlockType'] == 'LINE':
                text_blocks.append(block.get('Text', ''))
            
            elif block['BlockType'] == 'TABLE':
                table_content = self.extract_table_content(block, blocks)
                visual_content.append(VisualContent(
                    content_type="table",
                    content=table_content,
                    confidence=block.get('Confidence', 0),
                    bounding_box=block.get('Geometry', {}),
                    metadata={'block_id': block.get('Id', '')}
                ))
            
            elif block['BlockType'] == 'KEY_VALUE_SET':
                if block.get('EntityTypes') and 'KEY' in block['EntityTypes']:
                    kv_content = self.extract_key_value_content(block, blocks)
                    visual_content.append(VisualContent(
                        content_type="key_value",
                        content=kv_content,
                        confidence=block.get('Confidence', 0),
                        bounding_box=block.get('Geometry', {}),
                        metadata={'block_id': block.get('Id', '')}
                    ))
        
        full_text = '\n'.join(text_blocks)
        return full_text, visual_content
    
    def extract_table_content(self, table_block: Dict, all_blocks: List[Dict]) -> str:
        """Convert table block to readable text format"""
        table_text = f"[TABLE: {table_block.get('Id', 'unknown')}]\n"
        
        # Get table relationships
        if 'Relationships' in table_block:
            for relationship in table_block['Relationships']:
                if relationship['Type'] == 'CHILD':
                    for child_id in relationship['Ids']:
                        child_block = next((b for b in all_blocks if b['Id'] == child_id), None)
                        if child_block and child_block['BlockType'] == 'CELL':
                            cell_text = self.get_cell_text(child_block, all_blocks)
                            if cell_text:
                                table_text += f"{cell_text} | "
            table_text += "\n"
        
        return table_text
    
    def get_cell_text(self, cell_block: Dict, all_blocks: List[Dict]) -> str:
        """Extract text from table cell"""
        cell_text = ""
        if 'Relationships' in cell_block:
            for relationship in cell_block['Relationships']:
                if relationship['Type'] == 'CHILD':
                    for child_id in relationship['Ids']:
                        child_block = next((b for b in all_blocks if b['Id'] == child_id), None)
                        if child_block and 'Text' in child_block:
                            cell_text += child_block['Text'] + " "
        return cell_text.strip()
    
    def extract_key_value_content(self, kv_block: Dict, all_blocks: List[Dict]) -> str:
        """Extract key-value pair content"""
        return f"[KEY_VALUE: {kv_block.get('Id', 'unknown')}]"
    
    def process_document(self, file_path: str, document_type: str = "unknown") -> EnrichedChunk:
        """Process a single document and return enriched chunk"""
        file_name = os.path.basename(file_path)
        s3_key = f"textract-input/{file_name}"
        
        try:
            # Upload to S3
            if self.bucket_name:
                self.upload_to_s3(file_path, s3_key)
                
                # Extract with Textract
                textract_response = self.extract_with_textract(s3_key)
                text_content, visual_content = self.parse_textract_response(textract_response)
            else:
                # Fallback: basic text extraction (for demo purposes)
                text_content = f"Demo text content from {file_name}"
                visual_content = []
            
            return EnrichedChunk(
                id_=f"{file_name}_{hash(text_content)}",
                text_content=text_content,
                visual_content=visual_content,
                document_type=document_type,
                has_visual_elements=len(visual_content) > 0,
                metadata={
                    'source_file': file_name,
                    's3_key': s3_key,
                    'visual_element_count': len(visual_content)
                }
            )
        
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            return EnrichedChunk(
                id_=f"{file_name}_error",
                text_content=f"Error processing {file_name}",
                visual_content=[],
                document_type="error",
                has_visual_elements=False,
                metadata={'source_file': file_name, 'error': str(e)}
            )

# Initialize the processor
# Note: Set your S3 bucket name here for full Textract functionality
#S3_BUCKET_NAME = None  # Replace with your bucket name: "your-textract-bucket"
S3_BUCKET_NAME = genaisystemevaluationmediatestbucket

doc_processor = MultiModalDocumentProcessor(
    textract_client=textract_client,
    s3_client=s3_client,
    bucket_name=S3_BUCKET_NAME
)

print("✅ Document processing pipeline initialized")
if S3_BUCKET_NAME:
    print(f"📦 Using S3 bucket: {S3_BUCKET_NAME}")
else:
    print("⚠️  No S3 bucket configured - using demo mode")

NameError: name 'genaisystemevaluationmediatestbucket' is not defined

In [6]:
# Test processing your uploaded files
def test_existing_files():
    """Test processing the files already in your S3 bucket"""
    
    test_files = [
        {"key": "BusinessLicense.png", "type": "business_license"},
        {"key": "DL.png", "type": "drivers_license"}, 
        {"key": "PayStub.png", "type": "pay_stub"}
    ]
    
    results = []
    
    for file_info in test_files:
        try:
            print(f"\n🔍 Processing {file_info['key']}...")
            
            # Create a mock file path since file is already in S3
            mock_file_path = f"/tmp/{file_info['key']}"
            
            # Process using existing S3 key
            response = textract_client.analyze_document(
                Document={
                    'S3Object': {
                        'Bucket': S3_BUCKET_NAME,
                        'Name': file_info['key']
                    }
                },
                FeatureTypes=['TABLES', 'FORMS']
            )
            
            # Parse the response
            text_content, visual_content = doc_processor.parse_textract_response(response)
            
            result = {
                'file': file_info['key'],
                'type': file_info['type'],
                'text_length': len(text_content),
                'visual_elements': len(visual_content),
                'text_preview': text_content[:200] + "..." if len(text_content) > 200 else text_content,
                'visual_types': [vc.content_type for vc in visual_content]
            }
            
            results.append(result)
            print(f"✅ Extracted {len(text_content)} chars of text, {len(visual_content)} visual elements")
            
        except Exception as e:
            print(f"❌ Error processing {file_info['key']}: {e}")
            results.append({
                'file': file_info['key'],
                'type': file_info['type'],
                'error': str(e)
            })
    
    return results

# Test your files
print("🧪 Testing Textract on your uploaded files...")
test_results = test_existing_files()

# Display results
for result in test_results:
    print(f"\n📄 File: {result['file']}")
    if 'error' in result:
        print(f"   ❌ Error: {result['error']}")
    else:
        print(f"   📝 Text extracted: {result['text_length']} characters")
        print(f"   🖼️  Visual elements: {result['visual_elements']}")
        print(f"   📋 Visual types: {result['visual_types']}")
        print(f"   📖 Preview: {result['text_preview']}")

🧪 Testing Textract on your uploaded files...

🔍 Processing BusinessLicense.png...
❌ Error processing BusinessLicense.png: Parameter validation failed:
Invalid type for parameter Document.S3Object.Bucket, value: None, type: <class 'NoneType'>, valid types: <class 'str'>

🔍 Processing DL.png...
❌ Error processing DL.png: Parameter validation failed:
Invalid type for parameter Document.S3Object.Bucket, value: None, type: <class 'NoneType'>, valid types: <class 'str'>

🔍 Processing PayStub.png...
❌ Error processing PayStub.png: Parameter validation failed:
Invalid type for parameter Document.S3Object.Bucket, value: None, type: <class 'NoneType'>, valid types: <class 'str'>

📄 File: BusinessLicense.png
   ❌ Error: Parameter validation failed:
Invalid type for parameter Document.S3Object.Bucket, value: None, type: <class 'NoneType'>, valid types: <class 'str'>

📄 File: DL.png
   ❌ Error: Parameter validation failed:
Invalid type for parameter Document.S3Object.Bucket, value: None, type: <cla