## 1. Install Dependencies

In [1]:
# === PACKAGE INSTALLATION ===
print("üì¶ Installing dependencies...")

# Core packages for document processing
%pip install requests python-docx PyMuPDF Pillow --quiet

# HTTP client for API calls
%pip install httpx --quiet

# OpenTelemetry for tracing (optional)
%pip install opentelemetry-sdk --quiet

print("‚úÖ Packages installed!")

üì¶ Installing dependencies...
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
‚úÖ Packages installed!


## 2. Setup Fabric OpenAI Configuration

This uses Fabric's built-in authentication - **no API keys needed!**

In [2]:
import os
import json
import requests
from datetime import datetime
from pathlib import Path

# Fabric's built-in authentication
from synapse.ml.fabric.service_discovery import get_fabric_env_config
from synapse.ml.fabric.token_utils import TokenUtils

# ===================== CONFIGURATION =====================
# Only these parameters need to be configured:

# GPT Model Configuration (Fabric built-in)
DEPLOYMENT_NAME = "gpt-5"  # Update to match your Fabric deployment
API_VERSION = "2024-08-01-preview"
# Note: GPT-5 doesn't support max_tokens or temperature parameters

# SharePoint Repository Configuration
SHAREPOINT_DOCUMENT_ROOT = "https://mngenvmcap470378.sharepoint.com/sites/Tobedeleted/Shared%20Documents/"
# This is the direct document root URL where files are stored

# ===================== FABRIC OPENAI SETUP =====================
print("üîß Setting up Fabric OpenAI configuration...")

def get_fabric_openai_config():
    """
    Get OpenAI configuration from Fabric environment.
    Returns the service URL and authentication headers.
    
    No API keys needed - Fabric handles authentication automatically!
    """
    fabric_env_config = get_fabric_env_config().fabric_env_config
    auth_header = TokenUtils().get_openai_auth_header()
    
    openai_base_host = fabric_env_config.ml_workload_endpoint + "cognitive/openai/openai/"
    service_url = f"{openai_base_host}deployments/{DEPLOYMENT_NAME}/chat/completions?api-version={API_VERSION}"
    
    headers = {
        "Authorization": auth_header,
        "Content-Type": "application/json"
    }
    
    return service_url, headers

# Test the configuration
try:
    service_url, auth_headers = get_fabric_openai_config()
    print("‚úÖ Fabric OpenAI configuration successful!")
    print(f"ü§ñ Model: {DEPLOYMENT_NAME}")
    print(f"üîó Endpoint configured")
    print(f"üîê Authentication: Fabric managed (no API key needed)")
except Exception as e:
    print(f"‚ùå Error setting up Fabric OpenAI: {e}")
    print("üí° Make sure you're running this in a Fabric notebook")

üîß Setting up Fabric OpenAI configuration...
‚úÖ Fabric OpenAI configuration successful!
ü§ñ Model: gpt-5
üîó Endpoint configured
üîê Authentication: Fabric managed (no API key needed)


## 3. Ready to Analyze!

All configuration complete. The multimodal approach uses GPT-5's vision capabilities directly - no external services needed!

In [3]:
# === MULTIMODAL PROCESSING READY ===
# No external services needed!
# GPT-5 will analyze documents directly using its vision capabilities

print("‚úÖ Configuration complete!")
print("üí° Using GPT-5's multimodal capabilities:")
print("   - PDF documents: Converted to images for vision analysis")
print("   - DOCX documents: Text extraction or image conversion")
print("   - Images: Direct analysis with GPT-5 Vision")
print("   - No external APIs required!")

‚úÖ Configuration complete!
üí° Using GPT-5's multimodal capabilities:
   - PDF documents: Converted to images for vision analysis
   - DOCX documents: Text extraction or image conversion
   - Images: Direct analysis with GPT-5 Vision
   - No external APIs required!


## 4. Batch Processing Setup

Configure state management and document discovery for automated batch processing.

### 4.1 Processing State Manager

Track processed files to avoid reprocessing.

In [None]:
import hashlib
from typing import Set, Dict

class ProcessingStateManager:
    """
    Manages state of processed documents to avoid reprocessing.
    Tracks file hash, processing date, and status.
    """
    def __init__(self, state_file_path: str = "/lakehouse/default/Files/Document_Summaries/DocProcessingStateManager/.processing_state.json"):
        self.state_file_path = state_file_path
        self.state = self._load_state()
    
    def _load_state(self) -> Dict:
        """Load processing state from JSON file."""
        if os.path.exists(self.state_file_path):
            try:
                with open(self.state_file_path, 'r', encoding='utf-8') as f:
                    return json.load(f)
            except Exception as e:
                print(f"‚ö†Ô∏è  Could not load state file: {e}")
                return {"processed_files": {}, "metadata": {"version": "1.0"}}
        else:
            return {"processed_files": {}, "metadata": {"version": "1.0"}}
    
    def _save_state(self):
        """Save processing state to JSON file."""
        try:
            # Ensure directory exists
            os.makedirs(os.path.dirname(self.state_file_path), exist_ok=True)
            
            with open(self.state_file_path, 'w', encoding='utf-8') as f:
                json.dump(self.state, f, indent=2, ensure_ascii=False)
        except Exception as e:
            print(f"‚ö†Ô∏è  Could not save state file: {e}")
    
    def _calculate_file_hash(self, file_path: str) -> str:
        """Calculate SHA256 hash of file content."""
        sha256_hash = hashlib.sha256()
        with open(file_path, "rb") as f:
            # Read file in chunks to handle large files
            for byte_block in iter(lambda: f.read(4096), b""):
                sha256_hash.update(byte_block)
        return sha256_hash.hexdigest()
    
    def is_processed(self, file_path: str) -> bool:
        """
        Check if file has been processed.
        Returns True if file exists in state with same hash.
        """
        if not os.path.exists(file_path):
            return False
        
        file_hash = self._calculate_file_hash(file_path)
        file_name = os.path.basename(file_path)
        
        if file_name in self.state["processed_files"]:
            stored_hash = self.state["processed_files"][file_name].get("file_hash")
            return stored_hash == file_hash
        
        return False
    
    def mark_processed(self, file_path: str, summary_path: str = None, json_path: str = None, status: str = "success"):
        """
        Mark file as processed with metadata.
        
        Args:
            file_path: Path to processed document
            summary_path: Path to generated summary
            json_path: Path to generated JSON
            status: Processing status (success/failed)
        """
        file_hash = self._calculate_file_hash(file_path)
        file_name = os.path.basename(file_path)
        
        self.state["processed_files"][file_name] = {
            "file_hash": file_hash,
            "file_path": file_path,
            "processed_date": datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            "summary_output": summary_path,
            "json_output": json_path,
            "status": status,
            "file_size_kb": os.path.getsize(file_path) / 1024
        }
        
        self._save_state()
    
    def mark_failed(self, file_path: str, error_message: str):
        """Mark file as failed with error message."""
        file_name = os.path.basename(file_path)
        
        self.state["processed_files"][file_name] = {
            "file_hash": self._calculate_file_hash(file_path) if os.path.exists(file_path) else None,
            "file_path": file_path,
            "processed_date": datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            "status": "failed",
            "error": error_message
        }
        
        self._save_state()
    
    def get_processed_count(self) -> int:
        """Get count of successfully processed files."""
        return sum(1 for f in self.state["processed_files"].values() if f.get("status") == "success")
    
    def get_failed_count(self) -> int:
        """Get count of failed files."""
        return sum(1 for f in self.state["processed_files"].values() if f.get("status") == "failed")
    
    def reset(self):
        """Reset all processing state (use with caution!)."""
        self.state = {"processed_files": {}, "metadata": {"version": "1.0"}}
        self._save_state()
        print("‚úÖ Processing state reset")


# Initialize state manager
state_manager = ProcessingStateManager()

print("‚úÖ Processing State Manager initialized")
print(f"üìä Previously processed: {state_manager.get_processed_count()} files")
print(f"‚ùå Failed: {state_manager.get_failed_count()} files")
print(f"üíæ State file: {state_manager.state_file_path}")

### 4.2 Document Discovery

Find all documents in the lakehouse SharePointDocuments directory.

In [None]:
import glob
from typing import List

def discover_documents(search_pattern: str = "Files/SharePointDocuments/**/*", 
                       supported_extensions: List[str] = ['.pdf', '.docx', '.txt', '.md']) -> List[str]:
    """
    Discover all documents in the lakehouse SharePointDocuments directory.
    
    Args:
        search_pattern: Glob pattern to search (relative to lakehouse root)
        supported_extensions: List of file extensions to include
    
    Returns:
        List of absolute paths to discovered documents
    """
    lakehouse_path = "/lakehouse/default"
    search_path = os.path.join(lakehouse_path, search_pattern)
    
    print(f"üîç Searching for documents...")
    print(f"   Pattern: {search_path}")
    print(f"   Extensions: {', '.join(supported_extensions)}")
    
    # Find all files matching pattern
    all_files = glob.glob(search_path, recursive=True)
    
    # Filter by extension and exclude hidden/system files
    documents = [
        f for f in all_files 
        if os.path.isfile(f) 
        and any(f.lower().endswith(ext) for ext in supported_extensions)
        and not os.path.basename(f).startswith('.')
        and not os.path.basename(f).startswith('~')
    ]
    
    print(f"‚úÖ Found {len(documents)} document(s)")
    
    return documents


def get_unprocessed_documents(search_pattern: str = "Files/SharePointDocuments/**/*") -> List[str]:
    """
    Get list of documents that haven't been processed yet.
    
    Returns:
        List of absolute paths to unprocessed documents
    """
    all_documents = discover_documents(search_pattern)
    
    unprocessed = []
    for doc_path in all_documents:
        if not state_manager.is_processed(doc_path):
            unprocessed.append(doc_path)
    
    print(f"\nüìã Processing Summary:")
    print(f"   Total documents: {len(all_documents)}")
    print(f"   Already processed: {len(all_documents) - len(unprocessed)}")
    print(f"   New/modified: {len(unprocessed)}")
    
    return unprocessed


# Discover documents
print("="*80)
print("üìÅ DOCUMENT DISCOVERY")
print("="*80)

# Option 1: Discover all documents in Files directory
all_docs = discover_documents()

# Option 2: Get only unprocessed documents
unprocessed_docs = get_unprocessed_documents()

if unprocessed_docs:
    print("\nüÜï Unprocessed documents:")
    for doc in unprocessed_docs:
        file_size = os.path.getsize(doc) / 1024
        print(f"   - {os.path.basename(doc)} ({file_size:.1f} KB)")
else:
    print("\n‚úÖ All documents have been processed!")

print("="*80)

## 5. Helper Functions

Document processing functions and agents.

In [None]:
import httpx
import base64
from typing import Dict, Any, Tuple, Optional

class FabricAgent:
    """
    Custom agent that works with Fabric's OpenAI endpoint.
    Compatible with Microsoft Agent Framework interface.
    Supports multimodal inputs (text + images).
    """
    def __init__(self, deployment_name: str, api_version: str, instructions: str, name: str = "FabricAgent"):
        self.deployment_name = deployment_name
        self.api_version = api_version
        self.instructions = instructions
        self.name = name
        self._service_url = None
        self._headers = None
    
    def _ensure_config(self):
        """Get Fabric configuration."""
        if self._service_url is None:
            fabric_env_config = get_fabric_env_config().fabric_env_config
            auth_header = TokenUtils().get_openai_auth_header()
            
            openai_base_host = fabric_env_config.ml_workload_endpoint + "cognitive/openai/openai/"
            self._service_url = f"{openai_base_host}deployments/{self.deployment_name}/chat/completions?api-version={self.api_version}"
            
            self._headers = {
                "Authorization": auth_header,
                "Content-Type": "application/json"
            }
    
    async def run(self, user_message: str, images: list = None):
        """
        Run the agent with a user message and optional images.
        Compatible with agent_framework.Agent.run() interface.
        
        Args:
            user_message: Text prompt for the agent
            images: Optional list of image data (base64 encoded or file paths)
        """
        self._ensure_config()
        
        messages = [
            {"role": "system", "content": self.instructions}
        ]
        
        # Build user message with multimodal content if images provided
        if images and len(images) > 0:
            content = [{"type": "text", "text": user_message}]
            for img in images:
                content.append({
                    "type": "image_url",
                    "image_url": {"url": img}
                })
            messages.append({"role": "user", "content": content})
        else:
            messages.append({"role": "user", "content": user_message})
        
        payload = {"messages": messages}
        
        async with httpx.AsyncClient(timeout=180.0) as client:
            response = await client.post(
                self._service_url,
                headers=self._headers,
                json=payload
            )
            
            if response.status_code == 200:
                result = response.json()
                return AgentResult(
                    text=result["choices"][0]["message"]["content"],
                    raw_response=result
                )
            else:
                raise Exception(f"API Error: {response.status_code} - {response.text}")


class AgentResult:
    """Result object with .text property - compatible with agent_framework."""
    def __init__(self, text: str, raw_response: dict = None):
        self.text = text
        self.raw_response = raw_response


def encode_image_to_base64(file_path: str) -> str:
    """
    Read an image file and encode it to base64 data URL.
    Supports common image formats (PNG, JPEG, etc.)
    """
    extension = Path(file_path).suffix.lower()
    
    mime_types = {
        '.png': 'image/png',
        '.jpg': 'image/jpeg',
        '.jpeg': 'image/jpeg',
        '.gif': 'image/gif',
        '.webp': 'image/webp'
    }
    
    mime_type = mime_types.get(extension, 'image/png')
    
    with open(file_path, 'rb') as f:
        image_data = base64.b64encode(f.read()).decode('utf-8')
    
    return f"data:{mime_type};base64,{image_data}"


def document_to_images(document_path: str, max_dimension: int = 1024, quality: int = 85) -> Tuple[list, Optional[str]]:
    """
    Convert a document (PDF, DOCX) to images for multimodal processing.
    Compresses images to avoid 413 payload too large errors.
    
    Args:
        document_path: Path to document
        max_dimension: Maximum width/height in pixels (default 1024 for API limits)
        quality: JPEG quality 1-100 (default 85 balances size/quality)
    
    Returns:
        Tuple of (list of base64-encoded image data URLs, text content if DOCX)
    """
    try:
        import fitz  # PyMuPDF
        from PIL import Image
        import io
        import zipfile
        
        # Verify file exists first
        if not os.path.exists(document_path):
            raise FileNotFoundError(f"Document not found: {document_path}")
        
        file_path_obj = Path(document_path)
        file_extension = file_path_obj.suffix.lower()
        
        print(f"   Processing file: {file_path_obj.name}")
        print(f"   File type: {file_extension}")
        print(f"   File size: {os.path.getsize(document_path) / 1024:.1f} KB")
        
        images = []
        
        if file_extension == '.pdf':
            # Convert PDF pages to images with compression
            pdf_document = fitz.open(document_path)
            
            for page_num in range(len(pdf_document)):
                page = pdf_document[page_num]
                
                # Render at lower resolution to reduce size (1.5x instead of 2x)
                pix = page.get_pixmap(matrix=fitz.Matrix(1.5, 1.5))
                
                # Convert to PIL Image for compression
                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                
                # Resize if too large
                if img.width > max_dimension or img.height > max_dimension:
                    img.thumbnail((max_dimension, max_dimension), Image.Resampling.LANCZOS)
                
                # Compress to JPEG
                buffer = io.BytesIO()
                img.save(buffer, format='JPEG', quality=quality, optimize=True)
                img_data = buffer.getvalue()
                
                # Encode to base64
                img_base64 = base64.b64encode(img_data).decode('utf-8')
                images.append(f"data:image/jpeg;base64,{img_base64}")
                
                print(f"   Page {page_num + 1}: {len(img_base64) / 1024:.1f} KB")
            
            pdf_document.close()
            return images, None
            
        elif file_extension == '.docx':
            # For DOCX, extract text directly here
            print("   DOCX detected - extracting text content...")
            
            # First check if it's a valid ZIP file (DOCX files are ZIP archives)
            try:
                with zipfile.ZipFile(document_path, 'r') as zip_ref:
                    print(f"   ‚úÖ Valid DOCX structure detected")
            except zipfile.BadZipFile:
                print("   ‚ö†Ô∏è  File is not a valid DOCX (not a ZIP file)")
                print("   üí° This might be a plain text file with .docx extension")
                print("   Attempting to read as plain text...")
                
                # Try reading as plain text
                with open(document_path, 'r', encoding='utf-8', errors='ignore') as f:
                    text_content = f.read()
                
                if text_content.strip():
                    print(f"   Extracted {len(text_content):,} characters as plain text")
                    return [], text_content
                else:
                    raise ValueError("File appears to be corrupted or empty")
            
            # If we get here, it's a valid DOCX file
            from docx import Document
            
            # Read file as binary first to avoid path encoding issues
            with open(document_path, 'rb') as f:
                doc = Document(f)
            
            text_content = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
            print(f"   Extracted {len(text_content):,} characters from {len(doc.paragraphs)} paragraphs")
            
            # Return empty images list and text content
            return [], text_content
            
        elif file_extension in ['.png', '.jpg', '.jpeg', '.gif', '.webp']:
            # Already an image - compress if needed
            img = Image.open(document_path)
            
            # Resize if too large
            if img.width > max_dimension or img.height > max_dimension:
                img.thumbnail((max_dimension, max_dimension), Image.Resampling.LANCZOS)
            
            # Convert to RGB if needed
            if img.mode != 'RGB':
                img = img.convert('RGB')
            
            # Compress
            buffer = io.BytesIO()
            img.save(buffer, format='JPEG', quality=quality, optimize=True)
            img_data = buffer.getvalue()
            
            img_base64 = base64.b64encode(img_data).decode('utf-8')
            images.append(f"data:image/jpeg;base64,{img_base64}")
            
            print(f"   Image compressed: {len(img_base64) / 1024:.1f} KB")
            return images, None
        
        else:
            raise ValueError(f"Unsupported file type: {file_extension}")
        
    except Exception as e:
        print(f"‚ö†Ô∏è Error processing document: {e}")
        import traceback
        traceback.print_exc()
        return [], None


print("‚úÖ Helper functions defined")
print("üí° Using custom FabricAgent with multimodal capabilities!")

‚úÖ Helper functions defined
üí° Using custom FabricAgent with multimodal capabilities!


## 6. Document Summarization Prompts

Configure AI agent instructions for analysis and extraction.

In [9]:
# ===================== SUMMARIZATION PROMPTS =====================

# Agent 1: Document Analyzer (creates summary from visual/multimodal analysis)
DOCUMENT_ANALYZER_PROMPT = """You are an advanced document analysis agent with multimodal capabilities.

**Your Task:**
Analyze the provided document images/content and create a comprehensive executive summary.

**Instructions:**
1. Carefully examine all pages of the document
2. Identify the document type and structure
3. Extract key information, themes, and insights
4. Create a concise executive summary suitable for board-level presentation
5. Focus on business impact and strategic implications

**Output Format:**
Provide a clear, structured summary in markdown format with:
- Document title and type
- Key findings (bullet points)
- Executive summary (2-3 paragraphs)
- Critical takeaways

Be thorough but concise. Focus on what matters most."""


# Agent 2: JSON Extractor (extracts structured data)
JSON_EXTRACTOR_PROMPT = """You are a data extraction specialist.

**Your Task:**
Extract structured information from the document and return it as valid JSON.

**Required Fields:**
- document_title: The main title or subject of the document
- document_type: Type of document (e.g., "Regulatory Report", "Financial Statement", "Risk Assessment")
- document_date: Date mentioned in the document (if available, format as YYYY-MM-DD)
- key_topics: String of main topics/themes discussed
- critical_risks: String of identified risks or concerns comma separated
- action_items: String of action items or recommendations comma separated
- stakeholders: String of mentioned stakeholders or entities comma separated
- risk_rating: Overall risk assessment (Low/Medium/High/Critical) if applicable
- summary: Brief 1-2 sentence summary
- author: Document author or department if mentioned
- confidentiality: Confidentiality level if mentioned (e.g., "Public", "Internal", "Confidential", "Restricted")

**Output Format:**
Return ONLY valid JSON with the above structure. Do not include markdown formatting or code blocks.

Example:
{
  "document_title": "Q4 Risk Report",
  "document_type": "Regulatory Risk Report",
  "document_date": "2024-12-31",
  "key_topics": "Remediation Programs", "Regulatory Engagement",
  "critical_risks": "Compliance breach in X area",
  "action_items": "Review policy Y", "Implement framework Z",
  "stakeholders": "Executives", "Board of Directors",
  "risk_rating": "Medium",
  "summary": "Quarterly regulatory risk report covering remediation programs and regulatory engagement activities.",
  "author": "Risk Management Group",
  "confidentiality": "Internal"
}"""

print("‚úÖ Agent prompts configured")
print(f"üìù Document Analyzer prompt: {len(DOCUMENT_ANALYZER_PROMPT):,} characters")
print(f"üìù JSON Extractor prompt: {len(JSON_EXTRACTOR_PROMPT):,} characters")

‚úÖ Agent prompts configured
üìù Document Analyzer prompt: 720 characters
üìù JSON Extractor prompt: 1,658 characters


## 7. Batch Processing Execution

Automated processing of all unprocessed documents.

### 7.1 Batch Processing Functions

Process multiple documents with state tracking.

In [None]:
async def process_document_batch(document_paths: List[str], max_failures: int = 5):
    """
    Process multiple documents in batch with state tracking.
    Skips already-processed files and tracks failures.
    
    Args:
        document_paths: List of document paths to process
        max_failures: Stop batch if this many consecutive failures occur
    """
    print("="*80)
    print("üì¶ BATCH DOCUMENT PROCESSING")
    print("="*80)
    print(f"Total documents to process: {len(document_paths)}")
    print("="*80)
    
    processed_count = 0
    skipped_count = 0
    failed_count = 0
    consecutive_failures = 0
    
    for idx, doc_path in enumerate(document_paths, 1):
        filename = os.path.basename(doc_path)
        
        print(f"\n{'='*80}")
        print(f"üìÑ Document {idx}/{len(document_paths)}: {filename}")
        print(f"{'='*80}")
        
        # Check if already processed
        if state_manager.is_processed(doc_path):
            print("‚è≠Ô∏è  Already processed (file hash matches) - SKIPPING")
            skipped_count += 1
            consecutive_failures = 0  # Reset on skip
            continue
        
        # Process document
        try:
            result = await analyze_single_document(doc_path)
            
            if result:
                # Mark as successfully processed
                state_manager.mark_processed(
                    file_path=doc_path,
                    summary_path=result.get("summary_path"),
                    json_path=result.get("json_path"),
                    status="success"
                )
                processed_count += 1
                consecutive_failures = 0
                print(f"‚úÖ Successfully processed and saved")
            else:
                # Processing returned None (error occurred)
                state_manager.mark_failed(doc_path, "Processing returned no result")
                failed_count += 1
                consecutive_failures += 1
                print(f"‚ùå Processing failed")
        
        except Exception as e:
            error_msg = f"{type(e).__name__}: {str(e)}"
            print(f"‚ùå Error: {error_msg}")
            state_manager.mark_failed(doc_path, error_msg)
            failed_count += 1
            consecutive_failures += 1
        
        # Check if too many consecutive failures
        if consecutive_failures >= max_failures:
            print(f"\n‚ö†Ô∏è  Stopping batch: {consecutive_failures} consecutive failures")
            break
        
        # Progress update
        print(f"\nüìä Batch Progress: {processed_count} processed | {skipped_count} skipped | {failed_count} failed")
    
    # Final summary
    print("\n" + "="*80)
    print("üìä BATCH PROCESSING COMPLETE")
    print("="*80)
    print(f"‚úÖ Successfully processed: {processed_count}")
    print(f"‚è≠Ô∏è  Skipped (already done): {skipped_count}")
    print(f"‚ùå Failed: {failed_count}")
    print(f"üìÅ Total attempted: {processed_count + failed_count}")
    print("="*80)
    
    return {
        "processed": processed_count,
        "skipped": skipped_count,
        "failed": failed_count
    }


async def analyze_single_document(document_path: str):
    """
    Analyze a single document using GPT-5's multimodal capabilities.
    Modified version of analyze_document_multimodal for batch processing.
    
    Returns:
        Dict with summary, extracted_data, and output paths, or None on failure
    """
    try:
        # Get timestamps
        analysis_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        analysis_date_prefix = datetime.now().strftime('%Y%m%d')
        
        # Extract filename info
        filename = Path(document_path).name
        filename_without_ext = Path(document_path).stem
        
        # Define output paths
        summary_output_dir = "/lakehouse/default/Files/Document_Summaries"
        json_output_dir = "/lakehouse/default/Files/Document_Analysis"
        
        summary_output_path = f"{summary_output_dir}/{analysis_date_prefix}_{filename_without_ext}.md"
        json_output_path = f"{json_output_dir}/{analysis_date_prefix}_{filename_without_ext}.json"
        
        # Create output directories
        os.makedirs(summary_output_dir, exist_ok=True)
        os.makedirs(json_output_dir, exist_ok=True)
        
        print(f"üìÅ Input: {filename}")
        
        # Step 1: Process document
        print("Step 1: Processing document...")
        document_images, text_content = document_to_images(document_path)
        
        if document_images and len(document_images) > 0:
            has_images = True
            text_content = None
        elif text_content:
            has_images = False
            document_images = None
        else:
            # Try fallback for plain text files
            file_extension = Path(document_path).suffix.lower()
            if file_extension in ['.txt', '.md']:
                with open(document_path, 'r', encoding='utf-8') as f:
                    text_content = f.read()
                has_images = False
                document_images = None
            else:
                raise ValueError(f"Could not process file: {document_path}")
        
        # Step 2: Create analyzer agent
        print("Step 2: Creating analyzer agent...")
        analyzer_agent = FabricAgent(
            deployment_name=DEPLOYMENT_NAME,
            api_version=API_VERSION,
            instructions=DOCUMENT_ANALYZER_PROMPT,
            name="DocumentAnalyzer"
        )
        
        # Step 3: Analyze document
        print("Step 3: Analyzing with GPT-5...")
        if has_images:
            total_size_kb = sum(len(img) for img in document_images) / 1024
            
            if total_size_kb > 3000 or len(document_images) > 10:
                # Process in chunks
                summaries = []
                chunk_size = 5
                for i in range(0, len(document_images), chunk_size):
                    chunk = document_images[i:i+chunk_size]
                    chunk_result = await analyzer_agent.run(
                        user_message=f"Analyze pages {i+1}-{i+len(chunk)} of this document. Provide key findings.",
                        images=chunk
                    )
                    summaries.append(chunk_result.text)
                
                combined_summary = "\n".join(summaries)
                final_result = await analyzer_agent.run(
                    user_message=f"Synthesize these summaries into one comprehensive executive summary:\n\n{combined_summary}"
                )
                summary = final_result.text
            else:
                analyzer_result = await analyzer_agent.run(
                    user_message="Please analyze this document and provide a comprehensive executive summary.",
                    images=document_images
                )
                summary = analyzer_result.text
        else:
            if len(text_content) > 100000:
                # Chunk large text
                chunk_size = 50000
                summaries = []
                for i in range(0, len(text_content), chunk_size):
                    chunk = text_content[i:i+chunk_size]
                    chunk_result = await analyzer_agent.run(
                        user_message=f"Analyze this section of the document:\n\n{chunk}"
                    )
                    summaries.append(chunk_result.text)
                
                combined_summary = "\n\n".join(summaries)
                final_result = await analyzer_agent.run(
                    user_message=f"Synthesize these summaries:\n\n{combined_summary}"
                )
                summary = final_result.text
            else:
                analyzer_result = await analyzer_agent.run(
                    user_message=f"Please analyze this document and provide a comprehensive executive summary:\n\n{text_content}"
                )
                summary = analyzer_result.text
        
        # Step 4: Create extractor agent
        print("Step 4: Extracting structured data...")
        extractor_agent = FabricAgent(
            deployment_name=DEPLOYMENT_NAME,
            api_version=API_VERSION,
            instructions=JSON_EXTRACTOR_PROMPT,
            name="JSONExtractor"
        )
        
        # Step 5: Extract data
        if has_images:
            extractor_result = await extractor_agent.run(
                user_message="Extract structured information from this document and return as JSON.",
                images=document_images
            )
        else:
            extractor_result = await extractor_agent.run(
                user_message=f"Extract structured information from this document and return as JSON:\n\n{text_content}"
            )
        
        # Parse JSON
        json_text = extractor_result.text.strip()
        if json_text.startswith("```"):
            json_text = json_text.split("```")[1]
            if json_text.startswith("json"):
                json_text = json_text[4:]
            json_text = json_text.strip()
        
        try:
            extracted_data = json.loads(json_text)
        except json.JSONDecodeError:
            extracted_data = {"raw_response": extractor_result.text}
        
        # Step 6: Save outputs
        print("Step 5: Saving outputs...")
        
        # Save markdown
        with open(summary_output_path, "w", encoding="utf-8") as f:
            f.write("# DOCUMENT SUMMARY\n\n")
            f.write("---\n\n")
            f.write(f"**Source Document:** {filename}  \n")
            f.write(f"**Analysis Date:** {analysis_date}  \n")
            f.write(f"**Model:** {DEPLOYMENT_NAME} (Multimodal)  \n")
            f.write(f"**Approach:** Two-Agent Workflow (Analyzer + Extractor)  \n\n")
            f.write("---\n\n")
            f.write(summary)
            f.write("\n\n---\n")
        
        # Save JSON
        from urllib.parse import quote
        document_filename = Path(document_path).name
        sharepoint_doc_url = f"{SHAREPOINT_DOCUMENT_ROOT}{quote(document_filename)}"
        
        full_json_data = {
            "source_document": filename,
            "document_location": sharepoint_doc_url,
            "analysis_date": analysis_date,
            "analysis_metadata": {
                "document_path": document_path,
                "analysis_model": DEPLOYMENT_NAME,
                "api_version": API_VERSION,
                "authentication": "Fabric Managed Identity",
                "approach": "Multimodal GPT-5 Vision",
                "agents_used": ["DocumentAnalyzer", "JSONExtractor"],
                "has_images": has_images,
                "image_count": len(document_images) if has_images else 0
            },
            "summary_text": summary,
            "extracted_data": extracted_data
        }
        
        with open(json_output_path, "w", encoding="utf-8") as f:
            json.dump(full_json_data, f, indent=2, ensure_ascii=False)
        
        print(f"üíæ Saved: {os.path.basename(summary_output_path)}")
        print(f"üíæ Saved: {os.path.basename(json_output_path)}")
        
        return {
            "summary": summary,
            "extracted_data": extracted_data,
            "document_location": sharepoint_doc_url,
            "summary_path": summary_output_path,
            "json_path": json_output_path
        }
    
    except Exception as e:
        print(f"‚ùå Error processing document: {e}")
        import traceback
        traceback.print_exc()
        return None


print("‚úÖ Batch processing functions defined")
print("üí° Ready to process multiple documents with state tracking!")

### 7.2 Execute Batch Processing

Run automated processing on all unprocessed documents.

In [None]:
# ===================== BATCH PROCESSING EXECUTION =====================

# Get unprocessed documents
unprocessed = get_unprocessed_documents(search_pattern="Files/SharePointDocuments/**/*")

if not unprocessed:
    print("\n‚úÖ No new documents to process!")
    print("üí° All documents in the Files directory have been analyzed.")
else:
    print(f"\nüöÄ Starting batch processing of {len(unprocessed)} document(s)...")
    
    # Run batch processing
    result = await process_document_batch(unprocessed, max_failures=5)
    
    print("\n" + "="*80)
    print("üéâ BATCH PROCESSING FINISHED!")
    print("="*80)
    print(f"üìä Total state:")
    print(f"   ‚úÖ All-time processed: {state_manager.get_processed_count()}")
    print(f"   ‚ùå All-time failed: {state_manager.get_failed_count()}")
    print("="*80)
    
    # Optional: Show failed documents
    if result["failed"] > 0:
        print("\n‚ö†Ô∏è  Failed documents (check state file for details):")
        for file_name, info in state_manager.state["processed_files"].items():
            if info.get("status") == "failed":
                print(f"   - {file_name}: {info.get('error', 'Unknown error')}")