# üöÄ Complete Nanonets OCR Markdown Generator

**Advanced Document Processing Pipeline with Nanonets AI OCR**

This notebook provides a comprehensive solution for processing all cropped images from layout detection using Nanonets OCR model for high-quality text extraction and clean markdown generation.

## Features:
- ü§ñ **Nanonets AI OCR** - State-of-the-art OCR with contextual understanding
- üñºÔ∏è **Smart Image Processing** - Handles all image sizes including small elements
- üìù **Clean Text Extraction** - No extra metadata, only extracted content
- üéØ **Element-Type Aware** - Optimized prompts for different content types
- üìÑ **Structured Output** - Clean markdown with proper formatting
- üíæ **Batch Processing** - Process all documents efficiently

In [None]:
# Essential Imports and Configuration
import os
import json
import time
from pathlib import Path
from PIL import Image
import torch
from typing import Dict, List, Tuple, Optional, Any
from collections import defaultdict
import re
from datetime import datetime

# Core ML imports
from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText

# Configuration Constants
NANONETS_MODEL = "nanonets/Nanonets-OCR-s"
LAYOUT_RESULTS_DIR = Path("layout_results")
OUTPUT_DIR = Path("nanonets_clean_results")
MIN_IMAGE_SIZE = 32  # Minimum dimension for compatibility
MAX_IMAGE_SIZE = 2048  # Maximum dimension to avoid memory issues
MIN_TEXT_LENGTH = 2  # Minimum text length to consider valid

# Processing Configuration
USE_GPU = torch.cuda.is_available()
DEVICE = "cuda" if USE_GPU else "cpu"
TORCH_DTYPE = torch.bfloat16 if USE_GPU else torch.float32

print("üöÄ Nanonets OCR Markdown Generator")
print("=" * 50)
print(f"ü§ñ Model: {NANONETS_MODEL}")
print(f"üñ•Ô∏è Device: {DEVICE}")
print(f"üìÅ Input Directory: {LAYOUT_RESULTS_DIR}")
print(f"üíæ Output Directory: {OUTPUT_DIR}")
print(f"üéØ GPU Available: {'‚úÖ' if USE_GPU else '‚ùå'}")
print("=" * 50)

## 1. Nanonets OCR Engine Initialization

In [None]:
class NanonetsOCREngine:
    """Advanced Nanonets OCR Engine with comprehensive image processing."""
    
    def __init__(self):
        self.model = None
        self.processor = None
        self.tokenizer = None
        self.initialized = False
        
    def initialize(self):
        """Initialize the Nanonets OCR model and components."""
        try:
            print("üîÑ Initializing Nanonets OCR Engine...")
            
            # Load tokenizer and processor
            print("  üì• Loading tokenizer...")
            self.tokenizer = AutoTokenizer.from_pretrained(
                NANONETS_MODEL, 
                trust_remote_code=True
            )
            
            print("  üì• Loading processor...")
            self.processor = AutoProcessor.from_pretrained(
                NANONETS_MODEL, 
                trust_remote_code=True
            )
            
            # Load model
            print("  ? Loading model...")
            self.model = AutoModelForImageTextToText.from_pretrained(
                NANONETS_MODEL,
                trust_remote_code=True,
                torch_dtype=TORCH_DTYPE,
                device_map="auto" if USE_GPU else None
            )
            
            self.initialized = True
            print("‚úÖ Nanonets OCR Engine initialized successfully!")
            print(f"üñ•Ô∏è Model loaded on: {next(self.model.parameters()).device}")
            
        except Exception as e:
            print(f"‚ùå Failed to initialize OCR engine: {e}")
            self.initialized = False
            raise
    
    def is_ready(self):
        """Check if the OCR engine is ready for use."""
        return self.initialized and self.model is not None

# Initialize the OCR engine
ocr_engine = NanonetsOCREngine()
ocr_engine.initialize()

## 2. Image Processing and OCR Functions

In [None]:
def create_optimized_prompt(element_type: str) -> str:
    """Create element-specific prompts for clean text extraction."""
    
    base_prompt = (
        "Extract ONLY the text content from this image. "
        "Return clean text without any explanations or extra information. "
    )
    
    element_prompts = {
        "table": "Format tables as clean HTML using <table>, <tr>, <td>, <th> tags only.",
        "title": "Extract the title text only.",
        "section_header": "Extract the header text only.",
        "text": "Extract the text content preserving natural line breaks.",
        "paragraph": "Extract the paragraph text maintaining structure.",
        "key_value_region": "Extract key-value pairs as 'Key: Value' format.",
        "list": "Extract list items with appropriate bullet points or numbers.",
        "page_header": "Extract header text from top of page.",
        "page_footer": "Extract footer text from bottom of page.",
        "picture": "If text is visible, extract it. If no text, return [Image: brief description]"
    }
    
    specific_prompt = element_prompts.get(element_type, "Extract the visible text content.")
    return base_prompt + specific_prompt


def preprocess_image(image_path: Path) -> Optional[Image.Image]:
    """Preprocess image for optimal OCR results."""
    try:
        image = Image.open(image_path).convert("RGB")
        width, height = image.size
        
        # Handle small images - resize to meet minimum requirements
        if width < MIN_IMAGE_SIZE or height < MIN_IMAGE_SIZE:
            scale_factor = max(MIN_IMAGE_SIZE / width, MIN_IMAGE_SIZE / height)
            new_width = max(MIN_IMAGE_SIZE, int(width * scale_factor * 1.2))  # Add 20% buffer
            new_height = max(MIN_IMAGE_SIZE, int(height * scale_factor * 1.2))
            
            print(f"    ? Resizing from {width}x{height} to {new_width}x{new_height}")
            image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
        
        # Handle large images - downscale to prevent memory issues
        elif width > MAX_IMAGE_SIZE or height > MAX_IMAGE_SIZE:
            scale_factor = min(MAX_IMAGE_SIZE / width, MAX_IMAGE_SIZE / height)
            new_width = int(width * scale_factor)
            new_height = int(height * scale_factor)
            
            print(f"    üìê Downscaling from {width}x{height} to {new_width}x{new_height}")
            image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
        
        return image
        
    except Exception as e:
        print(f"    ‚ùå Image preprocessing failed: {str(e)[:50]}...")
        return None


def extract_text_with_nanonets(image_path: Path, element_type: str = "text") -> Tuple[str, float]:
    """Extract text using Nanonets OCR with confidence scoring."""
    
    if not ocr_engine.is_ready():
        return "", 0.0
    
    try:
        # Preprocess image
        image = preprocess_image(image_path)
        if image is None:
            return "", 0.0
        
        # Create optimized prompt
        prompt = create_optimized_prompt(element_type)
        
        # Prepare messages for the model
        messages = [
            {
                "role": "system", 
                "content": "You are a precise text extraction assistant. Extract only visible text without commentary."
            },
            {
                "role": "user", 
                "content": [
                    {"type": "image", "image": f"file://{image_path}"},
                    {"type": "text", "text": prompt},
                ]
            },
        ]
        
        # Apply chat template
        try:
            text = ocr_engine.processor.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True
            )
        except AttributeError:
            text = ocr_engine.tokenizer.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True
            )
        
        # Process inputs
        inputs = ocr_engine.processor(
            text=[text], images=[image], padding=True, return_tensors="pt"
        )
        
        # Move to GPU if available
        if USE_GPU:
            inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
        
        # Generate text
        with torch.inference_mode():
            output = ocr_engine.model.generate(
                **inputs,
                max_new_tokens=1024,
                do_sample=False,
                num_beams=1,
                temperature=0.1,
                repetition_penalty=1.05,
                early_stopping=True,
                pad_token_id=ocr_engine.model.generation_config.pad_token_id,
            )
        
        # Extract generated text
        generated_ids = [o[i.shape[-1]:] for i, o in zip(inputs["input_ids"], output)]
        result = ocr_engine.processor.batch_decode(
            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
        )[0]
        
        # Clean result
        result = result.strip()
        
        # Calculate confidence
        confidence = calculate_confidence(result, image.size)
        
        # Cleanup memory
        del image, inputs, output
        if USE_GPU:
            torch.cuda.empty_cache()
        
        return result, confidence
        
    except Exception as e:
        print(f"    ‚ùå OCR failed: {str(e)[:50]}...")
        return "", 0.0


def calculate_confidence(text: str, image_size: Tuple[int, int]) -> float:
    """Calculate confidence score based on extraction quality."""
    
    if not text or len(text.strip()) < MIN_TEXT_LENGTH:
        return 0.0
    
    confidence = 1.0
    
    # Reduce confidence for very short text
    if len(text) < 5:
        confidence *= 0.6
    
    # Check for error indicators
    error_indicators = ['failed', 'error', 'unable', 'cannot', 'sorry']
    if any(indicator in text.lower() for indicator in error_indicators):
        confidence *= 0.2
    
    # Check for garbled text (too many special characters)
    special_chars = sum(1 for c in text if not c.isalnum() and c not in ' .,!?-:;()[]{}')
    if len(text) > 0:
        special_ratio = special_chars / len(text)
        if special_ratio > 0.3:
            confidence *= 0.5
    
    # Boost for structured content
    if any(tag in text for tag in ['<table>', '<tr>', '<td>']):
        confidence *= 1.1
    
    # Consider image size
    width, height = image_size
    if width < 50 or height < 20:
        confidence *= 0.7
    
    return min(1.0, max(0.0, confidence))

print("‚úÖ OCR processing functions defined successfully!")

## 3. Document Processing Functions

In [None]:
def extract_element_info_from_filename(filename: str) -> Dict[str, Any]:
    """Extract element information from filename pattern."""
    # Pattern: p001_elem000_type_id.png
    parts = filename.replace('.png', '').replace('.jpg', '').replace('.jpeg', '').split('_')
    
    info = {
        'id': 'unknown',
        'type': 'text',
        'page': 1,
        'element_id': 'unknown'
    }
    
    try:
        if len(parts) >= 3:
            # Extract page number (p001 -> 1)
            if parts[0].startswith('p'):
                info['page'] = int(parts[0][1:])
            
            # Extract element ID (elem000)
            if parts[1].startswith('elem'):
                info['element_id'] = parts[1]
                info['id'] = parts[1]
            
            # Extract element type
            if len(parts) >= 4:
                info['type'] = parts[2]
            elif len(parts) == 3:
                info['type'] = parts[2]
        
        # Extract layout element ID if present in filename
        if len(parts) >= 5:
            try:
                # Last part might be the layout analysis ID
                layout_id = int(parts[-1])
                info['layout_id'] = layout_id
            except ValueError:
                pass
                
    except (ValueError, IndexError):
        pass  # Use defaults
    
    return info


def load_layout_metadata(doc_dir: Path) -> Tuple[Dict[str, Any], Dict[int, Dict]]:
    """Load layout metadata from JSON file and create element mapping."""
    
    layout_json_path = doc_dir / "layout_analysis.json"
    element_info_map = {}
    layout_elements = {}
    
    if layout_json_path.exists():
        try:
            with open(layout_json_path, 'r', encoding='utf-8') as f:
                layout_data = json.load(f)
            
            print(f"  üìÑ Loaded layout analysis with {layout_data.get('element_statistics', {}).get('total_elements', 0)} elements")
            
            # Create mapping of layout element IDs to their data
            for page_data in layout_data.get('pages', []):
                for element in page_data.get('elements', []):
                    element_id = element.get('id')
                    if element_id is not None:
                        layout_elements[element_id] = element
                        
                        # Also create reverse mapping for cropped images
                        # Look for corresponding cropped image files
                        element_type = element.get('type', 'text')
                        page_num = page_data.get('page_number', 1)
                        
                        # Try to match with filename patterns
                        possible_filenames = [
                            f"p{page_num:03d}_elem{element_id:03d}_{element_type}_{element_id}.png",
                            f"p{page_num:03d}_elem{element_id:03d}_{element_type}.png",
                            f"p{page_num:03d}_elem000_{element_type}_{element_id}.png"
                        ]
                        
                        for filename in possible_filenames:
                            element_info_map[filename] = element
                            
        except Exception as e:
            print(f"  ‚ö†Ô∏è Could not read layout JSON: {e}")
    
    return element_info_map, layout_elements


def calculate_reading_order(elements: List[Dict]) -> List[Dict]:
    """Calculate reading order based on spatial position from layout analysis."""
    
    # Separate elements with and without layout info
    elements_with_layout = []
    elements_without_layout = []
    
    for element in elements:
        layout_info = element.get('layout_info', {})
        if layout_info and 'bounding_box' in layout_info:
            elements_with_layout.append(element)
        else:
            elements_without_layout.append(element)
    
    # Sort elements with layout info by reading order (top to bottom, left to right)
    def reading_order_key(element):
        bbox = element['layout_info']['bounding_box']
        top = bbox['top']
        left = bbox['left']
        
        # Primary sort by vertical position (top)
        # Secondary sort by horizontal position (left)
        # Use a tolerance for "same line" elements
        row_tolerance = 20  # pixels
        row = int(top / row_tolerance)
        
        return (row, left)
    
    elements_with_layout.sort(key=reading_order_key)
    
    # Sort elements without layout info by type priority and filename
    type_priority = {
        'title': 1,
        'section_header': 2,
        'paragraph': 3,
        'text': 4,
        'table': 5,
        'list': 6,
        'key_value_region': 7,
        'picture': 8,
        'page_header': 9,
        'page_footer': 10
    }
    
    elements_without_layout.sort(
        key=lambda x: (type_priority.get(x['element_type'], 5), x.get('filename', ''))
    )
    
    # Combine: elements with layout info first (in reading order), then others
    return elements_with_layout + elements_without_layout


def process_single_document(doc_dir: Path) -> Dict[str, Any]:
    """Process all cropped images in a single document directory using layout analysis for reading order."""
    
    doc_name = doc_dir.name
    cropped_dir = doc_dir / "cropped_images"
    
    if not cropped_dir.exists():
        print(f"  ‚ö†Ô∏è No cropped_images directory found")
        return {"elements": [], "stats": {"total": 0, "processed": 0, "failed": 0}}
    
    # Get all image files
    image_files = []
    for ext in ['*.png', '*.jpg', '*.jpeg']:
        image_files.extend(cropped_dir.glob(ext))
    
    if not image_files:
        print(f"  ‚ö†Ô∏è No image files found")
        return {"elements": [], "stats": {"total": 0, "processed": 0, "failed": 0}}
    
    print(f"  üñºÔ∏è Found {len(image_files)} images")
    
    # Load layout metadata and element mapping
    element_info_map, layout_elements = load_layout_metadata(doc_dir)
    
    # Process each image
    processed_elements = []
    stats = {"total": len(image_files), "processed": 0, "failed": 0, "small_images": 0}
    
    for image_path in sorted(image_files):
        image_name = image_path.name
        
        print(f"    üîç Processing: {image_name}")
        
        # Get element info from layout analysis or filename
        element_info = element_info_map.get(image_name, {})
        if not element_info:
            # Try to extract layout ID from filename and match with layout elements
            filename_info = extract_element_info_from_filename(image_name)
            layout_id = filename_info.get('layout_id')
            
            if layout_id and layout_id in layout_elements:
                element_info = layout_elements[layout_id]
                print(f"    üìç Matched with layout element ID {layout_id}")
            else:
                element_info = filename_info
                print(f"    ‚ö†Ô∏è No layout info found, using filename-based info")
        else:
            print(f"    üìç Found layout analysis data")
        
        element_type = element_info.get('type', 'text')
        
        # Extract text
        start_time = time.time()
        extracted_text, confidence = extract_text_with_nanonets(image_path, element_type)
        processing_time = time.time() - start_time
        
        # Check image size for small image tracking
        try:
            with Image.open(image_path) as img:
                width, height = img.size
                if width < MIN_IMAGE_SIZE or height < MIN_IMAGE_SIZE:
                    stats["small_images"] += 1
        except:
            pass
        
        # Create element record
        element_record = {
            'filename': image_name,
            'image_path': str(image_path),
            'element_type': element_type,
            'extracted_text': extracted_text,
            'confidence': confidence,
            'processing_time': processing_time,
            'success': confidence > 0.1,
            'page': element_info.get('page', 1),
            'element_id': element_info.get('id', 'unknown'),
            'layout_info': element_info if 'bounding_box' in element_info else None
        }
        
        processed_elements.append(element_record)
        
        # Update stats
        if element_record['success']:
            stats["processed"] += 1
            print(f"    ‚úÖ Success: {len(extracted_text)} chars, confidence: {confidence:.2f}")
        else:
            stats["failed"] += 1
            print(f"    ‚ùå Failed: No text extracted")
    
    # Sort elements by reading order using layout analysis
    print(f"  üìñ Calculating reading order from layout analysis...")
    ordered_elements = calculate_reading_order(processed_elements)
    
    # Show reading order summary
    elements_with_layout = len([e for e in ordered_elements if e.get('layout_info')])
    print(f"  üìç Elements with layout info: {elements_with_layout}/{len(ordered_elements)}")
    
    return {"elements": ordered_elements, "stats": stats}

print("‚úÖ Document processing functions with layout-based reading order defined successfully!")

## 4. Markdown Generation Functions

In [None]:
def generate_clean_markdown(elements: List[Dict], doc_name: str) -> str:
    """Generate clean markdown from processed elements using layout-based reading order."""
    
    # Filter successful extractions
    valid_elements = [elem for elem in elements if elem['success'] and elem['extracted_text'].strip()]
    
    if not valid_elements:
        return f"# {doc_name}\n\n*No text content extracted*\n"
    
    lines = [f"# {doc_name}\n"]
    
    # Group elements by page
    pages = defaultdict(list)
    for element in valid_elements:
        page_num = element.get('page', 1)
        pages[page_num].append(element)
    
    # Process each page
    for page_num in sorted(pages.keys()):
        if len(pages) > 1:  # Only add page headers if multiple pages
            lines.append(f"\n## Page {page_num}\n")
        
        page_elements = pages[page_num]
        
        # Elements should already be in reading order from process_single_document
        # But we can group consecutive elements of same type for better formatting
        
        current_section = None
        
        for element in page_elements:
            text = element['extracted_text'].strip()
            element_type = element['element_type']
            confidence = element['confidence']
            layout_info = element.get('layout_info')
            
            if not text:
                continue
            
            # Add some spacing between different sections
            if current_section and current_section != element_type:
                if element_type in ['title', 'section_header']:
                    lines.append("")  # Extra space before headers
            
            # Format based on element type following layout reading order
            if element_type == 'title':
                lines.append(f"### {text}\n")
                current_section = 'title'
                
            elif element_type == 'section_header':
                lines.append(f"#### {text}\n")
                current_section = 'section_header'
                
            elif element_type == 'table':
                # Add table with proper spacing
                if '<table>' in text.lower():
                    lines.append(f"{text}\n")
                else:
                    lines.append(f"**Table:**\n\n{text}\n")
                current_section = 'table'
                
            elif element_type == 'list':
                lines.append(f"{text}\n")
                current_section = 'list'
                
            elif element_type == 'key_value_region':
                # Format key-value regions with emphasis
                lines.append(f"**{text}**\n")
                current_section = 'key_value'
                
            elif element_type == 'picture':
                # Handle image descriptions
                if text.startswith('[Image:') or 'image:' in text.lower():
                    lines.append(f"{text}\n")
                else:
                    lines.append(f"[Image: {text}]\n")
                current_section = 'picture'
                
            elif element_type in ['page_header', 'page_footer']:
                # Format headers/footers with italics
                lines.append(f"*{text}*\n")
                current_section = element_type
                
            else:
                # Regular text/paragraph - the most common case
                # Check if this continues the previous text section
                if current_section == 'text' and not text.endswith('.'):
                    # Might be continuation of previous paragraph
                    lines.append(f"{text}")
                else:
                    lines.append(f"{text}\n")
                current_section = 'text'
            
            # Add confidence indicator for low-confidence extractions (optional)
            if confidence < 0.5:
                lines.append(f"*(confidence: {confidence:.2f})*\n")
    
    return "\n".join(lines)


def save_results(doc_name: str, elements: List[Dict], stats: Dict, markdown_content: str) -> bool:
    """Save processing results with layout analysis information."""
    
    # Create output directory
    doc_output_dir = OUTPUT_DIR / doc_name
    doc_output_dir.mkdir(parents=True, exist_ok=True)
    
    try:
        # Save markdown file
        markdown_path = doc_output_dir / f"{doc_name}_nanonets_clean.md"
        with open(markdown_path, 'w', encoding='utf-8') as f:
            f.write(markdown_content)
        
        # Count elements with layout info
        elements_with_layout = len([e for e in elements if e.get('layout_info')])
        
        # Save detailed JSON with layout information
        json_path = doc_output_dir / f"{doc_name}_processing_results.json"
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump({
                'document_name': doc_name,
                'processing_timestamp': datetime.now().isoformat(),
                'processing_stats': stats,
                'layout_analysis_used': elements_with_layout > 0,
                'elements_with_layout_info': elements_with_layout,
                'total_elements': len(elements),
                'successful_elements': len([e for e in elements if e['success']]),
                'average_confidence': sum(e['confidence'] for e in elements if e['success']) / max(1, len([e for e in elements if e['success']])),
                'reading_order_method': 'layout_analysis_spatial' if elements_with_layout > 0 else 'filename_based',
                'elements': elements
            }, f, indent=2, ensure_ascii=False)
        
        print(f"  ‚úÖ Saved: {markdown_path}")
        print(f"  üìÑ JSON: {json_path}")
        print(f"  üìç Layout-based reading order: {'‚úÖ' if elements_with_layout > 0 else '‚ùå'}")
        return True
        
    except Exception as e:
        print(f"  ‚ùå Failed to save results: {e}")
        return False


def analyze_layout_coverage(doc_dir: Path) -> Dict[str, Any]:
    """Analyze how well the cropped images match with layout analysis."""
    
    # Load layout analysis
    layout_json_path = doc_dir / "layout_analysis.json"
    if not layout_json_path.exists():
        return {"layout_file_exists": False}
    
    try:
        with open(layout_json_path, 'r', encoding='utf-8') as f:
            layout_data = json.load(f)
    except Exception:
        return {"layout_file_exists": True, "readable": False}
    
    # Get cropped images
    cropped_dir = doc_dir / "cropped_images"
    if not cropped_dir.exists():
        return {"layout_file_exists": True, "readable": True, "cropped_dir_exists": False}
    
    image_files = []
    for ext in ['*.png', '*.jpg', '*.jpeg']:
        image_files.extend(cropped_dir.glob(ext))
    
    # Analyze coverage
    layout_elements = []
    for page_data in layout_data.get('pages', []):
        layout_elements.extend(page_data.get('elements', []))
    
    analysis = {
        "layout_file_exists": True,
        "readable": True,
        "cropped_dir_exists": True,
        "layout_elements_count": len(layout_elements),
        "cropped_images_count": len(image_files),
        "element_types_in_layout": {},
        "coverage_ratio": len(image_files) / max(1, len(layout_elements))
    }
    
    # Count element types
    for element in layout_elements:
        elem_type = element.get('type', 'unknown')
        analysis["element_types_in_layout"][elem_type] = analysis["element_types_in_layout"].get(elem_type, 0) + 1
    
    return analysis

print("‚úÖ Enhanced markdown generation with layout-based reading order defined successfully!")

In [None]:
## 5. Complete Batch Processing Pipeline

In [None]:
def process_all_documents():
    """Process all documents in the layout_results directory."""
    
    if not LAYOUT_RESULTS_DIR.exists():
        print(f"‚ùå Layout results directory not found: {LAYOUT_RESULTS_DIR}")
        return
    
    # Get all document directories
    doc_dirs = [d for d in LAYOUT_RESULTS_DIR.iterdir() if d.is_dir()]
    
    if not doc_dirs:
        print("‚ùå No document directories found")
        return
    
    print(f"üöÄ Starting batch processing for {len(doc_dirs)} documents...")
    print(f"üìÅ Results will be saved to: {OUTPUT_DIR}")
    print("=" * 60)
    
    # Processing statistics
    overall_stats = {
        'total_documents': len(doc_dirs),
        'processed_documents': 0,
        'successful_documents': 0,
        'total_images': 0,
        'successful_extractions': 0,
        'failed_extractions': 0,
        'small_images_handled': 0,
        'total_processing_time': 0,
        'average_confidence': 0.0
    }
    
    start_time = time.time()
    all_confidences = []
    
    # Process each document
    for doc_dir in sorted(doc_dirs):
        doc_name = doc_dir.name
        print(f"\nüìÑ Processing: {doc_name}")
        
        doc_start_time = time.time()
        
        try:
            # Process the document
            result = process_single_document(doc_dir)
            elements = result['elements']
            stats = result['stats']
            
            if not elements:
                print(f"  ‚ö†Ô∏è No elements processed for {doc_name}")
                continue
            
            # Generate markdown
            markdown_content = generate_clean_markdown(elements, doc_name)
            
            # Save results
            if save_results(doc_name, elements, stats, markdown_content):
                overall_stats['successful_documents'] += 1
                
                # Update statistics
                overall_stats['total_images'] += stats['total']
                overall_stats['successful_extractions'] += stats['processed']
                overall_stats['failed_extractions'] += stats['failed']
                overall_stats['small_images_handled'] += stats['small_images']
                
                # Collect confidences
                confidences = [e['confidence'] for e in elements if e['success']]
                all_confidences.extend(confidences)
                
                doc_time = time.time() - doc_start_time
                avg_confidence = sum(confidences) / len(confidences) if confidences else 0
                
                print(f"  üìä Document Summary:")
                print(f"    üñºÔ∏è Images: {stats['total']}")
                print(f"    ‚úÖ Processed: {stats['processed']}")
                print(f"    üîç Small images: {stats['small_images']}")
                print(f"    üéØ Avg confidence: {avg_confidence:.3f}")
                print(f"    ‚è±Ô∏è Time: {doc_time:.2f}s")
            
            overall_stats['processed_documents'] += 1
            
        except Exception as e:
            print(f"  ‚ùå Error processing {doc_name}: {e}")
            continue
    
    # Calculate final statistics
    total_time = time.time() - start_time
    overall_stats['total_processing_time'] = total_time
    overall_stats['average_confidence'] = sum(all_confidences) / len(all_confidences) if all_confidences else 0
    
    # Print final summary
    print("\n" + "=" * 60)
    print("üéâ BATCH PROCESSING COMPLETE!")
    print("=" * 60)
    print(f"üìä Final Statistics:")
    print(f"  üìÅ Documents processed: {overall_stats['processed_documents']}/{overall_stats['total_documents']}")
    print(f"  ‚úÖ Successful documents: {overall_stats['successful_documents']}")
    print(f"  ?Ô∏è Total images: {overall_stats['total_images']}")
    print(f"  ‚úÖ Successful extractions: {overall_stats['successful_extractions']}")
    print(f"  ‚ùå Failed extractions: {overall_stats['failed_extractions']}")
    print(f"  üîç Small images handled: {overall_stats['small_images_handled']}")
    print(f"  üéØ Overall success rate: {(overall_stats['successful_extractions']/max(1,overall_stats['total_images'])*100):.1f}%")
    print(f"  üéØ Average confidence: {overall_stats['average_confidence']:.3f}")
    print(f"  ‚è±Ô∏è Total time: {total_time:.2f}s ({total_time/60:.1f} minutes)")
    print(f"  üöÄ Processing speed: {overall_stats['total_images']/max(1,total_time):.2f} images/second")
    print(f"  üíæ Results saved to: {OUTPUT_DIR}")
    
    # Save overall statistics
    try:
        stats_path = OUTPUT_DIR / "batch_processing_summary.json"
        OUTPUT_DIR.mkdir(exist_ok=True)
        with open(stats_path, 'w', encoding='utf-8') as f:
            json.dump({
                'processing_completed': datetime.now().isoformat(),
                'model_used': NANONETS_MODEL,
                'device_used': DEVICE,
                'overall_statistics': overall_stats
            }, f, indent=2)
        print(f"  ? Summary saved: {stats_path}")
    except Exception as e:
        print(f"  ‚ö†Ô∏è Could not save summary: {e}")
    
    return overall_stats

print("‚úÖ Complete batch processing pipeline defined successfully!")

In [None]:
## 6. Execute Complete Processing

In [None]:
# Execute the complete Nanonets OCR processing pipeline

print("üöÄ Starting Complete Nanonets OCR Processing")
print("=" * 80)
print("üéØ Features:")
print("  ‚Ä¢ Clean text extraction only")
print("  ‚Ä¢ Smart image preprocessing (handles all sizes)")
print("  ‚Ä¢ Element-type specific prompts")
print("  ‚Ä¢ Structured markdown output")
print("  ‚Ä¢ Comprehensive batch processing")
print("  ‚Ä¢ No extra commentary in output")
print("=" * 80)

# Check if OCR engine is ready
if not ocr_engine.is_ready():
    print("‚ùå OCR engine not initialized!")
    print("Please run the OCR engine initialization cell first.")
else:
    # Execute the complete processing
    try:
        start_time = time.time()
        
        # Show GPU memory if available
        if USE_GPU:
            print(f"\nüíæ GPU Memory before processing:")
            print(f"  Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
            print(f"  Cached: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
        
        # Run batch processing
        results = process_all_documents()
        
        total_time = time.time() - start_time
        
        if USE_GPU:
            print(f"\n? GPU Memory after processing:")
            print(f"  Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
            print(f"  Cached: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
        
        print(f"\nüéâ PROCESSING COMPLETED SUCCESSFULLY!")
        print(f"‚úÖ All documents processed with Nanonets OCR")
        print(f"‚úÖ Clean markdown files generated")
        print(f"‚úÖ Results saved to '{OUTPUT_DIR}' directory")
        print(f"‚è±Ô∏è Total execution time: {total_time:.2f}s ({total_time/60:.1f} minutes)")
        
    except Exception as e:
        print(f"‚ùå Error during processing: {e}")
        import traceback
        traceback.print_exc()

## 7. Test Single Document (Optional)

In [None]:
# Test processing on a single document (optional)

def test_single_document():
    """Test processing on one document for validation."""
    
    if not LAYOUT_RESULTS_DIR.exists():
        print("‚ùå Layout results directory not found")
        return
    
    # Get first document
    doc_dirs = [d for d in LAYOUT_RESULTS_DIR.iterdir() if d.is_dir()]
    if not doc_dirs:
        print("‚ùå No documents found")
        return
    
    test_doc = doc_dirs[0]
    doc_name = test_doc.name
    
    print(f"üß™ Testing single document: {doc_name}")
    print("-" * 50)
    
    # Process the document
    result = process_single_document(test_doc)
    elements = result['elements']
    stats = result['stats']
    
    # Show results
    print(f"\nüìä Test Results:")
    print(f"  üì∏ Total images: {stats['total']}")
    print(f"  ‚úÖ Processed: {stats['processed']}")
    print(f"  ‚ùå Failed: {stats['failed']}")
    print(f"  üîç Small images: {stats['small_images']}")
    
    if elements:
        print(f"\nüìã Sample extracted content:")
        for i, element in enumerate(elements[:5]):  # Show first 5
            if element['success']:
                text_preview = element['extracted_text'][:100]
                print(f"  {i+1}. {element['element_type']}: {text_preview}...")
                print(f"     Confidence: {element['confidence']:.3f}")
    
    # Generate and show markdown preview
    markdown_content = generate_clean_markdown(elements, doc_name)
    print(f"\nüìÑ Markdown preview (first 500 chars):")
    print("-" * 30)
    print(markdown_content[:500])
    if len(markdown_content) > 500:
        print("...")
    print("-" * 30)
    
    print(f"\n‚úÖ Test completed successfully!")
    return result

# Uncomment the line below to run the test
# test_single_document()

## 8. Results Analysis and Utilities

In [None]:
def analyze_results():
    """Analyze the generated results with layout analysis coverage."""
    
    if not OUTPUT_DIR.exists():
        print(f"‚ùå Output directory not found: {OUTPUT_DIR}")
        return
    
    # Get all result directories
    result_dirs = [d for d in OUTPUT_DIR.iterdir() if d.is_dir()]
    
    if not result_dirs:
        print("‚ùå No results found")
        return
    
    print(f"üìä Results Analysis with Layout Analysis Coverage")
    print("=" * 60)
    print(f"Documents processed: {len(result_dirs)}")
    
    total_markdown_files = 0
    total_json_files = 0
    total_size = 0
    all_stats = []
    layout_usage_count = 0
    
    # Analyze each document
    for doc_dir in result_dirs:
        doc_name = doc_dir.name
        
        # Check for files
        md_files = list(doc_dir.glob("*_nanonets_clean.md"))
        json_files = list(doc_dir.glob("*_processing_results.json"))
        
        if md_files:
            total_markdown_files += 1
            md_size = md_files[0].stat().st_size
            total_size += md_size
            
        if json_files:
            total_json_files += 1
            json_size = json_files[0].stat().st_size
            total_size += json_size
            
            # Load statistics
            try:
                with open(json_files[0], 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    stats = data.get('processing_stats', {})
                    all_stats.append(stats)
                    
                    # Check layout analysis usage
                    if data.get('layout_analysis_used', False):
                        layout_usage_count += 1
                        elements_with_layout = data.get('elements_with_layout_info', 0)
                        total_elements = data.get('total_elements', 0)
                        print(f"  üìç {doc_name}: {elements_with_layout}/{total_elements} elements with layout info")
                    
            except Exception:
                pass
    
    print(f"\nFile Statistics:")
    print(f"  Markdown files: {total_markdown_files}")
    print(f"  JSON files: {total_json_files}")
    print(f"  Total size: {total_size / 1024 / 1024:.2f} MB")
    
    if all_stats:
        total_images = sum(s.get('total', 0) for s in all_stats)
        total_processed = sum(s.get('processed', 0) for s in all_stats)
        total_failed = sum(s.get('failed', 0) for s in all_stats)
        total_small = sum(s.get('small_images', 0) for s in all_stats)
        
        print(f"\nProcessing Statistics:")
        print(f"  Total images: {total_images}")
        print(f"  Successfully processed: {total_processed}")
        print(f"  Failed: {total_failed}")
        print(f"  Small images handled: {total_small}")
        print(f"  Success rate: {(total_processed/max(1,total_images)*100):.1f}%")
    
    print(f"\nLayout Analysis Usage:")
    print(f"  Documents using layout analysis: {layout_usage_count}/{len(result_dirs)}")
    print(f"  Layout coverage: {(layout_usage_count/max(1,len(result_dirs))*100):.1f}%")
    
    print(f"\nüíæ Results location: {OUTPUT_DIR}")


def analyze_layout_coverage_all():
    """Analyze layout analysis coverage for all documents."""
    
    if not LAYOUT_RESULTS_DIR.exists():
        print(f"‚ùå Layout results directory not found: {LAYOUT_RESULTS_DIR}")
        return
    
    doc_dirs = [d for d in LAYOUT_RESULTS_DIR.iterdir() if d.is_dir()]
    
    print(f"üìä Layout Analysis Coverage Report")
    print("=" * 50)
    
    total_docs = len(doc_dirs)
    docs_with_layout = 0
    docs_with_cropped = 0
    docs_with_both = 0
    
    coverage_details = []
    
    for doc_dir in doc_dirs[:10]:  # Analyze first 10 for detailed view
        doc_name = doc_dir.name
        analysis = analyze_layout_coverage(doc_dir)
        
        has_layout = analysis.get('layout_file_exists', False) and analysis.get('readable', False)
        has_cropped = analysis.get('cropped_dir_exists', False)
        
        if has_layout:
            docs_with_layout += 1
        if has_cropped:
            docs_with_cropped += 1
        if has_layout and has_cropped:
            docs_with_both += 1
            
        if has_layout and has_cropped:
            coverage_details.append({
                'doc': doc_name,
                'layout_elements': analysis.get('layout_elements_count', 0),
                'cropped_images': analysis.get('cropped_images_count', 0),
                'coverage_ratio': analysis.get('coverage_ratio', 0),
                'element_types': analysis.get('element_types_in_layout', {})
            })
    
    print(f"Sample Analysis (first 10 documents):")
    print(f"  Documents with layout analysis: {docs_with_layout}/10")
    print(f"  Documents with cropped images: {docs_with_cropped}/10")
    print(f"  Documents with both: {docs_with_both}/10")
    
    if coverage_details:
        print(f"\nDetailed Coverage:")
        for detail in coverage_details[:5]:
            print(f"  üìÑ {detail['doc']}:")
            print(f"    Layout elements: {detail['layout_elements']}")
            print(f"    Cropped images: {detail['cropped_images']}")
            print(f"    Coverage ratio: {detail['coverage_ratio']:.2f}")
            print(f"    Element types: {detail['element_types']}")


def show_reading_order_sample():
    """Show sample reading order from a processed document."""
    
    if not OUTPUT_DIR.exists():
        print("‚ùå No output directory found")
        return
    
    # Find first JSON file with results
    json_files = list(OUTPUT_DIR.glob("**/*_processing_results.json"))
    
    if not json_files:
        print("‚ùå No processing results found")
        return
    
    sample_file = json_files[0]
    print(f"üìÑ Reading Order Sample from: {sample_file.parent.name}")
    print("=" * 50)
    
    try:
        with open(sample_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        elements = data.get('elements', [])
        layout_used = data.get('layout_analysis_used', False)
        
        print(f"Layout analysis used: {'‚úÖ' if layout_used else '‚ùå'}")
        print(f"Reading order method: {data.get('reading_order_method', 'unknown')}")
        print(f"\nElement order (first 10):")
        
        for i, element in enumerate(elements[:10]):
            element_type = element.get('element_type', 'unknown')
            success = element.get('success', False)
            has_layout = element.get('layout_info') is not None
            
            text_preview = ""
            if success:
                text = element.get('extracted_text', '')
                text_preview = text[:50] + ("..." if len(text) > 50 else "")
            
            layout_indicator = "?" if has_layout else "üìù"
            status = "‚úÖ" if success else "‚ùå"
            
            print(f"  {i+1:2d}. {layout_indicator} {status} {element_type:15s} - {text_preview}")
        
        if len(elements) > 10:
            print(f"     ... and {len(elements) - 10} more elements")
            
    except Exception as e:
        print(f"‚ùå Could not read file: {e}")


def cleanup_memory():
    """Clean up GPU memory."""
    if USE_GPU:
        torch.cuda.empty_cache()
        print("üßπ GPU memory cleaned")
    else:
        print("üßπ No GPU memory to clean")


def show_sample_output():
    """Show sample output from processed documents."""
    
    if not OUTPUT_DIR.exists():
        print("‚ùå No output directory found")
        return
    
    # Find first markdown file
    md_files = list(OUTPUT_DIR.glob("**/*.md"))
    
    if not md_files:
        print("‚ùå No markdown files found")
        return
    
    sample_file = md_files[0]
    print(f"üìÑ Sample output from: {sample_file.name}")
    print("=" * 50)
    
    try:
        with open(sample_file, 'r', encoding='utf-8') as f:
            content = f.read()
        
        # Show first 1000 characters
        print(content[:1000])
        if len(content) > 1000:
            print(f"\n... (showing first 1000 of {len(content)} characters)")
        
    except Exception as e:
        print(f"‚ùå Could not read file: {e}")

print("‚úÖ Enhanced analysis functions with layout coverage defined")
print("üìä Use analyze_results() to see processing statistics with layout coverage")
print("üìç Use analyze_layout_coverage_all() to check layout analysis availability")
print("üìñ Use show_reading_order_sample() to see reading order information")
print("üßπ Use cleanup_memory() to free GPU memory")
print("üìÑ Use show_sample_output() to see sample results")

## 9. Quick Analysis Commands

In [None]:
# Quick analysis commands - run after processing

print("? Quick Analysis Options:")
print("="*40)
print("1. analyze_results()     - Show processing statistics")
print("2. show_sample_output()  - Display sample markdown output") 
print("3. test_single_document() - Test on one document")
print("4. cleanup_memory()      - Free GPU memory")
print("="*40)

# Uncomment any line below to run:
# analyze_results()
# show_sample_output()
# cleanup_memory()

---

## üéâ Complete Nanonets OCR Processing Pipeline

**Summary:** This notebook provides a complete, clean implementation for processing all cropped images using Nanonets OCR with optimized prompts and structured markdown output.

In [None]:
# Final Summary
print("‚úÖ Nanonets OCR Markdown Generator - Complete Implementation")
print("üéØ Features: Smart image processing, clean text extraction, structured output")
print("üìÅ Input: layout_results/ directory with cropped images")
print("? Output: nanonets_clean_results/ directory with clean markdown files")
print("üöÄ Ready to process all your documents with state-of-the-art OCR!")

## 10. Results Analysis and Validation

Let's analyze the Nanonets OCR results and validate the generated markdown files.

In [None]:
def analyze_nanonets_results():
    """
    Analyze the generated Nanonets OCR results with detailed statistics
    """
    if not os.path.exists(OUTPUT_DIR):
        print(f"‚ùå Output directory '{OUTPUT_DIR}' not found")
        return
    
    result_dirs = [d for d in os.listdir(OUTPUT_DIR) 
                   if os.path.isdir(os.path.join(OUTPUT_DIR, d))]
    
    if not result_dirs:
        print("‚ùå No result directories found")
        return
    
    print(f"üìä Nanonets OCR Results Analysis")
    print("=" * 60)
    print(f"ü§ñ OCR Engine: Nanonets OCR ({NANONETS_MODEL})")
    print(f"üìÑ Total documents processed: {len(result_dirs)}")
    
    total_markdown_files = 0
    total_json_files = 0
    total_size = 0
    total_elements_count = 0
    element_types = {}
    confidence_stats = {'high': 0, 'medium': 0, 'low': 0}
    overall_confidences = []
    feature_usage = {
        'html_tables': 0,
        'latex_equations': 0,
        'image_descriptions': 0,
        'watermarks': 0,
        'special_elements': 0
    }
    
    # Analyze each document
    for doc_name in result_dirs:
        doc_dir = os.path.join(OUTPUT_DIR, doc_name)
        
        # Check for markdown file
        md_file = os.path.join(doc_dir, f"{doc_name}_nanonets.md")
        json_file = os.path.join(doc_dir, f"{doc_name}_nanonets_detailed.json")
        
        if os.path.exists(md_file):
            total_markdown_files += 1
            md_size = os.path.getsize(md_file)
            total_size += md_size
            
            # Analyze markdown content for advanced features
            with open(md_file, 'r', encoding='utf-8') as f:
                content = f.read()
                if '<table>' in content.lower():
                    feature_usage['html_tables'] += 1
                if '$$' in content or '\\begin{' in content:
                    feature_usage['latex_equations'] += 1
                if '<img>' in content:
                    feature_usage['image_descriptions'] += 1
                if 'watermark' in content.lower():
                    feature_usage['watermarks'] += 1
                if any(marker in content for marker in ['‚òê', '‚òë', '<page_number>', '*[Page:']):
                    feature_usage['special_elements'] += 1
        
        if os.path.exists(json_file):
            total_json_files += 1
            json_size = os.path.getsize(json_file)
            total_size += json_size
            
            # Analyze detailed processing data
            try:
                with open(json_file, 'r') as f:
                    data = json.load(f)
                    
                    stats = data.get('processing_stats', {})
                    total_elements_count += stats.get('total_elements', 0)
                    
                    # Track confidence distribution
                    conf_dist = stats.get('confidence_distribution', {})
                    confidence_stats['high'] += conf_dist.get('high', 0)
                    confidence_stats['medium'] += conf_dist.get('medium', 0)
                    confidence_stats['low'] += conf_dist.get('low', 0)
                    
                    # Overall confidence
                    if stats.get('avg_confidence', 0) > 0:
                        overall_confidences.append(stats['avg_confidence'])
                    
                    # Count element types
                    elem_types = stats.get('element_types', {})
                    for elem_type, count in elem_types.items():
                        element_types[elem_type] = element_types.get(elem_type, 0) + count
            except:
                pass
    
    print(f"üìù Markdown files generated: {total_markdown_files}")
    print(f"üìÑ Detailed JSON files generated: {total_json_files}")
    print(f"üî§ Total elements processed: {total_elements_count}")
    print(f"üíæ Total output size: {total_size / 1024:.2f} KB")
    
    # Advanced feature usage analysis
    if any(feature_usage.values()):
        print(f"\\nüé® Advanced Features Usage:")
        print(f"  üìä HTML Tables: {feature_usage['html_tables']} documents")
        print(f"  üî¢ LaTeX Equations: {feature_usage['latex_equations']} documents")
        print(f"  üñºÔ∏è Image Descriptions: {feature_usage['image_descriptions']} documents")
        print(f"  üè∑Ô∏è Watermarks Detected: {feature_usage['watermarks']} documents")
        print(f"  ‚≠ê Special Elements: {feature_usage['special_elements']} documents")
    
    # Confidence analysis
    if overall_confidences:
        avg_confidence = sum(overall_confidences) / len(overall_confidences)
        min_confidence = min(overall_confidences)
        max_confidence = max(overall_confidences)
        
        print(f"\\nüéØ Confidence Analysis:")
        print(f"  üìä Average confidence: {avg_confidence:.3f}")
        print(f"  üìà Confidence range: {min_confidence:.3f} - {max_confidence:.3f}")
        print(f"  üü¢ High confidence (‚â•0.8): {confidence_stats['high']}")
        print(f"  üü° Medium confidence (0.6-0.8): {confidence_stats['medium']}")
        print(f"  üî¥ Low confidence (<0.6): {confidence_stats['low']}")
        
        total_conf_elements = sum(confidence_stats.values())
        if total_conf_elements > 0:
            high_pct = (confidence_stats['high'] / total_conf_elements) * 100
            print(f"  ‚ú® High confidence percentage: {high_pct:.1f}%")
    
    # Show element type distribution
    if element_types:
        print(f"\\nüìä Element types distribution:")
        sorted_types = sorted(element_types.items(), key=lambda x: x[1], reverse=True)
        for elem_type, count in sorted_types:
            print(f"  ‚Ä¢ {elem_type}: {count}")
    
    # Performance insights
    print(f"\\n‚ö° Nanonets OCR Performance Insights:")
    if total_elements_count > 0:
        avg_elements_per_doc = total_elements_count / len(result_dirs)
        print(f"  üìù Average elements per document: {avg_elements_per_doc:.1f}")
    
    # Show sample of first few documents
    print(f"\\nüìÑ Sample Results (first 5 documents):")
    for i, doc_name in enumerate(result_dirs[:5]):
        doc_dir = os.path.join(OUTPUT_DIR, doc_name)
        md_file = os.path.join(doc_dir, f"{doc_name}_nanonets.md")
        json_file = os.path.join(doc_dir, f"{doc_name}_nanonets_detailed.json")
        
        if os.path.exists(md_file):
            md_size = os.path.getsize(md_file)
            
            # Get confidence info
            conf_info = ""
            if os.path.exists(json_file):
                try:
                    with open(json_file, 'r') as f:
                        data = json.load(f)
                        stats = data.get('processing_stats', {})
                        avg_conf = stats.get('avg_confidence', 0)
                        if avg_conf > 0:
                            conf_info = f" (conf: {avg_conf:.3f})"
                except:
                    pass
            
            print(f"  {i+1}. {doc_name}: {md_size / 1024:.2f} KB{conf_info}")
        else:
            print(f"  {i+1}. {doc_name}: ‚ùå No markdown file")
    
    return len(result_dirs), total_elements_count

def show_sample_nanonets_markdown():
    """
    Show sample content from generated Nanonets markdown files
    """
    if not os.path.exists(OUTPUT_DIR):
        return
    
    result_dirs = [d for d in os.listdir(OUTPUT_DIR) 
                   if os.path.isdir(os.path.join(OUTPUT_DIR, d))]
    
    if result_dirs:
        # Show content from first document
        sample_doc = result_dirs[0]
        sample_md_path = os.path.join(OUTPUT_DIR, sample_doc, f"{sample_doc}_nanonets.md")
        
        if os.path.exists(sample_md_path):
            print(f"üìÑ Sample Nanonets Markdown Content ({sample_doc}):")
            print("=" * 80)
            
            with open(sample_md_path, 'r', encoding='utf-8') as f:
                content = f.read()
                # Show first 1500 characters to demonstrate quality
                if len(content) > 1500:
                    print(content[:1500] + "\\n\\n... [Content truncated for display] ...")
                else:
                    print(content)
            
            # Show advanced features detected
            print(f"\\nüé® Advanced Features Detected:")
            features_found = []
            if '<table>' in content.lower():
                features_found.append("üìä HTML Tables")
            if '$$' in content or '\\begin{' in content:
                features_found.append("üî¢ LaTeX Equations")
            if '<img>' in content:
                features_found.append("üñºÔ∏è Image Descriptions")
            if 'watermark' in content.lower():
                features_found.append("üè∑Ô∏è Watermarks")
            if any(marker in content for marker in ['‚òê', '‚òë', '<page_number>', '*[Page:']):
                features_found.append("‚≠ê Special Elements")
            
            if features_found:
                for feature in features_found:
                    print(f"   {feature}")
            else:
                print("   Standard text extraction")
            
            # Show statistics for this document
            json_path = os.path.join(OUTPUT_DIR, sample_doc, f"{sample_doc}_nanonets_detailed.json")
            if os.path.exists(json_path):
                with open(json_path, 'r') as f:
                    data = json.load(f)
                    print(f"\\nüìä Sample Document Statistics:")
                    print(f"   ü§ñ OCR Engine: {data.get('ocr_engine', 'Nanonets OCR')}")
                    print(f"   üî§ Elements processed: {data.get('processing_stats', {}).get('total_elements', 0)}")
                    print(f"   üìÖ Processing date: {data.get('processed_at', 'Unknown')}")
                    
                    config = data.get('configuration', {})
                    print(f"   ‚öôÔ∏è Configuration:")
                    print(f"      Performance Mode: {'Fast' if config.get('fast_mode') else 'Quality'}")
                    print(f"      Multi-GPU: {'Yes' if config.get('multi_gpu') else 'No'}")
                    print(f"      Advanced Formatting: {'Yes' if config.get('advanced_formatting') else 'No'}")
                    
                    # Show processing stats
                    stats = data.get('processing_stats', {})
                    print(f"   üìä Processing Statistics:")
                    print(f"      Elements with text: {stats.get('elements_with_text', 0)}")
                    print(f"      Average confidence: {stats.get('avg_confidence', 0):.3f}")
                    print(f"      High confidence elements: {stats.get('high_confidence_elements', 0)}")
                    
                    # Show element types
                    element_types = stats.get('element_types', {})
                    if element_types:
                        print(f"      Element types found:")
                        for elem_type, count in sorted(element_types.items(), key=lambda x: x[1], reverse=True):
                            print(f"         ‚Ä¢ {elem_type}: {count}")
        else:
            print(f"‚ùå Sample markdown file not found: {sample_md_path}")
    else:
        print("‚ùå No processed documents found")

# Run analysis
docs_processed, elements_processed = analyze_nanonets_results()

# Show sample content
print("\\n" + "=" * 80)
show_sample_nanonets_markdown()

## 11. Final Summary and Next Steps

In [None]:
# Final Summary and Completion
print("üéâ Nanonets OCR Markdown Generator - Complete Summary")
print("=" * 80)
print("ü§ñ Features Implemented with Nanonets OCR:")
print("  ‚Ä¢ Advanced AI-powered text recognition")
print("  ‚Ä¢ Multi-GPU acceleration support")
print("  ‚Ä¢ HTML table extraction with proper formatting")
print("  ‚Ä¢ LaTeX equation recognition and formatting")
print("  ‚Ä¢ Intelligent image description generation")
print("  ‚Ä¢ Automatic watermark and page number detection")
print("  ‚Ä¢ Context-aware element processing")
print("  ‚Ä¢ Comprehensive confidence scoring")
print("  ‚Ä¢ Advanced markdown generation")
print("  ‚Ä¢ Robust error handling and retry logic")
print("")
print("üìÅ Output Structure:")
print("  nanonets_results/")
print("    ‚îú‚îÄ‚îÄ document_name_1/")
print("    ‚îÇ   ‚îú‚îÄ‚îÄ document_name_1_nanonets.md")
print("    ‚îÇ   ‚îî‚îÄ‚îÄ document_name_1_nanonets_detailed.json")
print("    ‚îú‚îÄ‚îÄ document_name_2/")
print("    ‚îÇ   ‚îú‚îÄ‚îÄ document_name_2_nanonets.md")
print("    ‚îÇ   ‚îî‚îÄ‚îÄ document_name_2_nanonets_detailed.json")
print("    ‚îî‚îÄ‚îÄ ...")
print("")
print("üöÄ Nanonets OCR Advantages Utilized:")
print("  ‚Ä¢ Superior accuracy on complex documents")
print("  ‚Ä¢ Native support for structured content")
print("  ‚Ä¢ Advanced AI understanding of document context")
print("  ‚Ä¢ Multi-GPU acceleration for faster processing")
print("  ‚Ä¢ Rich formatting with HTML and LaTeX support")
print("  ‚Ä¢ Intelligent element type recognition")
print("")
print("üîß Next Steps:")
print("  ‚Ä¢ Run all cells sequentially to process documents")
print("  ‚Ä¢ Review generated markdown files for quality")
print("  ‚Ä¢ Compare results with other OCR engines")
print("  ‚Ä¢ Fine-tune configuration for specific document types")
print("  ‚Ä¢ Use the zipping feature below for result archival")
print("")

# Display final statistics if available
if 'processing_results' in locals() and processing_results:
    print("üìà Final Processing Statistics:")
    print(f"  ü§ñ OCR Engine: Nanonets OCR ({NANONETS_MODEL})")
    print(f"  üìÑ Documents processed: {processing_results.get('successful_docs', 0)}")
    print(f"  ‚ùå Processing failures: {processing_results.get('failed_docs', 0)}")
    print(f"  üî§ Elements extracted: {processing_results.get('total_elements', 0)}")
    print(f"  üéØ Average confidence: {processing_results.get('avg_confidence', 0):.3f}")
    print(f"  ‚è±Ô∏è Processing time: {processing_results.get('processing_time', 0):.2f} seconds")
    print(f"  üìà Success rate: {processing_results.get('success_rate', 0):.1f}%")
    print(f"  üöÄ Processing speed: {processing_results.get('elements_per_second', 0):.2f} elements/second")
    print("")

print("‚ú® Nanonets OCR Pipeline Complete! ‚ú®")
print("üî• Your documents have been processed with state-of-the-art AI OCR!")
print("‚ö° Enjoy the advanced features and superior accuracy of Nanonets!")
print("")
print("üì¶ Proceed to the next section to create a compressed archive of results.")

## 12. Archive Results - Create Compressed Package

Create a compressed archive of all Nanonets OCR results for easy sharing, backup, and distribution.

In [None]:
import zipfile
import shutil
from datetime import datetime

def create_results_archive():
    """
    Create a comprehensive compressed archive of Nanonets OCR results
    """
    if not os.path.exists(OUTPUT_DIR):
        print("‚ùå No results directory found to archive")
        print(f"Expected directory: {OUTPUT_DIR}")
        return None
    
    # Check if there are any results to archive
    result_dirs = [d for d in os.listdir(OUTPUT_DIR) 
                   if os.path.isdir(os.path.join(OUTPUT_DIR, d))]
    
    if not result_dirs:
        print("‚ùå No processed documents found to archive")
        return None
    
    # Create archive filename with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    archive_name = f"nanonets_ocr_results_{timestamp}.zip"
    archive_path = os.path.join(".", archive_name)
    
    print("üì¶ Creating Nanonets OCR Results Archive...")
    print(f"üóÇÔ∏è Archive name: {archive_name}")
    print(f"üìÅ Source directory: {OUTPUT_DIR}")
    
    try:
        # Calculate total size before compression
        total_size = 0
        total_files = 0
        
        for root, dirs, files in os.walk(OUTPUT_DIR):
            for file in files:
                file_path = os.path.join(root, file)
                total_size += os.path.getsize(file_path)
                total_files += 1
        
        print(f"üìä Archiving {total_files} files ({total_size / 1024 / 1024:.2f} MB)")
        
        # Create ZIP archive with compression
        with zipfile.ZipFile(archive_path, 'w', zipfile.ZIP_DEFLATED, compresslevel=6) as zipf:
            
            # Add all files from the results directory
            for root, dirs, files in os.walk(OUTPUT_DIR):
                for file in files:
                    file_path = os.path.join(root, file)
                    # Create relative path for archive
                    arcname = os.path.relpath(file_path, ".")
                    zipf.write(file_path, arcname)
            
            # Create a summary file with metadata
            summary_content = f"""# Nanonets OCR Results Archive
            
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
OCR Engine: Nanonets OCR ({NANONETS_MODEL})
Performance Mode: {'Fast' if FAST_MODE else 'Quality'}
Multi-GPU: {'Enabled' if USE_MULTI_GPU else 'Disabled'}
Advanced Features: {'Enabled' if ENABLE_ADVANCED_FORMATTING else 'Disabled'}

## Archive Contents:
- Total Documents: {len(result_dirs)}
- Total Files: {total_files}
- Original Size: {total_size / 1024 / 1024:.2f} MB

## Document Processing Results:
"""
            
            # Add document statistics to summary
            for doc_name in sorted(result_dirs):
                doc_dir = os.path.join(OUTPUT_DIR, doc_name)
                md_file = os.path.join(doc_dir, f"{doc_name}_nanonets.md")
                json_file = os.path.join(doc_dir, f"{doc_name}_nanonets_detailed.json")
                
                md_size = os.path.getsize(md_file) if os.path.exists(md_file) else 0
                json_size = os.path.getsize(json_file) if os.path.exists(json_file) else 0
                
                summary_content += f"- {doc_name}: MD={md_size/1024:.1f}KB, JSON={json_size/1024:.1f}KB\n"
            
            summary_content += f"""
## Features Utilized:
- HTML Table Extraction: {'Yes' if ENABLE_ADVANCED_FORMATTING else 'No'}
- LaTeX Equation Recognition: {'Yes' if ENABLE_ADVANCED_FORMATTING else 'No'}
- Image Descriptions: {'Yes' if ENABLE_IMAGE_DESCRIPTIONS else 'No'}
- Watermark Detection: {'Yes' if ENABLE_WATERMARK_DETECTION else 'No'}

## Processing Configuration:
- Page OCR Tokens: {PAGE_OCR_TOKENS}
- Crop OCR Tokens: {CROP_OCR_TOKENS}
- Table OCR Tokens: {TABLE_OCR_TOKENS}
- Minimum Text Length: {MIN_TEXT_LENGTH}

## File Structure:
Each document folder contains:
- document_name_nanonets.md: Rich markdown with advanced formatting
- document_name_nanonets_detailed.json: Comprehensive processing metadata

For questions or support, refer to the Nanonets OCR documentation.
"""
            
            # Add summary to archive
            zipf.writestr("README.txt", summary_content)
        
        # Get final archive size
        archive_size = os.path.getsize(archive_path)
        compression_ratio = (1 - archive_size / total_size) * 100 if total_size > 0 else 0
        
        print(f"‚úÖ Archive created successfully!")
        print(f"üì¶ Archive file: {archive_path}")
        print(f"üíæ Archive size: {archive_size / 1024 / 1024:.2f} MB")
        print(f"üóúÔ∏è Compression ratio: {compression_ratio:.1f}%")
        
        return archive_path
        
    except Exception as e:
        print(f"‚ùå Error creating archive: {e}")
        import traceback
        traceback.print_exc()
        return None

def create_selective_archive(document_names=None, include_json=True):
    """
    Create a selective archive with specific documents or file types
    """
    if not os.path.exists(OUTPUT_DIR):
        print("‚ùå No results directory found")
        return None
    
    # Get available documents
    available_docs = [d for d in os.listdir(OUTPUT_DIR) 
                      if os.path.isdir(os.path.join(OUTPUT_DIR, d))]
    
    if not available_docs:
        print("‚ùå No processed documents found")
        return None
    
    # Filter documents if specific names provided
    if document_names:
        docs_to_archive = [doc for doc in document_names if doc in available_docs]
        if not docs_to_archive:
            print("‚ùå None of the specified documents found")
            return None
    else:
        docs_to_archive = available_docs
    
    # Create selective archive
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    archive_name = f"nanonets_ocr_selective_{timestamp}.zip"
    archive_path = os.path.join(".", archive_name)
    
    print(f"üì¶ Creating selective archive: {archive_name}")
    print(f"üìÑ Documents to include: {len(docs_to_archive)}")
    
    try:
        with zipfile.ZipFile(archive_path, 'w', zipfile.ZIP_DEFLATED, compresslevel=6) as zipf:
            total_files = 0
            
            for doc_name in docs_to_archive:
                doc_dir = os.path.join(OUTPUT_DIR, doc_name)
                
                # Add markdown file
                md_file = os.path.join(doc_dir, f"{doc_name}_nanonets.md")
                if os.path.exists(md_file):
                    arcname = f"{doc_name}/{doc_name}_nanonets.md"
                    zipf.write(md_file, arcname)
                    total_files += 1
                
                # Add JSON file if requested
                if include_json:
                    json_file = os.path.join(doc_dir, f"{doc_name}_nanonets_detailed.json")
                    if os.path.exists(json_file):
                        arcname = f"{doc_name}/{doc_name}_nanonets_detailed.json"
                        zipf.write(json_file, arcname)
                        total_files += 1
            
            # Add selective summary
            summary = f"""# Selective Nanonets OCR Archive

Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
Documents included: {len(docs_to_archive)}
Include JSON files: {'Yes' if include_json else 'No'}
Total files: {total_files}

Documents:
{chr(10).join(f"- {doc}" for doc in docs_to_archive)}
"""
            zipf.writestr("README.txt", summary)
        
        archive_size = os.path.getsize(archive_path)
        print(f"‚úÖ Selective archive created: {archive_size / 1024 / 1024:.2f} MB")
        return archive_path
        
    except Exception as e:
        print(f"‚ùå Error creating selective archive: {e}")
        return None

# Create comprehensive results archive
print("üöÄ Creating Comprehensive Results Archive...")
print("=" * 60)

archive_path = create_results_archive()

if archive_path:
    print(f"""
üìÅ Archive Details:
  File: {archive_path}
  Location: {os.path.abspath(archive_path)}
  
üìã Archive Contents:
  ‚Ä¢ All markdown files with rich formatting
  ‚Ä¢ Detailed JSON files with processing metadata
  ‚Ä¢ README.txt with comprehensive documentation
  ‚Ä¢ Complete processing statistics and configuration
  
üéØ Use Cases:
  ‚Ä¢ Share results with colleagues or supervisors
  ‚Ä¢ Backup processed documents
  ‚Ä¢ Submit as part of research deliverables
  ‚Ä¢ Archive for future reference
  
‚úÖ Archive ready for distribution!
    """)
else:
    print("‚ùå Failed to create archive")

print("\nüí° Additional Archive Options:")
print("You can also create selective archives by running:")
print("  ‚Ä¢ create_selective_archive(['doc1', 'doc2'])  # Specific documents")
print("  ‚Ä¢ create_selective_archive(include_json=False)  # Markdown only")
print("  ‚Ä¢ create_selective_archive()  # All documents, selective format")

In [None]:
# Final Summary and Next Steps
print("üéâ Nanonets OCR Markdown Generator - Complete Summary")
print("=" * 80)
print("ü§ñ Features Implemented with Nanonets OCR:")
print("  ‚Ä¢ Advanced AI-powered text extraction with context understanding")
print("  ‚Ä¢ Multi-GPU acceleration for optimal performance")
print("  ‚Ä¢ HTML table extraction with proper formatting")
print("  ‚Ä¢ LaTeX equation recognition and rendering")
print("  ‚Ä¢ Intelligent image description generation")
print("  ‚Ä¢ Automatic watermark and page number detection")
print("  ‚Ä¢ Context-aware element processing")
print("  ‚Ä¢ Advanced confidence scoring and validation")
print("  ‚Ä¢ Comprehensive batch processing with statistics")
print("  ‚Ä¢ Memory optimization and error recovery")
print()
print("üìÅ Output Structure:")
print("  nanonets_results/")
print("    ‚îú‚îÄ‚îÄ document_name_1/")
print("    ‚îÇ   ‚îú‚îÄ‚îÄ document_name_1_nanonets.md")
print("    ‚îÇ   ‚îî‚îÄ‚îÄ document_name_1_nanonets_detailed.json")
print("    ‚îú‚îÄ‚îÄ document_name_2/")
print("    ‚îÇ   ‚îú‚îÄ‚îÄ document_name_2_nanonets.md")
print("    ‚îÇ   ‚îî‚îÄ‚îÄ document_name_2_nanonets_detailed.json")
print("    ‚îî‚îÄ‚îÄ ...")
print()
print("üöÄ Nanonets OCR Advantages Utilized:")
print("  ‚Ä¢ State-of-the-art AI models for superior accuracy")
print("  ‚Ä¢ Native support for complex document structures")
print("  ‚Ä¢ Advanced formatting with HTML and LaTeX")
print("  ‚Ä¢ Intelligent element type recognition")
print("  ‚Ä¢ Multi-GPU distributed processing")
print("  ‚Ä¢ Contextual understanding for better extraction")
print("  ‚Ä¢ Robust error handling and recovery")
print()
print("üîß Next Steps:")
print("  ‚Ä¢ Compare results with other OCR engines (PaddleOCR, Tesseract)")
print("  ‚Ä¢ Fine-tune confidence thresholds for optimal quality")
print("  ‚Ä¢ Experiment with different token limits for speed vs quality")
print("  ‚Ä¢ Optimize GPU memory usage for larger documents")
print("  ‚Ä¢ Add post-processing enhancements for specific use cases")
print("  ‚Ä¢ Integrate with downstream document analysis pipelines")
print("  ‚Ä¢ Implement custom prompts for domain-specific documents")
print()
print("üìä To view results:")
print(f"  ‚Ä¢ Check the '{OUTPUT_DIR}' directory")
print("  ‚Ä¢ Review individual markdown files for human-readable content")
print("  ‚Ä¢ Analyze detailed JSON files for processing metadata")
print("  ‚Ä¢ Compare processing times and accuracy across documents")
print()

# Display final statistics if available
if 'processing_results' in locals() and processing_results:
    print("üìà Final Processing Statistics:")
    print(f"  ü§ñ OCR Engine: Nanonets OCR ({NANONETS_MODEL})")
    print(f"  üìÑ Documents processed: {processing_results.get('successful_docs', 0)}")
    print(f"  ‚ùå Processing failures: {processing_results.get('failed_docs', 0)}")
    print(f"  üî§ Elements extracted: {processing_results.get('total_elements', 0)}")
    print(f"  üéØ Average confidence: {processing_results.get('avg_confidence', 0):.3f}")
    print(f"  ‚è±Ô∏è Processing time: {processing_results.get('processing_time', 0):.2f} seconds")
    print(f"  üìà Success rate: {processing_results.get('success_rate', 0):.1f}%")
    
    if processing_results.get('elements_per_second', 0) > 0:
        print(f"  üöÄ Processing speed: {processing_results['elements_per_second']:.1f} elements/second")
    
    # Feature usage summary
    if USE_MULTI_GPU:
        print(f"  ‚ö° Multi-GPU acceleration was utilized")
    if ENABLE_ADVANCED_FORMATTING:
        print(f"  üé® Advanced formatting features were enabled")
    if ENABLE_IMAGE_DESCRIPTIONS:
        print(f"  üñºÔ∏è Intelligent image descriptions were generated")
    if ENABLE_WATERMARK_DETECTION:
        print(f"  üè∑Ô∏è Watermark and special element detection was active")
    print()

print("üéØ Comparison with Other OCR Engines:")
print("vs. Tesseract:")
print("  ‚úÖ Superior accuracy on complex documents")
print("  ‚úÖ Native support for tables and equations")
print("  ‚úÖ Better handling of varied fonts and layouts")
print("  ‚úÖ Advanced AI understanding of document context")
print()
print("vs. PaddleOCR:")
print("  ‚úÖ More advanced formatting capabilities")
print("  ‚úÖ Better integration with modern ML pipelines")
print("  ‚úÖ Superior handling of complex document structures")
print("  ‚úÖ More intelligent content understanding")
print()
print("vs. Cloud OCR Services:")
print("  ‚úÖ Full control over processing and data privacy")
print("  ‚úÖ No API rate limits or usage costs")
print("  ‚úÖ Customizable prompts and processing logic")
print("  ‚úÖ Integration with local GPU infrastructure")

print()
print("‚ú® Nanonets OCR Pipeline Complete! ‚ú®")
print("ü§ñ Your documents have been processed with state-of-the-art AI OCR!")
print("‚ö° Enjoy the advanced formatting and superior accuracy!")

# Performance comparison note
if torch.cuda.is_available():
    gpu_count = torch.cuda.device_count()
    if gpu_count > 1:
        print(f"üñ•Ô∏è Multi-GPU processing delivered optimal performance across {gpu_count} GPUs")
    else:
        print(f"üñ•Ô∏è Single GPU processing completed successfully")
        print("üí° Consider multi-GPU setup for even better performance on large document sets")
else:
    print("üí° GPU acceleration would significantly improve processing speed")

print(f"\\nüìÇ Complete Output Directory: {OUTPUT_DIR}")
print("üîç Explore the generated markdown files to see the advanced OCR results!")

# üöÄ Nanonets OCR Markdown Generator

This notebook processes cropped images from layout detection results and performs OCR using **Nanonets OCR** to generate structured markdown files.

**Features:**
- üî• **Nanonets OCR** with multi-GPU acceleration support
- üéØ **Advanced text recognition** with HTML tables and LaTeX equations
- üìñ **Reading order detection** for logical text flow
- üìù **Rich markdown generation** with comprehensive formatting
- üìÅ **Organized output** in respective directories
- üé® **Advanced element handling** (headers, text, tables, equations, images)
- ‚ö° **GPU-optimized processing** for faster performance
- üß† **Memory management** with automatic optimization

**Input:** Layout detection results from `layout_results/` directory
**Output:** Markdown files saved in `nanonets_results/` directory

**Processing Pipeline:**
1. Load cropped images and layout analysis
2. Initialize Nanonets OCR with optimal settings
3. Perform OCR with advanced prompting for rich content extraction
4. Sort text elements by reading order
5. Generate structured markdown content with HTML tables and LaTeX
6. Save markdown files with comprehensive formatting

**Advantages over Traditional OCR:**
- Superior accuracy on complex documents
- Native support for tables, equations, and images
- Better handling of document structure
- Advanced formatting capabilities
- Multi-GPU acceleration support

## Complete Cropped Files OCR Processing with Clean Output

This section provides a comprehensive solution to process all cropped images using Nanonets OCR with optimized prompts and clean markdown output containing only extracted data.

In [None]:
# Quick OCR Engine Initialization (if not already loaded)
if 'ocr_engine' not in globals() or ocr_engine is None:
    print("üîÑ Initializing Nanonets OCR Engine...")
    
    import torch
    from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText
    
    # Configuration
    NANONETS_MODEL = "nanonets/Nanonets-OCR-s"
    
    try:
        # Load tokenizer and processor
        tokenizer = AutoTokenizer.from_pretrained(NANONETS_MODEL, trust_remote_code=True)
        processor = AutoProcessor.from_pretrained(NANONETS_MODEL, trust_remote_code=True)
        
        # Load model
        model = AutoModelForImageTextToText.from_pretrained(
            NANONETS_MODEL,
            trust_remote_code=True,
            torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
            device_map="auto" if torch.cuda.is_available() else None
        )
        
        # Create engine class
        class SimpleOCREngine:
            def __init__(self, model, processor, tokenizer):
                self.model = model
                self.processor = processor
                self.tokenizer = tokenizer
        
        ocr_engine = SimpleOCREngine(model, processor, tokenizer)
        
        print("‚úÖ Nanonets OCR Engine initialized successfully!")
        print(f"ü§ñ Model: {NANONETS_MODEL}")
        print(f"üñ•Ô∏è Device: {'CUDA' if torch.cuda.is_available() else 'CPU'}")
        
    except Exception as e:
        print(f"‚ùå Failed to initialize OCR engine: {e}")
        ocr_engine = None
else:
    print("‚úÖ OCR Engine already loaded and ready")

In [None]:
import os
import json
import time
from pathlib import Path
from PIL import Image
import torch
from typing import Dict, List, Tuple, Optional, Any
from collections import defaultdict
import re

class OptimizedNanonetsProcessor:
    """
    Optimized Nanonets OCR processor for all cropped images with clean output
    """
    
    def __init__(self, ocr_engine):
        self.ocr_engine = ocr_engine
        self.processed_count = 0
        self.success_count = 0
        self.error_count = 0
        self.total_confidence = 0.0
        
    def create_clean_prompt(self, element_type: str) -> str:
        """Create optimized prompts for clean text extraction only"""
        
        # Base prompt for clean extraction
        base_prompt = (
            "Extract ONLY the text content from this image. "
            "Do not add any explanations, metadata, or extra information. "
            "Return only the actual text that appears in the image. "
        )
        
        # Element-specific optimization
        element_prompts = {
            "table": (
                "Extract the table data and format it as a clean HTML table. "
                "Include only the actual cell content without any styling or extra markup. "
                "Use simple <table>, <tr>, <td>, <th> tags only."
            ),
            "title": (
                "Extract only the title text. Return just the heading text without any formatting markers."
            ),
            "section_header": (
                "Extract only the header text. Return just the heading content without any formatting."
            ),
            "text": (
                "Extract only the text content. Preserve line breaks where they naturally occur. "
                "Do not add any formatting or markup."
            ),
            "paragraph": (
                "Extract only the paragraph text. Maintain natural paragraph structure."
            ),
            "key_value_region": (
                "Extract the key-value pairs as simple text. Format as 'Key: Value' on separate lines."
            ),
            "list": (
                "Extract the list items. Use simple bullet points (-) or numbers (1., 2., etc.) as they appear."
            ),
            "page_header": (
                "Extract only the header text that appears at the top of the page."
            ),
            "page_footer": (
                "Extract only the footer text that appears at the bottom of the page."
            ),
            "picture": (
                "If there is any text visible in this image, extract it. "
                "If no text is visible, return: [Image: brief description]"
            )
        }
        
        # Add element-specific prompt
        if element_type in element_prompts:
            return base_prompt + element_prompts[element_type]
        
        return base_prompt + "Return only the visible text content."
    
    def preprocess_image(self, image_path: Path) -> Optional[Image.Image]:
        """Preprocess image to ensure compatibility with Nanonets"""
        try:
            image = Image.open(image_path).convert("RGB")
            width, height = image.size
            
            # Minimum dimensions for Nanonets model
            MIN_DIMENSION = 32
            
            # Resize small images
            if width < MIN_DIMENSION or height < MIN_DIMENSION:
                scale_factor = max(MIN_DIMENSION / width, MIN_DIMENSION / height)
                new_width = max(MIN_DIMENSION, int(width * scale_factor))
                new_height = max(MIN_DIMENSION, int(height * scale_factor))
                
                print(f"    üìè Resizing from {width}x{height} to {new_width}x{new_height}")
                image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
            
            # Ensure reasonable maximum size to avoid memory issues
            MAX_DIMENSION = 2048
            if width > MAX_DIMENSION or height > MAX_DIMENSION:
                scale_factor = min(MAX_DIMENSION / width, MAX_DIMENSION / height)
                new_width = int(width * scale_factor)
                new_height = int(height * scale_factor)
                
                print(f"    üìê Downscaling from {width}x{height} to {new_width}x{new_height}")
                image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
            
            return image
            
        except Exception as e:
            print(f"    ‚ùå Image preprocessing failed: {str(e)[:50]}...")
            return None
    
    def extract_clean_text(self, image_path: Path, element_type: str) -> Tuple[str, float]:
        """Extract clean text with optimized prompts"""
        try:
            # Preprocess image
            image = self.preprocess_image(image_path)
            if image is None:
                return "", 0.0
            
            # Create optimized prompt
            prompt = self.create_clean_prompt(element_type)
            
            # Prepare messages
            messages = [
                {
                    "role": "system", 
                    "content": "You are a precise text extraction assistant. Extract only the visible text content without any additional commentary or formatting."
                },
                {
                    "role": "user", 
                    "content": [
                        {"type": "image", "image": f"file://{image_path}"},
                        {"type": "text", "text": prompt},
                    ]
                },
            ]
            
            # Apply chat template
            try:
                text = self.ocr_engine.processor.apply_chat_template(
                    messages, tokenize=False, add_generation_prompt=True
                )
            except AttributeError:
                text = self.ocr_engine.tokenizer.apply_chat_template(
                    messages, tokenize=False, add_generation_prompt=True
                )
            
            # Process inputs
            inputs = self.ocr_engine.processor(
                text=[text], images=[image], padding=True, return_tensors="pt"
            )
            
            # Move to GPU if available
            if torch.cuda.is_available():
                inputs = {k: v.cuda(0) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
            
            # Generate text with optimized parameters
            with torch.inference_mode():
                output = self.ocr_engine.model.generate(
                    **inputs,
                    max_new_tokens=1024,  # Reasonable limit for clean extraction
                    do_sample=False,      # Deterministic output
                    num_beams=1,          # Fast greedy decoding
                    temperature=0.1,      # Low temperature for consistency
                    repetition_penalty=1.05,
                    length_penalty=1.0,
                    early_stopping=True,
                    pad_token_id=self.ocr_engine.model.generation_config.pad_token_id,
                )
            
            # Extract generated text
            generated_ids = [o[i.shape[-1]:] for i, o in zip(inputs["input_ids"], output)]
            result = self.ocr_engine.processor.batch_decode(
                generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
            )[0]
            
            # Clean up result
            result = result.strip()
            
            # Calculate confidence based on result quality
            confidence = self.calculate_confidence(result, image.size)
            
            # Cleanup memory
            del image, inputs, output
            torch.cuda.empty_cache() if torch.cuda.is_available() else None
            
            return result, confidence
            
        except Exception as e:
            print(f"    ‚ùå OCR failed: {str(e)[:50]}...")
            return "", 0.0
    
    def calculate_confidence(self, text: str, image_size: Tuple[int, int]) -> float:
        """Calculate confidence score based on extraction quality"""
        if not text or len(text.strip()) < 2:
            return 0.0
        
        confidence = 1.0
        
        # Reduce confidence for very short text
        if len(text) < 5:
            confidence *= 0.6
        
        # Reduce confidence for special error indicators
        if any(indicator in text.lower() for indicator in ['failed', 'error', 'unable', 'cannot']):
            confidence *= 0.2
        
        # Reduce confidence for garbled text (too many special characters)
        special_char_ratio = sum(1 for c in text if not c.isalnum() and c not in ' .,!?-:;()[]{}') / len(text)
        if special_char_ratio > 0.3:
            confidence *= 0.5
        
        # Boost confidence for structured content
        if any(tag in text for tag in ['<table>', '<tr>', '<td>']):
            confidence *= 1.1
        
        # Consider image size (very small images are less reliable)
        width, height = image_size
        if width < 50 or height < 20:
            confidence *= 0.7
        
        return min(1.0, max(0.0, confidence))
    
    def process_single_image(self, image_path: Path, element_info: Dict) -> Dict:
        """Process a single cropped image"""
        element_type = element_info.get('type', 'text')
        element_id = element_info.get('id', 'unknown')
        
        print(f"    üîç Processing {element_type} (ID: {element_id})")
        
        start_time = time.time()
        extracted_text, confidence = self.extract_clean_text(image_path, element_type)
        processing_time = time.time() - start_time
        
        # Update statistics
        self.processed_count += 1
        if confidence > 0.1:  # Consider as success if confidence > 0.1
            self.success_count += 1
            self.total_confidence += confidence
        else:
            self.error_count += 1
        
        result = {
            'id': element_id,
            'type': element_type,
            'image_path': str(image_path),
            'extracted_text': extracted_text,
            'confidence': confidence,
            'processing_time': processing_time,
            'success': confidence > 0.1
        }
        
        # Add original element info
        result.update(element_info)
        
        if extracted_text and confidence > 0.1:
            print(f"    ‚úÖ Success: {len(extracted_text)} chars, confidence: {confidence:.2f}")
        else:
            print(f"    ‚ùå Failed: No text extracted")
        
        return result

# Initialize the optimized processor
if 'ocr_engine' in globals() and ocr_engine:
    optimized_processor = OptimizedNanonetsProcessor(ocr_engine)
    print("‚úÖ Optimized Nanonets processor initialized")
else:
    print("‚ùå OCR engine not available. Please initialize the OCR engine first.")

In [None]:
def iterate_all_cropped_files():
    """
    Iterate through all cropped files in layout_results and process with Nanonets OCR
    """
    layout_results_path = Path("layout_results")
    
    if not layout_results_path.exists():
        print("‚ùå layout_results directory not found!")
        return
    
    # Find all document directories
    doc_dirs = [d for d in layout_results_path.iterdir() if d.is_dir()]
    
    if not doc_dirs:
        print("‚ùå No document directories found in layout_results")
        return
    
    print(f"üöÄ Found {len(doc_dirs)} document directories")
    print(f"ü§ñ Using Nanonets OCR for clean text extraction")
    print("=" * 80)
    
    all_results = {}
    overall_stats = {
        'total_documents': len(doc_dirs),
        'processed_documents': 0,
        'total_images': 0,
        'successful_extractions': 0,
        'failed_extractions': 0,
        'average_confidence': 0.0,
        'processing_time': 0.0
    }
    
    start_time = time.time()
    
    for doc_dir in doc_dirs:
        doc_name = doc_dir.name
        print(f"\\nüìÑ Processing document: {doc_name}")
        
        # Look for cropped_images directory
        cropped_dir = doc_dir / "cropped_images"
        if not cropped_dir.exists():
            print(f"  ‚ö†Ô∏è No cropped_images directory found in {doc_name}")
            continue
        
        # Look for layout results JSON
        layout_json_path = doc_dir / f"{doc_name}_layout_results.json"
        element_info_map = {}
        
        if layout_json_path.exists():
            try:
                with open(layout_json_path, 'r', encoding='utf-8') as f:
                    layout_data = json.load(f)
                    
                # Create mapping of image names to element info
                for page_data in layout_data.get('pages', []):
                    for element in page_data.get('elements', []):
                        if 'image_path' in element:
                            image_name = Path(element['image_path']).name
                            element_info_map[image_name] = element
                            
            except Exception as e:
                print(f"  ‚ö†Ô∏è Could not read layout JSON: {e}")
        
        # Get all image files in cropped_images
        image_files = []
        for ext in ['*.png', '*.jpg', '*.jpeg']:
            image_files.extend(cropped_dir.glob(ext))
        
        if not image_files:
            print(f"  ‚ö†Ô∏è No image files found in {cropped_dir}")
            continue
        
        print(f"  üñºÔ∏è Found {len(image_files)} cropped images")
        
        # Process each image
        doc_results = []
        doc_success_count = 0
        doc_total_confidence = 0.0
        
        for image_path in sorted(image_files):
            image_name = image_path.name
            
            # Get element info from layout results
            element_info = element_info_map.get(image_name, {})
            if not element_info:
                # Try to extract info from filename
                element_info = extract_info_from_filename(image_name)
            
            # Process the image
            result = optimized_processor.process_single_image(image_path, element_info)
            doc_results.append(result)
            
            overall_stats['total_images'] += 1
            
            if result['success']:
                doc_success_count += 1
                doc_total_confidence += result['confidence']
                overall_stats['successful_extractions'] += 1
            else:
                overall_stats['failed_extractions'] += 1
        
        # Store document results
        all_results[doc_name] = {
            'images_processed': len(image_files),
            'successful_extractions': doc_success_count,
            'average_confidence': doc_total_confidence / doc_success_count if doc_success_count > 0 else 0.0,
            'results': doc_results
        }
        
        overall_stats['processed_documents'] += 1
        
        print(f"  üìä Document summary:")
        print(f"    üñºÔ∏è Images processed: {len(image_files)}")
        print(f"    ‚úÖ Successful: {doc_success_count}")
        print(f"    ‚ùå Failed: {len(image_files) - doc_success_count}")
        print(f"    üéØ Avg confidence: {doc_total_confidence / doc_success_count if doc_success_count > 0 else 0:.3f}")
    
    # Calculate overall statistics
    total_time = time.time() - start_time
    overall_stats['processing_time'] = total_time
    overall_stats['average_confidence'] = (
        optimized_processor.total_confidence / optimized_processor.success_count 
        if optimized_processor.success_count > 0 else 0.0
    )
    
    print("\\n" + "=" * 80)
    print("üéâ PROCESSING COMPLETE!")
    print("=" * 80)
    print(f"üìä Overall Statistics:")
    print(f"  üìÅ Documents processed: {overall_stats['processed_documents']}/{overall_stats['total_documents']}")
    print(f"  üñºÔ∏è Total images: {overall_stats['total_images']}")
    print(f"  ‚úÖ Successful extractions: {overall_stats['successful_extractions']}")
    print(f"  ‚ùå Failed extractions: {overall_stats['failed_extractions']}")
    print(f"  üéØ Overall success rate: {(overall_stats['successful_extractions']/overall_stats['total_images']*100):.1f}%")
    print(f"  üéØ Average confidence: {overall_stats['average_confidence']:.3f}")
    print(f"  ‚è±Ô∏è Total processing time: {total_time:.2f}s ({total_time/60:.1f} minutes)")
    print(f"  üöÄ Processing speed: {overall_stats['total_images']/total_time:.2f} images/second")
    
    return all_results, overall_stats

def extract_info_from_filename(filename: str) -> Dict:
    """Extract element information from filename pattern"""
    # Pattern: p001_elem000_type_id.png
    parts = filename.replace('.png', '').replace('.jpg', '').replace('.jpeg', '').split('_')
    
    info = {
        'id': 'unknown',
        'type': 'text',
        'page': 1
    }
    
    try:
        if len(parts) >= 3:
            # Extract page number
            if parts[0].startswith('p'):
                info['page'] = int(parts[0][1:])
            
            # Extract element ID
            if parts[1].startswith('elem'):
                info['id'] = parts[1]
            
            # Extract element type
            if len(parts) >= 4:
                info['type'] = parts[2]
            elif len(parts) == 3:
                info['type'] = parts[2]
                
    except (ValueError, IndexError):
        pass  # Use defaults
    
    return info

def generate_clean_markdown_from_results(results: Dict, doc_name: str) -> str:
    """Generate clean markdown containing only extracted text data"""
    
    lines = [f"# {doc_name}\\n"]
    
    # Group results by page and sort by element order
    pages = defaultdict(list)
    
    for result in results['results']:
        if result['success'] and result['extracted_text'].strip():
            page_num = result.get('page', 1)
            pages[page_num].append(result)
    
    # Sort pages
    for page_num in sorted(pages.keys()):
        if len(pages) > 1:  # Only add page headers if multiple pages
            lines.append(f"\\n## Page {page_num}\\n")
        
        # Sort elements by type priority and ID
        type_priority = {
            'title': 1,
            'section_header': 2,
            'paragraph': 3,
            'text': 4,
            'table': 5,
            'list': 6,
            'key_value_region': 7,
            'page_header': 8,
            'page_footer': 9,
            'picture': 10
        }
        
        page_elements = sorted(pages[page_num], 
                             key=lambda x: (type_priority.get(x['type'], 5), x.get('id', '')))
        
        for result in page_elements:
            text = result['extracted_text'].strip()
            element_type = result['type']
            
            if not text:
                continue
            
            # Format based on element type
            if element_type in ['title', 'section_header']:
                # Determine heading level
                if element_type == 'title':
                    lines.append(f"### {text}\\n")
                else:
                    lines.append(f"#### {text}\\n")
                    
            elif element_type == 'table':
                # Add table directly (should already be in HTML format)
                if '<table>' in text:
                    lines.append(f"{text}\\n")
                else:
                    # If not HTML, wrap in simple format
                    lines.append(f"**Table:**\\n{text}\\n")
                    
            elif element_type == 'list':
                # Ensure proper list formatting
                lines.append(f"{text}\\n")
                
            elif element_type == 'key_value_region':
                # Format key-value pairs
                lines.append(f"**{text}**\\n")
                
            elif element_type in ['page_header', 'page_footer']:
                # Format headers/footers distinctly
                lines.append(f"*{text}*\\n")
                
            elif element_type == 'picture':
                # Handle image descriptions
                if text.startswith('[Image:'):
                    lines.append(f"{text}\\n")
                else:
                    lines.append(f"[Image: {text}]\\n")
            else:
                # Regular text/paragraph
                lines.append(f"{text}\\n")
    
    return "\\n".join(lines)

def save_all_results(all_results: Dict, overall_stats: Dict):
    """Save all processing results to files"""
    
    # Create output directory
    output_dir = Path("nanonets_clean_results")
    output_dir.mkdir(exist_ok=True)
    
    saved_count = 0
    
    for doc_name, doc_results in all_results.items():
        if doc_results['successful_extractions'] > 0:
            
            # Generate clean markdown
            markdown_content = generate_clean_markdown_from_results(doc_results, doc_name)
            
            # Save markdown file
            markdown_path = output_dir / f"{doc_name}_clean.md"
            try:
                with open(markdown_path, 'w', encoding='utf-8') as f:
                    f.write(markdown_content)
                print(f"  ‚úÖ Saved: {markdown_path}")
                saved_count += 1
            except Exception as e:
                print(f"  ‚ùå Failed to save {markdown_path}: {e}")
            
            # Save detailed JSON
            json_path = output_dir / f"{doc_name}_results.json"
            try:
                with open(json_path, 'w', encoding='utf-8') as f:
                    json.dump({
                        'document_name': doc_name,
                        'processing_timestamp': time.time(),
                        'summary': {
                            'images_processed': doc_results['images_processed'],
                            'successful_extractions': doc_results['successful_extractions'],
                            'average_confidence': doc_results['average_confidence']
                        },
                        'detailed_results': doc_results['results']
                    }, f, indent=2, ensure_ascii=False)
            except Exception as e:
                print(f"  ‚ö†Ô∏è Could not save JSON for {doc_name}: {e}")
    
    # Save overall statistics
    stats_path = output_dir / "processing_statistics.json"
    try:
        with open(stats_path, 'w', encoding='utf-8') as f:
            json.dump(overall_stats, f, indent=2)
        print(f"  üìä Saved overall statistics: {stats_path}")
    except Exception as e:
        print(f"  ‚ö†Ô∏è Could not save statistics: {e}")
    
    print(f"\\nüíæ Total files saved: {saved_count} markdown files")
    print(f"üìÅ Output directory: {output_dir}")

print("‚úÖ Complete cropped files processing functions defined")
print("üéØ Features: Clean text extraction, optimized prompts, structured output")

### Execute Complete Processing

Run the complete processing pipeline for all cropped images.

In [None]:
# Execute the complete processing pipeline
print("üöÄ Starting Complete Cropped Files OCR Processing")
print("=" * 80)
print("üéØ Features:")
print("  ‚Ä¢ Clean text extraction only")
print("  ‚Ä¢ Optimized prompts for each element type")
print("  ‚Ä¢ Automatic image resizing for compatibility")
print("  ‚Ä¢ Structured markdown output")
print("  ‚Ä¢ No extra commentary or metadata in output")
print("=" * 80)

# Check if OCR engine is available
if 'optimized_processor' not in globals():
    print("‚ùå Optimized processor not initialized!")
    print("Please run the previous cells first to set up the processor.")
else:
    # Run the complete processing
    try:
        results, stats = iterate_all_cropped_files()
        
        if results:
            print("\\nüíæ Saving results...")
            save_all_results(results, stats)
            
            print("\\nüéâ PROCESSING COMPLETE!")
            print("‚úÖ All cropped images have been processed with Nanonets OCR")
            print("‚úÖ Clean markdown files generated with extracted text only")
            print("‚úÖ Results saved to 'nanonets_clean_results' directory")
            
        else:
            print("‚ùå No results generated")
            
    except Exception as e:
        print(f"‚ùå Error during processing: {e}")
        import traceback
        traceback.print_exc()