In [31]:
import fitz  # PyMuPDF
import json
import logging
import time
from pathlib import Path
import pandas as pd
import io
from PIL import Image
import cv2
import numpy as np

logging.basicConfig(level=logging.INFO)
_log = logging.getLogger(__name__)

INPUT_PDF_PATH = Path(
    "D:\\WorkSpace\\LOL-PaperReader\\backend\\src\\paperreader\\services\\parser\\1810.04805v2.pdf"
)
OUTPUT_DIR = Path("output_parser_elements_analysis")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)


def extract_text_elements(page, page_num):
    """Extract all text elements with their positions and formatting"""
    text_elements = []

    try:
        layout = page.get_text("dict", flags=fitz.TEXTFLAGS_DICT)

        for block_num, block in enumerate(layout["blocks"]):
            if "lines" in block:  # Text block
                block_bbox = block["bbox"]
                x0, y0, x1, y1 = block_bbox

                block_text = []
                font_info = []

                for line_num, line in enumerate(block["lines"]):
                    line_text = ""
                    line_fonts = []

                    for span_num, span in enumerate(line["spans"]):
                        line_text += span["text"]
                        line_fonts.append(
                            {
                                "font": span.get("font", ""),
                                "size": span.get("size", 0),
                                "flags": span.get("flags", 0),  # Bold, italic, etc.
                                "color": span.get("color", 0),
                            }
                        )

                    if line_text.strip():
                        block_text.append(line_text.strip())
                        font_info.append(line_fonts)

                if block_text:
                    full_text = " ".join(block_text)

                    # Determine text type based on formatting
                    text_type = "paragraph"
                    is_bold = any(
                        span.get("flags", 0) & 2**4  # Bold flag
                        for line in block["lines"]
                        for span in line["spans"]
                    )

                    if is_bold and len(full_text) < 100:
                        text_type = "heading"
                    elif len(full_text) < 50:
                        text_type = "short_text"

                    text_element = {
                        "page": page_num + 1,
                        "block_id": block_num,
                        "type": "text",
                        "text_type": text_type,
                        "content": full_text,
                        "bbox": [x0, y0, x1, y1],
                        "position": {
                            "x": x0,
                            "y": y0,
                            "width": x1 - x0,
                            "height": y1 - y0,
                        },
                        "font_info": (
                            font_info[0] if font_info else []
                        ),  # Take first line's font info
                        "line_count": len(block_text),
                        "char_count": len(full_text),
                        "is_bold": is_bold,
                    }

                    text_elements.append(text_element)

    except Exception as e:
        print(f"[!] Error extracting text from page {page_num + 1}: {e}")

    return text_elements


def extract_image_elements(page, page_num, output_dir, pdf_stem):
    """Extract all image elements including embedded and vector figures"""
    image_elements = []
    images_saved = 0
    global_image_id = 1  # Global counter for all images on this page

    # --- Extract embedded images ---
    try:
        image_list = page.get_images()
        for img_index, img in enumerate(image_list):
            try:
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                image_ext = base_image["ext"]

                # Save image
                image_filename = f"{pdf_stem}-p{page_num + 1}-img{global_image_id:03d}-embedded.{image_ext}"
                image_path = output_dir / image_filename
                with open(image_path, "wb") as img_file:
                    img_file.write(image_bytes)

                # Get image info
                img_bbox = img[1:5] if len(img) > 4 else [0, 0, 0, 0]
                x0, y0, x1, y1 = img_bbox

                image_element = {
                    "page": page_num + 1,
                    "global_id": f"p{page_num + 1}_img{global_image_id:03d}",  # ‚úÖ Unique global ID
                    "image_id": f"embedded_{img_index + 1}",  # Original ID for reference
                    "type": "embedded_image",
                    "filename": image_filename,
                    "bbox": img_bbox,
                    "position": {"x": x0, "y": y0, "width": x1 - x0, "height": y1 - y0},
                    "xref": xref,
                    "format": image_ext,
                    "size_bytes": len(image_bytes),
                    "order": global_image_id,  # ‚úÖ Order number
                }

                image_elements.append(image_element)
                images_saved += 1
                global_image_id += 1

            except Exception as e:
                print(f"[!] Error extracting embedded image {img_index}: {e}")

    except Exception as e:
        print(f"[!] Error in embedded image extraction for page {page_num + 1}: {e}")

    # --- Extract vector figures ---
    try:
        paths = page.get_drawings()

        if paths:
            # Group paths that are close together
            path_groups = []
            for path in paths:
                path_rect = path.get("rect", fitz.Rect(0, 0, 0, 0))
                if path_rect.width > 0 and path_rect.height > 0:
                    added_to_group = False
                    for group in path_groups:
                        group_rect = group["rect"]
                        expanded_rect = group_rect + (-30, -30, 30, 30)
                        if path_rect.intersects(expanded_rect):
                            group["rect"] = group["rect"] | path_rect
                            group["paths"].append(path)
                            added_to_group = True
                            break

                    if not added_to_group:
                        path_groups.append({"rect": path_rect, "paths": [path]})

            for group_index, group in enumerate(path_groups):
                rect = group["rect"]
                if (
                    rect.width > 50
                    and rect.height > 50
                    and rect.width < page.rect.width * 0.95
                    and rect.height < page.rect.height * 0.95
                    and len(group["paths"]) > 2
                ):
                    # Save vector figure
                    clip_rect = rect + (-10, -10, 10, 10)
                    clip_rect = clip_rect & page.rect

                    if clip_rect.width > 30 and clip_rect.height > 30:
                        # Use global ID in filename
                        figure_filename = f"{pdf_stem}-p{page_num + 1}-img{global_image_id:03d}-vector.png"
                        figure_path = output_dir / figure_filename
                        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), clip=clip_rect)
                        pix.save(figure_path)

                        image_element = {
                            "page": page_num + 1,
                            "global_id": f"p{page_num + 1}_img{global_image_id:03d}",  # ‚úÖ Unique global ID
                            "image_id": f"vector_{group_index + 1}",  # Original ID for reference
                            "type": "vector_figure",
                            "filename": figure_filename,
                            "bbox": [rect.x0, rect.y0, rect.x1, rect.y1],
                            "position": {
                                "x": rect.x0,
                                "y": rect.y0,
                                "width": rect.width,
                                "height": rect.height,
                            },
                            "path_count": len(group["paths"]),
                            "complexity": len(group["paths"]),
                            "order": global_image_id,  # ‚úÖ Order number
                        }

                        image_elements.append(image_element)
                        images_saved += 1
                        global_image_id += 1

    except Exception as e:
        print(f"[!] Error in vector figure extraction for page {page_num + 1}: {e}")

    # --- Visual figure detection ---
    try:
        pix = page.get_pixmap(matrix=fitz.Matrix(1.5, 1.5))
        img_data = pix.tobytes("png")
        page_image = Image.open(io.BytesIO(img_data))

        img_array = np.array(page_image)
        gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
        edges = cv2.Canny(gray, 30, 100)

        contours, _ = cv2.findContours(
            edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
        )

        for contour_index, contour in enumerate(contours):
            x, y, w, h = cv2.boundingRect(contour)

            if (
                w > 80
                and h > 80
                and w < page_image.width * 0.8
                and h < page_image.height * 0.8
                and 0.2 < w / h < 5.0
            ):
                roi = edges[y : y + h, x : x + w]
                edge_density = np.sum(roi > 0) / (w * h)

                if edge_density > 0.015:  # Has visual complexity
                    # Convert back to page coordinates
                    scale_factor = 1.5
                    page_x = x / scale_factor
                    page_y = y / scale_factor
                    page_w = w / scale_factor
                    page_h = h / scale_factor

                    # Check for overlap with existing images
                    new_rect = fitz.Rect(
                        page_x, page_y, page_x + page_w, page_y + page_h
                    )
                    overlap = False

                    for existing_img in image_elements:
                        if existing_img["page"] == page_num + 1:
                            existing_bbox = existing_img["bbox"]
                            existing_rect = fitz.Rect(existing_bbox)
                            if new_rect.intersects(existing_rect):
                                overlap = True
                                break

                    if not overlap:
                        # Save visual figure
                        clip_rect = new_rect + (-5, -5, 5, 5)
                        clip_rect = clip_rect & page.rect

                        if clip_rect.width > 30 and clip_rect.height > 30:
                            region_pix = page.get_pixmap(
                                matrix=fitz.Matrix(2, 2), clip=clip_rect
                            )
                            # Use global ID in filename
                            visual_filename = f"{pdf_stem}-p{page_num + 1}-img{global_image_id:03d}-visual.png"
                            visual_path = output_dir / visual_filename
                            region_pix.save(visual_path)

                            image_element = {
                                "page": page_num + 1,
                                "global_id": f"p{page_num + 1}_img{global_image_id:03d}",  # ‚úÖ Unique global ID
                                "image_id": f"visual_{contour_index + 1}",  # Original ID for reference
                                "type": "visual_figure",
                                "filename": visual_filename,
                                "bbox": [
                                    new_rect.x0,
                                    new_rect.y0,
                                    new_rect.x1,
                                    new_rect.y1,
                                ],
                                "position": {
                                    "x": new_rect.x0,
                                    "y": new_rect.y0,
                                    "width": new_rect.width,
                                    "height": new_rect.height,
                                },
                                "edge_density": edge_density,
                                "detection_method": "visual_analysis",
                                "order": global_image_id,  # ‚úÖ Order number
                            }

                            image_elements.append(image_element)
                            images_saved += 1
                            global_image_id += 1

    except Exception as e:
        print(f"[!] Error in visual analysis for page {page_num + 1}: {e}")

    # Sort images by position for consistent ordering
    image_elements.sort(key=lambda img: (img["position"]["y"], img["position"]["x"]))

    print(
        f"[Page {page_num + 1}] Found {len(image_elements)} images, saved {images_saved} files"
    )
    return image_elements


def extract_table_elements(page, page_num, output_dir, pdf_stem):
    """Extract all table elements"""
    table_elements = []
    tables_saved = 0

    try:
        tables = page.find_tables()
        for table_index, table in enumerate(tables):
            try:
                table_bbox = table.bbox
                x0, y0, x1, y1 = table_bbox

                df = table.to_pandas()

                if not df.empty:
                    # Save table files
                    table_id = f"table_{table_index + 1}"
                    csv_filename = f"{pdf_stem}-p{page_num + 1}-{table_id}.csv"
                    html_filename = f"{pdf_stem}-p{page_num + 1}-{table_id}.html"

                    csv_path = output_dir / csv_filename
                    html_path = output_dir / html_filename

                    df.to_csv(csv_path, index=False)
                    with open(html_path, "w", encoding="utf-8") as f:
                        f.write(df.to_html(index=False))

                    # Get table content as text
                    table_text = df.to_string(index=False)

                    table_element = {
                        "page": page_num + 1,
                        "table_id": table_id,
                        "type": "table",
                        "bbox": list(table_bbox),
                        "position": {
                            "x": x0,
                            "y": y0,
                            "width": x1 - x0,
                            "height": y1 - y0,
                        },
                        "dimensions": {"rows": len(df), "columns": len(df.columns)},
                        "files": {"csv": csv_filename, "html": html_filename},
                        "columns": list(df.columns),
                        "content_preview": (
                            table_text[:500] + "..."
                            if len(table_text) > 500
                            else table_text
                        ),
                        "cell_count": len(df) * len(df.columns),
                    }

                    table_elements.append(table_element)
                    tables_saved += 1

            except Exception as e:
                print(
                    f"[!] Error extracting table {table_index} from page {page_num + 1}: {e}"
                )

    except Exception as e:
        print(f"[!] Error in table detection for page {page_num + 1}: {e}")

    print(
        f"[Page {page_num + 1}] Found {len(table_elements)} tables, saved {tables_saved} files"
    )
    return table_elements


def analyze_pdf_elements(pdf_path: Path, output_dir: Path):
    """Main function to extract and analyze all PDF elements"""

    global doc  # Make doc global for image extraction
    doc = fitz.open(pdf_path)
    pdf_stem = pdf_path.stem

    # Create subdirectories
    images_dir = output_dir / "images"
    tables_dir = output_dir / "tables"
    images_dir.mkdir(parents=True, exist_ok=True)
    tables_dir.mkdir(parents=True, exist_ok=True)

    start_time = time.time()

    # Storage for all elements
    all_text_elements = []
    all_image_elements = []
    all_table_elements = []

    page_analysis = []

    print(f"Analyzing PDF: {pdf_path.name}")
    print(f"Total pages: {len(doc)}")

    # Process each page
    for page_num in range(len(doc)):
        page = doc[page_num]
        print(f"\n--- Processing Page {page_num + 1}/{len(doc)} ---")

        # Extract elements from this page
        page_text_elements = extract_text_elements(page, page_num)
        page_image_elements = extract_image_elements(
            page, page_num, images_dir, pdf_stem
        )
        page_table_elements = extract_table_elements(
            page, page_num, tables_dir, pdf_stem
        )

        # Add to global collections
        all_text_elements.extend(page_text_elements)
        all_image_elements.extend(page_image_elements)
        all_table_elements.extend(page_table_elements)

        # Page summary
        page_summary = {
            "page": page_num + 1,
            "text_blocks": len(page_text_elements),
            "images": len(page_image_elements),
            "tables": len(page_table_elements),
            "total_elements": len(page_text_elements)
            + len(page_image_elements)
            + len(page_table_elements),
        }
        page_analysis.append(page_summary)

        print(f"  Text blocks: {len(page_text_elements)}")
        print(f"  Images: {len(page_image_elements)}")
        print(f"  Tables: {len(page_table_elements)}")

    # Create comprehensive analysis
    analysis_data = {
        "document_info": {
            "filename": pdf_path.name,
            "total_pages": len(doc),
            "processing_time": time.time() - start_time,
            "metadata": dict(doc.metadata),
        },
        "summary": {
            "total_text_blocks": len(all_text_elements),
            "total_images": len(all_image_elements),
            "total_tables": len(all_table_elements),
            "total_elements": len(all_text_elements)
            + len(all_image_elements)
            + len(all_table_elements),
        },
        "page_analysis": page_analysis,
        "elements": {
            "text_elements": all_text_elements,
            "image_elements": all_image_elements,
            "table_elements": all_table_elements,
        },
    }

    # Save comprehensive analysis
    analysis_path = output_dir / f"{pdf_stem}-elements-analysis.json"
    with open(analysis_path, "w", encoding="utf-8") as f:
        json.dump(analysis_data, f, indent=2, ensure_ascii=False)

    # Save individual element files for easier inspection
    text_path = output_dir / f"{pdf_stem}-text-elements.json"
    with open(text_path, "w", encoding="utf-8") as f:
        json.dump(all_text_elements, f, indent=2, ensure_ascii=False)

    images_path = output_dir / f"{pdf_stem}-image-elements.json"
    with open(images_path, "w", encoding="utf-8") as f:
        json.dump(all_image_elements, f, indent=2, ensure_ascii=False)

    tables_path = output_dir / f"{pdf_stem}-table-elements.json"
    with open(tables_path, "w", encoding="utf-8") as f:
        json.dump(all_table_elements, f, indent=2, ensure_ascii=False)

    doc.close()

    # Print summary
    print(f"\n{'='*50}")
    print(f"EXTRACTION COMPLETE")
    print(f"{'='*50}")
    print(f"Processing time: {time.time() - start_time:.2f}s")
    print(f"Total text blocks: {len(all_text_elements)}")
    print(f"Total images: {len(all_image_elements)}")
    print(f"Total tables: {len(all_table_elements)}")
    print(f"\nFiles saved:")
    print(f"  - Complete analysis: {analysis_path}")
    print(f"  - Text elements: {text_path}")
    print(f"  - Image elements: {images_path}")
    print(f"  - Table elements: {tables_path}")
    print(f"  - Images saved to: {images_dir}")
    print(f"  - Tables saved to: {tables_dir}")


# --- Run the analysis ---
analyze_pdf_elements(INPUT_PDF_PATH, OUTPUT_DIR)

Analyzing PDF: 1810.04805v2.pdf
Total pages: 16

--- Processing Page 1/16 ---
[Page 1] Found 0 images, saved 0 files
[Page 1] Found 0 tables, saved 0 files
  Text blocks: 9
  Images: 0
  Tables: 0

--- Processing Page 2/16 ---
[Page 2] Found 0 images, saved 0 files
[Page 2] Found 0 tables, saved 0 files
  Text blocks: 11
  Images: 0
  Tables: 0

--- Processing Page 3/16 ---
[Page 3] Found 14 images, saved 14 files
[Page 3] Found 2 tables, saved 2 files
  Text blocks: 38
  Images: 14
  Tables: 2

--- Processing Page 4/16 ---
[Page 4] Found 0 images, saved 0 files
[Page 4] Found 0 tables, saved 0 files
  Text blocks: 8
  Images: 0
  Tables: 0

--- Processing Page 5/16 ---
[Page 5] Found 1 images, saved 1 files
[Page 5] Found 0 tables, saved 0 files
  Text blocks: 15
  Images: 1
  Tables: 0

--- Processing Page 6/16 ---
[Page 6] Found 0 images, saved 0 files
[Page 6] Found 0 tables, saved 0 files
  Text blocks: 10
  Images: 0
  Tables: 0

--- Processing Page 7/16 ---
[Page 7] Found 0 imag

In [34]:
import fitz  # PyMuPDF
import json
import logging
import time
from pathlib import Path
import pandas as pd
import io
from PIL import Image
import cv2
import numpy as np


def clean_text_blocks_in_figures_tables(
    text_elements, image_elements, table_elements, overlap_threshold=0.8
):
    """Clean text blocks that significantly overlap with figure or table regions"""

    def get_overlap_ratio(text_bbox, other_bbox):
        """Calculate overlap ratio between two bounding boxes"""
        text_rect = fitz.Rect(text_bbox)
        other_rect = fitz.Rect(other_bbox)

        if not text_rect.intersects(other_rect):
            return 0.0

        overlap_rect = text_rect & other_rect  # Intersection
        overlap_area = overlap_rect.get_area()
        text_area = text_rect.get_area()

        if text_area == 0:
            return 0.0

        return overlap_area / text_area

    cleaned_text_elements = []
    removed_count = 0

    for text_element in text_elements:
        text_bbox = text_element["bbox"]
        page_num = text_element["page"]
        should_keep = True

        # Check overlap with images on the same page
        for image_element in image_elements:
            if image_element["page"] == page_num:
                image_bbox = image_element["bbox"]
                overlap_ratio = get_overlap_ratio(text_bbox, image_bbox)

                if overlap_ratio > overlap_threshold:
                    print(
                        f"[REMOVE] Text block {text_element['block_id']} on page {page_num}: {overlap_ratio:.2f} overlap with image {image_element['image_id']}"
                    )
                    should_keep = False
                    break

        # Check overlap with tables on the same page (if text wasn't already removed)
        if should_keep:
            for table_element in table_elements:
                if table_element["page"] == page_num:
                    table_bbox = table_element["bbox"]
                    overlap_ratio = get_overlap_ratio(text_bbox, table_bbox)

                    if overlap_ratio > overlap_threshold:
                        print(
                            f"[REMOVE] Text block {text_element['block_id']} on page {page_num}: {overlap_ratio:.2f} overlap with table {table_element['table_id']}"
                        )
                        should_keep = False
                        break

        if should_keep:
            cleaned_text_elements.append(text_element)
        else:
            removed_count += 1

    print(
        f"[CLEAN TEXT] Removed {removed_count} text blocks that overlap with figures/tables"
    )
    print(f"[CLEAN TEXT] Kept {len(cleaned_text_elements)} clean text blocks")

    return cleaned_text_elements


def clean_overlapping_images(image_elements, overlap_threshold=0.3):
    """Clean overlapping images, keeping the largest one"""

    def get_overlap_ratio(bbox1, bbox2):
        """Calculate overlap ratio between two bounding boxes"""
        rect1 = fitz.Rect(bbox1)
        rect2 = fitz.Rect(bbox2)

        if not rect1.intersects(rect2):
            return 0.0

        overlap_rect = rect1 & rect2
        overlap_area = overlap_rect.get_area()

        # Use the smaller area as denominator for overlap ratio
        area1 = rect1.get_area()
        area2 = rect2.get_area()
        smaller_area = min(area1, area2)

        if smaller_area == 0:
            return 0.0

        return overlap_area / smaller_area

    def get_image_area(img):
        """Calculate image area"""
        bbox = img["bbox"]
        return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])

    # Group images by page
    page_images = {}
    for img in image_elements:
        page_num = img["page"]
        if page_num not in page_images:
            page_images[page_num] = []
        page_images[page_num].append(img)

    cleaned_images = []
    removed_count = 0

    for page_num, images in page_images.items():
        # Sort images by area (largest first)
        images_by_size = sorted(images, key=get_image_area, reverse=True)

        kept_images = []

        for img in images_by_size:
            should_keep = True
            img_bbox = img["bbox"]

            # Check against all already kept images on this page
            for kept_img in kept_images:
                kept_bbox = kept_img["bbox"]
                overlap_ratio = get_overlap_ratio(img_bbox, kept_bbox)

                if overlap_ratio > overlap_threshold:
                    kept_area = get_image_area(kept_img)
                    current_area = get_image_area(img)
                    print(
                        f"[REMOVE] Image {img['image_id']} (area: {current_area:.0f}) on page {page_num}: {overlap_ratio:.2f} overlap with larger image {kept_img['image_id']} (area: {kept_area:.0f})"
                    )
                    should_keep = False
                    break

            if should_keep:
                kept_images.append(img)
            else:
                removed_count += 1

        cleaned_images.extend(kept_images)

    print(f"[CLEAN IMAGES] Removed {removed_count} overlapping images (kept largest)")
    print(f"[CLEAN IMAGES] Kept {len(cleaned_images)} unique images")

    return cleaned_images


def merge_elements_by_reading_order(text_elements, image_elements, table_elements):
    """Merge text, images, and tables in proper reading order considering column layout"""

    # Group elements by page
    page_elements = {}

    # Initialize page structure
    for text_elem in text_elements:
        page_num = text_elem["page"]
        if page_num not in page_elements:
            page_elements[page_num] = {
                "text": [],
                "images": [],
                "tables": [],
                "merged_content": [],
            }
        page_elements[page_num]["text"].append(text_elem)

    # Add images and tables to their respective pages
    for img_elem in image_elements:
        page_num = img_elem["page"]
        if page_num in page_elements:
            page_elements[page_num]["images"].append(img_elem)

    for table_elem in table_elements:
        page_num = table_elem["page"]
        if page_num in page_elements:
            page_elements[page_num]["tables"].append(table_elem)

    # Process each page
    all_merged_content = []

    for page_num in sorted(page_elements.keys()):
        page_data = page_elements[page_num]

        print(f"\n--- Processing Page {page_num} ---")
        print(f"Text blocks: {len(page_data['text'])}")
        print(f"Images: {len(page_data['images'])}")
        print(f"Tables: {len(page_data['tables'])}")

        # Keep text blocks in original order (already in reading order from PyMuPDF)
        text_blocks = page_data["text"]

        # Sort images and tables by Y position, then by X position
        images = page_data["images"]
        tables = page_data["tables"]

        page_content = []
        page_content.append(
            {
                "type": "page_header",
                "content": f"# Page {page_num}\n\n",
                "page": page_num,
                "position": {"x": 0, "y": 0},
            }
        )

        # Initialize pointers for images and tables
        img_index = 0
        table_index = 0

        # Loop through text elements in their original order
        for text_i, text_block in enumerate(text_blocks):
            text_bbox = text_block["bbox"]
            current_page = text_block["page"]
            current_y = text_bbox[1]  # Y position
            x_end = text_bbox[2]

            # Check if we should insert images before this text block
            while img_index < len(images):
                img = images[img_index]
                img_bbox = img["bbox"]
                img_page = img["page"]
                img_x = img_bbox[0]
                img_y = img_bbox[1]

                # Insert image if it's on same page and positioned before current text block
                if img_page == current_page and img_y <= current_y and img_x < x_end:
                    # Create image content
                    if "filename" in img:
                        content = f"![{img['image_id']}]({img['filename']})\n\n"
                    else:
                        content = f"*[Image: {img['image_id']}]*\n\n"

                    page_content.append(
                        {
                            "type": "image",
                            "content": content,
                            "image_type": img["type"],
                            "page": page_num,
                            "position": {"x": img_bbox[0], "y": img_y},
                            "image_id": img["image_id"],
                        }
                    )

                    print(
                        f"[INSERT] Image {img['image_id']} before text block {text_block['block_id']}"
                    )
                    img_index += 1
                else:
                    break

            # Check if we should insert tables before this text block
            while table_index < len(tables):
                table = tables[table_index]
                table_bbox = table["bbox"]
                table_page = table["page"]
                table_x = table_bbox[0]
                table_y = table_bbox[1]

                # Insert table if it's on same page and positioned before current text block
                if (
                    table_page == current_page
                    and table_y <= current_y
                    and table_x < x_end
                ):
                    # Create table content
                    content = f"### {table['table_id'].replace('_', ' ').title()}\n\n"
                    if "files" in table:
                        content += f"[CSV]({table['files']['csv']}) | [HTML]({table['files']['html']})\n\n"
                    if "content_preview" in table:
                        content += f"```\n{table['content_preview'][:300]}...\n```\n\n"

                    page_content.append(
                        {
                            "type": "table",
                            "content": content,
                            "page": page_num,
                            "position": {"x": table_bbox[0], "y": table_y},
                            "table_id": table["table_id"],
                            "dimensions": table.get("dimensions", {}),
                        }
                    )

                    print(
                        f"[INSERT] Table {table['table_id']} before text block {text_block['block_id']}"
                    )
                    table_index += 1
                else:
                    break
            # Add current text block
            text_content = text_block["content"].strip()
            if text_content:
                # Determine if heading or paragraph
                if text_block.get("text_type") == "heading":
                    final_text = f"## {text_content}\n\n"
                else:
                    final_text = f"{text_content}\n\n"

                page_content.append(
                    {
                        "type": "text",
                        "content": final_text,
                        "text_type": text_block.get("text_type", "paragraph"),
                        "page": page_num,
                        "position": {"x": text_bbox[0], "y": text_bbox[1]},
                        "block_id": text_block.get("block_id", 0),
                    }
                )

        # Add any remaining images and tables at the end of the page
        # while img_index < len(images):
        #     img = images[img_index]
        #     if img["page"] == current_page:
        #         if "filename" in img:
        #             content = f"![{img['image_id']}]({img['filename']})\n\n"
        #         else:
        #             content = f"*[Image: {img['image_id']}]*\n\n"

        #         page_content.append(
        #             {
        #                 "type": "image",
        #                 "content": content,
        #                 "image_type": img["type"],
        #                 "page": page_num,
        #                 "position": {"x": img["bbox"][0], "y": img["bbox"][1]},
        #                 "image_id": img["image_id"],
        #             }
        #         )
        #         print(f"[INSERT] Remaining image {img['image_id']} at end of page")
        #     img_index += 1

        # while table_index < len(tables):
        #     table = tables[table_index]
        #     if table["page"] == current_page:
        #         content = f"### {table['table_id'].replace('_', ' ').title()}\n\n"
        #         if "files" in table:
        #             content += f"[CSV]({table['files']['csv']}) | [HTML]({table['files']['html']})\n\n"

        #         page_content.append(
        #             {
        #                 "type": "table",
        #                 "content": content,
        #                 "page": page_num,
        #                 "position": {"x": table["bbox"][0], "y": table["bbox"][1]},
        #                 "table_id": table["table_id"],
        #             }
        #         )
        #         print(f"[INSERT] Remaining table {table['table_id']} at end of page")
        #     table_index += 1

        page_elements[page_num]["merged_content"] = page_content
        all_merged_content.extend(page_content)

        print(
            f"[MERGE] Page {page_num}: {len(page_content)} total elements merged in reading order"
        )

    return all_merged_content, page_elements


# Example usage function that combines all three
def process_pdf_elements(
    text_elements, image_elements, table_elements, output_dir, pdf_stem
):
    """Complete pipeline to clean and merge PDF elements"""

    print("=" * 60)
    print("CLEANING AND MERGING PDF ELEMENTS")
    print("=" * 60)

    # Step 1: Clean text blocks that overlap with figures/tables
    print("\n1. Cleaning text blocks overlapping with figures/tables...")
    cleaned_text = clean_text_blocks_in_figures_tables(
        text_elements, image_elements, table_elements
    )

    # Step 2: Clean overlapping images
    print("\n2. Cleaning overlapping images...")
    cleaned_images = clean_overlapping_images(image_elements)

    # Step 3: Merge elements in reading order
    print("\n3. Merging elements in reading order...")
    merged_content, page_structure = merge_elements_by_reading_order(
        cleaned_text, cleaned_images, table_elements
    )

    # Save results
    print("\n4. Saving results...")

    # Save merged content as markdown
    markdown_content = []
    for item in merged_content:
        markdown_content.append(item["content"])

    md_path = output_dir / f"{pdf_stem}-merged-document.md"
    with open(md_path, "w", encoding="utf-8") as f:
        f.write("".join(markdown_content))

    # Save detailed structure as JSON
    structure_path = output_dir / f"{pdf_stem}-merged-structure.json"
    with open(structure_path, "w", encoding="utf-8") as f:
        json.dump(
            {
                "merged_content": merged_content,
                "page_structure": page_structure,
                "statistics": {
                    "total_elements": len(merged_content),
                    "text_blocks_kept": len(cleaned_text),
                    "text_blocks_removed": len(text_elements) - len(cleaned_text),
                    "images_kept": len(cleaned_images),
                    "images_removed": len(image_elements) - len(cleaned_images),
                    "tables_kept": len(table_elements),
                    "pages_processed": len(page_structure),
                },
            },
            f,
            indent=2,
            ensure_ascii=False,
        )

    print(f"[‚úì] Merged document saved: {md_path}")
    print(f"[‚úì] Structure data saved: {structure_path}")
    print(f"[‚úì] Processing complete!")

    return merged_content, page_structure

In [None]:
def run_pdf_processing_pipeline(pdf_path_str: str, output_dir_str: str = None):
    """
    Complete PDF processing pipeline that extracts, cleans, and merges elements

    Args:
        pdf_path_str: Path to PDF file as string
        output_dir_str: Output directory path as string (optional)

    Returns:
        dict: Processing results with merged content and statistics
    """

    # Convert string paths to Path objects
    pdf_path = Path(pdf_path_str)

    if not pdf_path.exists():
        raise FileNotFoundError(f"PDF file not found: {pdf_path}")

    # Set output directory
    if output_dir_str is None:
        output_dir = Path("output_merged_pipeline") / pdf_path.stem
    else:
        output_dir = Path(output_dir_str)

    output_dir.mkdir(parents=True, exist_ok=True)

    pdf_stem = pdf_path.stem

    print("=" * 80)
    print(f"PDF PROCESSING PIPELINE: {pdf_path.name}")
    print("=" * 80)

    total_start_time = time.time()

    # Step 1: Extract all elements from PDF
    print("\nüîç STEP 1: EXTRACTING ELEMENTS FROM PDF")
    print("-" * 50)

    try:
        # Run the element extraction (using your existing function)
        analyze_pdf_elements(pdf_path, output_dir)

        # Load the extracted elements
        text_elements_path = output_dir / f"{pdf_stem}-text-elements.json"
        image_elements_path = output_dir / f"{pdf_stem}-image-elements.json"
        table_elements_path = output_dir / f"{pdf_stem}-table-elements.json"

        # Load elements from JSON files
        with open(text_elements_path, "r", encoding="utf-8") as f:
            text_elements = json.load(f)

        with open(image_elements_path, "r", encoding="utf-8") as f:
            image_elements = json.load(f)

        with open(table_elements_path, "r", encoding="utf-8") as f:
            table_elements = json.load(f)

        print(f"[‚úì] Extracted {len(text_elements)} text blocks")
        print(f"[‚úì] Extracted {len(image_elements)} images")
        print(f"[‚úì] Extracted {len(table_elements)} tables")

    except Exception as e:
        print(f"[‚úó] Error in extraction: {e}")
        raise

    # Step 2: Run the cleaning and merging pipeline
    print("\nüßπ STEP 2: CLEANING AND MERGING ELEMENTS")
    print("-" * 50)

    try:
        merged_content, page_structure = process_pdf_elements(
            text_elements, image_elements, table_elements, output_dir, pdf_stem
        )

    except Exception as e:
        print(f"[‚úó] Error in processing: {e}")
        raise

    # Step 3: Generate final summary report
    print("\nüìä STEP 3: GENERATING SUMMARY REPORT")
    print("-" * 50)

    total_time = time.time() - total_start_time

    # Create comprehensive summary
    summary_report = {
        "pipeline_info": {
            "pdf_file": pdf_path.name,
            "pdf_path": str(pdf_path),
            "output_directory": str(output_dir),
            "processing_time_seconds": round(total_time, 2),
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
        },
        "extraction_results": {
            "original_text_blocks": len(text_elements),
            "original_images": len(image_elements),
            "original_tables": len(table_elements),
        },
        "cleaning_results": {
            "cleaned_text_blocks": sum(
                1 for item in merged_content if item["type"] == "text"
            ),
            "cleaned_images": sum(
                1 for item in merged_content if item["type"] == "image"
            ),
            "tables_processed": sum(
                1 for item in merged_content if item["type"] == "table"
            ),
            "text_blocks_removed": None,  # Will be calculated
            "images_removed": None,  # Will be calculated
        },
        "output_files": {
            "merged_markdown": f"{pdf_stem}-merged-document.md",
            "structure_json": f"{pdf_stem}-merged-structure.json",
            "text_elements": f"{pdf_stem}-text-elements.json",
            "image_elements": f"{pdf_stem}-image-elements.json",
            "table_elements": f"{pdf_stem}-table-elements.json",
            "images_folder": "images/",
            "tables_folder": "tables/",
        },
        "pages_processed": len(page_structure),
        "total_merged_elements": len(merged_content),
    }

    # Calculate removal statistics from the structure file if it exists
    structure_path = output_dir / f"{pdf_stem}-merged-structure.json"
    if structure_path.exists():
        with open(structure_path, "r", encoding="utf-8") as f:
            structure_data = json.load(f)
            stats = structure_data.get("statistics", {})
            summary_report["cleaning_results"]["text_blocks_removed"] = stats.get(
                "text_blocks_removed", 0
            )
            summary_report["cleaning_results"]["images_removed"] = stats.get(
                "images_removed", 0
            )

    # Save summary report
    summary_path = output_dir / f"{pdf_stem}-pipeline-summary.json"
    with open(summary_path, "w", encoding="utf-8") as f:
        json.dump(summary_report, f, indent=2, ensure_ascii=False)

    # Save human-readable summary
    summary_text_path = output_dir / f"{pdf_stem}-pipeline-summary.txt"
    with open(summary_text_path, "w", encoding="utf-8") as f:
        f.write("PDF PROCESSING PIPELINE SUMMARY\n")
        f.write("=" * 50 + "\n\n")
        f.write(f"üìÑ File: {pdf_path.name}\n")
        f.write(f"‚è±Ô∏è  Processing Time: {total_time:.2f} seconds\n")
        f.write(f"üìÖ Processed: {summary_report['pipeline_info']['timestamp']}\n\n")

        f.write("üìä EXTRACTION RESULTS:\n")
        f.write(f"   ‚Ä¢ Text blocks: {len(text_elements)}\n")
        f.write(f"   ‚Ä¢ Images: {len(image_elements)}\n")
        f.write(f"   ‚Ä¢ Tables: {len(table_elements)}\n\n")

        f.write("üßπ CLEANING RESULTS:\n")
        f.write(
            f"   ‚Ä¢ Text blocks kept: {summary_report['cleaning_results']['cleaned_text_blocks']}\n"
        )
        f.write(
            f"   ‚Ä¢ Images kept: {summary_report['cleaning_results']['cleaned_images']}\n"
        )
        f.write(
            f"   ‚Ä¢ Tables kept: {summary_report['cleaning_results']['tables_processed']}\n"
        )
        if summary_report["cleaning_results"]["text_blocks_removed"] is not None:
            f.write(
                f"   ‚Ä¢ Text blocks removed: {summary_report['cleaning_results']['text_blocks_removed']}\n"
            )
            f.write(
                f"   ‚Ä¢ Images removed: {summary_report['cleaning_results']['images_removed']}\n"
            )
        f.write(f"\n")

        f.write("üìÅ OUTPUT FILES:\n")
        f.write(
            f"   ‚Ä¢ Merged document: {summary_report['output_files']['merged_markdown']}\n"
        )
        f.write(
            f"   ‚Ä¢ Structure data: {summary_report['output_files']['structure_json']}\n"
        )
        f.write(
            f"   ‚Ä¢ Images folder: {summary_report['output_files']['images_folder']}\n"
        )
        f.write(
            f"   ‚Ä¢ Tables folder: {summary_report['output_files']['tables_folder']}\n"
        )
        f.write(f"   ‚Ä¢ Pipeline summary: {pdf_stem}-pipeline-summary.json\n\n")

        f.write(f"üìñ FINAL DOCUMENT:\n")
        f.write(f"   ‚Ä¢ Total pages: {summary_report['pages_processed']}\n")
        f.write(f"   ‚Ä¢ Total elements: {summary_report['total_merged_elements']}\n")
        f.write(
            f"   ‚Ä¢ Main output: {output_dir / summary_report['output_files']['merged_markdown']}\n"
        )

    # Print final summary
    print(f"[‚úì] Pipeline completed in {total_time:.2f} seconds")
    print(f"[‚úì] Processed {summary_report['pages_processed']} pages")
    print(f"[‚úì] Generated {summary_report['total_merged_elements']} merged elements")
    print(f"[‚úì] Summary saved: {summary_path}")
    print(
        f"[‚úì] Main document: {output_dir / summary_report['output_files']['merged_markdown']}"
    )

    print(f"\nüìÅ All files saved to: {output_dir}")

    return {
        "success": True,
        "summary": summary_report,
        "merged_content": merged_content,
        "page_structure": page_structure,
        "output_directory": str(output_dir),
        "main_document_path": str(
            output_dir / summary_report["output_files"]["merged_markdown"]
        ),
    }


# Example usage functions for different scenarios
def process_single_pdf(pdf_path: str, output_dir: str = None):
    """Process a single PDF file"""
    try:
        result = run_pdf_processing_pipeline(pdf_path, output_dir)
        print("\n‚úÖ SUCCESS: PDF processed successfully!")
        return result
    except Exception as e:
        print(f"\n‚ùå ERROR: {e}")
        return {"success": False, "error": str(e)}


def process_multiple_pdfs(pdf_directory: str, output_base_dir: str = None):
    """Process multiple PDF files in a directory"""
    pdf_dir = Path(pdf_directory)

    if not pdf_dir.exists():
        raise ValueError(f"Directory not found: {pdf_dir}")

    pdf_files = list(pdf_dir.glob("*.pdf"))

    if not pdf_files:
        print(f"No PDF files found in {pdf_dir}")
        return {"success": False, "error": "No PDF files found"}

    print(f"Found {len(pdf_files)} PDF files to process")

    results = {}
    successful = 0
    failed = 0

    for pdf_file in pdf_files:
        print(f"\n{'='*20} Processing {pdf_file.name} {'='*20}")

        try:
            # Set individual output directory for each PDF
            if output_base_dir:
                individual_output = Path(output_base_dir) / pdf_file.stem
            else:
                individual_output = Path("output_batch_processing") / pdf_file.stem

            result = run_pdf_processing_pipeline(str(pdf_file), str(individual_output))
            results[pdf_file.name] = result
            successful += 1
            print(f"‚úÖ {pdf_file.name} processed successfully!")

        except Exception as e:
            results[pdf_file.name] = {"success": False, "error": str(e)}
            failed += 1
            print(f"‚ùå {pdf_file.name} failed: {e}")

    print(f"\nüìä BATCH PROCESSING COMPLETE")
    print(f"‚úÖ Successful: {successful}/{len(pdf_files)}")
    print(f"‚ùå Failed: {failed}/{len(pdf_files)}")

    return {
        "success": True,
        "total_files": len(pdf_files),
        "successful": successful,
        "failed": failed,
        "results": results,
    }


# Quick test function
# def quick_test():
#     """Quick test with your current PDF"""
#     pdf_path = "D:\\WorkSpace\\LOL-PaperReader\\backend\\src\\paperreader\\services\\parser\\1810.04805v2.pdf"
#     return process_single_pdf(pdf_path)


# Run the pipeline (uncomment one of these)

# Option 1: Process single PDF with default settings
result = process_single_pdf(
    "D:\\WorkSpace\\LOL-PaperReader\\backend\\src\\paperreader\\services\\parser\\2303.14334v2.pdf"
)

# Option 2: Process single PDF with custom output directory
# result = process_single_pdf(
#     "D:\\WorkSpace\\LOL-PaperReader\\backend\\src\\paperreader\\services\\parser\\1810.04805v2.pdf",
#     "my_custom_output"
# )

# Option 3: Process multiple PDFs in a directory
# result = process_multiple_pdfs(
#     "D:\\WorkSpace\\LOL-PaperReader\\backend\\src\\paperreader\\services\\parser\\",
#     "batch_output"
# )

PDF PROCESSING PIPELINE: 2303.14334v2.pdf

üîç STEP 1: EXTRACTING ELEMENTS FROM PDF
--------------------------------------------------
Analyzing PDF: 2303.14334v2.pdf
Total pages: 11

--- Processing Page 1/11 ---
[Page 1] Found 0 images, saved 0 files
[Page 1] Found 0 tables, saved 0 files
  Text blocks: 22
  Images: 0
  Tables: 0

--- Processing Page 2/11 ---
[Page 2] Found 2 images, saved 2 files
[Page 2] Found 0 tables, saved 0 files
  Text blocks: 8
  Images: 2
  Tables: 0

--- Processing Page 3/11 ---
[Page 3] Found 2 images, saved 2 files
[Page 3] Found 0 tables, saved 0 files
  Text blocks: 10
  Images: 2
  Tables: 0

--- Processing Page 4/11 ---
[Page 4] Found 4 images, saved 4 files
[Page 4] Found 0 tables, saved 0 files
  Text blocks: 11
  Images: 4
  Tables: 0

--- Processing Page 5/11 ---
[Page 5] Found 4 images, saved 4 files
[Page 5] Found 0 tables, saved 0 files
  Text blocks: 11
  Images: 4
  Tables: 0

--- Processing Page 6/11 ---
[Page 6] Found 2 images, saved 2 file

: 