In [None]:
import html
from dataclasses import dataclass


@dataclass
class BoundingBox:
    top_left: tuple  # (x1, y1)
    bottom_right: tuple  # (x2, y2)
    text: str


def generate_html(bounding_boxes, container_size=(800, 600)):
    html_header = """<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Bounding Boxes to HTML</title>
    <style>
        .container {{
            position: relative;
            width: {width}px;
            height: {height}px;
            border: 1px solid #000;
        }}
        .text-box {{
            position: absolute;
            background-color: rgba(255, 255, 0, 0.3); /* Optional: Highlight boxes */
            border: 1px dashed #f00; /* Optional: Border for visualization */
            padding: 5px; /* Increased padding */
            box-sizing: border-box;
            overflow: hidden; /* Change to 'auto' for scrollbars */
            white-space: normal; /* Allow text to wrap */
            word-wrap: break-word; /* Break long words */
            font-size: 14px;
            color: #333;
            text-align: left; /* Change to 'center' if desired */
        }}
        .image-box {{
            position: absolute;
            border: 1px solid #000; /* Optional: Border for visualization */
        }}
    </style>
</head>
<body>
    <div class="container">
"""

    html_footer = """    </div>
</body>
</html>"""

    html_body = ""

    for box in bounding_boxes:
        x1, y1 = box.top_left
        x2, y2 = box.bottom_right
        width = x2 - x1
        height = y2 - y1

        # Check if the text is an image tag
        if box.text.startswith("<img"):
            # Create a div for each image with absolute positioning
            image_div = f"""        <div class="image-box" style="
                left: {x1}px;
                top: {y1}px;
                width: {width}px;
                height: {height}px;
            ">
                {box.text}
            </div>
"""
            html_body += image_div
        else:
            # Escape HTML special characters in text
            escaped_text = html.escape(box.text)

            # Create a div for each text box with absolute positioning
            text_div = f"""        <div class="text-box" style="
                left: {x1}px;
                top: {y1}px;
                width: {width}px;
                height: {height}px;
            ">
                {escaped_text}
            </div>
"""
            html_body += text_div

    full_html = (
        html_header.format(width=container_size[0], height=container_size[1])
        + html_body
        + html_footer
    )
    return full_html


def calculate_container_size(bounding_boxes, padding=10):
    max_width = max(box.bottom_right[0] for box in bounding_boxes) + padding
    max_height = max(box.bottom_right[1] for box in bounding_boxes) + padding
    return (max_width, max_height)


def export_to_html(bounding_boxes, output_file="output.html", container_size=None):
    if not container_size:
        container_size = calculate_container_size(bounding_boxes)
    html_content = generate_html(bounding_boxes, container_size)
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(html_content)
    print(f"HTML exported to {output_file}")

In [None]:
def run_1():
    bounding_boxes = [
        BoundingBox(top_left=(50, 100), bottom_right=(200, 150), text="Hello World"),
        BoundingBox(top_left=(300, 200), bottom_right=(450, 250), text="Another Text"),
        BoundingBox(
            top_left=(400, 400), bottom_right=(500, 500), text="More Text Here"
        ),
    ]
    export_to_html(bounding_boxes, output_file="output.html")


run_1()

In [None]:
import base64
from typing import Tuple

import pymupdf


def load_exam_and_extract_text_words(filepath: str):
    doc = pymupdf.open(filepath)

    for i, page in enumerate(doc):  # scan through the pages
        blocks: List[Tuple[float, float, float, float, str, float, float]] = (
            page.get_text("blocks")
        )

        bounding_boxes = []
        for block in blocks:
            if len(block[4].strip()) == 0:
                continue
            if (
                block[4] == "\n"
                or block[4] == " "
                or block[4].startswith("Page")
                or block[4].startswith("Spring")
            ):
                continue
            # print(f"text='{block[4]}'")
            bounding_boxes.append(
                BoundingBox(
                    top_left=(block[0], block[1]),
                    bottom_right=(block[2], block[3]),
                    text=block[4],
                )
            )

        print(page.get_drawings())

        images = page.get_images(full=True)
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_base64 = base64.b64encode(image_bytes).decode("utf-8")
            img_html = f"""<img src="data:image/{base_image['ext']};base64,{image_base64}" style="position:absolute; left:{img[1]}px; top:{img[2]}px; width:{img[3]}px; height:{img[4]}px;" />"""
            bounding_boxes.append(
                BoundingBox(
                    top_left=(img[1], img[2]),
                    bottom_right=(img[1] + img[3], img[2] + img[4]),
                    text=img_html,
                )
            )

            print(img)

        output_dir = f"{os.path.basename(filepath).replace('.pdf', '')}"
        file_path = os.path.join(output_dir, f"page-{i}-blocks.html")
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        export_to_html(
            bounding_boxes,
            output_file=file_path,
        )

        pixmap = page.get_pixmap()
        # Define SVG file pathi
        svg_file_path = os.path.join(output_dir, f"page-{i}.jpg")
        pixmap.save(svg_file_path)

        print(f"Saved: {svg_file_path}")


load_exam_and_extract_text_words("../fe_files/exams/FE-Jan24.pdf")

In [None]:
import io
import os
import traceback
from dataclasses import dataclass
from typing import List

import PIL
from PIL import Image, ImageChops

from parser.drawing.drawing import Drawing, get_merged_drawings_from_page


def handle_drawings(page, out_dir, page_idx):
    merged_drawings: List[Drawing] = get_merged_drawings_from_page(page)

    for idx, merged_drawing in enumerate(merged_drawings):
        pixemap = page.get_pixmap(clip=merged_drawing.rect.as_tuple())
        pixemap.save(os.path.join(out_dir, f"merged_drawing_{page_idx}_{idx}.png"))


@dataclass
class ConvertedImage:
    raw: PIL.Image
    processed: PIL.Image


def convert_image(
    doc: pymupdf.Document, img, debug: bool = False
) -> ConvertedImage | None:
    xref = img[0]
    smask = img[1]  # Soft mask xref
    width = img[2]
    height = img[3]
    colors = img[4]
    color_space = img[5]
    filter_type = img[8]

    if debug:
        print("\nProcessing Image:")
        print(f"  XREF: {xref}")
        print(f"  Soft Mask XREF: {smask}")
        print(f"  Dimensions: {width}x{height}")
        print(f"  Colors: {colors}")
        print(f"  Color Space: {color_space}")
        print(f"  Filter: {filter_type}")

    # Extract the main image Pixmap
    pix = pymupdf.Pixmap(doc, xref)
    if debug:
        print(f"  Initial Pixmap: {pix.n} channels")

    # Handle images with a soft mask
    if smask:
        if debug:
            print(
                f"  Image has a soft mask (smask xref={smask}). Attempting to apply mask."
            )
        try:
            mask_pix = pymupdf.Pixmap(doc, smask)
            if debug:
                print(f"    Mask Pixmap: {mask_pix.n} channels")

            # set alpha to match mask
            pix.set_alpha(mask_pix.samples)
            mask_pix = None  # Free mask Pixmap
        except Exception as e:
            print(f"    Error applying soft mask: {e}")
            print("Traceback: ", traceback.format_exc())
            print("    Proceeding without applying mask.")

    # Handle color spaces and alpha channels
    if pix.n >= 5:  # CMYK or other complex color spaces
        if debug:
            print(
                f"  Image is in a complex color space (pix.n={pix.n}). Converting to RGB."
            )
        pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
    elif pix.n == 4:  # RGBA
        if debug:
            print("  Image has an alpha channel (pix.n=4). Compositing over white.")
        pix = pymupdf.Pixmap(pymupdf.csRGB, pix)  # Composites over white by default
    elif pix.n == 2:  # Grayscale with alpha
        if debug:
            print("  Image is grayscale with alpha (pix.n=2). Compositing over white.")
        pix = pymupdf.Pixmap(pymupdf.csGRAY, pix)  # Composites over white
    elif pix.n == 1:  # Grayscale
        if debug:
            print("  Image is grayscale (pix.n=1). No conversion needed.")
        # No conversion needed
    elif pix.n == 3:  # RGB
        if debug:
            print("  Image is RGB (pix.n=3). No conversion needed.")
        # No conversion needed
    else:
        if debug:
            print(f"  Image has an unhandled color space (pix.n={pix.n}). Skipping.")
        pix = None

    raw_image = None
    if pix:
        # Convert Pixmap to PNG bytes
        img_bytes = pix.tobytes("png")
        raw_image = Image.open(io.BytesIO(img_bytes))
        pix = None  # Free Pixmap resources

        # Open the image with PIL
        try:
            image = Image.open(io.BytesIO(img_bytes))
            if debug:
                print(f"    Opened image with PIL. Mode: {image.mode}")
        except Exception as e:
            print(f"    Error opening image with PIL: {e}")
            return None

        # Additional handling for images that might still have alpha channels
        if image.mode == "RGBA":
            if debug:
                print(
                    f"    Image has an alpha channel in PIL (mode={image.mode}). Compositing over white."
                )
            background = Image.new("RGB", image.size, (255, 255, 255))
            background.paste(image, mask=image.split()[3])  # 3 is the alpha channel
            image = background
            if debug:
                print("    Composited over white. New mode: RGB.")
        elif image.mode == "LA":
            if debug:
                print(
                    f"    Image has a grayscale alpha channel in PIL (mode={image.mode}). Compositing over white."
                )
            background = Image.new("L", image.size, 255)
            background.paste(image, mask=image.split()[1])  # 1 is the alpha channel
            image = background
            if debug:
                print("    Composited over white. New mode: L.")
        elif image.mode not in ["RGB", "L"]:
            if debug:
                print(f"    Converting image mode from {image.mode} to RGB.")
            image = image.convert("RGB")

        # Detect if the image is effectively grayscale despite being in RGB
        if image.mode == "RGB":
            # Check if all channels are equal
            r, g, b = image.split()
            if (
                ImageChops.difference(r, g).getbbox() is None
                and ImageChops.difference(r, b).getbbox() is None
            ):
                if debug:
                    print("    Image appears to be grayscale. Converting to 'L' mode.")
                image = image.convert("L")

        # Ensure image is in RGB mode
        if image.mode != "RGB":
            image = image.convert("RGB")
            if debug:
                print("    Converted image mode to RGB.")

        return ConvertedImage(raw=raw_image, processed=image)


def handle_images(
    doc: pymupdf.Document,
    page: pymupdf.Page,
    out_dir: str,
    page_idx: int,
    debug: bool = False,
):
    images = page.get_images(full=True)
    if debug:
        print(f"\nFound {len(images)} image(s) on page {page_idx + 1}.")

    if not images:
        if debug:
            print(f"No images found on page {page_idx + 1}.")
        return

    for img_index, img in enumerate(images, start=1):
        converted_image = convert_image(doc, img, debug)
        if converted_image is None:
            print(f"Error converting image {img_index} on page {page_idx + 1}")
            continue

        image_filename_raw = f"page_{page_idx +1}_img_{img_index}_raw.png"
        image_path_raw = os.path.join(out_dir, image_filename_raw)
        converted_image.raw.save(image_path_raw)

        image_filename = f"page_{page_idx +1}_img_{img_index}.png"
        image_path = os.path.join(out_dir, image_filename)
        converted_image.processed.save(image_path)

    print(f"\nCompleted processing images on page {page_idx + 1}.\n")


def run(filepath):
    doc = pymupdf.Document(filepath)
    print(f"Loaded {filepath} with {doc.page_count} pages")

    out_dir = os.path.basename(filepath).replace(".pdf", "")
    os.makedirs(out_dir, exist_ok=True)

    for i, page in enumerate(doc):
        print(f"Processing page {i}")
        handle_drawings(page, out_dir, i)
        handle_images(doc, page, out_dir, i)


run("../fe_files/exams/FE-Jan24.pdf")

In [None]:
# def handle_images(
#    doc: pymupdf.Document,
#    page: pymupdf.Page,
#    out_dir: str,
#    page_idx: int,
#    debug: bool = False,
# ):
#    images = page.get_images(full=True)
#    if debug:
#        print(f"\nFound {len(images)} image(s) on page {page_idx + 1}.")
#
#    if not images:
#        if debug:
#            print(f"No images found on page {page_idx + 1}.")
#        return
#
#    for img_index, img in enumerate(images, start=1):
#        xref = img[0]
#        smask = img[1]  # Soft mask xref
#        width = img[2]
#        height = img[3]
#        colors = img[4]
#        color_space = img[5]
#        filter_type = img[8]
#
#        if debug:
#            print(f"\nProcessing Image {img_index}:")
#            print(f"  XREF: {xref}")
#            print(f"  Soft Mask XREF: {smask}")
#            print(f"  Dimensions: {width}x{height}")
#            print(f"  Colors: {colors}")
#            print(f"  Color Space: {color_space}")
#            print(f"  Filter: {filter_type}")
#
#        try:
#            # Extract the main image Pixmap
#            pix = pymupdf.Pixmap(doc, xref)
#            if debug:
#                print(f"  Initial Pixmap: {pix.n} channels")
#
#            # Handle images with a soft mask
#            if smask:
#                if debug:
#                    print(
#                        f"  Image has a soft mask (smask xref={smask}). Attempting to apply mask."
#                    )
#                try:
#                    mask_pix = pymupdf.Pixmap(doc, smask)
#                    if debug:
#                        print(f"    Mask Pixmap: {mask_pix.n} channels")
#
#                    # set alpha to match mask
#                    pix.set_alpha(mask_pix.samples)
#                    mask_pix = None  # Free mask Pixmap
#                except Exception as e:
#                    print(f"    Error applying soft mask: {e}")
#                    print("Traceback: ", traceback.format_exc())
#                    print("    Proceeding without applying mask.")
#
#            # Handle color spaces and alpha channels
#            if pix.n >= 5:  # CMYK or other complex color spaces
#                if debug:
#                    print(
#                        f"  Image is in a complex color space (pix.n={pix.n}). Converting to RGB."
#                    )
#                pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
#            elif pix.n == 4:  # RGBA
#                if debug:
#                    print(
#                        "  Image has an alpha channel (pix.n=4). Compositing over white."
#                    )
#                pix = pymupdf.Pixmap(
#                    pymupdf.csRGB, pix
#                )  # Composites over white by default
#            elif pix.n == 2:  # Grayscale with alpha
#                if debug:
#                    print(
#                        "  Image is grayscale with alpha (pix.n=2). Compositing over white."
#                    )
#                pix = pymupdf.Pixmap(pymupdf.csGRAY, pix)  # Composites over white
#            elif pix.n == 1:  # Grayscale
#                if debug:
#                    print("  Image is grayscale (pix.n=1). No conversion needed.")
#                # No conversion needed
#            elif pix.n == 3:  # RGB
#                if debug:
#                    print("  Image is RGB (pix.n=3). No conversion needed.")
#                # No conversion needed
#            else:
#                if debug:
#                    print(
#                        f"  Image has an unhandled color space (pix.n={pix.n}). Skipping."
#                    )
#                pix = None
#
#            if pix:
#                # **Option 1: Save Pixmap Directly as PNG**
#                image_filename_raw = f"page_{page_idx +1}_img_{img_index}_raw.png"
#                image_path_raw = os.path.join(out_dir, image_filename_raw)
#                try:
#                    pix.save(image_path_raw)
#                    if debug:
#                        print(f"    Saved Pixmap directly as PNG: {image_path_raw}")
#                except Exception as e:
#                    print(f"    Error saving Pixmap directly: {e}")
#
#                # **Option 2: Continue to PIL Processing (Without Grayscale Conversion)**
#                # Convert Pixmap to PNG bytes
#                img_bytes = pix.tobytes("png")
#                pix = None  # Free Pixmap resources
#
#                # Open the image with PIL
#                try:
#                    image = Image.open(io.BytesIO(img_bytes))
#                    if debug:
#                        print(f"    Opened image with PIL. Mode: {image.mode}")
#                except Exception as e:
#                    print(f"    Error opening image with PIL: {e}")
#                    continue
#
#                # **Skip Grayscale Conversion for Now**
#                # Temporarily comment out grayscale detection and conversion
#                # Additional handling for images that might still have alpha channels
#                if image.mode == "RGBA":
#                    print(
#                        f"    Image has an alpha channel in PIL (mode={image.mode}). Compositing over white."
#                    )
#                    background = Image.new("RGB", image.size, (255, 255, 255))
#                    background.paste(
#                        image, mask=image.split()[3]
#                    )  # 3 is the alpha channel
#                    image = background
#                    print("    Composited over white. New mode: RGB.")
#                elif image.mode == "LA":
#                    print(
#                        f"    Image has a grayscale alpha channel in PIL (mode={image.mode}). Compositing over white."
#                    )
#                    background = Image.new("L", image.size, 255)
#                    background.paste(
#                        image, mask=image.split()[1]
#                    )  # 1 is the alpha channel
#                    image = background
#                    print("    Composited over white. New mode: L.")
#                elif image.mode not in ["RGB", "L"]:
#                    print(f"    Converting image mode from {image.mode} to RGB.")
#                    image = image.convert("RGB")
#
#                # Detect if the image is effectively grayscale despite being in RGB
#                if image.mode == "RGB":
#                    # Check if all channels are equal
#                    r, g, b = image.split()
#                    if (
#                        ImageChops.difference(r, g).getbbox() is None
#                        and ImageChops.difference(r, b).getbbox() is None
#                    ):
#                        print(
#                            "    Image appears to be grayscale. Converting to 'L' mode."
#                        )
#                        image = image.convert("L")
#
#                # **Save the Image in RGB Mode Without Grayscale Conversion**
#                image_filename = f"page_{page_idx +1}_img_{img_index}.png"
#                image_path = os.path.join(out_dir, image_filename)
#
#                try:
#                    # Ensure image is in RGB mode
#                    if image.mode != "RGB":
#                        image = image.convert("RGB")
#                        print("    Converted image mode to RGB.")
#
#                    image.save(image_path)
#                    print(f"    Saved image: {image_path}")
#                except Exception as e:
#                    print(f"    Error saving image: {e}")
#
#        except Exception as e:
#            print(f"  Error processing image {img_index} on page {page_idx + 1}: {e}")
#            continue
#
#    print(f"\nCompleted processing images on page {page_idx + 1}.\n")