# 📑 PDF Content Extraction Tool

This notebook extracts **images and captions** from PDF documents.

## ✨ Features
- Extracts high-resolution images
- Associates captions with images
- Creates a clean PDF (`images_with_captions.pdf`)
- (Optional) Debug visualization with bounding boxes and accordion preview

---

In [None]:
# ✅ Install dependencies
!pip install pymupdf numpy Pillow scikit-learn ipywidgets reportlab

In [None]:
# ✅ Import libraries
import os
import numpy as np
import fitz  # PyMuPDF
import ipywidgets as widgets
from ipywidgets import Accordion, VBox, Image as WImage
from IPython.display import display
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.lib.units import inch
from PIL import Image
import tempfile


In [None]:
# ✅ Helper functions
def draw_wrapped_text(c, text, x, y, max_width, line_height,
                      font="Helvetica-Oblique", font_size=11):
    c.setFont(font, font_size)
    words, lines, line = text.split(), [], ""
    for word in words:
        test_line = f"{line} {word}" if line else word
        if c.stringWidth(test_line, font, font_size) <= max_width:
            line = test_line
        else:
            lines.append(line)
            line = word
    if line:
        lines.append(line)
    for l in lines:
        c.drawCentredString(x, y, l)
        y -= line_height
    return y

def boxes_are_close_or_overlap(box1, box2, gap_threshold=10):
    return (
        (box1.x1 + gap_threshold >= box2.x0 and box1.x0 - gap_threshold <= box2.x1) and
        (box1.y1 + gap_threshold >= box2.y0 and box1.y0 - gap_threshold <= box2.y1)
    )

def merge_boxes_if_close(box_list, gap_threshold=10):
    merged_boxes = []
    while box_list:
        current = box_list.pop(0)
        merged = False
        for i, existing in enumerate(merged_boxes):
            if boxes_are_close_or_overlap(current, existing, gap_threshold):
                merged_boxes[i] = fitz.Rect(
                    min(current.x0, existing.x0),
                    min(current.y0, existing.y0),
                    max(current.x1, existing.x1),
                    max(current.y1, existing.y1)
                )
                merged = True
                break
        if not merged:
            merged_boxes.append(current)
    return merged_boxes


In [None]:
# ✅ Full PDFBoxExtractor class (clean extraction + debug visualization)
class PDFBoxExtractor:
    def __init__(self, pdf_path, debug_visualization=False):
        self.pdf_path = pdf_path
        self.doc = fitz.open(pdf_path)
        self.output_folder = tempfile.mkdtemp()
        self.output_pdf = "images_with_captions.pdf"
        self.debug_visualization = debug_visualization
        if debug_visualization:
            self.debug_pdf = "debug_visualization.pdf"
            self.debug_doc = fitz.open()

    def save_image(self, page_num, img_xref):
        img_dict = self.doc.extract_image(img_xref)
        img_data = img_dict.get("image")
        img_ext = img_dict.get("ext", "png")
        img_filename = f"page_{page_num}_img_{img_xref}.{img_ext}"
        img_path = os.path.join(self.output_folder, img_filename)
        with open(img_path, "wb") as img_file:
            img_file.write(img_data)
        return img_filename

    def draw_boxes(self, page, boxes, color):
        for box in boxes:
            try:
                annot = page.add_rect_annot(box)
                annot.set_colors(stroke=color)
                annot.set_border(width=1.0)
                annot.update()
            except Exception:
                # Some PDFs may not accept annotations — skip safely
                pass

    def expand_image_boxes(self, image_rects, text_blocks, protected_captions=None):
        if protected_captions is None:
            protected_captions = []
        protected_boxes = [cap_box for _, cap_box in protected_captions]
        expanded_boxes = []
        for image_rect in image_rects:
            x0, y0, x1, y1 = image_rect
            detected_text_boxes = []
            w_margin, h_margin = (x1 - x0) * 0.1, (y1 - y0) * 0.05
            expansion_step, max_expansion = 1.175, 2.0
            for _ in range(7):
                top_strip = fitz.Rect(x0, y0 - h_margin, x1, y0)
                bottom_strip = fitz.Rect(x0, y1, x1, y1 + h_margin)
                left_strip = fitz.Rect(x0 - w_margin, y0, x0, y1)
                right_strip = fitz.Rect(x1, y0, x1 + w_margin, y1)
                new_text_detected = False
                for block in text_blocks:
                    text_rect = fitz.Rect(block[:4])
                    # Skip if protected or already inside image
                    if (image_rect.contains(text_rect) or any(text_rect.intersects(p) for p in protected_boxes)):
                        continue
                    if (top_strip.intersects(text_rect) or bottom_strip.intersects(text_rect) or
                        left_strip.intersects(text_rect) or right_strip.intersects(text_rect)):
                        detected_text_boxes.append(text_rect)
                        new_text_detected = True
                if not new_text_detected:
                    break
                w_margin *= expansion_step
                h_margin *= expansion_step
                if w_margin > (x1 - x0) * max_expansion:
                    break
            expanded_rect = fitz.Rect(x0, y0, x1, y1)
            for text_rect in detected_text_boxes:
                expanded_rect = fitz.Rect(
                    min(expanded_rect.x0, text_rect.x0),
                    min(expanded_rect.y0, text_rect.y0),
                    max(expanded_rect.x1, text_rect.x1),
                    max(expanded_rect.y1, text_rect.y1)
                )
            expanded_boxes.append(expanded_rect)
        return expanded_boxes

    def cluster_boxes(self, boxes):
        if not boxes:
            return []
        features = []
        for box in boxes:
            center_x = (box.x0 + box.x1) / 2
            center_y = (box.y0 + box.y1) / 2
            width = box.x1 - box.x0
            height = box.y1 - box.y0
            features.append([center_x, center_y, width, height])
        features = np.array(features)
        scaler = StandardScaler()
        scaled_features = scaler.fit_transform(features)
        if scaled_features.shape[0] < 2:
            eps = 0.5
        else:
            neigh = NearestNeighbors(n_neighbors=2)
            nbrs = neigh.fit(scaled_features)
            distances, _ = nbrs.kneighbors(scaled_features)
            distances_to_nearest = distances[:, 1]
            median_distance = np.median(distances_to_nearest)
            eps = max(median_distance * 1.5, 0.5)
        clustering = DBSCAN(eps=eps, min_samples=1).fit(scaled_features)
        labels = clustering.labels_
        clustered_boxes = {}
        for box, label in zip(boxes, labels):
            clustered_boxes.setdefault(label, []).append(box)
        merged_boxes = []
        for group in clustered_boxes.values():
            x0 = min(box.x0 for box in group)
            y0 = min(box.y0 for box in group)
            x1 = max(box.x1 for box in group)
            y1 = max(box.y1 for box in group)
            merged_boxes.append(fitz.Rect(x0, y0, x1, y1))
        return merged_boxes

    def merge_boxes(self, expanded_boxes, captions, caption_info):
        # Merge boxes that are close/overlapping first
        merged_expanded = merge_boxes_if_close(list(expanded_boxes), gap_threshold=10)
        caption_assoc = {}
        for i, exp_box in enumerate(merged_expanded):
            for cap_text, cap_box in captions:
                if self.is_near(exp_box, cap_box, factor=2.0):
                    caption_assoc[i] = cap_text
                    break
        boxes_with_caption = {}
        boxes_without_caption = []
        clusters = []
        for i, box in enumerate(merged_expanded):
            if i in caption_assoc:
                cap = caption_assoc[i]
                boxes_with_caption.setdefault(cap, []).append(box)
            else:
                boxes_without_caption.append(box)
        # Merge boxes associated with the same caption
        for cap, group in boxes_with_caption.items():
            combined_x0 = min(box.x0 for box in group)
            combined_y0 = min(box.y0 for box in group)
            combined_x1 = max(box.x1 for box in group)
            combined_y1_raw = max(box.y1 for box in group)
            caption_box = caption_info[cap]
            combined_y1 = min(combined_y1_raw, caption_box.y0)
            merged_box = fitz.Rect(combined_x0, combined_y0, combined_x1, combined_y1)
            clusters.append({"box": merged_box, "caption": cap})
        # For the rest, cluster heuristically
        merged_no_caption = self.cluster_boxes(boxes_without_caption)
        for box in merged_no_caption:
            clusters.append({"box": box, "caption": None})
        return clusters

    @staticmethod
    def is_near(exp_box, cap_box, factor=2.0):
        vertical_distance = cap_box.y0 - exp_box.y1
        cap_height = cap_box.y1 - cap_box.y0
        threshold = cap_height * factor
        horizontal_overlap = max(0, min(exp_box.x1, cap_box.x1) - max(exp_box.x0, cap_box.x0))
        min_overlap = min(cap_box.width * 0.5, exp_box.width * 0.5)
        return (vertical_distance >= 0 and vertical_distance < threshold and horizontal_overlap > min_overlap)

    def process_pdf(self):
        total_pages = len(self.doc)
        progress = widgets.FloatProgress(value=0, min=0, max=1, description='Processing:')
        display(progress)
        # Canvas for images+captions
        c = canvas.Canvas(self.output_pdf, pagesize=letter)
        page_width, page_height = letter
        max_text_width = page_width - 100
        y_position = page_height - 70
        high_res_matrix = fitz.Matrix(2, 2)

        for page_num, page in enumerate(self.doc, start=1):
            print(f"Processing page {page_num}/{total_pages}")
            if self.debug_visualization:
                debug_page = self.debug_doc.new_page(width=page.rect.width, height=page.rect.height)
                debug_page.show_pdf_page(debug_page.rect, self.doc, page_num - 1)

            images = []
            for img in page.get_images(full=True):
                xref = img[0]
                bbox_list = page.get_image_rects(xref)
                for rect in bbox_list:
                    img_filename = self.save_image(page_num, xref)
                    images.append({"filename": img_filename, "bbox": rect})

            text_blocks = page.get_text("blocks")
            captions, caption_info = [], {}
            caption_patterns = ["figure", "fig.", "plate", "table"]
            for block in text_blocks:
                try:
                    text = block[4].strip()
                except Exception:
                    continue
                low = text.lower()
                if any(low.startswith(patt) for patt in caption_patterns):
                    caption_box = fitz.Rect(block[:4])
                    if text not in caption_info:
                        captions.append((text, caption_box))
                        caption_info[text] = caption_box

            image_rects = [img["bbox"] for img in images]
            expanded_boxes = self.expand_image_boxes(image_rects, text_blocks, captions)
            clusters = self.merge_boxes(expanded_boxes, captions, caption_info)

            if self.debug_visualization:
                # Draw debug annotations
                try:
                    self.draw_boxes(debug_page, image_rects, (0, 0, 1))
                    self.draw_boxes(debug_page, expanded_boxes, (0, 1, 0))
                    self.draw_boxes(debug_page, [cap_box for _, cap_box in captions], (1, 0, 0))
                    self.draw_boxes(debug_page, [cluster["box"] for cluster in clusters], (0.5, 0, 0.5))
                except Exception:
                    pass
            # Add clusters to the images+captions PDF
            for cluster in clusters:
                box = cluster["box"]
                try:
                    pix = page.get_pixmap(matrix=high_res_matrix, clip=box)
                    img_filename = f"page_{page_num}_box_{int(box.x0)}_{int(box.y0)}.png"
                    img_path = os.path.join(self.output_folder, img_filename)
                    pix.save(img_path)
                except Exception as e:
                    print(f"Could not render cluster image: {e}")
                    continue
                try:
                    img = Image.open(img_path)
                    img_width, img_height = img.size
                    display_width = 4 * inch
                    display_height = display_width * (img_height / img_width)
                    if y_position - display_height - 80 < 50:
                        c.showPage()
                        y_position = page_height - 70
                    x_pos = (page_width - display_width) / 2
                    c.drawInlineImage(img_path, x_pos, y_position - display_height,
                                      width=display_width, height=display_height)
                    y_position -= (display_height + 15)
                    caption_text = cluster["caption"] if cluster["caption"] else "(No caption detected)"
                    # draw caption centered and wrapped
                    y_position = draw_wrapped_text(c, caption_text, page_width/2, y_position, max_text_width, 14)
                    y_position -= 30
                except Exception as e:
                    print(f"Error inserting image {img_path}: {e}")
            progress.value = page_num / total_pages

        # finalize
        self.doc.close()
        c.save()
        print(f"Images + captions PDF saved as: {self.output_pdf}")

        if self.debug_visualization:
            try:
                self.debug_doc.save(self.debug_pdf)
                self.debug_doc.close()
                print(f"Debug visualization saved as: {self.debug_pdf}")
                # build and display accordion preview
                dbg = fitz.open(self.debug_pdf)
                pages_widgets = []
                for pg in dbg:
                    px = pg.get_pixmap(matrix=fitz.Matrix(1.5, 1.5))
                    img_bytes = px.tobytes("png")
                    pages_widgets.append(WImage(value=img_bytes, format='png'))
                if pages_widgets:
                    accordion = Accordion(children=[VBox([p]) for p in pages_widgets])
                    for i in range(len(pages_widgets)):
                        accordion.set_title(i, f"Page {i+1}")
                    display(accordion)
            except Exception as e:
                print(f"Could not create debug PDF or preview: {e}")

        # clean up temporary images
        try:
            shutil.rmtree(self.output_folder)
        except Exception:
            pass


In [None]:
# ✅ UI: upload widget + debug option + handler
upload_btn = widgets.FileUpload(accept='.pdf', multiple=False)
debug_checkbox = widgets.Checkbox(value=False, description='Enable debug visualization')
display(widgets.VBox([upload_btn, debug_checkbox]))
def process_uploaded_file(change):
    if not upload_btn.value:
        return
    uploaded_file = list(upload_btn.value.values())[0]
    file_path = "uploaded_file.pdf"
    with open(file_path, "wb") as f:
        f.write(uploaded_file["content"])
    pdf_extractor = PDFBoxExtractor(file_path, debug_visualization=debug_checkbox.value)
    pdf_extractor.process_pdf()
    print("Processing complete!")
    # Provide download link for images_with_captions.pdf
    try:
        with open(pdf_extractor.output_pdf, "rb") as f:
            pdf_data = f.read()
        import base64
        b64_pdf = base64.b64encode(pdf_data).decode('utf-8')
        download_link = HTML(
            f'<a href="data:application/pdf;base64,{b64_pdf}" '
            f'download="{pdf_extractor.output_pdf}" '
            f'style="padding: 0.5em 1em; background: #007bff; color: white; border-radius: 3px; text-decoration: none;">'
            f'Download Images + Captions PDF</a>'
        )
        display(download_link)
    except Exception as e:
        print(f"Could not create download link: {e}")

upload_btn.observe(process_uploaded_file, names='value')


### ✅ Usage
1. Run the first cell to install dependencies.
2. Run the other cells to load the tool.
3. Upload a PDF using the widget.
4. (Optional) Enable debug visualization before uploading.
5. Download `images_with_captions.pdf`. If debug was enabled, you will see an accordion preview and a `debug_visualization.pdf` download link.
