In [2]:
import tabula
import pdfplumber
import fitz
import camelot
import warnings , math

warnings.filterwarnings("ignore", category=UserWarning) 

In [4]:
""" Extracts content blocks from a PDF using PyMuPDF.
    Args:
        input_pdf_path (str): Path to the input PDF.
    Returns:
        list: A list of dictionaries, each representing a page with its details and blocks.
"""

def extract_pdf_blocks(input_pdf_path):
    
    blocks_data = []
    input_doc = fitz.open(input_pdf_path)

    for page_number, page in enumerate(input_doc):
        # Extract page dimensions
        dimensions = (page.rect.width, page.rect.height)

        # Extract blocks of content
        blocks = page.get_text("dict")["blocks"]

        # Create page data dictionary
        page_data = {
            "page_number": page_number + 1,  # Page numbers are 1-based
            "dimensions": dimensions,
            "blocks": blocks
        }

        blocks_data.append(page_data)

    input_doc.close()
    return blocks_data

In [5]:
"""Adds table bounding boxes to the blocks data using pdfplumber.
    Args:
        input_pdf_path (str): Path to the input PDF.
        blocks_data (list): List of page blocks extracted by extract_pdf_blocks.
    Returns:
        list: Updated blocks data with table bounding boxes added."""
def detect_table_blocks(input_pdf_path, blocks_data):
    updated_blocks_data = []

    with pdfplumber.open(input_pdf_path) as pdf:
        for page_data, pdf_page in zip(blocks_data, pdf.pages):
            table_bboxes = [table.bbox for table in pdf_page.find_tables()]

            updated_page_data = {
                "page_number": page_data["page_number"],
                "dimensions": page_data["dimensions"],
                "blocks": page_data["blocks"],
                "table_bboxes": table_bboxes
            }

            updated_blocks_data.append(updated_page_data)

    return updated_blocks_data

In [6]:
"""Separates blocks into text-only and image-only blocks.
    Args:
        blocks_data (list): List of dictionaries containing page blocks extracted by extract_pdf_blocks.
    Returns:
        tuple: Two lists of dictionaries, one for text-only blocks and another for image-only blocks."""
def separate_text_and_image_blocks(blocks_data):

    text_blocks_data = []
    image_blocks_data = []

    for page_data in blocks_data:
        # Extract page details
        page_number = page_data["page_number"]
        dimensions = page_data["dimensions"]
        blocks = page_data["blocks"]
        table_bboxes = page_data["table_bboxes"]

        # Separate blocks by type
        text_blocks = [block for block in blocks if "lines" in block]
        image_blocks = [block for block in blocks if "image" in block]

        # Create text-only page data
        if text_blocks:
            text_blocks_data.append({
                "page_number": page_number,
                "dimensions": dimensions,
                "blocks": text_blocks,
                "table_bboxes": table_bboxes
            })

        # Create image-only page data
        if image_blocks:
            image_blocks_data.append({
                "page_number": page_number,
                "dimensions": dimensions,
                "blocks": image_blocks
            })

    return text_blocks_data, image_blocks_data

In [7]:
"""Extracts tabular blocks from the blocks_with_tables data.
    Args:
        blocks_with_tables (list): List of page blocks with table bounding boxes added.
    Returns:
        list: A list of dictionaries containing only tabular blocks."""
def get_tabular_data(blocks_with_tables):
    tabular_data = []

    for page_data in blocks_with_tables:
        page_number = page_data["page_number"]
        dimensions = page_data["dimensions"]
        table_bboxes = page_data["table_bboxes"]
        blocks = page_data["blocks"]

        tabular_blocks = []

        for block in blocks:
            if "lines" in block:
                for line in block["lines"]:
                    for span in line["spans"]:
                        bbox = span.get("bbox", [0, 0, 0, 0])

                        inside_table = any(
                            bbox[0] >= table_bbox[0] and
                            bbox[1] >= table_bbox[1] and
                            bbox[2] <= table_bbox[2] and
                            bbox[3] <= table_bbox[3]
                            for table_bbox in table_bboxes
                        )

                        if inside_table:
                            tabular_blocks.append(block)
                            break

        tabular_data.append({
            "page_number": page_number,
            "dimensions": dimensions,
            "blocks": tabular_blocks,
            "table_bboxes": table_bboxes
        })

    return tabular_data

In [8]:
"""Extracts non-tabular text blocks from the blocks_with_tables data.
    Args:
        blocks_with_tables (list): List of page blocks with table bounding boxes added.
    Returns:
        list: A list of dictionaries containing only non-tabular text blocks."""
        
def get_textual_data(blocks_with_tables):
    textual_data = []

    for page_data in blocks_with_tables:
        page_number = page_data["page_number"]
        dimensions = page_data["dimensions"]
        table_bboxes = page_data["table_bboxes"]
        blocks = page_data["blocks"]

        non_tabular_blocks = []

        for block in blocks:
            if "lines" in block:
                for line in block["lines"]:
                    for span in line["spans"]:
                        bbox = span.get("bbox", [0, 0, 0, 0])

                        inside_table = any(
                            bbox[0] >= table_bbox[0] and
                            bbox[1] >= table_bbox[1] and
                            bbox[2] <= table_bbox[2] and
                            bbox[3] <= table_bbox[3]
                            for table_bbox in table_bboxes
                        )

                        if not inside_table:
                            non_tabular_blocks.append(block)
                            break

        textual_data.append({
            "page_number": page_number,
            "dimensions": dimensions,
            "blocks": non_tabular_blocks
        })

    return textual_data

In [17]:
"""Creates a PDF from blocks data and saves it, with options for including images and drawing table bounding boxes.
    Args:
        blocks_data (list): List of dictionaries containing page blocks.
        output_pdf_path (str): Path to save the generated PDF.
        include_images (bool): Whether to include images in the PDF.
        draw_table_bboxes (bool): Whether to draw bounding boxes around tables.
        Returns nothing
    """
def create_pdf_from_blocks(blocks_data, output_pdf_path, include_images=False, draw_table_bboxes=False):
    output_doc = fitz.open()

    for page_data in blocks_data:
        # Extract page dimensions
        page_width, page_height = page_data["dimensions"]
        # Create a new page
        page = output_doc.new_page(width=page_width, height=page_height)

        # Handle text and image blocks
        for block in page_data["blocks"]:
            if "lines" in block:
                # Insert text from text blocks
                for line in block["lines"]:
                    for span in line["spans"]:
                        bbox = span.get("bbox", [0, 0, 0, 0])
                        text = span["text"]
                        size = span.get("size", 12)
                        color = span.get("color", (0, 0, 0))

                        # Normalize color if it's in integer form
                        if isinstance(color, int):
                            color = (
                                (color >> 16) & 255, (color >> 8) & 255, color & 255
                            )
                        color = tuple(c / 255 for c in color)  # Convert to 0-1 range

                        try:
                            page.insert_text(
                                (bbox[0], bbox[1]),
                                text,
                                fontsize=size,
                                color=color
                            )
                        except Exception:
                            page.insert_text(
                                (bbox[0], bbox[1]),
                                text,
                                fontsize=size,
                                fontname="helv",  # Fallback font
                                color=color
                            )
            elif "image" in block and include_images:
                # Insert image from image blocks
                image = block["image"]
                rect = fitz.Rect(block["bbox"])  # Get the bounding box for the image
                page.insert_image(rect, stream=image)

        # Draw table bounding boxes if requested
        if draw_table_bboxes:
            table_bboxes = page_data.get("table_bboxes", [])
            for table_bbox in table_bboxes:
                # Draw rectangle for table bounding box
                rect = fitz.Rect(table_bbox)
                page.draw_rect(rect, color=(1, 0, 0), width=2)  # Red color, 2-pixel width

    # Save the output PDF
    output_doc.save(output_pdf_path)
    output_doc.close()

In [12]:
path = r"C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo\pdf-extractor"

samco_path = r"\files\SamcoFactSheet2024.pdf"
tata_path = r"\files\TataFactSheet2024.pdf"

#samco
no_image_path = path +r"\output\NoImgPdf.pdf"
textual_pdf_path = path + r"\output\TextualPdf.pdf"
tabular_pdf_path = path + r"\output\TabularPdf.pdf"

In [10]:
blocks_data = extract_pdf_blocks(path+samco_path)
blocks_data = detect_table_blocks(path +samco_path,blocks_data)

blocks_data, blocks_image = separate_text_and_image_blocks(blocks_data)

In [None]:
blocks_data = get_tabular_data(blocks_data)
create_pdf_from_blocks(blocks_data, textual_pdf_path)
print(f"\nSuccess !!!")

In [1]:
def create_pdf(blocks_data, output_pdf_path):
    
    """args: list->pagedict->blocks data in page
            path to store output pdf
        
        Makes the pdf no return
    """
    #currently used for single pdf
    output_doc = fitz.open()
    
    width, height = blocks_data['dimensions']
    blocks = blocks_data['blocks']


    # Create a new page in the output PDF
    output_page = output_doc.new_page(width=width, height=height)

    for block in blocks:
        if "lines" in block:  # Process text blocks
            for line in block["lines"]:
                for span in line["spans"]:
                    bbox = span.get("bbox", [0, 0, 0, 0])
                    text = span["text"]
                    size = float(span.get("size", 12))
                    color = span.get("color", (0, 0, 0))
                    font = span.get("font", "helv")

                    # Normalize color
                    if isinstance(color, int):
                        color = (
                            (color >> 16) & 255, (color >> 8) & 255, color & 255
                        )
                    color = tuple(c / 255 for c in color)

                    if "Bold" in font:
                        font.replace("-Bold", "-b")
                    elif "Light" in font:
                        font.replace("-Light", "-b")
                    else:
                        font.replace("-Regular", "")  

                    try:
                        output_page.insert_text(
                            (bbox[0], bbox[1]),
                            text,
                            fontsize=size,
                            fontname=font,
                            color=color,
                        )
                    except Exception:
                        output_page.insert_text(
                            (bbox[0], bbox[1]),
                            text,
                            fontsize=size,
                            fontname="helv",
                            color=color,
                        )

    # Save the output PDF
    output_doc.save(output_pdf_path)
    output_doc.close()