In [1]:
import pdfplumber
import fitz

In [2]:
path = r"C:\Users\rando\OneDrive\Documents\mywork-repo"
samco_path = r"\files\SamcoFactSheet2024.pdf"
tata_path = r"\files\TataFactSheet2024.pdf"

In [50]:
import pdfplumber
import fitz  # PyMuPDF

def extract_pdf_blocks(input_pdf_path):
    """
    Open the PDF and extract all blocks of text, images, and other content.

    Args:
        input_pdf_path (str): Path to the input PDF.

    Returns:
        list: A list of pages, where each page is a dictionary containing blocks of content.
    """
    blocks_data = []
    input_doc = fitz.open(input_pdf_path)

    with pdfplumber.open(input_pdf_path) as pdf:
        for page_number, pdf_page in enumerate(pdf.pages):
            # Extract blocks of content using PyMuPDF
            doc_page = input_doc[page_number]
            blocks = doc_page.get_text("dict")["blocks"]
            
            # Append the blocks along with table bounding boxes from pdfplumber
            blocks_data.append({
                "blocks": blocks,
                "table_bboxes": [table.bbox for table in pdf_page.find_tables()],
                "page_rect": doc_page.rect
            })

    input_doc.close()
    return blocks_data

def remove_image_blocks_and_create_pdf(blocks_data, output_pdf_path):
    """
    Returns:
        list: Updated block data without image blocks.
    """
    output_doc = fitz.open()

    for page_data in blocks_data:
        page_rect = page_data["page_rect"]
        blocks = page_data["blocks"]

        # Create a new page in the output PDF
        output_page = output_doc.new_page(width=page_rect.width, height=page_rect.height)

        filtered_blocks = []

        for block in blocks:
            if "image" in block:
                continue  # Skip image blocks

            filtered_blocks.append(block)

            if "lines" in block:  # Process text lines
                for line in block["lines"]:
                    for span in line["spans"]:
                        bbox = span.get("bbox", [0, 0, 0, 0])
                        text = span["text"]
                        size = span.get("size", 12)  # Default font size if not provided
                        color = span.get("color", (0, 0, 0))  # Default color (black)
                        font = span.get("font", "helv")

                        # Normalize color if it's in integer form (e.g., 0x000000)
                        if isinstance(color, int):
                            color = (
                                (color >> 16) & 255, (color >> 8) & 255, color & 255
                            )
                        color = tuple(c / 255 for c in color)  # Convert to 0-1 range

                        is_bold = "Bold" in font
                        fontname = font + ("-b" if is_bold else "")

                        try:
                            output_page.insert_text(
                                (bbox[0], bbox[1]),
                                text,
                                fontsize=size,
                                fontname=fontname,
                                color=color,
                            )
                        except Exception:
                            output_page.insert_text(
                                (bbox[0], bbox[1]),
                                text,
                                fontsize=size,
                                fontname="helv",  # Fallback font
                                color=color,
                            )

    # Save the output PDF
    output_doc.save(output_pdf_path)
    output_doc.close()

    return blocks_data

In [51]:
import re
from collections import Counter

def detect_footer_margin(blocks_data, footer_sample_ratio=0.5):
    """
    Args:
        blocks_data (list): List of pages with blocks of content.
        footer_sample_ratio (float): Ratio of pages to sample for footer detection (0 to 1).
    Returns:
        tuple: The bottom margin that likely contains the footer (y-coordinate value).
    """
    page_count = len(blocks_data)
    sample_size = int(page_count * footer_sample_ratio)  # Sample size based on the ratio
    footer_lines = []

    # Sample pages and collect footer lines (lines near the bottom of the page)
    for page_data in blocks_data[:sample_size]:
        page_rect = page_data["page_rect"]
        blocks = page_data["blocks"]
        
        # Define the region to scan for footer (bottom 20% of the page by default)
        footer_region = page_rect.height * 0.8

        for block in blocks:
            if "lines" in block:
                for line in block["lines"]:
                    for span in line["spans"]:
                        bbox = span.get("bbox", [0, 0, 0, 0])
                        text = span["text"]
                        
                        # If the line is in the bottom region of the page, consider it
                        if bbox[1] >= footer_region:
                            footer_lines.append(text.strip())  # Collect footer lines

    # Use the mode of footer lines to dynamically identify the footer pattern
    footer_line_counts = Counter(footer_lines)
    most_common_footer_line, _ = footer_line_counts.most_common(1)[0] if footer_lines else (None, 0)
    
    return footer_region, most_common_footer_line

def remove_repetitive_footer_header_dynamic(blocks_data):
    """
    Args:
        blocks_data (list): List of pages with blocks of content.   
    Returns:
        list: Updated block data with footer/header content removed.
    """
    # Step 1: Dynamically detect the footer region and text pattern
    footer_region, footer_text = detect_footer_margin(blocks_data)
    if not footer_text:
        return blocks_data  # No footer detected, return original blocks_data
    
    updated_blocks_data = []

    # Step 2: Remove footer/header content from all pages based on detected footer region
    for page_data in blocks_data:
        page_rect = page_data["page_rect"]
        blocks = page_data["blocks"]
        
        # Create new list of blocks for this page (without footer/header content)
        page_updated_blocks = []

        for block in blocks:
            if "lines" in block:
                page_block = []
                for line in block["lines"]:
                    for span in line["spans"]:
                        bbox = span.get("bbox", [0, 0, 0, 0])
                        text = span["text"]
                        
                        # Check if the text is in the footer region and matches the detected footer pattern
                        if bbox[1] < footer_region or text.strip() != footer_text:
                            page_block.append(line)
                
                if page_block:
                    page_updated_blocks.append(page_block)

        updated_blocks_data.append(page_updated_blocks)

    return updated_blocks_data

In [52]:
blocks_data = extract_pdf_blocks(path + samco_path)
blocks_data = detect_footer_margin(blocks_data,0.5)
blocks_data = remove_repetitive_footer_header_dynamic(blocks_data)

TypeError: 'float' object is not subscriptable

In [46]:
def create_non_tabular_pdf(blocks_data, output_pdf_path):
    """
    Returns:
        list: Block data containing only non-tabular content.
    """
    output_doc = fitz.open()
    non_tabular_blocks = []

    for page_data in blocks_data:
        page_rect = page_data["page_rect"]
        blocks = page_data["blocks"]
        table_bboxes = page_data["table_bboxes"]

        # Create a new page in the output PDF
        output_page = output_doc.new_page(width=page_rect.width, height=page_rect.height)

        page_non_tabular_blocks = []

        for block in blocks:
            if "lines" in block:  # Process text blocks
                for line in block["lines"]:
                    for span in line["spans"]:
                        bbox = span.get("bbox", [0, 0, 0, 0])

                        # Check if the text is within any table bbox (skip if inside table)
                        inside_table = any(
                            bbox[0] >= table_bbox[0] and
                            bbox[1] >= table_bbox[1] and
                            bbox[2] <= table_bbox[2] and
                            bbox[3] <= table_bbox[3]
                            for table_bbox in table_bboxes
                        )

                        if not inside_table:
                            page_non_tabular_blocks.append(block)
                            text = span["text"]
                            size = span.get("size", 12)
                            color = span.get("color", (0, 0, 0))
                            font = span.get("font", "helv")

                            # Normalize color
                            if isinstance(color, int):
                                color = (
                                    (color >> 16) & 255, (color >> 8) & 255, color & 255
                                )
                            color = tuple(c / 255 for c in color)

                            is_bold = "Bold" in font
                            fontname = font + ("-b" if is_bold else "")

                            try:
                                output_page.insert_text(
                                    (bbox[0], bbox[1]),
                                    text,
                                    fontsize=size,
                                    fontname=fontname,
                                    color=color,
                                )
                            except Exception:
                                output_page.insert_text(
                                    (bbox[0], bbox[1]),
                                    text,
                                    fontsize=size,
                                    fontname="helv",
                                    color=color,
                                )

        non_tabular_blocks.append(page_non_tabular_blocks)

    # Save the output PDF
    output_doc.save(output_pdf_path)
    output_doc.close()

    return non_tabular_blocks

def create_tabular_pdf(blocks_data, output_pdf_path):
    """
    Returns:
        list: Block data containing only tabular content.
    """
    output_doc = fitz.open()
    tabular_blocks = []

    for page_data in blocks_data:
        page_rect = page_data["page_rect"]
        blocks = page_data["blocks"]
        table_bboxes = page_data["table_bboxes"]

        # Create a new page in the output PDF
        output_page = output_doc.new_page(width=page_rect.width, height=page_rect.height)

        page_tabular_blocks = []

        for block in blocks:
            if "lines" in block:  # Process text blocks
                for line in block["lines"]:
                    for span in line["spans"]:
                        bbox = span.get("bbox", [0, 0, 0, 0])

                        # Check if the text is within any table bbox (include if inside table)
                        inside_table = any(
                            bbox[0] >= table_bbox[0] and
                            bbox[1] >= table_bbox[1] and
                            bbox[2] <= table_bbox[2] and
                            bbox[3] <= table_bbox[3]
                            for table_bbox in table_bboxes
                        )

                        if inside_table:
                            page_tabular_blocks.append(block)
                            text = span["text"]
                            size = span.get("size", 12)
                            color = span.get("color", (0, 0, 0))
                            font = span.get("font", "helv")

                            # Normalize color
                            if isinstance(color, int):
                                color = (
                                    (color >> 16) & 255, (color >> 8) & 255, color & 255
                                )
                            color = tuple(c / 255 for c in color)

                            is_bold = "Bold" in font
                            fontname = font + ("-b" if is_bold else "")

                            try:
                                output_page.insert_text(
                                    (bbox[0], bbox[1]),
                                    text,
                                    fontsize=size,
                                    fontname=fontname,
                                    color=color,
                                )
                            except Exception:
                                output_page.insert_text(
                                    (bbox[0], bbox[1]),
                                    text,
                                    fontsize=size,
                                    fontname="helv",
                                    color=color,
                                )

        tabular_blocks.append(page_tabular_blocks)

    # Save the output PDF
    output_doc.save(output_pdf_path)
    output_doc.close()

    return tabular_blocks

In [43]:
no_image_path = path +r"\output\noImgPdf.pdf"
textual_pdf_path = path + r"\output\textalPdf.pdf"
tabular_pdf_path = path + r"\output\tabularPdf.pdf"

In [None]:
# Example usage
blocks_data = remove_image_blocks_and_create_pdf(blocks_data, no_image_path)
create_non_tabular_pdf(blocks_data, textual_pdf_path)
create_tabular_pdf(blocks_data, tabular_pdf_path)

print("\nCode Successful")


Code Successful
