In [2]:
import fitz
import tabula
import pdfplumber
import math

In [34]:
def extract_tabular_data_from_page(input_pdf_path, page_number):
    """
    Extract blocks of tabular data from a single page of the PDF.

    Args:
        input_pdf_path (str): Path to the input PDF.
        page_number (int): The page number to extract from (0-indexed).

    Returns:
        dict: A dictionary containing the extracted tabular data and the page's bounding box.
    """
    blocks_data = []
    input_doc = fitz.open(input_pdf_path)

    with pdfplumber.open(input_pdf_path) as pdf:
        pdf_page = pdf.pages[page_number]  # Get the specific page from pdfplumber
        doc_page = input_doc[page_number]  # Corresponding page from fitz

        # Extract blocks of content using PyMuPDF
        blocks = doc_page.get_text("dict")["blocks"]
        
        # Extract table bounding boxes using pdfplumber
        table_bboxes = [table.bbox for table in pdf_page.find_tables()]
        
        blocks_data.append({
            "blocks": blocks,
            "table_bboxes": table_bboxes,
            "page_rect": doc_page.rect
        })

    input_doc.close()
    return blocks_data

def create_tabular_pdf_from_page(blocks_data, output_pdf_path):
    """
    Create a PDF with only the tabular data extracted from a page.

    Args:
        blocks_data (list): Extracted block data from the page.
        output_pdf_path (str): Path to save the output PDF.

    Returns:
        list: Block data containing only the tabular content.
    """
    output_doc = fitz.open()
    tabular_blocks = []

    for page_data in blocks_data:
        page_rect = page_data["page_rect"]
        blocks = page_data["blocks"]
        table_bboxes = page_data["table_bboxes"]

        # Create a new page in the output PDF
        output_page = output_doc.new_page(width=page_rect.width, height=page_rect.height)

        page_tabular_blocks = []

        for block in blocks:
            if "lines" in block:  # Process text blocks
                for line in block["lines"]:
                    for span in line["spans"]:
                        bbox = span.get("bbox", [0, 0, 0, 0])
                        bbox = [float(cord) for cord in bbox]

                        # Check if the text is within any table bbox (include if inside table)
                        inside_table = any(
                            bbox[0] >= table_bbox[0] and
                            bbox[1] >= table_bbox[1] and
                            bbox[2] <= table_bbox[2] and
                            bbox[3] <= table_bbox[3]
                            for table_bbox in table_bboxes
                        )

                        if inside_table:
                            page_tabular_blocks.append(block)
                            text = span["text"]
                            size = math.ceil(float(span.get("size", 12)))
                            color = span.get("color", (0, 0, 0))
                            font = span.get("font", "helv")

                            # Normalize color
                            if isinstance(color, int):
                                color = (
                                    (color >> 16) & 255, (color >> 8) & 255, color & 255
                                )
                            color = tuple(c / 255 for c in color)

                            try:
                                output_page.insert_text(
                                    (bbox[0], bbox[1]),
                                    text,
                                    fontsize=size,
                                    fontname=font,
                                    color=color,
                                )
                            except Exception:
                                output_page.insert_text(
                                    (bbox[0], bbox[1]),
                                    text,
                                    fontsize=size,
                                    fontname="helv",
                                    color=color,
                                )

        tabular_blocks.append(page_tabular_blocks)

    # Save the output PDF with only tabular data
    output_doc.save(output_pdf_path)
    output_doc.close()

    return tabular_blocks


In [7]:
output_path = r"C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo\pdf-extractor\output\singular\tabular_single.pdf"
pdf_path = r"C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo\pdf-extractor\files\SamcoFactSheet2024.pdf"

In [40]:
block_content = extract_tabular_data_from_page(pdf_path,10)
tabular_block_content = create_tabular_pdf_from_page(block_content, output_path)

In [None]:
print(type(tabular_block_content))
print(len(tabular_block_content))

for content in tabular_block_content[0]:
    print(f"\n{content}")

In [46]:

def detect_table_structure(tabular_blocks, col_tolerance=10, row_tolerance=5):
    """
    Detect the table structure by grouping text blocks into columns and rows.
    
    Args:
        tabular_blocks (list): List of blocks (dicts) with text and bbox information.
        col_tolerance (int): Tolerance to group columns based on x-coordinate.
        row_tolerance (int): Tolerance to group rows based on y-coordinate.
    
    Returns:
        tuple: (num_columns, num_rows) - The number of columns and rows detected.
    """
    # Step 1: Group text blocks into columns based on x-coordinate
    columns = []
    current_column = []
    last_x = None

    # Sort blocks by their x-coordinate (and y-coordinate to maintain row order)
    for blocks in tabular_blocks:
        blocks.sort(key=lambda block: (block['bbox'][1], block['bbox'][0]))  # Sort by y first, then x

    for block in tabular_blocks:
        x = block['bbox'][0]
        if last_x is None or abs(x - last_x) < col_tolerance:  # Same column if x is close enough
            current_column.append(block)
        else:
            columns.append(current_column)
            current_column = [block]
        last_x = x

    if current_column:
        columns.append(current_column)  # Add the last column group

    # Step 2: Group text blocks into rows based on y-coordinate
    rows = []
    current_row = []
    last_y = None

    # For each column, group blocks into rows based on y-coordinate
    for column in columns:
        for block in column:
            y = block['bbox'][1]
            if last_y is None or abs(y - last_y) < row_tolerance:  # Same row if y is close enough
                current_row.append(block)
            else:
                rows.append(current_row)
                current_row = [block]
            last_y = y

        if current_row:
            rows.append(current_row)  # Add the last row group

    # Return the number of columns and rows
    num_columns = len(columns)
    num_rows = len(rows)

    return num_columns, num_rows


In [47]:
row, col = detect_table_structure(tabular_block_content)

TypeError: list indices must be integers or slices, not str