In [32]:
import tabula
import pdfplumber
import fitz
import camelot
import warnings , math

warnings.filterwarnings("ignore", category=UserWarning) 

In [34]:
path = r"C:\Users\rando\OneDrive\Documents\mywork-repo"

#path = r"C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo\pdf-extractor"

samco_path = r"\files\SamcoFactSheet2024.pdf"
tata_path = r"\files\TataFactSheet2024.pdf"

In [33]:
def extract_pdf_blocks(input_pdf_path):
    """
    Open the PDF and extract all blocks of text, images, and other content.

    Args:
        input_pdf_path (str): Path to the input PDF.

    Returns:
        list: A list of pages, where each page is a dictionary containing blocks of content.
    """
    blocks_data = []
    input_doc = fitz.open(input_pdf_path)

    with pdfplumber.open(input_pdf_path) as pdf:
        for page_number, pdf_page in enumerate(pdf.pages):
            # Extract blocks of content using PyMuPDF
            doc_page = input_doc[page_number]
            blocks = doc_page.get_text("dict")["blocks"]
            
            # Append the blocks along with table bounding boxes from pdfplumber
            blocks_data.append({
                "blocks": blocks,
                "table_bboxes": [table.bbox for table in pdf_page.find_tables()],
                "page_rect": doc_page.rect
            })

    input_doc.close()
    return blocks_data

In [28]:
def remove_image_blocks_and_create_pdf(blocks_data, output_pdf_path):
    """
    Returns:
        list: Updated block data without image blocks.
    """
    output_doc = fitz.open()
    output_blocks_data = [] 

    for page_data in blocks_data:
        page_rect = page_data["page_rect"]
        blocks = page_data["blocks"]

        # Create a new page in the output PDF
        output_page = output_doc.new_page(width=page_rect.width, height=page_rect.height)

        filtered_blocks = []

        for block in blocks:
            if "image" in block:
                continue #skip the image box
                # bbox = block.get('bbox',[0,0,0,0])
                # try:
                #     output_page.draw_rect(
                #         bbox,  # Bounding box of the rectangle
                #         color=(0.9, 0.9, 0.9),  # Light grey color
                #         fill=True,  # Fill the rectangle
                #     )
                # except Exception:
                #     continue  # Skip if there's an error drawing the rectangle
                # continue  # Skip further processing for image blocks

            filtered_blocks.append(block)

            if "lines" in block:  # Process text lines
                for line in block["lines"]:
                    for span in line["spans"]:
                        bbox = span.get("bbox", [0, 0, 0, 0])
                        text = span["text"]
                        size = math.ceil(float(span.get("size", 12)))  # Default font size if not provided
                        color = span.get("color", (0, 0, 0))  # Default color (black)
                        font = span.get("font", "helv").lower()  # Fallback to 'helv' if font is not found

                        # Check if the font is bold or italic and modify font name accordingly
                        fontname = "helv"  # Default font
                        if "bold" in font or "light" in font:
                            fontname += '-b'# Add suffix for bold
                        if "italic" in font:
                            fontname += '-i'# Add suffix for italic

                        # Normalize color if it's in integer form (e.g., 0x000000)
                        if isinstance(color, int):
                            color = (
                                (color >> 16) & 255, (color >> 8) & 255, color & 255
                            )
                        color = tuple(c / 255 for c in color)  # Convert to 0-1 range

                        try:
                            output_page.insert_text(
                                (bbox[0], bbox[1]),
                                text,
                                fontsize=size,
                                fontname=fontname,
                                color=color,
                            )
                        except Exception:
                            output_page.insert_text(
                                (bbox[0], bbox[1]),
                                text,
                                fontsize=size,
                                fontname="helv",  # Fallback font
                                color=color,
                            )
        
        # After processing the page, extract text blocks from the newly created output page
        output_blocks = output_page.get_text("dict")["blocks"]
        output_blocks_data.append({
            "blocks": output_blocks,
            "page_rect": page_rect})


    # Save the output PDF
    output_doc.save(output_pdf_path)
    output_doc.close()

    return output_blocks_data

In [35]:
#samco
no_image_path = path +r"\output\NoImgPdf.pdf"
textual_pdf_path = path + r"\output\TextualPdf.pdf"
tabular_pdf_path = path + r"\output\TabularPdf.pdf"

In [7]:
#tata
no_image_path = path +r"\output\TatanoImgPdf.pdf"
textual_pdf_path = path + r"\output\TatatextalPdf.pdf"
tabular_pdf_path = path + r"\output\TatatabularPdf.pdf"

In [37]:
blocks_data = extract_pdf_blocks(path + samco_path)
non_image_blocks_data = remove_image_blocks_and_create_pdf(blocks_data, no_image_path)
print("\n Success !!")


 Success !!


In [None]:
#blk_data = blocks_data #pdf_blocks
blk_data = nonimage_blocks_data #pdf minus images
print(len(blk_data))
for pgn, page in enumerate(blk_data):
    print(f"\nPage:{pgn}")
    for block in page['blocks']:
       print(f"\n{block}")

In [38]:
"""Returns:list: Block data containing only non-tabular content."""
def create_non_tabular_pdf(blocks_data, output_pdf_path):
    output_doc = fitz.open()
    non_tabular_blocks = []

    for page_data in blocks_data:
        page_rect = page_data["page_rect"]
        blocks = page_data["blocks"]
        table_bboxes = page_data["table_bboxes"]

        # Create a new page in the output PDF
        output_page = output_doc.new_page(width=page_rect.width, height=page_rect.height)

        page_non_tabular_blocks = []

        for block in blocks:
            if "lines" in block:  # Process text blocks
                for line in block["lines"]:
                    for span in line["spans"]:
                        bbox = span.get("bbox", [0, 0, 0, 0])

                        # Check if the text is within any table bbox (skip if inside table)
                        inside_table = any(
                            bbox[0] >= table_bbox[0] and
                            bbox[1] >= table_bbox[1] and
                            bbox[2] <= table_bbox[2] and
                            bbox[3] <= table_bbox[3]
                            for table_bbox in table_bboxes
                        )

                        if not inside_table:
                            page_non_tabular_blocks.append(block)
                            text = span["text"]
                            size = math.ceil(float(span.get("size", 12)))
                            color = span.get("color", (0, 0, 0))
                            font = span.get("font", "helv")

                            # Normalize color
                            if isinstance(color, int):
                                color = (
                                    (color >> 16) & 255, (color >> 8) & 255, color & 255
                                )
                            color = tuple(c / 255 for c in color)

                            # is_bold = "Bold" in font
                            # fontname = font + ("-b" if is_bold else "")

                            try:
                                output_page.insert_text(
                                    (bbox[0], bbox[1]),
                                    text,
                                    fontsize=size,
                                    fontname=font,
                                    color=color,
                                )
                            except Exception:
                                output_page.insert_text(
                                    (bbox[0], bbox[1]),
                                    text,
                                    fontsize=size,
                                    fontname="helv",
                                    color=color,
                                )

        non_tabular_blocks.append(page_non_tabular_blocks)

    # Save the output PDF
    output_doc.save(output_pdf_path)
    output_doc.close()

    return non_tabular_blocks


"""Returns:list: Block data containing only tabular content."""
def create_tabular_pdf(blocks_data, output_pdf_path):
    output_doc = fitz.open()
    tabular_blocks = []

    for page_data in blocks_data:
        page_rect = page_data["page_rect"]
        blocks = page_data["blocks"]
        table_bboxes = page_data["table_bboxes"]

        # Create a new page in the output PDF
        output_page = output_doc.new_page(width=page_rect.width, height=page_rect.height)

        page_tabular_blocks = []

        for block in blocks:
            # if "image" in block:  # Process image blocks
            #     image_data = block["image"]  # Assuming image data is stored here
            #     bbox = block["bbox"]  # Get the bounding box for the image

            #     # Insert the image into the page
            #     try:
            #         output_page.insert_image(
            #             fitz.Rect(bbox[0], bbox[1], bbox[2], bbox[3]),
            #             stream=image_data,  # Assuming image_data is in bytes
            #         )
            #         page_tabular_blocks.append(block)
            #     except Exception as e:
            #         #print(f"Error inserting image: {e}")
            #         continue
                    

            if "lines" in block:  # Process text blocks
                for line in block["lines"]:
                    for span in line["spans"]:
                        bbox = span.get("bbox", [0, 0, 0, 0])

                        # Check if the text is within any table bbox (include if inside table)
                        inside_table = any(
                            bbox[0] >= table_bbox[0] and
                            bbox[1] >= table_bbox[1] and
                            bbox[2] <= table_bbox[2] and
                            bbox[3] <= table_bbox[3]
                            for table_bbox in table_bboxes
                        )

                        if inside_table:
                            page_tabular_blocks.append(block)
                            text = span["text"]
                            size = math.ceil(float(span.get("size", 12)))
                            color = span.get("color", (0, 0, 0))
                            font = span.get("font", "helv")

                            # Normalize color
                            if isinstance(color, int):
                                color = (
                                    (color >> 16) & 255, (color >> 8) & 255, color & 255
                                )
                            color = tuple(c / 255 for c in color)

                            # is_bold = "Bold" in font
                            # fontname = font + ("-b" if is_bold else "")

                            try:
                                output_page.insert_text(
                                    (bbox[0], bbox[1]),
                                    text,
                                    fontsize=size,
                                    fontname=font,
                                    color=color,
                                )
                            except Exception:
                                output_page.insert_text(
                                    (bbox[0], bbox[1]),
                                    text,
                                    fontsize=size,
                                    fontname="helv",
                                    color=color,
                                )

        tabular_blocks.append(page_tabular_blocks)

    # Save the output PDF
    output_doc.save(output_pdf_path)
    output_doc.close()

    return tabular_blocks

In [11]:
# Example usage
create_non_tabular_pdf(blocks_data, textual_pdf_path)
print("\nCode Successful")


Code Successful


In [39]:
tabular_block_data = create_tabular_pdf(blocks_data, tabular_pdf_path)
print("\nCode Successful")


Code Successful


In [50]:
print(len(tabular_block_data))
print()

24
[{'number': 51, 'type': 0, 'bbox': (258.4447937011719, 137.28489685058594, 418.40057373046875, 145.8828887939453), 'lines': [{'spans': [{'size': 6.0, 'flags': 20, 'font': 'Inter-Bold', 'color': -12220216, 'ascender': 1.1130000352859497, 'descender': -0.3199999928474426, 'text': 'Issuer', 'origin': (258.4447937011719, 143.962890625), 'bbox': (258.4447937011719, 137.28489685058594, 276.6416320800781, 145.8828887939453)}], 'wmode': 0, 'dir': (1.0, 0.0), 'bbox': (258.4447937011719, 137.28489685058594, 276.6416320800781, 145.8828887939453)}, {'spans': [{'size': 6.0, 'flags': 20, 'font': 'Inter-Bold', 'color': -12220216, 'ascender': 1.1130000352859497, 'descender': -0.3199999928474426, 'text': 'Industry / Rating', 'origin': (369.7447814941406, 143.962890625), 'bbox': (369.7447814941406, 137.28489685058594, 418.40057373046875, 145.8828887939453)}], 'wmode': 0, 'dir': (1.0, 0.0), 'bbox': (369.7447814941406, 137.28489685058594, 418.40057373046875, 145.8828887939453)}]}, {'number': 51, 'type'

Code for detecting rows and columns

In [13]:
def detect_table_structure(tabular_blocks, col_tolerance=10, row_tolerance=5):
    """
    Detect the table structure by grouping text blocks into columns and rows.
    
    Args:
        tabular_blocks (list): List of blocks (dicts) with text and bbox information.
        col_tolerance (int): Tolerance to group columns based on x-coordinate.
        row_tolerance (int): Tolerance to group rows based on y-coordinate.
    
    Returns:
        tuple: (num_columns, num_rows) - The number of columns and rows detected.
    """
    # Step 1: Group text blocks into columns based on x-coordinate
    columns = []
    current_column = []
    last_x = None

    # Sort blocks by their x-coordinate (and y-coordinate to maintain row order)
    tabular_blocks.sort(key=lambda block: (block['bbox'][1], block['bbox'][0]))  # Sort by y first, then x

    for block in tabular_blocks:
        x = block['bbox'][0]
        if last_x is None or abs(x - last_x) < col_tolerance:  # Same column if x is close enough
            current_column.append(block)
        else:
            columns.append(current_column)
            current_column = [block]
        last_x = x

    if current_column:
        columns.append(current_column)  # Add the last column group

    # Step 2: Group text blocks into rows based on y-coordinate
    rows = []
    current_row = []
    last_y = None

    # For each column, group blocks into rows based on y-coordinate
    for column in columns:
        for block in column:
            y = block['bbox'][1]
            if last_y is None or abs(y - last_y) < row_tolerance:  # Same row if y is close enough
                current_row.append(block)
            else:
                rows.append(current_row)
                current_row = [block]
            last_y = y

        if current_row:
            rows.append(current_row)  # Add the last row group

    # Return the number of columns and rows
    num_columns = len(columns)
    num_rows = len(rows)

    return num_columns, num_rows
