In [18]:
path = r"C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\office-projects\pdf-extractor"
samco_path = r"\data\SamcoFactsheetOct24.pdf"
tata_path = r"\data\TataFactNov24.pdf"

In [15]:
import pdfplumber
import fitz  # PyMuPDF

def detect_tables_and_generate_pdfs(input_pdf_path, output_tabular_pdf, output_non_tabular_pdf):
    """
    Detect tables in a PDF using pdfplumber and generate two PDFs:
    - One with only tabular data (inside the table bounding boxes).
    - One with all non-tabular data (outside the table bounding boxes).

    Args:
        input_pdf_path (str): Path to the input PDF.
        output_tabular_pdf (str): Path to save the PDF with only tabular data.
        output_non_tabular_pdf (str): Path to save the PDF with non-tabular data.
    """
    with pdfplumber.open(input_pdf_path) as pdf:
        # Open the original PDF to copy content
        input_doc = fitz.open(input_pdf_path)

        # Create two new documents for the outputs
        tabular_output_doc = fitz.open()
        non_tabular_output_doc = fitz.open()

        for page_number, pdf_page in enumerate(pdf.pages):
            pdf_tables = pdf_page.find_tables()  # Detect tables using pdfplumber
            
            # Open the corresponding page in the original PDF
            doc_page = input_doc[page_number]
            
            # Create new pages for both tabular and non-tabular PDFs
            tabular_page = tabular_output_doc.new_page(width=doc_page.rect.width, height=doc_page.rect.height)
            non_tabular_page = non_tabular_output_doc.new_page(width=doc_page.rect.width, height=doc_page.rect.height)
            
            # Extract all blocks (text, images, etc.) from the page
            blocks = doc_page.get_text("dict")['blocks']

            # Initialize a list to store table bounding boxes
            table_bboxes = [table.bbox for table in pdf_tables]

            # Iterate over blocks and separate text based on the bbox
            for block in blocks:
                if 'image' in block:
                    continue  # Skip images
                
                if 'lines' in block:  # This indicates a text block
                    for line in block['lines']:
                        for span in line['spans']:
                            bbox = span.get('bbox', [0, 0, 0, 0])
                            text = span['text']
                            size = span.get('size', 13)  # Default font size if not provided
                            color = span.get('color', (0, 0, 0))  # Default color (black) if not provided
                            font = span.get('font', "helv")

                            # Check if color is an integer (e.g., 0x000000) and convert to (R, G, B)
                            if isinstance(color, int):
                                color = ((color >> 16) & 255, (color >> 8) & 255, color & 255)  # Convert hex to RGB
                        
                            # Normalize the color values (R, G, B) to range from 0 to 1
                            color = tuple(c / 255 for c in color)

                            is_bold = 'Bold' in font #check if bold is in font
                            fontname = font + ("-b" if is_bold else "")

                            # Check if the bbox of the text block is inside any table bbox
                            inside_table = False
                            for table_bbox in table_bboxes:
                                if (bbox[0] >= table_bbox[0] and bbox[1] >= table_bbox[1] and
                                    bbox[2] <= table_bbox[2] and bbox[3] <= table_bbox[3]):
                                    inside_table = True
                                    break
                            
                            # If inside a table, add to the tabular PDF
                            if inside_table:
                                try:
                                    tabular_page.insert_text(
                                        (bbox[0], bbox[1]),
                                        text,
                                        fontsize=size,
                                        fontname= fontname,
                                        color=color
                                    )
                                except Exception:
                                    tabular_page.insert_text(
                                        (bbox[0], bbox[1]),
                                        text,
                                        fontsize=size,
                                        fontname="helv",  # Fallback font
                                        color=color
                                    )
                            else:
                                # If not inside a table, add to the non-tabular PDF
                                try:
                                    non_tabular_page.insert_text(
                                        (bbox[0], bbox[1]),  # Position (x, y) of the text
                                        text,
                                        fontsize=size,
                                        fontname= fontname,
                                        color=color
                                    )
                                except Exception as e:
                                    #print(f"Error inserting text: {e}")
                                    # Fallback to insert text with 'helv' font and black color if error occurs
                                    non_tabular_page.insert_text(
                                        (bbox[0], bbox[1]),  # Position (x, y) of the text
                                        text,
                                        fontsize=size,
                                        fontname="helv",  # Use 'helv' font for fallback
                                        color=color  # Default color black
                                    )

            # Save the output PDFs
        tabular_output_doc.save(output_tabular_pdf)
        non_tabular_output_doc.save(output_non_tabular_pdf)

        # Close the documents
        tabular_output_doc.close()
        non_tabular_output_doc.close()
        input_doc.close()

    print(f"Tabular data PDF saved to: {output_tabular_pdf}")
    print(f"Non-tabular data PDF saved to: {output_non_tabular_pdf}")


In [20]:
input_path = path + tata_path
output_tabular = path + r"\output\tabular.pdf"
output_nontabular = path + r"\output\nontabular.pdf"

detect_tables_and_generate_pdfs(input_path, output_tabular, output_nontabular)

Tabular data PDF saved to: C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\office-projects\pdf-extractor\output\tabular.pdf
Non-tabular data PDF saved to: C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\office-projects\pdf-extractor\output\nontabular.pdf
