In [28]:
import pdfplumber
import fitz
import camelot
import warnings , math, collections

warnings.filterwarnings("ignore", category=UserWarning) 

In [4]:
#path = r"C:\Users\rando\OneDrive\Documents\mywork-repo"

path = r"C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo"

samco_path = r"\files\SamcoFactSheet2024.pdf"
tata_path = r"\files\TataFactSheet2024.pdf"

In [59]:
def extract_pdf_blocks(input_pdf_path):
    """
    Open the PDF and extract all blocks of text, images, and other content.

    Args:
        input_pdf_path (str): Path to the input PDF.

    Returns:
        list: A list of pages, where each page is a dictionary containing blocks of content.
    """
    blocks_data = []
    
    input_doc = fitz.open(input_pdf_path)
    

    with pdfplumber.open(input_pdf_path) as pdf:
        for page_number, pdf_page in enumerate(pdf.pages):
            # Extract blocks of content using PyMuPDF
            doc_page = input_doc[page_number]
            blocks = doc_page.get_text("dict")["blocks"]
            
            #get the color and size for each block then aggregate
            page_color = []
            page_size = []
            for block in blocks:
                if 'lines' not in block:
                    continue
                for lines in block['lines']:
                    for span in lines['spans']:
                        color = span['color']
                        size = span['size']
                        page_color.append(color)
                        page_size.append(size)
                        
            page_color = collections.Counter(page_color)
            page_size = collections.Counter(page_size)
            # Append the blocks along with table bounding boxes from pdfplumber
            blocks_data.append({
                "blocks": blocks,
                "table_bboxes": [table.bbox for table in pdf_page.find_tables()],
                "page_rect": doc_page.rect,
                "page_colors":dict(page_color),
                "page_sizes": dict(page_size),
                "total_pages": input_doc.page_count
            })

    input_doc.close()
    return blocks_data

In [14]:
def remove_image_blocks_and_create_pdf(blocks_data, output_pdf_path):
    """
    Returns:
        list: Updated block data without image blocks.
    """
    output_doc = fitz.open()
    output_blocks_data = []
    image_blocks_data = []

    for page_data in blocks_data:
        page_rect = page_data["page_rect"]
        blocks = page_data["blocks"]
        tablebbox = page_data['table_bboxes']
        

        # Create a new page in the output PDF
        output_page = output_doc.new_page(width=page_rect.width, height=page_rect.height)

        page_image_data = []
        

        for block in blocks:
            if "image" in block.keys():
                page_image_data.append(block)
                del block['image']
                continue #skip the image box
                # bbox = block.get('bbox',[0,0,0,0])
                # try:
                #     output_page.draw_rect(
                #         bbox,  # Bounding box of the rectangle
                #         color=(0.9, 0.9, 0.9),  # Light grey color
                #         fill=True,  # Fill the rectangle
                #     )
                # except Exception:
                #     continue  # Skip if there's an error drawing the rectangle
                # continue  # Skip further processing for image blocks


            if "lines" in block:  # Process text lines
                for line in block["lines"]:
                    for span in line["spans"]:
                        bbox = span.get("bbox", [0, 0, 0, 0])
                        text = span["text"]
                        size = math.ceil(float(span.get("size", 12)))  # Default font size if not provided
                        color = span.get("color", (0, 0, 0))  # Default color (black)
                        font = span.get("font", "helv").lower()  # Fallback to 'helv' if font is not found

                        # Check if the font is bold or italic and modify font name accordingly
                        fontname = "helv"  # Default font
                        if "bold" in font or "light" in font:
                            fontname += '-b'# Add suffix for bold
                        if "italic" in font:
                            fontname += '-i'# Add suffix for italic

                        # Normalize color if it's in integer form (e.g., 0x000000)
                        if isinstance(color, int):
                            color = (
                                (color >> 16) & 255, (color >> 8) & 255, color & 255
                            )
                        color = tuple(c / 255 for c in color)  # Convert to 0-1 range

                        try:
                            output_page.insert_text(
                                (bbox[0], bbox[1]),
                                text,
                                fontsize=size,
                                fontname=fontname,
                                color=color,
                            )
                        except Exception:
                            output_page.insert_text(
                                (bbox[0], bbox[1]),
                                text,
                                fontsize=size,
                                fontname="helv",  # Fallback font
                                color=color,
                            )
        # Append instance each instance of image to 
        image_blocks_data.append(page_image_data)
        
        # After processing the page, extract text blocks from the newly created output page
        output_blocks = output_page.get_text("dict")["blocks"]
        output_blocks_data.append({
            "blocks": output_blocks,
            "page_rect": page_rect,
            "table_bboxes": tablebbox
            })


    # Save the output PDF
    output_doc.save(output_pdf_path)
    output_doc.close()

    return output_blocks_data, image_blocks_data

In [16]:
#samco
no_image_path = path +r"\output\NoImgPdf.pdf"
textual_pdf_path = path + r"\output\TextualPdf.pdf"
tabular_pdf_path = path + r"\output\TabularPdf.pdf"

In [8]:
#tata
no_image_path = path +r"\output\TatanoImgPdf.pdf"
textual_pdf_path = path + r"\output\TatatextalPdf.pdf"
tabular_pdf_path = path + r"\output\TatatabularPdf.pdf"

In [60]:
blocks_data = extract_pdf_blocks(path + samco_path)
#text_data, image_data = remove_image_blocks_and_create_pdf(blocks_data, no_image_path)
print("\n Success !!")


 Success !!


In [70]:
#Remove Redundant Content
removeContent =[
    'Mutual fund investments are subject to market risks, read all scheme related documents carefully. ',
    '(An open-ended scheme investing across large cap, midcap and small cap stocks)',
    "(An open-ended Equity Linked Saving Scheme with a statutory lock-in of 3 years and tax benefit.)",
    "(An open-ended dynamic equity scheme investing across large cap, mid cap, small cap stocks)",
    "Disclaimer",
    "(An open-ended equity scheme following momentum theme)"
]

textDirection = [(1.0,0), (-1.0,0),(0,1.0),(0,-1.0)]

page_count = blocks_data[0]['total_pages'] #get total page of file
for i in range(page_count):
    removeContent.append(str(i))


In [11]:
#blk_data = blocks_data #pdf_blocks
blk_data = text_data #pdf minus images
print(len(blk_data))
for pgn, page in enumerate(blk_data):
    print(f"\nPage:{pgn}")
    for block in page['blocks']:
       print(f"\n{block}")

In [71]:

"""Returns:list: Block data containing only non-tabular content."""
def create_non_tabular_pdf(blocks_data, output_pdf_path):
    output_doc = fitz.open()
    non_tabular_blocks = []

    for page_data in blocks_data:
        page_rect = page_data["page_rect"]
        blocks = page_data["blocks"]
        table_bboxes = page_data["table_bboxes"]

        # Create a new page in the output PDF
        output_page = output_doc.new_page(width=page_rect.width, height=page_rect.height)

        page_non_tabular_blocks = []
        written_texts = set()  # To track written spans (text + bbox)
        written_blocks = set()  # To track processed blocks

        for block in blocks:
            block_key = hash(str(block.get("bbox", "")) + str(block.get("lines", [])))  # Unique key for block

            # Avoid reprocessing the same block
            if block_key in written_blocks:
                continue

            is_non_tabular_block = False  # Track if the block is non-tabular

            if "lines" in block:  # Process text blocks
                for line in block["lines"]:
                    for span in line["spans"]:
                        bbox = span.get("bbox", [0, 0, 0, 0])
                        text = span["text"].strip()  # Normalize text
                        
                        #set important conditions
                        condition = (text not in removeContent and 
                                     line['dir'] in textDirection)
                        
                        if condition:
                            # Check if the text is within any table bbox
                            inside_table = any(
                                bbox[0] >= table_bbox[0] and
                                bbox[1] >= table_bbox[1] and
                                bbox[2] <= table_bbox[2] and
                                bbox[3] <= table_bbox[3]
                                for table_bbox in table_bboxes
                            )

                            if not inside_table:
                                # Create a unique identifier for this span
                                span_key = (text, tuple(map(round, bbox)))

                                if span_key not in written_texts:
                                    written_texts.add(span_key)  # Mark this span as written
                                    is_non_tabular_block = True  # Mark the block as non-tabular

                                    # Insert text into the PDF
                                    size = float(span.get("size", 12))
                                    color = span.get("color", (0, 0, 0))
                                    font = span.get("font", "helv").lower()

                                    # Normalize color
                                    if isinstance(color, int):
                                        color = (
                                            (color >> 16) & 255, (color >> 8) & 255, color & 255
                                        )
                                    color = tuple(c / 255 for c in color)

                                    # Write text to the PDF
                                    try:
                                        output_page.insert_text(
                                            (bbox[0], bbox[1]),
                                            text,
                                            fontsize=size,
                                            fontname=font,
                                            color=color,
                                        )
                                    except Exception:
                                        output_page.insert_text(
                                            (bbox[0], bbox[1]),
                                            text,
                                            fontsize=size,
                                            fontname="helv",
                                            color=color,
                                        )

            if is_non_tabular_block:
                page_non_tabular_blocks.append(block)
                written_blocks.add(block_key)  # Mark this block as processed

        # Append only unique non-tabular blocks for this page
        non_tabular_blocks.append(page_non_tabular_blocks)

    # Save the output PDF
    output_doc.save(output_pdf_path)
    output_doc.close()

    return non_tabular_blocks


"""Returns:list: Block data containing only tabular content."""
def create_tabular_pdf(blocks_data, output_pdf_path):
    output_doc = fitz.open()
    tabular_blocks = []

    for page_data in blocks_data:
        page_rect = page_data["page_rect"]
        blocks = page_data["blocks"]
        table_bboxes = page_data["table_bboxes"]

        # Create a new page in the output PDF
        output_page = output_doc.new_page(width=page_rect.width, height=page_rect.height)

        page_tabular_blocks = []
        written_texts = set()  # To track written spans (text + bbox)

        for block in blocks:
            is_tabular_block = False  # Track if the block belongs to a table
            block_key = hash(str(block.get("bbox", "")) + str(block.get("lines", [])))  # Unique key for the block

            # Avoid reprocessing the same block
            if block_key in written_texts:
                continue

            if "lines" in block.keys():  # Process text blocks
                for line in block["lines"]:
                    for span in line["spans"]:
                        bbox = span.get("bbox", [0, 0, 0, 0])
                        text = span["text"].strip()  # Normalize text

                        # Check if the text is within any table bbox
                        inside_table = any(
                            bbox[0] >= table_bbox[0] and
                            bbox[1] >= table_bbox[1] and
                            bbox[2] <= table_bbox[2] and
                            bbox[3] <= table_bbox[3]
                            for table_bbox in table_bboxes
                        )

                        if inside_table:
                            # Create a unique identifier for this span
                            span_key = (text, tuple(map(round, bbox)))

                            if span_key not in written_texts:
                                written_texts.add(span_key)  # Mark this span as written
                                is_tabular_block = True  # Mark the block as part of a table

                                # Insert text into the PDF
                                size = float(span.get("size", 12))
                                color = span.get("color", (0, 0, 0))
                                font = span.get("font", "helv").lower()

                                # Normalize color
                                if isinstance(color, int):
                                    color = (
                                        (color >> 16) & 255, (color >> 8) & 255, color & 255
                                    )
                                color = tuple(c / 255 for c in color)

                                # Write text to the PDF
                                try:
                                    output_page.insert_text(
                                        (bbox[0], bbox[1]),
                                        text,
                                        fontsize=size,
                                        fontname=font,
                                        color=color,
                                    )
                                except Exception:
                                    output_page.insert_text(
                                        (bbox[0], bbox[1]),
                                        text,
                                        fontsize=size,
                                        fontname="helv",
                                        color=color,
                                    )

            if is_tabular_block:
                page_tabular_blocks.append(block)
                written_texts.add(block_key)  # Mark the block as processed

        # Draw table bounding boxes with light green color
        light_green = (0.5, 1, 0.5)  # RGB for light green
        for table_bbox in table_bboxes:
            rect = fitz.Rect(table_bbox)
            output_page.draw_rect(rect, color=light_green, width=1)
        
        # Append only unique tabular blocks for this page
        tabular_blocks.append(page_tabular_blocks)

    # Save the output PDF
    output_doc.save(output_pdf_path)
    output_doc.close()

    return tabular_blocks


In [72]:
textual_block_data = create_non_tabular_pdf(blocks_data, textual_pdf_path)
# print("\nCode Successful")

#tabular_block_data = create_tabular_pdf(text_data, tabular_pdf_path)
print("\nCode Successful")


Code Successful


In [22]:
textual_block_data[1]

In [17]:
def extract_tables(blocks):
    """
    Extracts tables from a list of fitz blocks.

    Args:
        blocks: List of blocks extracted from the PDF using PyMuPDF.

    Returns:
        A list of Camelot Table objects.
    """

    # 1. Create a temporary string to hold the table data
    table_text = ""
    for block in blocks:
        if "lines" in block:  # Check if the block is text
            for line in block['lines']:
                for span in line['spans']:
                    table_text += span['text']  # Extract the text from the span
                table_text += "\n"  # Add a newline after each line

    path = r"C:\Users\rando\OneDrive\Documents\mywork-repo\output\temp_table.txt"
    with open(path, 'w') as f:
        f.write(table_text)

    try:
        tables = camelot.read_pdf('temp_table.txt', flavor='lattice', pages='1')
        return tables
    except Exception as e:
        print(f"Error extracting tables: {e}")
        return []