In [16]:
import pdfplumber
import fitz
import camelot
import warnings , math, collections , os
import pickle

warnings.filterwarnings("ignore", category=UserWarning) 

path = r"C:\Users\rando\OneDrive\Documents\mywork-repo"
#path = r"C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo"


#pickle data paths
pickle_path_text = r"\output\pkl\textual_data.pkl"
pickle_path_tab = r"\output\pkl\tabular_data.pkl"


#file data paths
samco_path = r"\files\SamcoFactSheet2024.pdf"
tata_path = r"\files\TataFactSheet2024.pdf"


#dry run paths
dry_run_path = r"\output\DryRun.pdf"

IMPORTANT CLEANING FUNCTIONS

In [13]:
#Imp Funct

removeContent =[
    'Mutual fund investments are subject to market risks, read all scheme related documents carefully.',
    '(An open-ended scheme investing across large cap, midcap and small cap stocks)',
    "(An open-ended Equity Linked Saving Scheme with a statutory lock-in of 3 years and tax benefit.)",
    "(An open-ended dynamic equity scheme investing across large cap, mid cap, small cap stocks)",
    "(An open-ended equity scheme following momentum theme)",
    "(An open-ended equity scheme following special situations theme)",
    ".",
    "st",
    "Note:",
    "Disclaimer",
    "93.72",
    "risk-o-meter",
    "scheme risk-o-meter",
    "*Investors should consult their financial advisers if in doubt about whether the product is suitable for them.",
    "94.87",
    "(An open-ended dynamic asset allocation fund)",
    "97.11",
    ":"
]

document = fitz.open(path +tata_path)
total_pages = document.page_count
document.close()
for i in range(1,total_pages+1):
    removeContent.append(str(i))


removeContent = [content.lower() for content in removeContent]

pagesToIgnore = []

textDirection = [(1.0,0), (-1.0,0),(0,1.0),(0,-1.0)]

#regarding colors

def extract_rgb(color_int):
    red = (color_int >> 16) & 0xFF
    green = (color_int >> 8) & 0xFF
    blue = color_int & 0xFF
    return (int(red), int(green), int(blue))
    
def adjust_color_if_white(rgb):
    # Define threshold for white detection
    white_threshold = 230
    if all(component >= white_threshold for component in rgb):
        return (255, 165, 0)  # RGB for Orange
    return rgb

def is_white_or_shade(color):
    """ Determine if the color is white or a shade close to white """
    threshold = 240  # Define how close to white the color must be
    return all(c >= threshold for c in color)



#regarding tables

def adjust_bbox(bbox, direction, pixels):
    """
    Adjusts the boundary of a bounding box in the specified direction by a given number of pixels.

    Args:
        bbox (tuple): The original bounding box (x0, y0, x1, y1).
        direction (str): Direction to adjust ('top', 'bottom', 'left', 'right').
        pixels (int): The number of pixels to adjust by. Use positive values to expand and negative to contract.

    Returns:
        fitz.Rect: A new fitz.Rect object with the adjusted bounding box.
    """
    x0, y0, x1, y1 = bbox

    if direction == 'top':
        y0 -= pixels
    elif direction == 'bottom':
        y1 += pixels
    elif direction == 'left':
        x0 -= pixels
    elif direction == 'right':
        x1 += pixels
    else:
        raise ValueError("Invalid direction. Choose from 'top', 'bottom', 'left', 'right'.")

    adjusted_rect = fitz.Rect(x0, y0, x1, y1)
    return adjusted_rect


def is_table_large_enough(table_bbox, min_width, min_height):
    """Check if the table bounding box is larger than the minimum dimensions.
    Args:
        table_bbox (tuple): The bounding box of the table.
        min_width (float): Minimum width required.
        min_height (float): Minimum height required. 
    Returns:
        bool: True if both dimensions are greater than the minimums, else False.
    """
    x0, y0, x1, y1 = table_bbox
    return (x1 - x0) > min_width and (y1 - y0) > min_height


#other texts


def create_new_file(file_name):
    full_file_name = os.path.join(path,file_name)
    
    os.makedirs(os.path.dirname(full_file_name), exist_ok=True)

    # Check if the file already exists
    if os.path.exists(full_file_name):
        return full_file_name

    with open(full_file_name, 'w') as file:
        file.write("")  # Create an empty file
    
    return full_file_name

def check_if_redundant_text(text, removeContent):
    
    for remove in removeContent:
        if text == remove:
            return False
    return True

UTILITY AGGREGATOR

In [37]:
def process_text_attributes(blocks, table_bboxes=None):
    page_color = {}
    page_size = {}

    for block in blocks:
        if 'lines' not in block:
            continue
        for lines in block['lines']:
            for span in lines['spans']:
                color = span['color']
                size = round(float(span['size']))
                text = span['text'].strip()

                if table_bboxes is not None:  # If table_bboxes provided, check if text is within a table
                    bbox = span.get("bbox", [0, 0, 0, 0])
                    inside_table = any(
                        bbox[0] >= table_bbox[0] and
                        bbox[1] >= table_bbox[1] and
                        bbox[2] <= table_bbox[2] and
                        bbox[3] <= table_bbox[3]
                        for table_bbox in table_bboxes
                    )
                    if not inside_table:
                        continue  # Skip text not in tables for tabular data processing

                # Update color dictionary with text examples
                if color in page_color:
                    page_color[color]['count'] += 1
                    if len(page_color[color]['examples']) < 4:  # Store up to 6 examples
                        page_color[color]['examples'].append(text)
                else:
                    page_color[color] = {'count': 1, 'examples': [text]}

                # Update size dictionary with text examples
                if size in page_size:
                    page_size[size]['count'] += 1
                    if len(page_size[size]['examples']) < 6:  # Store up to 6 examples
                        page_size[size]['examples'].append(text)
                else:
                    page_size[size] = {'count': 1, 'examples': [text]}

    # Sort page_size by size then by count in descending order
    sorted_page_size = sorted(page_size.items(), key=lambda item: (item[0], -item[1]['count']), reverse=True)
    sorted_page_color = sorted(page_color.items(), key=lambda item: (-item[1]['count'], item[0]), reverse=True)

    return {
        "page_color_summary": list(sorted_page_color),
        "page_size_summary": list(sorted_page_size)
    }


PDF DATA SEGREGATION AND EXTRACTION

In [38]:
"""Open the PDF and extract all blocks of text, images, and other content, while collecting examples of text for each font size and color.
Args:input_pdf_path (str): Path to the input PDF.
    pagesToIgnore (list): List of page numbers to ignore.
Returns:list: A list of pages, where each page is a dictionary containing blocks of content and examples of text for each color and size.
"""
def extract_pdf_blocks(input_pdf_path, page_to_ignore):
    document_blocks_data = []
    page_blocks_data = []
    input_doc = fitz.open(input_pdf_path)

    with pdfplumber.open(input_pdf_path) as pdf:
        for page_number, pdf_page in enumerate(pdf.pages):
            if page_number not in pagesToIgnore:
                doc_page = input_doc[page_number]
                blocks = doc_page.get_text("dict")["blocks"]

                # Use the utility function to get sorted sizes and colors
                text_attributes = process_text_attributes(blocks)
                
                #get blocks for grand block list
                for block in blocks:
                    document_blocks_data.append(block)

                table_bboxes = []
                for table_bbox in (table.bbox for table in pdf_page.find_tables()):
                    adjusted_bbox = adjust_bbox(table_bbox, 'top', 10)
                    if is_table_large_enough(adjusted_bbox, 90, 30):
                        table_bboxes.append(adjusted_bbox)

                page_blocks_data.append({
                    "blocks": blocks,
                    "table_bboxes": table_bboxes,
                    "page_rect": doc_page.rect,
                    "page_color_summary": text_attributes['page_color_summary'],
                    "page_size_summary": text_attributes['page_size_summary'],
                })

        
        text_attributes = process_text_attributes(document_blocks_data)
        
        final_document_blocks_data = {
            "blocks_data": page_blocks_data,
            "total_pages": input_doc.page_count,
            "grand_page_colors": text_attributes['page_color_summary'],
            "grand_page_sizes": text_attributes['page_size_summary']
        }
    input_doc.close()
    return final_document_blocks_data

In [39]:
"""Removes all images from a PDF, preserving text and layout.
    Args:input_path (str): Path to the original PDF.
        output_path (str): Path to save the modified PDF.
    This function redacts images from the PDF to ensure layout preservation.
"""
def remove_images_from_pdf(input_path, output_path):
    doc = fitz.open(input_path) 
    for page in doc: 
        images = page.get_images(full=True)
        for image in images:
            xref = image[0] 
            for rect in page.get_image_rects(xref):
                page.add_redact_annot(rect, fill=(1, 1, 1)) 
            page.apply_redactions()

    doc.save(output_path, garbage=4, deflate=True)  
    doc.close() 


remove_images_from_pdf(path + samco_path , path + r"\output\removeImagePdf.pdf")

In [48]:
def create_pdf_file(document_data, output_pdf_path):
    output_doc = fitz.open()
    
    for page_data in document_data['blocks_data']:
        page_rect = page_data['page_rect']
        blocks = page_data['blocks']
        
        output_page = output_doc.new_page(width = page_rect.width, height = page_rect.height)
        
        for block in blocks:
            if "lines" in block:
                for line in block['lines']:
                    for span in line['spans']:
                        bbox = span.get("bbox", [0, 0, 0, 0])
                        text = span["text"]
                        size = round(float(span.get("size", 12)))  # Ensure font size is rounded
                        color = span.get("color", (0,0,0))  # Default color (black)
                        font = span.get("font", "helv").lower()  # Default font
                        
                        # Check if the font is bold or italic and modify the font name accordingly
                        fontname = "helv"  # Start with default font
                        if "bold" in font:
                            fontname += '-b'  # Add suffix for bold
                        if "italic" in font:
                            fontname += '-i'  # Add suffix for italic

                        # Normalize color if it's in integer form (e.g., 0x000000)
                        if isinstance(color, int):
                            color = ((color >> 16) & 255, (color >> 8) & 255, color & 255)

                        if is_white_or_shade(color):
                            color = (255, 165, 0)  # Change color to orange
                        
                        color = tuple(c / 255 for c in color)  # Normalize color to 0-1 range
                        
                        # Insert text into the PDF
                        try:
                            output_page.insert_text(
                                (bbox[0], bbox[1]),
                                text,
                                fontsize=size,
                                fontname=fontname,
                                color=color,
                            )
                        except Exception:
                            # Fallback to default font if there's an exception
                            output_page.insert_text(
                                (bbox[0], bbox[1]),
                                text,
                                fontsize=size,
                                fontname="helv",
                                color=color,
                            )

    # Save the output PDF
    output_doc.save(output_pdf_path)
    output_doc.close()
                    
                    

In [57]:
""" Removes image blocks from the provided blocks data and creates a PDF without these blocks.
    Args:
        blocks_data (list): List of dictionaries containing page data including text and image blocks.
        output_pdf_path (str): Path to save the output PDF.
    Returns:
        tuple: A tuple containing two lists - updated block data without image blocks and data of removed image blocks.
"""
def seperate_text_image_blocks(document_blocks_data):
    
    page_blocks_data = []
    image_blocks_data = []

    for page_data in document_blocks_data['blocks_data']:
        page_rect = page_data["page_rect"]
        blocks = page_data["blocks"]
        
        non_image_blocks_data = []
        page_image_data = []

        for block in blocks:
            if "image" in block.keys():  # Directly checking for image keys
                page_image_data.append(block)
            else:
                non_image_blocks_data.append(block)
            

        # After processing the page, extract text blocks from the newly created output page
        page_blocks_data.append({
            "blocks": non_image_blocks_data,
            "page_rect": page_rect,
            "table_bboxes": page_data['table_bboxes'],
            "page_color_summary": page_data['page_color_summary'],
            "page_size_summary": page_data['page_size_summary']
        })

        # Collect data for removed image blocks
        image_blocks_data.append(page_image_data)

    
    #the dict to append for final data
    final_document_blocks_data = {
        "blocks_data": page_blocks_data,
        "total_pages": document_blocks_data['total_pages'],
        "grand_page_colors": document_blocks_data['grand_page_colors'],
        "grand_page_sizes": document_blocks_data['grand_page_sizes']
    }

    return final_document_blocks_data, image_blocks_data

In [44]:
"""Returns: list of dictionaries, each containing only tabular content for each page."""
def create_tabular_pdf(document_blocks_data, output_pdf_path):
    
    output_doc = fitz.open()
    pages_data = []  # This will hold data for each page

    for page_index, page_data in enumerate(document_blocks_data['blocks_data']):
        page_rect = page_data["page_rect"]
        blocks = page_data["blocks"]
        table_bboxes = page_data["table_bboxes"]

        # Create a new page in the output PDF
        output_page = output_doc.new_page(width=page_rect.width, height=page_rect.height)

        page_tabular_blocks = []
        written_texts = set()  # To track written spans (text + bbox)
        page_colors = {}  # To track colors of tabular text
        page_sizes = {}  # To track sizes of tabular text

        for block in blocks:
            is_tabular_block = False  # Track if the block belongs to a table
            block_key = hash(str(block.get("bbox", "")) + str(block.get("lines", [])))  # Unique key for the block

            # Avoid reprocessing the same block
            if block_key in written_texts:
                continue

            if "lines" in block:  # Process text blocks
                for line in block["lines"]:
                    for span in line["spans"]:
                        bbox = span.get("bbox")
                        text = span["text"]  # Normalize text

                        # Check if the text is within any table bbox
                        inside_table = any(
                            bbox[0] >= table_bbox[0] and
                            bbox[1] >= table_bbox[1] and
                            bbox[2] <= table_bbox[2] and
                            bbox[3] <= table_bbox[3]
                            for table_bbox in table_bboxes
                        )

                        if inside_table:
                            # Create a unique identifier for this span
                            span_key = (text, tuple(map(round, bbox)))

                            if span_key not in written_texts:
                                written_texts.add(span_key)  # Mark this span as written
                                is_tabular_block = True  # Mark the block as part of a table

                                size = span.get("size")
                                color = span.get("color")
                                font = span.get("font")

                                # Update color and size dictionaries
                                if color not in page_colors:
                                    page_colors[color] = {'example': text, 'count': 1}
                                else:
                                    page_colors[color]['count'] += 1
                                
                                if size not in page_sizes:
                                    page_sizes[size] = {'example': text, 'count': 1}
                                else:
                                    page_sizes[size]['count'] += 1

                                # Write text to the PDF
                                try:
                                    output_page.insert_text(
                                        (bbox[0], bbox[1]),
                                        text,
                                        fontsize=size,
                                        fontname=font,
                                        color=color,
                                    )
                                except Exception:
                                    output_page.insert_text(
                                        (bbox[0], bbox[1]),
                                        text,
                                        fontsize=size,
                                        fontname="helv",
                                        color=color,
                                    )

            if is_tabular_block:
                page_tabular_blocks.append(block)
                written_texts.add(block_key)  # Mark the block as processed
                
        text_attributes = process_text_attributes(page_tabular_blocks)
        

        # Draw table bounding boxes with light green color
        light_green = (0.2, 1, 0.5)  # RGB for light green
        for table_bbox in table_bboxes:
            rect = fitz.Rect(table_bbox)
            output_page.draw_rect(rect, color=light_green, width=.4)
        
        # Store page-specific data
        page_data = {
            "blocks": page_tabular_blocks,
            "page_rect": page_rect,
            "table_bboxes": table_bboxes,
            "page_color_summary": text_attributes['page_color_summary'],
            "page_size_summary": text_attributes['page_size_summary']
        }
        pages_data.append(page_data)
        
        
        final_document_blocks_data = {
            "blocks_data": pages_data,
            "grand_page_colors": {},
            "grand_page_sizes": {}
        }
    
    # Save the output PDF
    output_doc.save(output_pdf_path)
    output_doc.close()

    return final_document_blocks_data


In [136]:
def create_textual_pdf(blocks_data, output_pdf_path):
    """Returns: list of dictionaries, each containing block data for non-tabular content of each page."""
    
    output_doc = fitz.open()
    pages_data = []  # This will hold data for each page

    for page_index, page_data in enumerate(blocks_data):
        page_rect = page_data["page_rect"]
        blocks = page_data["blocks"]
        table_bboxes = page_data["table_bboxes"]

        # Create a new page in the output PDF
        output_page = output_doc.new_page(width=page_rect.width, height=page_rect.height)

        page_non_tabular_blocks = []
        written_texts = set()  # To track written spans (text + bbox)
        written_blocks = set()  # To track processed blocks
        page_sizes = {}  # To track sizes of non-tabular text
        page_colors = {}  # To track colors of non-tabular text

        for block in blocks:
            block_key = hash(str(block.get("bbox", "")) + str(block.get("lines", [])))  # Unique key for block
            
            if block_key in written_blocks:
                continue

            is_non_tabular_block = False  # Track if the block is non-tabular

            if "lines" in block:  # Process text blocks
                for line in block["lines"]:
                    for span in line["spans"]:
                        bbox = span.get("bbox", [0, 0, 0, 0])
                        text = span["text"].strip()

                        # Check if the text is outside any table bbox
                        inside_table = any(
                            bbox[0] >= table_bbox[0] and
                            bbox[1] >= table_bbox[1] and
                            bbox[2] <= table_bbox[2] and
                            bbox[3] <= table_bbox[3]
                            for table_bbox in table_bboxes
                        )

                        if not inside_table:
                            span_key = (text, tuple(map(round, bbox)))

                            if span_key not in written_texts:
                                written_texts.add(span_key)
                                is_non_tabular_block = True

                                size = float(span.get("size", 12))
                                color = span.get("color", (0, 0, 0))
                                font = span.get("font", "helv").lower()

                                if isinstance(color, int):
                                    color = ((color >> 16) & 255, (color >> 8) & 255, color & 255)
                                if is_white_or_shade(color):
                                    color = (255, 165, 0)
                                color = tuple(c / 255 for c in color)

                                # Track size and color with example text
                                if size not in page_sizes:
                                    page_sizes[size] = {'example': text, 'count': 1}
                                else:
                                    page_sizes[size]['count'] += 1

                                if color not in page_colors:
                                    page_colors[color] = {'example': text, 'count': 1}
                                else:
                                    page_colors[color]['count'] += 1

                                output_page.insert_text((bbox[0], bbox[1]), text, fontsize=size, fontname=font, color=color)

            if is_non_tabular_block:
                page_non_tabular_blocks.append(block)
                written_blocks.add(block_key)  # Mark the block as processed

        # Draw table bounding boxes with light green color
        light_green = (0.2, 1, 0.3)  # RGB for light green
        for table_bbox in table_bboxes:
            rect = fitz.Rect(table_bbox)
            output_page.draw_rect(rect, color=light_green, width=.5)

        # Store page-specific data
        page_data = {
            "blocks": page_non_tabular_blocks,
            "page_rect": page_rect,
            "table_bboxes": table_bboxes,
            "page_colors": page_colors,
            "page_sizes": page_sizes
        }
        pages_data.append(page_data)
    
    # Save the output PDF
    output_doc.save(output_pdf_path)
    output_doc.close()

    return pages_data


In [97]:
"""
Separates text and tabular data based on bounding boxes (bboxes).
Args:document_blocks_data (dict): Dictionary containing block data for each page.
Returns:tuple: A tuple containing two lists:
- updated block data with non-tabular (text) blocks
- updated block data with tabular blocks
"""
def separate_text_and_tabular_blocks(document_blocks_data):
    
# Separate lists for document-level tabular and textual data
    document_tabular_blocks = []
    document_textual_blocks = []

    # Separate lists for page-level tabular and textual data
    tabular_blocks_data = []
    textual_blocks_data = []

    for page_data in document_blocks_data['blocks_data']:
        blocks = page_data['blocks']
        table_bboxes = page_data['table_bboxes']

        # Separate lists for page-level blocks
        page_tabular_blocks = []
        page_textual_blocks = []

        for block in blocks:
            is_tabular_block = False

            if 'lines' in block:
                for line in block['lines']:
                    for span in line['spans']:
                        bbox = span.get('bbox', [0, 0, 0, 0])

                        # Check if the span bbox is inside any table bbox
                        inside_table = any(
                            bbox[0] >= table_bbox[0]
                            and bbox[1] >= table_bbox[1]
                            and bbox[2] <= table_bbox[2]
                            and bbox[3] <= table_bbox[3]
                            for table_bbox in table_bboxes
                        )

                        if inside_table:
                            is_tabular_block = True
                            break

                    if is_tabular_block:
                        break

            # Append the block to the appropriate page-level list
            if is_tabular_block:
                page_tabular_blocks.append(block)
                document_tabular_blocks.append(block)
            else:
                page_textual_blocks.append(block)
                document_textual_blocks.append(block)

        # Apply attributes for each page
        text_attributes = process_text_attributes(page_textual_blocks)
        tab_attributes = process_text_attributes(page_tabular_blocks)

        # Store the separated blocks for the page
        tabular_blocks_data.append({
            "blocks": page_tabular_blocks,
            "page_rect": page_data['page_rect'],
            "table_bboxes": table_bboxes,
            "page_color_summary": tab_attributes['page_color_summary'],
            "page_size_summary": tab_attributes['page_size_summary'],
        })

        textual_blocks_data.append({
            "blocks": page_textual_blocks,
            "page_rect": page_data['page_rect'],
            "table_bboxes": table_bboxes,
            "page_color_summary": text_attributes['page_color_summary'],
            "page_size_summary": text_attributes['page_size_summary'],
        })

    # Process aggregated attributes for document-level data
    tabular_agg = process_text_attributes(document_tabular_blocks)
    textual_agg = process_text_attributes(document_textual_blocks)

    # Prepare final data
    final_textual_blocks_data = {
        "blocks_data": textual_blocks_data,
        "grand_page_colors": textual_agg['page_color_summary'],
        "grand_page_sizes": textual_agg['page_size_summary']
    }

    final_tabular_blocks_data = {
        "blocks_data": tabular_blocks_data,
        "grand_page_colors": tabular_agg['page_color_summary'],
        "grand_page_sizes": tabular_agg['page_size_summary']
    }

    # Return the separated data
    return final_textual_blocks_data, final_tabular_blocks_data


In [None]:
# Return Value of function
#     grand_dict = {
#     "block_data": [
#         "blocks_data": [],
#         "page_sizes": [],
#         "page_colors": [],
#         "page_rect": ()
#     ]
#     "total_pages":[]
#     "grand_page_colors": []
#     "grand_page_sizes": []
#     }

DRY RUN CODE

In [84]:
#samco output path
no_image_path = path +r"\output\NoImgPdf.pdf"
textual_pdf_path = path + r"\output\TextualPdf.pdf"
tabular_pdf_path = path + r"\output\TabularPdf.pdf"

#tata output path
# no_image_path = path +r"\output\TatanoImgPdf.pdf"
# textual_pdf_path = path + r"\output\TatatextalPdf.pdf"
# tabular_pdf_path = path + r"\output\TatatabularPdf.pdf"

In [85]:
blocks_data = extract_pdf_blocks(path + samco_path, pagesToIgnore)
print("\n Success !!")


 Success !!


In [86]:
non_image_data, image_data = seperate_text_image_blocks(blocks_data)
create_pdf_file(non_image_data,no_image_path)
print("\n Success !!")


 Success !!


In [None]:
textual_data,tabular_data = separate_text_and_tabular_blocks(non_image_data)
print("\n Success !!")


 Success !!


In [101]:
create_pdf_file(textual_data, textual_pdf_path)
create_pdf_file(tabular_data, tabular_pdf_path)

In [100]:
with open(path + pickle_path_text , 'wb') as file:
    pickle.dump(textual_data, file)
    
with open(path + pickle_path_tab , 'wb') as file:
    pickle.dump(tabular_data, file)

In [230]:
"""Open the pdf , get all text data and blocks and draw a boundary along each boundary boxes
    Args:input_pdf_path(str) , output_pdf_path (str)
    Returns: nothing, a new pdf created
"""
def draw_boundaries_on_pdf(input_pdf_path, output_pdf_path):
    # Open the PDF file
    doc = fitz.open(input_pdf_path)
    for page in doc:
        blocks = page.get_text("blocks")  # Get the blocks of text on the page
        for block in blocks:
            bbox = block[:4]  # The bbox is the first four elements of the block
            # Draw a rectangle with an orange border around the bbox
            page.draw_rect(bbox, color=(1.0, 0.647, 0.0), width=1.5, overlay=False)
    
    # Save the modified document to a new file
    doc.save(output_pdf_path)
    doc.close()
# Use the function with your file paths
draw_boundaries_on_pdf(textual_pdf_path, path + dry_run_path)

In [77]:
def draw_table_boundaries(input_pdf_path, output_pdf_path):
    with pdfplumber.open(input_pdf_path) as pdf:
        doc = fitz.open(input_pdf_path)
        for page_number, page in enumerate(pdf.pages):
            fitz_page = doc[page_number]
            tables = page.find_tables()
            for table in tables:
                bbox = table.bbox
                rect = fitz.Rect(bbox[0], bbox[1], bbox[2], bbox[3])
                fitz_page.draw_rect(rect, color=(0, 0, 1), width=1.5, overlay=False)
        doc.save(output_pdf_path)
        doc.close()

draw_table_boundaries(no_image_path, path + dry_run_path)

In [17]:
# def extract_tables(blocks):
#     """
#     Extracts tables from a list of fitz blocks.

#     Args:
#         blocks: List of blocks extracted from the PDF using PyMuPDF.

#     Returns:
#         A list of Camelot Table objects.
#     """

#     # 1. Create a temporary string to hold the table data
#     table_text = ""
#     for block in blocks:
#         if "lines" in block:  # Check if the block is text
#             for line in block['lines']:
#                 for span in line['spans']:
#                     table_text += span['text']  # Extract the text from the span
#                 table_text += "\n"  # Add a newline after each line

#     path = r"C:\Users\rando\OneDrive\Documents\mywork-repo\output\temp_table.txt"
#     with open(path, 'w') as f:
#         f.write(table_text)

#     try:
#         tables = camelot.read_pdf('temp_table.txt', flavor='lattice', pages='1')
#         return tables
#     except Exception as e:
#         print(f"Error extracting tables: {e}")
#         return []