In [1]:
import pdfplumber
import fitz  # PyMuPDF

In [6]:
path = r"C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\office-projects\pdf-extractor\data\scanned-pdf\text_with_header_footer.pdf"

In [4]:

import pdfplumber
import random

def calculate_randomized_header_footer_positions(path, num_pages=15):
    """
    Analyze random pages of a PDF to determine the header and footer positions 
    based on the average positions of text elements.
    
    Args:
        path (str): The file path to the PDF document.
        num_pages (int): The number of pages to analyze for header and footer positions.
    
    Returns:
        tuple: A tuple containing:
            - header_bottom (float): Average bottom position of the header.
            - footer_top (float): Average top position of the footer.
            If no header/footer is detected, returns None for respective values.
    """
    with pdfplumber.open(path) as pdf:
        total_pages = len(pdf.pages)
        
        if total_pages == 0:
            raise ValueError("The PDF contains no pages.")
        
        # Randomly select page indices
        pages_to_analyze = random.sample(range(total_pages), min(total_pages, num_pages))
        
        header_positions = []
        footer_positions = []

        for page_index in pages_to_analyze:
            page = pdf.pages[page_index]
            words = page.extract_words()
            
            if words:  # Ensure there are words on the page
                header_positions.append(words[0]["bottom"])  # First word's bottom position
                footer_positions.append(words[-1]["top"])    # Last word's top position
        
        # Calculate averages, or return None if no valid positions found
        header_bottom = sum(header_positions) / len(header_positions) if header_positions else None
        footer_top = sum(footer_positions) / len(footer_positions) if footer_positions else None

        return header_bottom, footer_top




def parse_document(file_path):
    """
    Parse the document, extract structured content (headings and paragraphs),
    after removing the header and footer.
    """
    document = {}

    # Step 1: Analyze first 10 pages to omit header/footer using `omit_header_footer`
    header_bottom, footer_top = calculate_randomized_header_footer_positions(file_path)

    # Step 2: Open the PDF using fitz (PyMuPDF) for structured content extraction (headings and paragraphs)
    pdf_doc = fitz.open(file_path)
    total_pages = len(pdf_doc)

    for page_number in range(total_pages):
        # Get the page content from fitz (PyMuPDF)
        page = pdf_doc[page_number]
        parsed_content = get_page_content_after_omit(page, header_bottom, footer_top)

        # Add the structured content (headings and paragraphs after removing header/footer) to the document dictionary
        document[f"page{page_number + 1}"] = parsed_content

    return document


def get_page_content_after_omit(page, header_bottom, footer_top):
    """
    Extract structured content (headers and paragraphs) from a page after removing header and footer.
    """
    block_content = {"header": [], "content": {}}
    
    # Obtain each block in the paragraph
    blocks = page.get_text("dict")["blocks"]
    total_blocks = len(blocks)

    for block_index in range(total_blocks):
        block = blocks[block_index]
        text_font_size = []

        if "lines" in block:
            for line in block["lines"]:
                for span in line["spans"]:
                    text_font_size.append((span["text"], span["size"], span["font"]))

        # Organize content based on font size
        header_text = []
        paragraph_text = []
        for text, size, font in text_font_size:
            # Only add text that's between the header and footer bounds
            if size == 12:  # Assuming size 12 is the paragraph text
                paragraph_text.append(text)
            else:  # If the text is not of size 12, it's treated as part of the header
                header_text.append(text)

        # Process header and content separately
        if header_text:
            block_content["header"].append(" ".join(header_text))  # Combine header text
        if paragraph_text:
            block_content["content"][f"p{block_index + 1}"] = " ".join(paragraph_text)  # Combine paragraph text

    return block_content




In [None]:
import pdfplumber
import random

def calculate_randomized_header_footer_positions(path, num_pages=20):
    """
    Analyze random pages of a PDF to determine the header and footer positions 
    based on the average positions of text elements.
    
    Args:
        path (str): The file path to the PDF document.
        num_pages (int): The number of pages to analyze for header and footer positions.
    
    Returns:
        tuple: A tuple containing:
            - header_bottom (float): Average bottom position of the header.
            - footer_top (float): Average top position of the footer.
            If no header/footer is detected, returns None for respective values.
    """
    with pdfplumber.open(path) as pdf:
        total_pages = len(pdf.pages)
        
        if total_pages == 0:
            raise ValueError("The PDF contains no pages.")
        
        # Randomly select page indices
        pages_to_analyze = random.sample(range(total_pages), min(total_pages, num_pages))
        
        header_positions = []
        footer_positions = []

        for page_index in pages_to_analyze:
            page = pdf.pages[page_index]
            words = page.extract_words()
            
            if words:  # Ensure there are words on the page
                header_positions.append(words[0]["bottom"])  # First word's bottom position
                footer_positions.append(words[-1]["top"])    # Last word's top position
        
        # Calculate averages, or return None if no valid positions found
        header_bottom = sum(header_positions) / len(header_positions) if header_positions else None
        footer_top = sum(footer_positions) / len(footer_positions) if footer_positions else None

        return header_bottom, footer_top


In [7]:

document = parse_document(path)

# Print structured content for the first page after omitting header/footer
print(f"Structured Content for Page 1: {document['page1']}")

Structured Content for Page 1: {'header': ['6', 'The Macro picture', 'November 2024 October 2024', 'Source:  RBI, Bloomberg', 'Index Name Return in % As on', '29-Nov-24 31-Oct-24 30-Nov-23 1 Month 1 Year Index Name Return in % As on', '29-Nov-24 31-Oct-24 30-Nov-23 1 Month 1 Year', 'www.tatamutualfund.com TATA MUTUAL FUND', 'MARKET OUTLOOK', 'Equity market', ' ', 'Disclaimer:  The views expressed are in no way trying to predict the markets or to time them. The views expressed are for information purpose only and do not construe to be any investment, legal or taxation advice.  Please consult your Financial/Investment Adviser before investing.  The views expressed may not reflect in the scheme portfolios of Tata Mutual Fund. This note has been prepared using information believed to be  accurate at the time of its use.', 'BSE-30 and Nifty-50 indices consolidated for the month of November 2024 post a sharp correction in the previous month. Mid-cap and Small Cap indices also performed inlin