In [4]:
import pprint, json, math, os, sys
import fitz
import pandas as pd
from collections import defaultdict
import pdfminer

dir_path = r"C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo"
fund_path = r"C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\Dec 24"
sys.path.append(os.path.abspath(dir_path))

from app.fundData import *
from app.helper import Helper


dry_path = r'\data\output\DryRun.pdf'

In [29]:
#path = r"C:\Users\rando\OneDrive\Documents\mywork-repo"
path = r"C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo"
tata_path = r"\pdf-extractor\select_tata.pdf"

In [None]:
def extract_pdf_blocks(input_pdf_path):
    """
    Open the PDF and extract all blocks of text, images, and other content.

    Args:
        input_pdf_path (str): Path to the input PDF.

    Returns:
        list: A list of pages, where each page is a dictionary containing blocks of content.
    """
    blocks_data = []
    input_doc = fitz.open(input_pdf_path)

    with pdfplumber.open(input_pdf_path) as pdf:
        for page_number, pdf_page in enumerate(pdf.pages):
            # Extract blocks of content using PyMuPDF
            doc_page = input_doc[page_number]
            blocks = doc_page.get_text("dict")["blocks"]
            
            # Append the blocks along with table bounding boxes from pdfplumber
            # table bboxes are required to seperate tabular from text content
            blocks_data.append({
                "blocks": blocks,
                "table_bboxes": [table.bbox for table in pdf_page.find_tables()],
                "page_rect": doc_page.rect
            })

    input_doc.close()
    return blocks_data

def remove_image_blocks_and_create_pdf(blocks_data, output_pdf_path):
    """
    Returns:
        list: Updated block data without image blocks.
    """
    output_doc = fitz.open()

    for page_data in blocks_data:
        page_rect = page_data["page_rect"]
        blocks = page_data["blocks"]

        # Create a new page in the output PDF
        output_page = output_doc.new_page(width=page_rect.width, height=page_rect.height)

        filtered_blocks = []

        for block in blocks:
            if "image" in block:
                continue  # Skip image blocks

            filtered_blocks.append(block)

            if "lines" in block:  # Process text lines
                for line in block["lines"]:
                    for span in line["spans"]:
                        bbox = span.get("bbox", [0, 0, 0, 0])
                        text = span["text"]
                        size = math.ceil(float(span.get("size", 12)))  # Default font size if not provided
                        color = span.get("color", (0, 0, 0))  # Default color (black)
                        font = span.get("font", "helv")

                        # Normalize color if it's in integer form (e.g., 0x000000)
                        if isinstance(color, int):
                            color = (
                                (color >> 16) & 255, (color >> 8) & 255, color & 255
                            )
                        color = tuple(c / 255 for c in color)  # Convert to 0-1 range

                        is_bold = "Bold" in font
                        fontname = font + ("-b" if is_bold else "")

                        try:
                            output_page.insert_text(
                                (bbox[0], bbox[1]),
                                text,
                                fontsize=size,
                                fontname=fontname,
                                color=color,
                            )
                        except Exception:
                            output_page.insert_text(
                                (bbox[0], bbox[1]),
                                text,
                                fontsize=size,
                                fontname="helv",  # Fallback font
                                color=color,
                            )

    # Save the output PDF
    output_doc.save(output_pdf_path)
    output_doc.close()

    return blocks_data

def create_non_tabular_pdf(blocks_data, output_pdf_path):
    """
    Returns:
        list: Block data containing only non-tabular content.
    """
    output_doc = fitz.open()
    non_tabular_blocks = []

    for page_data in blocks_data:
        page_rect = page_data["page_rect"]
        blocks = page_data["blocks"]
        table_bboxes = page_data["table_bboxes"]

        # Create a new page in the output PDF
        output_page = output_doc.new_page(width=page_rect.width, height=page_rect.height)

        page_non_tabular_blocks = []

        for block in blocks:
            if "lines" in block:  # Process text blocks
                for line in block["lines"]:
                    for span in line["spans"]:
                        bbox = span.get("bbox", [0, 0, 0, 0])

                        # Check if the text is within any table bbox (skip if inside table)
                        inside_table = any(
                            bbox[0] >= table_bbox[0] and
                            bbox[1] >= table_bbox[1] and
                            bbox[2] <= table_bbox[2] and
                            bbox[3] <= table_bbox[3]
                            for table_bbox in table_bboxes
                        )

                        if not inside_table:
                            page_non_tabular_blocks.append(block)
                            text = span["text"]
                            size = math.ceil(float(span.get("size", 12)))
                            color = span.get("color", (0, 0, 0))
                            font = span.get("font", "helv")

                            # Normalize color
                            if isinstance(color, int):
                                color = (
                                    (color >> 16) & 255, (color >> 8) & 255, color & 255
                                )
                            color = tuple(c / 255 for c in color)

                            is_bold = "Bold" in font
                            fontname = font + ("-b" if is_bold else "")

                            try:
                                output_page.insert_text(
                                    (bbox[0], bbox[1]),
                                    text,
                                    fontsize=size,
                                    fontname=fontname,
                                    color=color,
                                )
                            except Exception:
                                output_page.insert_text(
                                    (bbox[0], bbox[1]),
                                    text,
                                    fontsize=size,
                                    fontname="helv",
                                    color=color,
                                )

        non_tabular_blocks.append(page_non_tabular_blocks)

    # Save the output PDF
    output_doc.save(output_pdf_path)
    output_doc.close()

    return non_tabular_blocks

def create_tabular_pdf(blocks_data, output_pdf_path):
    """
    Returns:
        list: Block data containing only tabular content.
    """
    output_doc = fitz.open()
    tabular_blocks = []

    for page_data in blocks_data:
        page_rect = page_data["page_rect"]
        blocks = page_data["blocks"]
        table_bboxes = page_data["table_bboxes"]

        # Create a new page in the output PDF
        output_page = output_doc.new_page(width=page_rect.width, height=page_rect.height)

        page_tabular_blocks = []

        for block in blocks:
            if "lines" in block:  # Process text blocks
                for line in block["lines"]:
                    for span in line["spans"]:
                        bbox = span.get("bbox", [0, 0, 0, 0])

                        # Check if the text is within any table bbox (include if inside table)
                        inside_table = any(
                            bbox[0] >= table_bbox[0] and
                            bbox[1] >= table_bbox[1] and
                            bbox[2] <= table_bbox[2] and
                            bbox[3] <= table_bbox[3]
                            for table_bbox in table_bboxes
                        )

                        if inside_table:
                            page_tabular_blocks.append(block)
                            text = span["text"]
                            size = math.ceil(float(span.get("size", 12)))
                            color = span.get("color", (0, 0, 0))
                            font = span.get("font", "helv")

                            # Normalize color
                            if isinstance(color, int):
                                color = (
                                    (color >> 16) & 255, (color >> 8) & 255, color & 255
                                )
                            color = tuple(c / 255 for c in color)

                            is_bold = "Bold" in font
                            fontname = font + ("-b" if is_bold else "")

                            try:
                                output_page.insert_text(
                                    (bbox[0], bbox[1]),
                                    text,
                                    fontsize=size,
                                    fontname=fontname,
                                    color=color,
                                )
                            except Exception:
                                output_page.insert_text(
                                    (bbox[0], bbox[1]),
                                    text,
                                    fontsize=size,
                                    fontname="helv",
                                    color=color,
                                )

        tabular_blocks.append(page_tabular_blocks)

    # Save the output PDF
    output_doc.save(output_pdf_path)
    output_doc.close()

    return tabular_blocks

In [7]:
#tata
no_image_path = path +r"\pdf-extractor\noimg.pdf"
# textual_pdf_path = path + r"\output\TatatextalPdf.pdf"
tabular_pdf_path = path + r"\pdf-extractor\dryrun.pdf"

In [None]:
# Example usage
blocks_data = extract_pdf_blocks(path + tata_path)
blocks_data = remove_image_blocks_and_create_pdf(blocks_data, no_image_path)


In [32]:
def get_pdf_text(path:str):
    
        doc = fitz.open(path)
        text_data = {}
        for pgn in range(doc.page_count):
            page = doc[pgn]
            text = page.get_text("text")
            text = text.encode('utf-8', 'ignore').decode('utf-8')
            data = text.split('\n')
            text_data [pgn] = text
        return text_data
data = get_pdf_text(path+ tata_path)

In [None]:
for c, content in data.items():
    print('===={c}====')
    df = content.split('\n')
    for text in df:
        print(text)

In [None]:
import camelot
import pandas as pd

tata_path = r"\pdf-extractor\select_tata.pdf"
input_pdf = path + tata_path 

tables = camelot.read_pdf(input_pdf, flavor="stream", pages="all")
df_list = [table.df for table in tables]  
# final_df = pd.concat(df_list, ignore_index=True) 

In [None]:
import os
import camelot
import pandas as pd

# Folder containing PDFs
path = r"C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo"
pdf_folder = path + r"\pdf-extractor\pdf"
output_folder = path + r"\pdf-extractor\output"

os.makedirs(output_folder, exist_ok=True)

# Master Excel file
master_excel_path = os.path.join(output_folder, "extracted_tables.xlsx")

# Initialize an Excel writer
with pd.ExcelWriter(master_excel_path, engine="openpyxl") as writer:
    
    for pdf_file in os.listdir(pdf_folder):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, pdf_file)
            
            # Extract tables from all pages
            tables = camelot.read_pdf(pdf_path, pages="all", flavor="stream")
            
            if tables.n == 0:
                print(f"No tables found in {pdf_file}")
                continue

            pdf_name = os.path.splitext(pdf_file)[0]
            csv_path = os.path.join(output_folder, f"{pdf_name}.csv")

            all_tables = []
            
            for i, table in enumerate(tables):
                df = table.df  # Convert table to DataFrame
                
                # Get the page number for this table
                page_number = table._metadata["page"]
                
                # Add Page Number column
                df.insert(0, "Page Number", page_number)
                
                # Append to CSV file
                df.to_csv(csv_path, mode='a', index=False, header=(i == 0))  
                
                # Save to Master Excel (Each PDF gets a new sheet, all tables combined)
                all_tables.append(df)

            # If we have multiple tables, concatenate them before saving to Excel
            if all_tables:
                final_df = pd.concat(all_tables, ignore_index=True)
                final_df.to_excel(writer, sheet_name=pdf_name, index=False)

            print(f"Extracted tables from {pdf_file} and saved to {csv_path}")

print(f"All extracted tables saved in {master_excel_path}")
