In [81]:
import pdfplumber
import fitz
import camelot
import warnings , math, collections , os
import pickle
import pandas as pd

import pandas as pd
warnings.filterwarnings("ignore", category=UserWarning) 

path = r"C:\Users\rando\OneDrive\Documents\mywork-repo"
#path = r"C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo"


#file data paths
samco_path = r"\files\SamcoFactSheet2024.pdf"
tata_path = r"\files\TataFactSheet2024.pdf"


In [90]:
def int_to_normalized_rgb(value):
    if value < 0:
        value = (1 << 32) + value

    r = (value >> 16) & 0xFF  # Red component
    g = (value >> 8) & 0xFF   # Green component
    b = value & 0xFF          # Blue component


    return (r / 255.0, g / 255.0, b / 255.0)

def map_inter_fonts_to_times(pdf_fonts):
    def map_font(font_name):
        if "inter" in font_name.lower():
            if any(weight in font_name.lower() for weight in ["bold", "extrabold", "semibold"]):
                return "Times-Bold"
            elif any(weight in font_name.lower() for weight in ["medium", "regular", "light", "extra light"]):
                return "Times-Roman"
        return "Helvetica"  # Default to Helvetica for all other fonts

    # Apply the mapping
    mapped_fonts = {font: map_font(font) for font in pdf_fonts}
    return mapped_fonts

def extract_fonts_colors_sizes(all_blocks, isRound = False):
    fonts = set()
    sizes = set()
    colors = set()

    for page_data in all_blocks:  
        for block in page_data.get("blocks", []): 
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    fonts.add(span.get("font", "Unknown"))
                    size = span.get("size", "Unknown")
                    if isRound:
                        size = round(size)
                    
                    sizes.add(size)
                    
                    colors.add(span.get("color", 0))

    return {
        "fonts": sorted(fonts),
        "sizes": sorted(sizes),
        "colors": sorted(colors),
    } 

In [115]:
def generate_counts(data, round_size=False):
    overall_counts = {
        "sizes": {},
        "colors": {},
        "fonts": {}
    }
    page_counts = []

    for page_data in data:
        page_size_counts = {}
        page_color_counts = {}
        page_font_counts = {}

        for block in page_data.get("blocks", []):
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    # Process size
                    size = span.get("size", 12)
                    if round_size:
                        size = round(size)
                    page_size_counts[size] = page_size_counts.get(size, 0) + 1
                    overall_counts["sizes"][size] = overall_counts["sizes"].get(size, 0) + 1

                    # Process color
                    color = span.get("color", (0, 0, 0))
                    if isinstance(color, int):
                        color = (
                            (color >> 16) & 255,  # Red
                            (color >> 8) & 255,   # Green
                            color & 255           # Blue
                        )
                    page_color_counts[color] = page_color_counts.get(color, 0) + 1
                    overall_counts["colors"][color] = overall_counts["colors"].get(color, 0) + 1

                    # Process font
                    font = span.get("font", "helv")
                    map_fonts = {'Heebo-Medium': 'Helvetica',
                                'Inter-Black': 'Helvetica',
                                'Inter-Bold': 'Times-Bold',
                                'Inter-ExtraBold': 'Times-Bold',
                                'Inter-ExtraLight': 'Times-Roman',
                                'Inter-Light': 'Times-Roman',
                                'Inter-Medium': 'Times-Roman',
                                'Inter-Regular': 'Times-Roman',
                                'Inter-SemiBold': 'Times-Bold',
                                'Kailasa': 'Helvetica',
                                'MyriadPro-Regular': 'Helvetica',
                                'Helvetica': 'Helvetica'
                                }
                    font = map_fonts[font]
                    page_font_counts[font] = page_font_counts.get(font, 0) + 1
                    overall_counts["fonts"][font] = overall_counts["fonts"].get(font, 0) + 1

        # Sort page-level counts by count in descending order
        sorted_page_size_counts = dict(sorted(page_size_counts.items(), key=lambda item: item[1], reverse=True))
        sorted_page_color_counts = dict(sorted(page_color_counts.items(), key=lambda item: item[1], reverse=True))
        sorted_page_font_counts = dict(sorted(page_font_counts.items(), key=lambda item: item[1], reverse=True))

        page_counts.append({
            "page_num": page_data.get("page_num"),
            "sizes": sorted_page_size_counts,
            "colors": sorted_page_color_counts,
            "fonts": sorted_page_font_counts
        })

    # Sort overall counts by count in descending order
    sorted_overall_sizes = dict(sorted(overall_counts["sizes"].items(), key=lambda item: item[1], reverse=True))
    sorted_overall_colors = dict(sorted(overall_counts["colors"].items(), key=lambda item: item[1], reverse=True))
    sorted_overall_fonts = dict(sorted(overall_counts["fonts"].items(), key=lambda item: item[1], reverse=True))

    overall_counts = {
        "sizes": sorted_overall_sizes,
        "colors": sorted_overall_colors,
        "fonts": sorted_overall_fonts
    }

    return {
        "per_page": page_counts,
        "overall": overall_counts
    }

In [135]:
def generate_pair_counts(data, pair_keys, round_size=False):
    """
    Generate counts based on pairs of attributes (e.g., font-size, size-color) per page and for the entire PDF.

    Args:
        data (list): List of page data extracted from the PDF.
        pair_keys (tuple): A tuple of two keys (e.g., ('font', 'size'), ('size', 'color')).
        round_size (bool): Whether to round font sizes to the nearest integer.

    Returns:
        dict: Nested dictionary containing pair counts per page and for the entire document.
    """
    map_fonts = {
        'Heebo-Medium': 'Helvetica',
        'Inter-Black': 'Helvetica',
        'Inter-Bold': 'Times-Bold',
        'Inter-ExtraBold': 'Times-Bold',
        'Inter-ExtraLight': 'Times-Roman',
        'Inter-Light': 'Times-Roman',
        'Inter-Medium': 'Times-Roman',
        'Inter-Regular': 'Times-Roman',
        'Inter-SemiBold': 'Times-Bold',
        'Kailasa': 'Helvetica',
        'MyriadPro-Regular': 'Helvetica',
        'Helvetica': 'Helvetica'
    }

    overall_pair_counts = {}
    page_pair_counts = []

    for page_data in data:
        page_counts = {}

        for block in page_data.get("blocks", []):
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    # Extract attributes for the pair
                    key1 = span.get(pair_keys[0], None)
                    key2 = span.get(pair_keys[1], None)

                    # Map font names if font is part of the pair_keys
                    if pair_keys[0] == 'font':
                        key1 = map_fonts.get(key1, 'Helvetica')
                    if pair_keys[1] == 'font':
                        key2 = map_fonts.get(key2, 'Helvetica')

                    # Special handling for 'size' if rounding is enabled
                    if pair_keys[0] == 'size':
                        key1 = round(span.get("size", 12)) if round_size else span.get("size", 12)
                    if pair_keys[1] == 'size':
                        key2 = round(span.get("size", 12)) if round_size else span.get("size", 12)

                    # Normalize color if applicable
                    if pair_keys[0] == 'color' or pair_keys[1] == 'color':
                        if isinstance(key1, int):
                            key1 = (
                                (key1 >> 16) & 255,  # Red
                                (key1 >> 8) & 255,   # Green
                                key1 & 255           # Blue
                            )
                        if isinstance(key2, int):
                            key2 = (
                                (key2 >> 16) & 255,  # Red
                                (key2 >> 8) & 255,   # Green
                                key2 & 255           # Blue
                            )

                    # Create pair tuple
                    pair = (key1, key2)

                    # Update counts
                    if pair in page_counts:
                        page_counts[pair] += 1
                    else:
                        page_counts[pair] = 1

                    if pair in overall_pair_counts:
                        overall_pair_counts[pair] += 1
                    else:
                        overall_pair_counts[pair] = 1

        # Sort page-level pair counts by count in descending order
        sorted_page_counts = dict(sorted(page_counts.items(), key=lambda item: item[1], reverse=True))
        page_pair_counts.append({
            "page_num": page_data.get("page_num"),
            "pairs": sorted_page_counts
        })

    # Sort overall pair counts by count in descending order
    sorted_overall_pair_counts = dict(sorted(overall_pair_counts.items(), key=lambda item: item[1], reverse=True))

    return {
        "per_page": page_pair_counts,
        "overall": sorted_overall_pair_counts
    }

In [101]:
def extract_pdf_blocks(pdf_path, ignore_page):
    document = fitz.open(pdf_path)

    # Open with pdfplumber
    with pdfplumber.open(pdf_path) as plumber_pdf:
        all_data = []

        for page_num in range(len(document)):
            if page_num not in ignore_page:
                page_data = {
                    "page_num": page_num + 1,
                    "page_width": None,
                    "page_height": None,
                    "blocks": [],
                    "tables": []
                }

                # Extract page dimensions
                page = document[page_num]
                page_width, page_height = page.rect.width, page.rect.height

                page_data["page_width"] = page_width
                page_data["page_height"] = page_height

                text_dict = page.get_text("dict")
                blocks = text_dict.get("blocks", [])

                for block in blocks:
                    block["page_num"] = page_num + 1
                    page_data["blocks"].append(block)

                # Extract table bounding boxes
                plumber_page = plumber_pdf.pages[page_num]
                tables = plumber_page.find_tables()

                for table in tables:
                    page_data["tables"].append(table.bbox) 

                #add page to all_data
                all_data.append(page_data)

    document.close()
   
    
    return all_data

In [57]:
def extract_non_image_blocks_and_images(all_blocks):
    non_image_blocks = []
    image_blocks_by_page = []

    for page_data in all_blocks:
        page_images = [] 
        page_non_images = [] 

        for block in page_data.get("blocks", []):
            if "image" in block: 
                page_images.append(block)
            else:
                page_non_images.append(block)

        # Append data for this page
        non_image_blocks.append({
            "page_num": page_data["page_num"],
            "page_width": page_data["page_width"],
            "page_height": page_data["page_height"],
            "blocks": page_non_images
        })
        image_blocks_by_page.append(page_images)

    return non_image_blocks, image_blocks_by_page

In [60]:
def separate_text_and_tabular_blocks(non_image_blocks):
    text_blocks = []
    tabular_blocks = []

    for page_data in non_image_blocks:  # Iterate through pages
        page_text_blocks = []  
        page_tabular_blocks = [] 

        # Get table bounding boxes for each page
        table_bboxes = page_data.get("tables", [])

        for block in page_data.get("blocks", []):
            block_bbox = block.get("bbox", [])

            if block_bbox:
                # Check if block falls in table bounding box
                is_tabular = any(
                    block_bbox[0] >= table_bbox[0] and
                    block_bbox[1] >= table_bbox[1] and
                    block_bbox[2] <= table_bbox[2] and
                    block_bbox[3] <= table_bbox[3]
                    for table_bbox in table_bboxes
                )

                if is_tabular:
                    page_tabular_blocks.append(block)
                else:
                    # Add table bounding boxes to text blocks
                    block["tables"] = table_bboxes
                    page_text_blocks.append(block)
            else:
                
                block["tables"] = table_bboxes
                page_text_blocks.append(block)

    
        text_blocks.append({
            "page_num": page_data["page_num"],
            "page_width": page_data["page_width"],
            "page_height": page_data["page_height"],
            "blocks": page_text_blocks,
            "tables": table_bboxes  #per page appended for drawing later
        })
        tabular_blocks.append({
            "page_num": page_data["page_num"],
            "page_width": page_data["page_width"],
            "page_height": page_data["page_height"],
            "blocks": page_tabular_blocks
        })

    return text_blocks, tabular_blocks


In [130]:
document_data = extract_pdf_blocks(path + samco_path, [0,23])
text_data, tab_data = separate_text_and_tabular_blocks(document_data)
font_size_color = generate_counts(text_data, round_size=True) #for text data rn

In [None]:
font_size_color['overall']['colors']

In [None]:
font_size_color['overall']['sizes']

In [None]:
font_size_color['overall']['fonts']

In [136]:
pair_counts = generate_pair_counts(text_data, ('font', 'size'), round_size=False)

In [137]:
pair_counts['overall']

{('Times-Roman', 7.0): 255,
 ('Times-Bold', 8.0): 209,
 ('Times-Roman', 8.0): 182,
 ('Times-Bold', 7.0): 154,
 ('Times-Roman', 10.0): 137,
 ('Times-Bold', 9.0): 57,
 ('Times-Bold', 10.0): 53,
 ('Times-Roman', 7.959899425506592): 48,
 ('Times-Roman', 9.0): 41,
 ('Times-Roman', 6.5): 34,
 ('Times-Roman', 3.5952820777893066): 29,
 ('Times-Bold', 6.5): 28,
 ('Times-Roman', 9.5): 27,
 ('Times-Roman', 3.700000047683716): 26,
 ('Times-Roman', 6.498578071594238): 25,
 ('Times-Roman', 4.109248161315918): 24,
 ('Times-Roman', 4.109339237213135): 24,
 ('Times-Roman', 4.109270095825195): 24,
 ('Times-Roman', 6.0): 23,
 ('Helvetica', 7.0): 20,
 ('Times-Roman', 6.599999904632568): 20,
 ('Times-Bold', 24.0): 17,
 ('Times-Roman', 6.634067535400391): 16,
 ('Times-Roman', 5.878775596618652): 16,
 ('Helvetica', 8.0): 14,
 ('Times-Bold', 8.132010459899902): 12,
 ('Times-Bold', 16.0): 12,
 ('Times-Roman', 4.109289646148682): 12,
 ('Times-Roman', 4.109335899353027): 12,
 ('Times-Roman', 4.109299659729004): 