In [7]:
import pdfplumber
import fitz
import camelot
import warnings , math, collections , os, re
import pickle

warnings.filterwarnings("ignore", category=UserWarning) 

IMPORTANT CLEANING FUNCTIONS

In [5]:

removeContent =[
    'Mutual fund investments are subject to market risks, read all scheme related documents carefully.',
    '(An open-ended scheme investing across large cap, midcap and small cap stocks)',
    "(An open-ended Equity Linked Saving Scheme with a statutory lock-in of 3 years and tax benefit.)",
    "(An open-ended dynamic equity scheme investing across large cap, mid cap, small cap stocks)",
    "(An open-ended equity scheme following momentum theme)",
    "(An open-ended equity scheme following special situations theme)",
    ".",
    "st",
    "Note:",
    "Disclaimer",
    "93.72",
    "risk-o-meter",
    "scheme risk-o-meter",
    "*Investors should consult their financial advisers if in doubt about whether the product is suitable for them.",
    "94.87",
    "(An open-ended dynamic asset allocation fund)",
    "97.11",
    ":"
]

#regarding colors

def normalize_rgb_color(color):
    if isinstance(color, int):
        color = (
            (color >> 16) & 255,  # R
            (color >> 8) & 255,   # G
            color & 255           # B
        )
    normalized_color = tuple(c / 255 for c in color)

    if all(channel > 0.9 for channel in normalized_color): #Check if white
        return (1.0, 0.647, 0.0)  # Orange

    return normalized_color 

def map_font(font_name):
    normalized_font_name = font_name.lower()

    if 'bold' in normalized_font_name:
        return 'Times-Bold'
    elif 'medium' in normalized_font_name:
        return 'Times'  # Treat medium somewhat as regular/Roman for Times
    elif 'regular' in normalized_font_name:
        return 'Times'  # Direct mapping for regular to Times-Roman
    elif 'light' in normalized_font_name or 'extralight' in normalized_font_name:
        return 'Times'  # Light and extra light to Times-Roman
    elif 'italic' in normalized_font_name or 'condens' in normalized_font_name:
        return 'Times'  # Mapping italics and condensed to Times-Italic
    elif 'black' in normalized_font_name or 'extrabold' in normalized_font_name:
        return 'Times-Bold'  # Treating black and extra bold as Times-Bold
    else:
        return 'Helvetica'

#regarding tables

def adjust_bbox(bbox, direction, pixels):
    """
    Adjusts the boundary of a bounding box in the specified direction by a given number of pixels.

    Args:
        bbox (tuple): The original bounding box (x0, y0, x1, y1).
        direction (str): Direction to adjust ('top', 'bottom', 'left', 'right').
        pixels (int): The number of pixels to adjust by. Use positive values to expand and negative to contract.

    Returns:
        fitz.Rect: A new fitz.Rect object with the adjusted bounding box.
    """
    x0, y0, x1, y1 = bbox

    if direction == 'top':
        y0 -= pixels
    elif direction == 'bottom':
        y1 += pixels
    elif direction == 'left':
        x0 -= pixels
    elif direction == 'right':
        x1 += pixels
    else:
        raise ValueError("Invalid direction. Choose from 'top', 'bottom', 'left', 'right'.")

    adjusted_rect = fitz.Rect(x0, y0, x1, y1)
    return adjusted_rect

def is_table_large_enough(table_bbox, min_width, min_height):
    x0, y0, x1, y1 = table_bbox
    return (x1 - x0) > min_width and (y1 - y0) > min_height

#other texts

def create_new_file(file_name):
    full_file_name = os.path.join(path,file_name)
    
    os.makedirs(os.path.dirname(full_file_name), exist_ok=True)

    # Check if the file already exists
    if os.path.exists(full_file_name):
        return full_file_name

    with open(full_file_name, 'w') as file:
        file.write("")  # Create an empty file
    
    return full_file_name

def check_if_redundant_text(text, removeContent):
    
    for remove in removeContent:
        if text == remove:
            return False
    return True



PDF DATA SEGREGATION AND EXTRACTION

In [7]:
"""Open the PDF and extract all blocks of text, images, and other content, while collecting examples of text for each font size and color.
Args:input_pdf_path (str): Path to the input PDF.
pagesToIgnore (list): List of page numbers to ignore.
Returns:list: A list of pages, where each page is a dictionary containing blocks of content and examples of text for each color and size."""
def extract_pdf_blocks(input_pdf_path, pagesToIgnore):
    document_blocks_data = []
    page_blocks_data = []
    input_doc = fitz.open(input_pdf_path)

    with pdfplumber.open(input_pdf_path) as pdf:
        for page_number, pdf_page in enumerate(pdf.pages):
            if page_number not in pagesToIgnore:
                doc_page = input_doc[page_number]
                blocks = doc_page.get_text("dict")["blocks"]

                #get blocks for grand block list
                for block in blocks:
                    document_blocks_data.append(block)

                 # New list for filtered table bboxes
                filtered_table_bboxes = []

                for table_bbox in [table.bbox for table in pdf_page.find_tables()]:
                    if is_table_large_enough(table_bbox, 70, 50):  #width,height
                        adjusted_bbox = adjust_bbox(table_bbox, 'bottom', -6) 
                        filtered_table_bboxes.append(adjusted_bbox)

                page_blocks_data.append({
                    "blocks": blocks,
                    "table_bboxes": filtered_table_bboxes,
                    "page_rect": doc_page.rect
                })

        final_document_blocks_data = {
            "blocks_data": page_blocks_data,
            "total_pages": input_doc.page_count
        }
    input_doc.close()
    return final_document_blocks_data

In [8]:
""" Removes image blocks from the provided blocks data and creates a PDF without these blocks.
Args:blocks_data (list): List of dictionaries containing page data including text and image blocks.
output_pdf_path (str): Path to save the output PDF.
Returns:tuple: A tuple containing two lists - updated block data without image blocks and data of removed image blocks."""
def seperate_text_image_blocks(document_blocks_data):
    page_blocks_data = []
    image_blocks_data = []

    for page_data in document_blocks_data['blocks_data']:
        page_rect = page_data["page_rect"]
        blocks = page_data["blocks"]
        
        non_image_blocks_data = []
        page_image_data = []

        for block in blocks:
            if "image" in block.keys():  # Directly checking for image keys
                page_image_data.append(block)
            else:
                non_image_blocks_data.append(block)
            

        # After processing the page, extract text blocks from the newly created output page
        page_blocks_data.append({
            "blocks": non_image_blocks_data,
            "page_rect": page_rect,
            "table_bboxes": page_data['table_bboxes']
        })

        # Collect data for removed image blocks
        image_blocks_data.append(page_image_data)

    final_document_blocks_data = {
        "blocks_data": page_blocks_data,
        "total_pages": document_blocks_data['total_pages']
    }

    return final_document_blocks_data, image_blocks_data

In [9]:
"""Separates text and tabular data based on bounding boxes (bboxes).
Args:document_blocks_data (dict): Dictionary containing block data for each page.
Returns:tuple: A tuple containing two lists:
- updated block data with non-tabular (text) blocks
- updated block data with tabular block"""
def separate_text_and_tabular_blocks(document_blocks_data):
    document_tabular_blocks = []
    document_textual_blocks = []

    tabular_blocks_data = []
    textual_blocks_data = []

    for page_data in document_blocks_data['blocks_data']:
        blocks = page_data['blocks']
        table_bboxes = page_data['table_bboxes']

        page_tabular_blocks = []
        page_textual_blocks = []

        for block in blocks:
            is_tabular_block = False

            if 'lines' in block:
                for line in block['lines']:
                    for span in line['spans']:
                        bbox = span.get('bbox', [0, 0, 0, 0])

                        inside_table = any(
                            bbox[0] >= table_bbox[0]
                            and bbox[1] >= table_bbox[1]
                            and bbox[2] <= table_bbox[2]
                            and bbox[3] <= table_bbox[3]
                            for table_bbox in table_bboxes
                        )

                        if inside_table:
                            is_tabular_block = True
                            break

                    if is_tabular_block:
                        break

            if is_tabular_block:
                page_tabular_blocks.append(block)
                document_tabular_blocks.append(block)
            else:
                page_textual_blocks.append(block)
                document_textual_blocks.append(block)

        tabular_blocks_data.append({
            "blocks": page_tabular_blocks,
            "page_rect": page_data['page_rect'],
            "table_bboxes": table_bboxes
        })

        textual_blocks_data.append({
            "blocks": page_textual_blocks,
            "page_rect": page_data['page_rect'],
            "table_bboxes": table_bboxes
        })

    final_textual_blocks_data = {
        "blocks_data": textual_blocks_data
    }

    final_tabular_blocks_data = {
        "blocks_data": tabular_blocks_data
    }

    return final_textual_blocks_data, final_tabular_blocks_data


In [10]:
""" Creates a pdf for the data
Args:document_data (dict): data to make pdf of.
output_path (str): Path to save the modified PDF.
"""
def create_pdf_file(document_data, output_pdf_path):
    output_doc = fitz.open()
    
    for page_data in document_data['blocks_data']:
        page_rect = page_data['page_rect']
        blocks = page_data['blocks']
        table_bbox = page_data['table_bboxes']
        
        output_page = output_doc.new_page(width=page_rect.width, height=page_rect.height)
        
        for block in blocks:
            if "lines" in block:
                for line in block['lines']:
                    for span in line['spans']:
                        bbox = span.get("bbox", [0, 0, 0, 0])
                        text = span["text"].strip()
                        size = round(float(span.get("size", 12)))  # Ensure font size is rounded
                        color = span.get("color", (0, 0, 0))  # Default color (black)
                        font = span.get("font", "Helvetica") # Default font
            
                        #process size and color 
                        color = normalize_rgb_color(color) #code above
                        fontname = map_font(font) #code above
                             
                             
                        #error in font hence try except block                               
                        try:
                            output_page.insert_text(
                                (bbox[0], bbox[1]),
                                text,
                                fontsize=size,
                                fontname=fontname,
                                color=color,
                            )
                        except Exception:
                            output_page.insert_text(
                                (bbox[0], bbox[1]),
                                text,
                                fontsize=size,
                                fontname="helv",
                                color=color,
                            )

        
         # Drawing the table bounding boxes
        for table_bbox in table_bbox:
            rect = fitz.Rect(table_bbox)
            output_page.draw_rect(rect, color=(.8, 0, 0), width=0.5)
            
        
            
    output_doc.save(output_pdf_path)
    output_doc.close()

DRY RUN CODE

In [5]:
#path = r"C:\Users\rando\OneDrive\Documents\mywork-repo"
path = r"C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo"

#file data paths
samco_path = path + r"\files\SamcoFactSheet2024.pdf"
tata_path = path + r"\files\TataFactSheet2024.pdf"

#dry run paths
dry_run_path = path + r"\output\DryRun.pdf"

pickle_indices = r"\output\pkl\indices_var.pkl"

In [124]:
#SAMCO PATHS
#samco output path
# no_image_path = path + r"\output\sam\NoImgPdf.pdf"
# textual_pdf_path = path + r"\output\sam\TextualPdf.pdf"
# tabular_pdf_path = path + r"\output\sam\TabularPdf.pdf"

#pickle data paths samco
# pickle_text = r"\output\pkl\sam\textual_data.pkl"
# pickle_tab = r"\output\pkl\sam\tabular_data.pkl"
# pickle_nonimg = r"\output\pkl\sam\nonimg_data.pkl"

In [26]:
#TATA PATHS
#tata output path
no_image_path = path +r"\output\tata\TatanoImgPdf.pdf"
textual_pdf_path = path + r"\output\tata\TatatextalPdf.pdf"
tabular_pdf_path = path + r"\output\tata\TatatabularPdf.pdf"

#pickkle data paths tata
pickle_text = r"\output\pkl\tata\textual_data.pkl"
pickle_tab = r"\output\pkl\tata\tabular_data.pkl"
pickle_nonimg = r"\output\pkl\tata\nonimg_data.pkl"
pickle_all = r'\output\pkl\tata\all_data.pkl'

In [21]:
blocks_data = extract_pdf_blocks(tata_path, [])
non_image_data, image_data = seperate_text_image_blocks(blocks_data)
textual_data,tabular_data = separate_text_and_tabular_blocks(non_image_data)

In [125]:
create_pdf_file(non_image_data,no_image_path)
create_pdf_file(textual_data, textual_pdf_path)
create_pdf_file(tabular_data, tabular_pdf_path)
print("\n Success !!")


 Success !!


In [33]:
sorted34 = sorted(blocks_data['blocks_data'][34]['blocks'], key = lambda x: (x['bbox'][1], x['bbox'][0]))

In [None]:
sorted34

In [27]:
# with open(path + pickle_text , 'wb') as file:
#     pickle.dump(textual_data, file)
    
# with open(path + pickle_tab , 'wb') as file:
#     pickle.dump(tabular_data, file)
    
with open(path + pickle_nonimg , 'wb') as file:
    pickle.dump(non_image_data, file)
    
with open(path + pickle_all, 'wb') as file:
    pickle.dump(blocks_data,file)

In [24]:
non_image_data['blocks_data'][34]

{'blocks': [{'number': 0,
   'type': 0,
   'bbox': (178.50050354003906,
    492.3822021484375,
    591.50830078125,
    541.4861450195312),
   'lines': [{'spans': [{'size': 6.0,
       'flags': 0,
       'font': 'Swiss721BT-RomanCondense',
       'color': -14475488,
       'ascender': 0.9629999995231628,
       'descender': -0.23600000143051147,
       'text': 'Past performance may or may not be sustained in the future. Returns greater than 1 year period are compounded annualized. Income Distribution cum capital withdrawals are assumed ',
       'origin': (178.50050354003906, 498.1601867675781),
       'bbox': (178.50050354003906,
        492.3822021484375,
        591.50830078125,
        499.576171875)}],
     'wmode': 0,
     'dir': (1.0, 0.0),
     'bbox': (178.50050354003906,
      492.3822021484375,
      591.50830078125,
      499.576171875)},
    {'spans': [{'size': 6.0,
       'flags': 0,
       'font': 'Swiss721BT-RomanCondense',
       'color': -14475488,
       'ascender': 

In [20]:
"""Open the pdf , get all text data and blocks and draw a boundary along each boundary boxes
    Args:input_pdf_path(str) , output_pdf_path (str)
    Returns: nothing, a new pdf created"""
    
def draw_table_boundaries(input_pdf_path, output_pdf_path):
    with pdfplumber.open(input_pdf_path) as pdf:
        doc = fitz.open(input_pdf_path)
        for page_number, page in enumerate(pdf.pages):
            fitz_page = doc[page_number]
            tables = page.find_tables()
            for table in tables:
                bbox = table.bbox
                rect = fitz.Rect(bbox[0], bbox[1], bbox[2], bbox[3])
                fitz_page.draw_rect(rect, color=(0, 0, 1), width=1.5, overlay=False)
        doc.save(output_pdf_path)
        doc.close()

draw_table_boundaries(no_image_path, path + dry_run_path)

HIGHLIGHT CORE INDEXES

In [8]:
""" Get the indices to be checked from the dumped file, create a grand list to check the content"""
with open(path + pickle_indices , 'rb') as file:
    indices = pickle.load(file)  
final_indices = []
for k,v in indices.items():
   temp = [k] + v
   for t in temp:
      final_indices.append(t)
      
final_indices = list(set(final_indices))

In [9]:
def add_highlights_to_data(indices_variations, data):
    for page_data in data['blocks_data']:
        for block in page_data['blocks']:
            if "lines" in block: #check if page has lines
                for line in block['lines']:
                        for span in line['spans']:
                            if span['flags'] in [20,25]: #indicate bold value
                                span_text = span['text'].lower()
                                for term in indices_variations:
                                    pattern = r'\b' + re.escape(term.lower()) + r'\b'
                                    if re.search(pattern, span_text):
                                        # Add a highlighted key to indicate this span should be highlighted
                                        span['highlighted'] = True
    return data
def check_indices_and_highlight(indices_variations, path):
    doc = fitz.open(path)
    
    important_pages = set()

    for page_number, page in enumerate(doc):
        text_instances = page.get_text('dict')["blocks"]

        for block in text_instances:
            if "lines" in block: 
                for line in block["lines"]: 
                    for span in line["spans"]:
                        # Check text attributes
                        if span['flags'] in [20,25, 4,16,0]:  # Example for bold or large text
                            span_text = span['text'].lower()
                            for term in indices_variations:
                                pattern = r'\b' + re.escape(term.lower()) + r'\b'
                                if re.search(pattern, span_text):
                                    important_pages.add(page_number + 1)
                                    # Highlight found terms
                                    rect = fitz.Rect(span['bbox']) 
                                    page.add_highlight_annot(rect)
                                    break  # Optional: break if only one highlight per span is needed

    if important_pages:
        output_path = path.replace('.pdf', '_highlighted.pdf')
        doc.save(output_path)
        doc.close()
        return list(important_pages), output_path
    else:
        doc.close()
        return list(important_pages), None

In [13]:
file_path = path  + r"\files\factsheet-march-2022.pdf"
file_path

'C:\\Users\\Kaustubh.keny\\OneDrive - Cogencis Information Services Ltd\\Documents\\mywork-repo\\files\\factsheet-march-2022.pdf'

In [14]:

check_indices_and_highlight(final_indices, file_path)
#textual_data_high = add_highlights_to_data(final_indices,non_image_data)

([2,
  3,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44],
 'C:\\Users\\Kaustubh.keny\\OneDrive - Cogencis Information Services Ltd\\Documents\\mywork-repo\\files\\factsheet-march-2022_highlighted.pdf')

In [30]:
"""Open the pdf , get all text data and blocks and draw a boundary along each boundary boxes
    Args:input_pdf_path(str) , output_pdf_path (str)
    Returns: nothing, a new pdf created
"""
def draw_boundaries_on_pdf(input_pdf_path, path):
    # Open the PDF file
    doc = fitz.open(input_pdf_path)
    for page in doc:
        blocks = page.get_text("blocks")  # Get the blocks of text on the page
        for block in blocks:
            bbox = block[:4]  # The bbox is the first four elements of the block
            # Draw a rectangle with an orange border around the bbox
            page.draw_rect(bbox, color=(1.0, 0.647, 0.0), width=1.5, overlay=False)
    
    # Save the modified document to a new file
    
    output_path = path.replace('.pdf', '_block_highlighted.pdf')
    doc.save(output_path)
    doc.close()

# file_path = path + r'\output\DryRun.pdf'
draw_boundaries_on_pdf(no_image_path, dry_run_path)

In [31]:
"""Open the pdf , get all text data and blocks and draw a boundary along each boundary boxes
    Args:input_pdf_path(str) , output_pdf_path (str)
    Returns: nothing, a new pdf created
"""
def draw_boundaries_on_lines(input_pdf_path, path):
    # Open the PDF file
    doc = fitz.open(input_pdf_path)
    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            if "lines" in block:
                for line in block["lines"]:
                    
                    bbox = line["bbox"]  # The bbox is now directly accessible from the line
                    page.draw_rect(bbox, color=(.4, 0.647, 0.0), width=1.5, overlay=False)
    
    
    output_path = path.replace('.pdf', '_line_highlighted.pdf')
    doc.save(output_path)
    doc.close()
    
# file_path = path + r'\output\DryRun.pdf'
draw_boundaries_on_lines(no_image_path, dry_run_path)

In [82]:
non_image_data['blocks_data'][14]

{'blocks': [{'number': 0,
   'type': 0,
   'bbox': (36.0, 34.68272018432617, 267.84002685546875, 55.882694244384766),
   'lines': [{'spans': [{'size': 8.0,
       'flags': 20,
       'font': 'Swiss721BT-BoldCondensed',
       'color': -14475488,
       'ascender': 0.9629999995231628,
       'descender': -0.2370000034570694,
       'text': '2.\t Securities Transaction Tax (STT)',
       'origin': (36.0, 42.38671875),
       'bbox': (36.0,
        34.68272018432617,
        151.5360107421875,
        44.282718658447266)}],
     'wmode': 0,
     'dir': (1.0, 0.0),
     'bbox': (36.0, 34.68272018432617, 151.5360107421875, 44.282718658447266)},
    {'spans': [{'size': 8.0,
       'flags': 20,
       'font': 'Swiss721BT-BoldCondensed',
       'color': -14475488,
       'ascender': 0.9629999995231628,
       'descender': -0.2370000034570694,
       'text': '\t',
       'origin': (36.0, 53.9866943359375),
       'bbox': (36.0, 46.28269577026367, 38.0, 55.882694244384766)}],
     'wmode': 0,
  

FILTER PAGES BASED ON IMPORTANCE

In [72]:
"""Gets the pages to select based on REGEX passed, assuming first block gives title moslty after sort
    Args:regex(str) , document data (dict)
    Returns: list of important pages"""
    
def get_imp_pages(document_data, regex):
    
    imp_pages = [] #list for imp pages
    data = document_data['blocks_data']
    
    for pgn ,page in enumerate(data): #iterate through each page
        sortedPage = sorted(page['blocks'], key= lambda k: (k['bbox'][1], k['bbox'][0])) #sort t to b, l to r
        firstBlock = sortedPage[0] #get first block of sorted data
        
        if 'lines' in firstBlock:
            for line in firstBlock['lines']:
                for span in line['spans']:
                    text = span['text'].lower()
                    
                    #regex condition is imp
                    cond1 = re.findall(regex, text)
                    if cond1:
                        #print(text, pgn)
                        imp_pages.append(pgn)
    
    return list(set(imp_pages))
    

In [73]:
pageSet1 = get_imp_pages(non_image_data, "^samco|^tata|fund$") #primary set
pageSet2 = get_imp_pages(non_image_data, "^open")#similarly for other types of data can generate im pages

In [None]:
pageSet1

In [97]:
"""Extract and sort lines for a two-column layout, returning all data associated with each line without duplicates.
    Args:data (list): The raw extracted data from Fitz.
        split_x_coord (float): The x-coordinate dividing the left and right columns.
    Returns:list: Sorted list of all unique line data in reading order (top-to-bottom, left-to-right).
    """
def sort_data_blocks(document_data, split_x_coord, pages_to_sort):
    
    data = document_data['blocks_data']  # Get all PDF block data
    final_data = list()

    for pgn, page in enumerate(data):
        
        column1 = []
        column2 = []
        
        if pgn in pages_to_sort:
            for block in page.get('blocks', []):
                for line in block.get('lines', []):
                    bbox = line.get('bbox', None)
                    spans = line.get('spans', [])
                    if bbox and spans:
                        # Classify line based on its x-coordinate
                        block_data = {'bbox': bbox, 'spans': spans, 'line': line}
                        if bbox[0] < split_x_coord:  # Left column
                            column1.append(block_data)
                        else:  # Right column
                            column2.append(block_data)

            # Sort lines within each column
            column1.sort(key=lambda x: (x['bbox'][1], x['bbox'][0]))
            column2.sort(key=lambda x: (x['bbox'][1], x['bbox'][0]))
            
            final_data.append({
                "page": pgn,
                "col1": column1,
                "col2": column2
            })
        

    return final_data

In [98]:
sortedData = sort_data_blocks(non_image_data, 173, pageSet1)

In [None]:
sortedData[0]