In [49]:
import pdfplumber
import fitz
import camelot
import warnings , math, collections , os, re
import pickle

warnings.filterwarnings("ignore", category=UserWarning) 

In [5]:
#path = r"C:\Users\rando\OneDrive\Documents\mywork-repo"
path = r"C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo"

#file data paths
samco_path = r"\files\SamcoFactSheet2024.pdf"
tata_path = r"\files\TataFactSheet2024.pdf"


IMPORTANT CLEANING FUNCTIONS

In [7]:
#Imp Funct

removeContent =[
    'Mutual fund investments are subject to market risks, read all scheme related documents carefully.',
    '(An open-ended scheme investing across large cap, midcap and small cap stocks)',
    "(An open-ended Equity Linked Saving Scheme with a statutory lock-in of 3 years and tax benefit.)",
    "(An open-ended dynamic equity scheme investing across large cap, mid cap, small cap stocks)",
    "(An open-ended equity scheme following momentum theme)",
    "(An open-ended equity scheme following special situations theme)",
    ".",
    "st",
    "Note:",
    "Disclaimer",
    "93.72",
    "risk-o-meter",
    "scheme risk-o-meter",
    "*Investors should consult their financial advisers if in doubt about whether the product is suitable for them.",
    "94.87",
    "(An open-ended dynamic asset allocation fund)",
    "97.11",
    ":"
]

document = fitz.open(path +samco_path)
total_pages = document.page_count
document.close()
for i in range(1,total_pages+1):
    removeContent.append(str(i))


removeContent = [content.lower() for content in removeContent]

pagesToIgnore = []

textDirection = [(1.0,0), (-1.0,0),(0,1.0),(0,-1.0)]

#regarding colors

def extract_rgb(color_int):
    red = (color_int >> 16) & 0xFF
    green = (color_int >> 8) & 0xFF
    blue = color_int & 0xFF
    return (int(red), int(green), int(blue))
    
def adjust_color_if_white(rgb):
    # Define threshold for white detection
    white_threshold = 230
    if all(component >= white_threshold for component in rgb):
        return (255, 165, 0)  # RGB for Orange
    return rgb

def is_white_or_shade(color):
    """ Determine if the color is white or a shade close to white """
    threshold = 240  # Define how close to white the color must be
    return all(c >= threshold for c in color)

#regarding tables

def adjust_bbox(bbox, direction, pixels):
    """
    Adjusts the boundary of a bounding box in the specified direction by a given number of pixels.

    Args:
        bbox (tuple): The original bounding box (x0, y0, x1, y1).
        direction (str): Direction to adjust ('top', 'bottom', 'left', 'right').
        pixels (int): The number of pixels to adjust by. Use positive values to expand and negative to contract.

    Returns:
        fitz.Rect: A new fitz.Rect object with the adjusted bounding box.
    """
    x0, y0, x1, y1 = bbox

    if direction == 'top':
        y0 -= pixels
    elif direction == 'bottom':
        y1 += pixels
    elif direction == 'left':
        x0 -= pixels
    elif direction == 'right':
        x1 += pixels
    else:
        raise ValueError("Invalid direction. Choose from 'top', 'bottom', 'left', 'right'.")

    adjusted_rect = fitz.Rect(x0, y0, x1, y1)
    return adjusted_rect


def is_table_large_enough(table_bbox, min_width, min_height):
    x0, y0, x1, y1 = table_bbox
    return (x1 - x0) > min_width and (y1 - y0) > min_height

#other texts

def create_new_file(file_name):
    full_file_name = os.path.join(path,file_name)
    
    os.makedirs(os.path.dirname(full_file_name), exist_ok=True)

    # Check if the file already exists
    if os.path.exists(full_file_name):
        return full_file_name

    with open(full_file_name, 'w') as file:
        file.write("")  # Create an empty file
    
    return full_file_name

def check_if_redundant_text(text, removeContent):
    
    for remove in removeContent:
        if text == remove:
            return False
    return True

def normalize_rgb_color(color):
    if isinstance(color, int):
        color = (
            (color >> 16) & 255,  # Red
            (color >> 8) & 255,   # Green
            color & 255           # Blue
        )

    normalized_color = tuple(c / 255 for c in color)

    if all(channel > 0.9 for channel in normalized_color): #Check if white
        return (1.0, 0.647, 0.0)  # Orange

    return normalized_color

PDF DATA SEGREGATION AND EXTRACTION

In [33]:
"""Open the PDF and extract all blocks of text, images, and other content, while collecting examples of text for each font size and color.
Args:input_pdf_path (str): Path to the input PDF.
pagesToIgnore (list): List of page numbers to ignore.
Returns:list: A list of pages, where each page is a dictionary containing blocks of content and examples of text for each color and size."""
def extract_pdf_blocks(input_pdf_path, pagesToIgnore):
    document_blocks_data = []
    page_blocks_data = []
    input_doc = fitz.open(input_pdf_path)

    with pdfplumber.open(input_pdf_path) as pdf:
        for page_number, pdf_page in enumerate(pdf.pages):
            if page_number not in pagesToIgnore:
                doc_page = input_doc[page_number]
                blocks = doc_page.get_text("dict")["blocks"]

                #get blocks for grand block list
                for block in blocks:
                    document_blocks_data.append(block)

                 # New list for filtered table bboxes
                filtered_table_bboxes = []

                for table_bbox in [table.bbox for table in pdf_page.find_tables()]:
                    if is_table_large_enough(table_bbox, 70, 50):  #width,height
                        adjusted_bbox = adjust_bbox(table_bbox, 'bottom', -6) 
                        filtered_table_bboxes.append(adjusted_bbox)

                page_blocks_data.append({
                    "blocks": blocks,
                    "table_bboxes": filtered_table_bboxes,
                    "page_rect": doc_page.rect
                })

        final_document_blocks_data = {
            "blocks_data": page_blocks_data,
            "total_pages": input_doc.page_count
        }
    input_doc.close()
    return final_document_blocks_data

In [10]:
""" Removes image blocks from the provided blocks data and creates a PDF without these blocks.
Args:blocks_data (list): List of dictionaries containing page data including text and image blocks.
output_pdf_path (str): Path to save the output PDF.
Returns:tuple: A tuple containing two lists - updated block data without image blocks and data of removed image blocks."""
def seperate_text_image_blocks(document_blocks_data):
    page_blocks_data = []
    image_blocks_data = []

    for page_data in document_blocks_data['blocks_data']:
        page_rect = page_data["page_rect"]
        blocks = page_data["blocks"]
        
        non_image_blocks_data = []
        page_image_data = []

        for block in blocks:
            if "image" in block.keys():  # Directly checking for image keys
                page_image_data.append(block)
            else:
                non_image_blocks_data.append(block)
            

        # After processing the page, extract text blocks from the newly created output page
        page_blocks_data.append({
            "blocks": non_image_blocks_data,
            "page_rect": page_rect,
            "table_bboxes": page_data['table_bboxes']
        })

        # Collect data for removed image blocks
        image_blocks_data.append(page_image_data)

    final_document_blocks_data = {
        "blocks_data": page_blocks_data,
        "total_pages": document_blocks_data['total_pages']
    }

    return final_document_blocks_data, image_blocks_data

In [11]:
"""Separates text and tabular data based on bounding boxes (bboxes).
Args:document_blocks_data (dict): Dictionary containing block data for each page.
Returns:tuple: A tuple containing two lists:
- updated block data with non-tabular (text) blocks
- updated block data with tabular block"""
def separate_text_and_tabular_blocks(document_blocks_data):
    document_tabular_blocks = []
    document_textual_blocks = []

    tabular_blocks_data = []
    textual_blocks_data = []

    for page_data in document_blocks_data['blocks_data']:
        blocks = page_data['blocks']
        table_bboxes = page_data['table_bboxes']

        page_tabular_blocks = []
        page_textual_blocks = []

        for block in blocks:
            is_tabular_block = False

            if 'lines' in block:
                for line in block['lines']:
                    for span in line['spans']:
                        bbox = span.get('bbox', [0, 0, 0, 0])

                        inside_table = any(
                            bbox[0] >= table_bbox[0]
                            and bbox[1] >= table_bbox[1]
                            and bbox[2] <= table_bbox[2]
                            and bbox[3] <= table_bbox[3]
                            for table_bbox in table_bboxes
                        )

                        if inside_table:
                            is_tabular_block = True
                            break

                    if is_tabular_block:
                        break

            if is_tabular_block:
                page_tabular_blocks.append(block)
                document_tabular_blocks.append(block)
            else:
                page_textual_blocks.append(block)
                document_textual_blocks.append(block)

        tabular_blocks_data.append({
            "blocks": page_tabular_blocks,
            "page_rect": page_data['page_rect'],
            "table_bboxes": table_bboxes
        })

        textual_blocks_data.append({
            "blocks": page_textual_blocks,
            "page_rect": page_data['page_rect'],
            "table_bboxes": table_bboxes
        })

    final_textual_blocks_data = {
        "blocks_data": textual_blocks_data
    }

    final_tabular_blocks_data = {
        "blocks_data": tabular_blocks_data
    }

    return final_textual_blocks_data, final_tabular_blocks_data


In [None]:
# Return Value of function
#     grand_dict = {
#     "block_data": [
#         "blocks_data": [],
#         "page_rect": ()
#     ]
#     "total_pages":[]
# 
#     }

In [103]:
""" Creates a pdf for the data
Args:document_data (dict): data to make pdf of.
output_path (str): Path to save the modified PDF.
"""
def create_pdf_file(document_data, output_pdf_path):
    output_doc = fitz.open()
    
    for page_data in document_data['blocks_data']:
        page_rect = page_data['page_rect']
        blocks = page_data['blocks']
        table_bbox = page_data['table_bboxes']
        
        output_page = output_doc.new_page(width=page_rect.width, height=page_rect.height)
        
        for block in blocks:
            if "lines" in block:
                for line in block['lines']:
                    for span in line['spans']:
                        bbox = span.get("bbox", [0, 0, 0, 0])
                        text = span["text"]
                        size = round(float(span.get("size", 12)))  # Ensure font size is rounded
                        color = span.get("color", (0, 0, 0))  # Default color (black)
                        font = span.get("font", "Helvetica") # Default font
            
                        #process size and color 
                        color = normalize_rgb_color(color)
                        
                        map_fonts = {'Heebo-Medium': 'Helvetica',
                                    'Inter-Black': 'Helvetica',
                                    'Inter-Bold': 'Times-Bold',
                                    'Inter-ExtraBold': 'Times-Bold',
                                    'Inter-ExtraLight': 'Times-Roman',
                                    'Inter-Light': 'Times-Roman',
                                    'Inter-Medium': 'Times-Roman',
                                    'Inter-Regular': 'Times-Roman',
                                    'Inter-SemiBold': 'Times-Bold',
                                    'Kailasa': 'Helvetica',
                                    'MyriadPro-Regular': 'Helvetica',
                                    'Helvetica': 'Helvetica'
                                    }
                        fontname = map_fonts[font]
                                                            
                        try:
                            output_page.insert_text(
                                (bbox[0], bbox[1]),
                                text,
                                fontsize=size,
                                fontname=fontname,
                                color=color,
                            )
                        except Exception:
                            output_page.insert_text(
                                (bbox[0], bbox[1]),
                                text,
                                fontsize=size,
                                fontname="helv",
                                color=color,
                            )

        
         # Drawing the table bounding boxes
        for table_bbox in table_bbox:
            rect = fitz.Rect(table_bbox)
            output_page.draw_rect(rect, color=(.8, 0, 0), width=0.5)
            
        
            
    output_doc.save(output_pdf_path)
    output_doc.close()

DRY RUN CODE

In [12]:
#dry run paths
dry_run_path = r"\output\DryRun.pdf"

#samco output path
no_image_path = path + r"\output\NoImgPdf.pdf"
textual_pdf_path = path + r"\output\TextualPdf.pdf"
tabular_pdf_path = path + r"\output\TabularPdf.pdf"

#tata output path
# no_image_path = path +r"\output\TatanoImgPdf.pdf"
# textual_pdf_path = path + r"\output\TatatextalPdf.pdf"
# tabular_pdf_path = path + r"\output\TatatabularPdf.pdf"

In [118]:
blocks_data = extract_pdf_blocks(path + samco_path, pagesToIgnore)
non_image_data, image_data = seperate_text_image_blocks(blocks_data)

In [40]:
textual_data,tabular_data = separate_text_and_tabular_blocks(non_image_data)
create_pdf_file(non_image_data,no_image_path)
create_pdf_file(textual_data, textual_pdf_path)
create_pdf_file(tabular_data, tabular_pdf_path)
print("\n Success !!")


 Success !!


In [42]:
#pickle data paths
pickle_path_text = r"\output\pkl\textual_data.pkl"
pickle_path_tab = r"\output\pkl\tabular_data.pkl"
pickle_path_nonimg = r"\output\pkl\nonimg_data.pkl"
pickle_path_indices = r"\output\pkl\indices_var.pkl"

with open(path + pickle_path_text , 'wb') as file:
    pickle.dump(textual_data, file)
    
with open(path + pickle_path_tab , 'wb') as file:
    pickle.dump(tabular_data, file)
    
with open(path + pickle_path_nonimg , 'wb') as file:
    pickle.dump(non_image_data, file)

In [20]:
"""Open the pdf , get all text data and blocks and draw a boundary along each boundary boxes
    Args:input_pdf_path(str) , output_pdf_path (str)
    Returns: nothing, a new pdf created"""
    
def draw_table_boundaries(input_pdf_path, output_pdf_path):
    with pdfplumber.open(input_pdf_path) as pdf:
        doc = fitz.open(input_pdf_path)
        for page_number, page in enumerate(pdf.pages):
            fitz_page = doc[page_number]
            tables = page.find_tables()
            for table in tables:
                bbox = table.bbox
                rect = fitz.Rect(bbox[0], bbox[1], bbox[2], bbox[3])
                fitz_page.draw_rect(rect, color=(0, 0, 1), width=1.5, overlay=False)
        doc.save(output_pdf_path)
        doc.close()

draw_table_boundaries(no_image_path, path + dry_run_path)

HIGHLIGHT CORE INDEXES

In [None]:
""" Get the indices to be checked from the dumped file, create a grand list to check the content"""
with open(path + pickle_path_indices , 'rb') as file:
    indices = pickle.load(file)  
final_indices = []
for k,v in indices.items():
   temp = [k] + v
   for t in temp:
      final_indices.append(t)
      
final_indices

In [119]:
def add_highlights_to_data(indices_variations, data):
    for page_data in data['blocks_data']:
        for block in page_data['blocks']:
            if "lines" in block: #check if page has lines
                for line in block['lines']:
                        for span in line['spans']:
                            if span['flags'] in [20,25]: #indicate bold value
                                span_text = span['text'].lower()
                                for term in indices_variations:
                                    pattern = r'\b' + re.escape(term.lower()) + r'\b'
                                    if re.search(pattern, span_text):
                                        # Add a highlighted key to indicate this span should be highlighted
                                        span['highlighted'] = True
    return data
def check_indices_and_highlight(indices_variations, path):
    doc = fitz.open(path)
    
    important_pages = set()

    for page_number, page in enumerate(doc):
        text_instances = page.get_text('dict')["blocks"]

        for block in text_instances:
            if "lines" in block: 
                for line in block["lines"]: 
                    for span in line["spans"]:
                        # Check text attributes
                        if span['flags'] in [20,25]:  # Example for bold or large text
                            span_text = span['text'].lower()
                            for term in indices_variations:
                                pattern = r'\b' + re.escape(term.lower()) + r'\b'
                                if re.search(pattern, span_text):
                                    important_pages.add(page_number + 1)
                                    # Highlight found terms
                                    rect = fitz.Rect(span['bbox']) 
                                    page.add_highlight_annot(rect)
                                    break  # Optional: break if only one highlight per span is needed

    if important_pages:
        output_path = path.replace('.pdf', '_highlighted.pdf')
        doc.save(output_path)
        doc.close()
        return list(important_pages), output_path
    else:
        doc.close()
        return list(important_pages), None

In [113]:
check_indices_and_highlight(final_indices, textual_pdf_path)
textual_data_high = add_highlights_to_data(final_indices,textual_data)

In [116]:
"""Open the pdf , get all text data and blocks and draw a boundary along each boundary boxes
    Args:input_pdf_path(str) , output_pdf_path (str)
    Returns: nothing, a new pdf created
"""
def draw_boundaries_on_pdf(input_pdf_path, path):
    # Open the PDF file
    doc = fitz.open(input_pdf_path)
    for page in doc:
        blocks = page.get_text("blocks")  # Get the blocks of text on the page
        for block in blocks:
            bbox = block[:4]  # The bbox is the first four elements of the block
            # Draw a rectangle with an orange border around the bbox
            page.draw_rect(bbox, color=(1.0, 0.647, 0.0), width=1.5, overlay=False)
    
    # Save the modified document to a new file
    
    output_path = path.replace('.pdf', '_block_highlighted.pdf')
    doc.save(output_path)
    doc.close()

file_path = path + r"\output\TextualPdf_highlighted.pdf"
draw_boundaries_on_pdf(file_path, path + dry_run_path)

In [117]:
"""Open the pdf , get all text data and blocks and draw a boundary along each boundary boxes
    Args:input_pdf_path(str) , output_pdf_path (str)
    Returns: nothing, a new pdf created
"""
def draw_boundaries_on_lines(input_pdf_path, path):
    # Open the PDF file
    doc = fitz.open(input_pdf_path)
    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            if "lines" in block:
                for line in block["lines"]:
                    
                    bbox = line["bbox"]  # The bbox is now directly accessible from the line
                    page.draw_rect(bbox, color=(.4, 0.647, 0.0), width=1.5, overlay=False)
    
    
    output_path = path.replace('.pdf', '_line_highlighted.pdf')
    doc.save(output_path)
    doc.close()
    
file_path = path + r"\output\TextualPdf_highlighted.pdf"
draw_boundaries_on_lines(file_path, path + dry_run_path)

GETTING WHAT YOU WANTED

In [150]:
def remove_unwanted_keys(data, keys_to_remove):
    """Recursively remove specified keys from dictionaries and lists."""
    if isinstance(data, dict):
        # Create a new dictionary that does not include the unwanted keys
        return {k: remove_unwanted_keys(v, keys_to_remove) for k, v in data.items() if k not in keys_to_remove}
    elif isinstance(data, list):
        # Apply the function to each element in the list
        return [remove_unwanted_keys(item, keys_to_remove) for item in data]

    return data

remove_keys = ['bbox','dir','ascender', 'descender','wmode', 'number','type','origin','color']

In [None]:
textual_data_high['blocks_data'][1]

In [152]:
cleanedData = remove_unwanted_keys(textual_data_high, remove_keys)

In [None]:
cleanedData['blocks_data'][1]

In [163]:
for pgn,blocks in enumerate(cleanedData['blocks_data']): #each page
    if pgn == 0:
       print(blocks['blocks'][0]['lines'])
    for block in blocks['blocks']: #each block in blocks
        for line in block['lines']:
            for span in line['spans']:
                    if 'highlighted' in span and span['flags'] in [20,25]:
                        print(span['text'])

[{'spans': [{'size': 72.0, 'flags': 20, 'font': 'Inter-Bold', 'text': 'Factsheet'}]}]
Fund Manager: 
Minimum Additional Amount
Benchmark:
Modified Duration: 
Average Maturity: 
Yield to Maturity (YTM): 
Standard Deviation: 
Macaulay duration
Sharpe Ratio: 
Beta Ratio (Portfolio Beta): 
Portfolio Turnover Ratio: 
Investment Objective
Inception Date
Benchmark
Total Experience: 
Fund Manager
Average AUM for Month of October 2024
Total Experience: 
Total Experience: 
Portfolio Turnover Ratio: 
Benchmark Risk-o-meter
Investment Objective
Inception Date
Benchmark
Portfolio Turnover Ratio: 
Fund Manager
Average AUM for Month of 
Total Experience: 
Total Experience: 
Total Experience: 
Benchmark Risk-o-meter
Investment Objective
Inception Date
Benchmark
Total Experience: 
Fund Manager
Average AUM for Month of 
Total Experience: 
Total Experience: 
Annualised Portfolio YTM
Macaulay Duration
Modified Duration
Benchmark Risk-o-meter
Investment Objective
Inception Date
Benchmark
Portfolio Turnover

In [160]:
for blocks in cleanedData['blocks_data']: #each page
    print(blocks['blocks'][0]['lines'])

[{'spans': [{'size': 72.0, 'flags': 20, 'font': 'Inter-Bold', 'text': 'Factsheet'}]}]
[{'spans': [{'size': 10.0, 'flags': 20, 'font': 'Inter-Bold', 'text': '1'}]}]
[{'spans': [{'size': 16.0, 'flags': 4, 'font': 'Inter-Black', 'text': 'HEXASHIELD'}]}, {'spans': [{'size': 16.0, 'flags': 4, 'font': 'Inter-Black', 'text': 'TESTED'}]}, {'spans': [{'size': 16.0, 'flags': 4, 'font': 'Inter-Black', 'text': 'INVESTMENTS'}]}]
[{'spans': [{'size': 10.0, 'flags': 20, 'font': 'Inter-Bold', 'text': '3'}]}]
[{'spans': [{'size': 10.0, 'flags': 20, 'font': 'Inter-Bold', 'text': '4'}]}]
[{'spans': [{'size': 10.0, 'flags': 20, 'font': 'Inter-Bold', 'text': '5'}]}]
[{'spans': [{'size': 10.0, 'flags': 4, 'font': 'Inter-Light', 'text': '.'}]}]
[{'spans': [{'size': 10.0, 'flags': 20, 'font': 'Inter-Bold', 'text': '7'}]}]
[{'spans': [{'size': 10.0, 'flags': 20, 'font': 'Inter-Bold', 'text': '8'}]}]
[{'spans': [{'size': 10.0, 'flags': 20, 'font': 'Inter-Bold', 'text': '9'}]}]
[{'spans': [{'size': 10.0, 'flags'