In [13]:
import pdfplumber
import fitz
import warnings , math, collections , os, re, pprint, json
import pickle
import numpy as np
import pandas as pd



warnings.filterwarnings("ignore", category=UserWarning) 


path = r"C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo"
#path = r"C:\Users\rando\OneDrive\Documents\mywork-repo"

In [31]:
canara_path = path + r"\files\factsheet-march-2022.pdf"
dry_run_path  = path + r"\output\DryRun.pdf"
indice_path = path + r"\output\pkl\indices_var.pkl"

In [23]:
def get_financial_indices(path:str):
    final_indices = set()
    with open(path , 'rb') as file:
        indices = pickle.load(file)  
        for k,v in indices.items():
            temp = [k] + v
            for t in temp:
                final_indices.add(t)
    
    return final_indices


""" Highlights important financial indices in the pdf, does other pre
analysis of data.
Args: list of indices, string of pdf path
Returns: dict of pages highlighted, string of output pdf, dict of pages contaiting FUND NAMES
"""
def check_indice_highlight(path:str, indices_variations:list, fund_pattern:str, fund_size:int):
    doc = fitz.open(path)
    page_count = doc.page_count #No of pages
    
    pages = [i for i in range(page_count)]
    important_pages = dict.fromkeys(pages, 0)
    fund_titles = dict.fromkeys(pages, "")


    for page_num, page in enumerate(doc):
        
        text_instances = page.get_text('dict')["blocks"]
        
        #sort for all data in pdf document 
        sorted_text_instances = sorted(text_instances, key=lambda x: (x['bbox'][1], x['bbox'][0]))
        
        # rect = fitz.Rect((35,120,250,765))
        # page.add_highlight_annot(rect)

        for pgn,block in enumerate(sorted_text_instances):     
            if "lines" not in block:
                continue
            
            for line in block["lines"]: 
                for span in line["spans"]:
                    if span['flags'] in [20,25, 16,0 ,4]:  # learn flag logic , rn set for all flags value
                        span_text = span['text'].strip().lower()
                        
                        
                        #FUND PAGE CHECK
                        conditions = [
                            pgn in range(0,15),
                            re.match(fund_pattern, span_text, re.IGNORECASE),
                            span['size'] >fund_size
                        ]
                        if all(conditions):
        
                            fund_titles[page_num] = span_text
                            rect = fitz.Rect(span['bbox']) 
                            page.add_rect_annot(rect)
                            
                            rect = fitz.Rect((30,150,220,760))
                            page.add_rect_annot(rect)  
                        
                        #CHECK IMP FINANCE INDICES  
                        for term in indices_variations:  
                            pattern = r'\b' + re.escape(term.lower()) + r'\b'
                            if re.search(pattern, span_text):

                                #count highlights
                                important_pages[page_num] +=1
                                #mark content
                                rect = fitz.Rect(span['bbox']) 
                                page.add_highlight_annot(rect)
                                break  #optional , one highlight

    
    output_path = None
    if any(important_pages.values()):
        output_path = path.replace('.pdf', '_highlighted.pdf')
        doc.save(output_path)

    doc.close()
    return important_pages, output_path, fund_titles


""" Get the clipped data in the bbox provided and store in nested dict
Args: input path, dryrun path, important pages, bbox coords
Returns: dict { 'page' : int 'block': dict}"""
def get_clipped_data(input:str, output:str, pageSelect:list, bbox:list[set], fund_names:dict):
    
    document = fitz.open(input)
    finalData = []
    
    for pgn in pageSelect:
        #get the page
        page = document[pgn]
        fundName = fund_names[pgn]
    
        blocks = page.get_text('dict', clip = bbox[0])['blocks'] #get all blocks
        
        filtered_blocks = [block for block in blocks if block['type']==0 and 'lines' in block]
        sorted_blocks = sorted(filtered_blocks, key= lambda x: (x['bbox'][1], x['bbox'][0]))
        
        finalData.append({
            "page": pgn,
            "fundname": fundName,
            "block": sorted_blocks,
        })
            
    return finalData

def get_pdf_data(input:str, output:str, pageSelect:list, fund_names:dict):
    
    document = fitz.open(input)
    finalData = []
    
    for pgn in pageSelect:
        #get the page
        page = document[pgn]
        fundName = fund_names[pgn]
    
        blocks = page.get_text('dict')['blocks'] #get all blocks
        
        filtered_blocks = [block for block in blocks if block['type']==0 and 'lines' in block]
        sorted_blocks = sorted(filtered_blocks, key= lambda x: (x['bbox'][1], x['bbox'][0]))
        
        finalData.append({
            "page": pgn,
            "fundname": fundName,
            "block": sorted_blocks,
        })
            
    return finalData

def extract_span_data(data:list, name:list): #all
    final_data = dict()
    for pgn,page in enumerate(data):
        pgn_content = []
        for blocks in page['block']:
            for line in blocks['lines']:
                spans = line.get('spans',[])
                for span in spans:
                    
                    text = span['text'].strip()
                    size = span['size']
                    color = span['color']
                    origin = span['origin']
                    bbox = span['bbox']
                
                    pgn_content.append([size,text,color,origin,bbox])
                    
        final_data[page['fundname']] = pgn_content
    
    return final_data

In [20]:
file_path = canara_path
final_indices = get_financial_indices(indice_path)
fund_pattern = r"^(samco|tata|canara)"
fund_size = 10 #greater than logic
highlight_pages, saved_path, fund_pages =  check_indice_highlight(file_path, final_indices, fund_pattern, fund_size)


In [18]:
pagedf = pd.DataFrame({'title': fund_pages.values(),'highlight_count': highlight_pages.values()})

"""_summary_ fund is located only on certain pages, based on no. of 
highlights we know which pages are imp. automate this content later
"""
pagedf.to_excel(path + r'\output\example.xlsx')
print(pagedf)

                                             title  highlight_count
0                                                                 0
1                                                                 2
2                                                                 1
3                                                                 0
4                                                                 2
5                                                                19
6              canara robeco flexicap fund (crfcf)               12
7     canara robeco blue chip equity fund (crbcef)               13
8           canara robeco emerging equities (cree)               12
9             canara robeco small cap fund (crscf)               14
10              canara robeco infrastructure (cri)               11
11      canara robeco consumer trends fund (crctf)               11
12    canara robeco equity tax saver fund (cretsf)               12
13       canara robeco focused equity fund (crfe

In [39]:
def create_matrix_structure(data: list, header_font: float, content_font: float):
    # Step 1: collect font sizes and coordinates
    coordinates = []
    fonts = set()

    for block in data['block']:
        for line in block['lines']:
            for span in line['spans']:
                origin = tuple(span['origin'])  # Top-left coordinates
                coordinates.append(origin)
                fonts.add(span['size'])

    coordinates = sorted(set(coordinates), key=lambda c: (c[1], c[0]))  # Sort by y, then x
    fonts = sorted(fonts, reverse=True)  # Descending order of font size

    # Step 2: create the matrix
    coord_to_index = {coord: idx for idx, coord in enumerate(coordinates)}  # (x,y) at pos 0 etc. ROWS
    font_to_index = {font: idx for idx, font in enumerate(fonts)}  # COLUMNS
    matrix = np.zeros((len(coordinates), len(fonts)), dtype=object)  # Set all to zeros initially

    # Step 3: matrix populate and nested dict add
    nested_dict = {}
    current_subheader = None

    for block in data['block']:
        for line in block['lines']:
            for span in line['spans']:
                origin = tuple(span['origin'])  # Top-left x,y
                font = span['size']
                text_preview = span['text']  # Get the first two words of the text

                # Populate the matrix with text preview
                if origin in coord_to_index and font in font_to_index:
                    row = coord_to_index[origin]
                    col = font_to_index[font]
                    if matrix[row, col] == 0:
                        matrix[row, col] = "na"
                    matrix[row, col] = text_preview

                # Build the nested dictionary
                if font == header_font:
                    current_subheader = span
                    nested_dict[current_subheader['text']] = []
                elif font <= content_font and current_subheader:
                    nested_dict[current_subheader['text']].append(span)

    matrix_df = pd.DataFrame(matrix, index=coordinates, columns=fonts)

    return nested_dict, matrix_df
def generate_pdf_from_data(data:list, output_path:str):
    pdf_doc = fitz.open()
    
    for section, spans in data.items():
    
        page = pdf_doc.new_page()
        text_position = 72  # for title initalize something

        #section title
        title_font_size = 14 
        try:
            page.insert_text(
                (72, text_position), #initalizor
                section,
                fontsize=title_font_size,
                fontname="helv",
                color=(0, 0, 1),
            )        
        except Exception as e:
            print(f"The error is {e}")
            
        #content title
        for span in spans:
            bbox = span.get("bbox", [0, 0, 0, 0])  # default 

            #Errror in fitz font 
            try:
                page.insert_text(
                    (bbox[0], bbox[1]),
                    span["text"],
                    fontsize=span["size"],
                    fontname="helv",
                    color=tuple(int(span["color"] & 0xFFFFFF) for _ in range(3)))#unsigned int value so (0,0,0)
                
            except Exception:
                page.insert_text(
                    (bbox[0], bbox[1]),
                    span["text"],
                    fontsize=span["size"],
                    fontname="helv",
                    color=(1, 0, 0),
                )

    # Save the created PDF
    pdf_doc.save(output_path)
    pdf_doc.close()
    print(f"  PDF generated to: {output_path}")


In [24]:
from collections import defaultdict

def clean_block_data(blocks):
    
    remove_text = ['Purchase','Amount','thereafter','.','. ',',',':','st',";","-",'st ',' ','th', 'th ', 'rd', 'rd ', 'nd', 'nd ','','`','(Date of Allotment)']
    
    sorted_blocks = sorted(blocks, key=lambda x: (x[3][1],x[3][0]))
    
    cleaned_blocks = []
    for block in sorted_blocks:
        size, text, color, origin, bbox = block
        if text not in remove_text:
            cleaned_blocks.append(block)
 
    processed_blocks = []
    # adjust size based on color and size
    for block in cleaned_blocks:
        size, text, color, origin, bbox = block
        text = text.strip()
        if size in [9.0,8.0] and color == -1:
            size = 20.0  # Update size to 20.0
        processed_blocks.append([size, text, color, origin, bbox])
                

    # group blocks by rounded y-coordinate
    grouped_blocks = defaultdict(list)
    for block in processed_blocks:
        y_coord = math.ceil(block[3][1])# Extract and round the y-coordinate
        size = block[0]
        grouped_blocks[(y_coord,size)].append(block)

    # Combine blocks with the same y-coordinate
    combined_blocks = []
    for key, group in grouped_blocks.items():
        
        if key[1] == 20:
            combined_text = " ".join(item[1] for item in group).strip()
            if combined_text:  # Ignore whitespace-only text
                size, color, origin, bbox = group[0][0], group[0][2], group[0][3],group[0][4]
                combined_blocks.append([size, combined_text, color, origin,bbox])
        
        else:
            for item in group:
                combined_blocks.append(item)

    return combined_blocks

def process_text_data(text_data):
    
    updated_text_data = {}

    for fund, data in text_data.items():
        blocks = data
        cleaned_blocks = clean_block_data(blocks)
        updated_text_data[fund] = cleaned_blocks

    return updated_text_data

In [37]:
pages = [ i for i in range(6,26)]
bbox = [(30,150,215,760)]

data = get_clipped_data(canara_path, dry_run_path, pages, bbox, fund_pages)

In [27]:
text_data  = extract_span_data(data,[])
cleaned_data = process_text_data(text_data)

In [None]:
data

In [None]:
text_data['canara robeco value fund (crvf)']

In [None]:
cleaned_data['canara robeco value fund (crvf)']

In [29]:
header_size = 20
content_size = 10 #anything less than
final_text_data = dict()
final_matrix = dict()

for fund, items in cleaned_data.items(): #ech fund
    
    #step 1 extract size, coord
    coordinates = list()
    sizes = set()
    
    for item in items: #size,text,color,origin
        origin = tuple(item[3])
        coordinates.append(origin)
        sizes.add(item[0])
    
    coordinates = sorted(set(coordinates), key=lambda c: (c[1], c[0]))  # Sort by y, then x
    sizes = sorted(sizes, reverse=True)  
    
    #step 2 create matrix
    coord_to_index = {coord: idx for idx, coord in enumerate(coordinates)}  # (x,y) at pos 0 etc. ROWS
    size_to_index = {font: idx for idx, font in enumerate(sizes)}  # COLUMNS
    matrix = np.zeros((len(coordinates), len(sizes)), dtype=object)
    
    
    #step 3
    nested_dict = {}
    current_header = None
    for item in items:
        origin = tuple(item[3])
        size = item[0]
        text = item[1].strip()
        
        #populate the matrix
        if origin in coord_to_index and size in size_to_index:
            row = coord_to_index[origin]
            col = size_to_index[size]
            
            if matrix[row,col] == 0:
                matrix[row,col] ==r"nil"
            matrix[row,col] == text
        
        #build nested dict
        if size == header_size:
            current_header = "_".join([i for i in text.split(" ") if i != '']).lower()
            nested_dict[current_header] = []
        elif size<= content_size and current_header:
            nested_dict[current_header].append(item)
            
    final_text_data[fund] = nested_dict        
    matrix_df = pd.DataFrame(matrix, index=coordinates, columns=sizes)
    final_matrix[fund] = matrix_df

In [30]:
def generate_pdf_from_data(data:list, output_path:str):
    
    pdf_doc = fitz.open()
    
    for header, items in data.items():
        
        page = pdf_doc.new_page()
        text_position = 72  # for title initalize something

        #section title
        title_font_size = 24
        try:
            page.insert_text(
                (72, text_position), #initalizor
                header,
                fontsize=title_font_size,
                fontname="helv",
                color=(0, 0, 1),
            )        
        except Exception as e:
            print(f"Error while parsing fund {e}")
            
        for item in items:
            
            bbox = item[3] #origin coords
            text = item[1]
            size = item[0]
            color = item[2]
   
            #Errror in fitz font 
            try:
                page.insert_text(
                    (bbox[0], bbox[1]),
                    text,
                    fontsize=size,
                    fontname="helv",
                    color=tuple(int(color & 0xFFFFFF) for _ in range(3)))#unsigned int value so (0,0,0)
                
            except Exception:
                page.insert_text(
                    (bbox[0], bbox[1]),
                    text,
                    fontsize=size,
                    fontname="helv",
                    color=(1, 0, 0),
                )

    # Save the created PDF
    pdf_doc.save(output_path)
    pdf_doc.close()
    print(f" PDF generated to: {output_path}")

def extract_data_from_pdf(path:str):
    
    # def replace_main_key(string: str):
    #     replace_key = string
    #     if re.match(r'^NAV.*as on', string, re.IGNORECASE):
    #         replace_key = "NAV" 
    #     elif "market" in string.lower():
    #         replace_key = "Market Cap"
    #     elif re.match(r"Assets Under Management", string, re.IGNORECASE):
    #         replace_key = "Assets Under Management"   
    #     return replace_key
    
    with pdfplumber.open(path) as pdf:
        final_data = []
        final_data_generated = {}
        
        for page in pdf.pages:
            # extract text from the page
            text = page.extract_text()
            final_data.append(text)
        
        #store them in a dict for each page
        for data in final_data[1:]:
            content = data.split('\n')
            main_key = content[0]
            values = content[1:]
        
            final_data_generated[main_key] = values

        #sort the headers in lex order
        sorted_final_generated = {key: final_data_generated[key] for key in sorted(final_data_generated)}

    return sorted_final_generated

In [34]:
final_extracted_text = dict()
for fund, items in final_text_data.items():
    print(fund)
    generate_pdf_from_data(items, dry_run_path)
    extract_data = extract_data_from_pdf(dry_run_path)
    final_extracted_text[fund] = extract_data

canara robeco flexicap fund (crfcf)
 PDF generated to: C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo\output\DryRun.pdf
canara robeco blue chip equity fund (crbcef)
 PDF generated to: C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo\output\DryRun.pdf
canara robeco emerging equities (cree)
 PDF generated to: C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo\output\DryRun.pdf
canara robeco small cap fund (crscf)
 PDF generated to: C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo\output\DryRun.pdf
canara robeco infrastructure (cri)
 PDF generated to: C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo\output\DryRun.pdf
canara robeco consumer trends fund (crctf)
 PDF generated to: C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo\output\DryRun.pdf
can

ValueError: cannot save with zero pages