In [1]:
import pdfplumber
import fitz
import warnings , math, collections , os, re, pprint, json
import pickle
import numpy as np
import pandas as pd



warnings.filterwarnings("ignore", category=UserWarning) 


#path = r"C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo"
path = r"C:\Users\rando\OneDrive\Documents\mywork-repo"

In [2]:
canara_path = path + r"\files\factsheet-march-2022.pdf"
dry_path  = path + r"\output\DryRun.pdf"
indice_path = path + r"\output\pkl\indices_var.pkl"

In [17]:
def get_financial_indices(path:str):
    final_indices = set()
    with open(path , 'rb') as file:
        indices = pickle.load(file)  
        for k,v in indices.items():
            temp = [k] + v
            for t in temp:
                final_indices.add(t)
    
    return final_indices


""" Highlights important financial indices in the pdf, does other pre
analysis of data.
Args: list of indices, string of pdf path
Returns: dict of pages highlighted, string of output pdf, dict of pages contaiting FUND NAMES
"""
def check_indice_highlight(path:str, indices_variations:list, fund_pattern:str, fund_size:int):
    doc = fitz.open(path)
    page_count = doc.page_count #No of pages
    
    pages = [i for i in range(page_count)]
    important_pages = dict.fromkeys(pages, 0)
    fund_titles = dict.fromkeys(pages, "")


    for page_num, page in enumerate(doc):
        
        text_instances = page.get_text('dict')["blocks"]
        
        #sort for all data in pdf document 
        sorted_text_instances = sorted(text_instances, key=lambda x: (x['bbox'][1], x['bbox'][0]))
        
        # rect = fitz.Rect((35,120,250,765))
        # page.add_highlight_annot(rect)

        for pgn,block in enumerate(sorted_text_instances):     
            if "lines" not in block:
                continue
            
            for line in block["lines"]: 
                for span in line["spans"]:
                    if span['flags'] in [20,25, 16,0 ,4]:  # learn flag logic , rn set for all flags value
                        span_text = span['text'].strip().lower()
                        
                        
                        #FUND PAGE CHECK
                        conditions = [
                            pgn in range(0,6),
                            re.match(fund_pattern, span_text, re.IGNORECASE),
                            span['size'] >fund_size
                        ]
                        if all(conditions):
        
                            fund_titles[page_num] = span_text  
                        
                        #CHECK IMP FINANCE INDICES  
                        for term in indices_variations:  
                            pattern = r'\b' + re.escape(term.lower()) + r'\b'
                            if re.search(pattern, span_text):

                                #count highlights
                                important_pages[page_num] +=1
                                #mark content
                                rect = fitz.Rect(span['bbox']) 
                                page.add_highlight_annot(rect)
                                break  #optional , one highlight

    
    output_path = None
    if any(important_pages.values()):
        output_path = path.replace('.pdf', '_highlighted.pdf')
        doc.save(output_path)

    doc.close()
    return important_pages, output_path, fund_titles


""" Get the clipped data in the bbox provided and store in nested dict
Args: input path, dryrun path, important pages, bbox coords
Returns: dict { 'page' : int 'block': dict}"""
def get_clipped_data(input:str, output:str, pageSelect:list, bbox:list[set], fund_names:dict):
    
    document = fitz.open(input)
    finalData = []
    
    for pgn in pageSelect:
        #get the page
        page = document[pgn]
        fundName = fund_names[pgn]
    
        blocks = page.get_text('dict', clip = bbox[0])['blocks'] #get all blocks
        
        filtered_blocks = [block for block in blocks if block['type']==0 and 'lines' in block]
        sorted_blocks = sorted(filtered_blocks, key= lambda x: (x['bbox'][1], x['bbox'][0]))
        
        finalData.append({
            "page": pgn,
            "fundname": fundName,
            "block": sorted_blocks,
        })
            
    return finalData

def extract_span_data(data:list, name:str): #all
    final_data = dict()
    for pgn,page in enumerate(data):
        pgn_content = []
        for blocks in page['block']:
            for line in blocks['lines']:
                spans = line.get('spans',[])
                for span in spans:
                    pgn_content.append(span[name])
                
        final_data[f"Page: {pgn + 1}"] = pgn_content
    
    return final_data

In [4]:
file_path = canara_path
final_indices = get_financial_indices(indice_path)
fund_pattern = r"^(samco|tata|canara)"
fund_size = 10
highlight_pages, saved_path, fund_pages =  check_indice_highlight(file_path, final_indices, fund_pattern, fund_size)


pagedf = pd.DataFrame({'title': fund_pages.values(),'highlight_count': highlight_pages.values()})

"""_summary_ fund is located only on certain pages, based on no. of 
highlights we know which pages are imp. automate this content later
"""
print(pagedf)

                                             title  highlight_count
0                                                                 0
1                                                                 2
2                                                                 1
3                                                                 0
4                                                                 2
5                                                                19
6              canara robeco flexicap fund (crfcf)               12
7     canara robeco blue chip equity fund (crbcef)               13
8           canara robeco emerging equities (cree)               12
9             canara robeco small cap fund (crscf)               14
10              canara robeco infrastructure (cri)               11
11      canara robeco consumer trends fund (crctf)               11
12    canara robeco equity tax saver fund (cretsf)               12
13       canara robeco focused equity fund (crfe

In [5]:
pages = [ i for i in range(6,26)]
bbox = [(30,150,220,760)]

data = get_clipped_data(canara_path, dry_path, pages, bbox, fund_pages)
#text_data = return_span_data(data, 'text')
#data[2]['block']
#clean_data = clean_data_combine(data, -1,12.0)
#set font whose color is __ to size __ 
#Bad Logic here btw
#Page 0,1,2,3 indexes-> 3,5,7,9 pages 

In [6]:
def create_matrix_structure(data: list, header_font: float, content_font: float):
    # Step 1: collect font sizes and coordinates
    coordinates = []
    fonts = set()

    for block in data['block']:
        for line in block['lines']:
            for span in line['spans']:
                origin = tuple(span['origin'])  # Top-left coordinates
                coordinates.append(origin)
                fonts.add(span['size'])

    coordinates = sorted(set(coordinates), key=lambda c: (c[1], c[0]))  # Sort by y, then x
    fonts = sorted(fonts, reverse=True)  # Descending order of font size

    # Step 2: create the matrix
    coord_to_index = {coord: idx for idx, coord in enumerate(coordinates)}  # (x,y) at pos 0 etc. ROWS
    font_to_index = {font: idx for idx, font in enumerate(fonts)}  # COLUMNS
    matrix = np.zeros((len(coordinates), len(fonts)), dtype=object)  # Set all to zeros initially

    # Step 3: matrix populate and nested dict add
    nested_dict = {}
    current_subheader = None

    for block in data['block']:
        for line in block['lines']:
            for span in line['spans']:
                origin = tuple(span['origin'])  # Top-left x,y
                font = span['size']
                text_preview = span['text']  # Get the first two words of the text

                # Populate the matrix with text preview
                if origin in coord_to_index and font in font_to_index:
                    row = coord_to_index[origin]
                    col = font_to_index[font]
                    if matrix[row, col] == 0:
                        matrix[row, col] = "na"
                    matrix[row, col] = text_preview

                # Build the nested dictionary
                if font == header_font:
                    current_subheader = span
                    nested_dict[current_subheader['text']] = []
                elif font <= content_font and current_subheader:
                    nested_dict[current_subheader['text']].append(span)

    matrix_df = pd.DataFrame(matrix, index=coordinates, columns=fonts)

    return nested_dict, matrix_df
def generate_pdf_from_data(data:list, output_path:str):
    pdf_doc = fitz.open()
    
    for section, spans in data.items():
    
        page = pdf_doc.new_page()
        text_position = 72  # for title initalize something

        #section title
        title_font_size = 14 
        try:
            page.insert_text(
                (72, text_position), #initalizor
                section,
                fontsize=title_font_size,
                fontname="helv",
                color=(0, 0, 1),
            )        
        except Exception as e:
            print(f"The error is {e}")
            
        #content title
        for span in spans:
            bbox = span.get("bbox", [0, 0, 0, 0])  # default 

            #Errror in fitz font 
            try:
                page.insert_text(
                    (bbox[0], bbox[1]),
                    span["text"],
                    fontsize=span["size"],
                    fontname="helv",
                    color=tuple(int(span["color"] & 0xFFFFFF) for _ in range(3)))#unsigned int value so (0,0,0)
                
            except Exception:
                page.insert_text(
                    (bbox[0], bbox[1]),
                    span["text"],
                    fontsize=span["size"],
                    fontname="helv",
                    color=(1, 0, 0),
                )

    # Save the created PDF
    pdf_doc.save(output_path)
    pdf_doc.close()
    print(f"  PDF generated to: {output_path}")

def extract_data_from_pdf(path:str):
    
    def replace_main_key(string: str):
        replace_key = string
        if re.match(r'^NAV.*as on', string, re.IGNORECASE):
            replace_key = "NAV" 
        elif "market" in string.lower():
            replace_key = "Market Cap"
        elif re.match(r"Assets Under Management", string, re.IGNORECASE):
            replace_key = "Assets Under Management"   
        return replace_key
    
    with pdfplumber.open(file_path) as pdf:
        final_data = []
        final_data_generated = {}
        
        for page in pdf.pages:
            # extract text from the page
            text = page.extract_text()
            final_data.append(text)
        
        #store them in a dict for each page
        for data in final_data:
            content = data.split('\n')
            main_key = replace_main_key(content[0])
            values = content[1:]
        
            final_data_generated[main_key] = values

        #sort the headers in lex order
        sorted_final_generated = {key: final_data_generated[key] for key in sorted(final_data_generated)}

    return sorted_final_generated

In [19]:
def clean_data_combine(data: list, check_color: int, replace_size):
    
    remove_text = ['.','. ',',',':','st',";","-",'st ',' ','th', 'th ', 'rd', 'rd ', 'nd', 'nd ','`']
    
    for page in data:
        for blocks in page['block']:
            for lines in  blocks.get('lines',[]):
            
                #REMOVE WASTE VALUES
                lines['spans'] = [
                    span for span in lines.get("spans", [])
                            if span.get("text").strip() not in remove_text
                ]
                
                #MAKE SAME COLOR AND SIZE
                for span in lines.get('spans',[]):
                    if span['color'] == check_color and span['size'] == 7.5:
                        span['size'] = replace_size
                
                
                #COMBINE SPANS HAVING SIMILAR PROP
                # combined_spans = []
                # for span in lines.get("spans", []):
                #     if combined_spans and all(
                #         combined_spans[-1].get(key) == span.get(key)
                #         for key in ["flags", "size", "color"]
                #     ):
                #         # Combine text if spans are similar
                #         combined_spans[-1]["text"] += " " + span["text"]
                #     else:
                #         combined_spans.append(span)
                # lines["spans"] = combined_spans
                
    return data


In [20]:
cleaned_data = clean_data_combine(data,-12371562, 20)

In [33]:
cleaned_data[0]['block']

[{'number': 0,
  'type': 0,
  'bbox': (41.5260009765625,
   153.5399932861328,
   112.81690979003906,
   162.5399932861328),
  'lines': [{'spans': [{'size': 9.0,
      'flags': 4,
      'font': 'Taz-SemiLight',
      'color': -1,
      'ascender': 0.800000011920929,
      'descender': -0.20000000298023224,
      'text': 'FUND INFORMATION',
      'origin': (41.5260009765625, 160.739990234375),
      'bbox': (41.5260009765625,
       153.5399932861328,
       112.81690979003906,
       162.5399932861328)}],
    'wmode': 0,
    'dir': (1.0, 0.0),
    'bbox': (41.5260009765625,
     153.5399932861328,
     112.81690979003906,
     162.5399932861328)}]},
 {'number': 1,
  'type': 0,
  'bbox': (41.52101135253906,
   167.56500244140625,
   204.89340209960938,
   227.03997802734375),
  'lines': [{'spans': [{'size': 20,
      'flags': 4,
      'font': 'Taz-SemiLight',
      'color': -12371562,
      'ascender': 0.800000011920929,
      'descender': -0.20000000298023224,
      'text': 'CATEGORY/T

In [56]:
pattern = r"([A-Z /]+):\s*(.+?)(?=,[A-Z]|$)"

match = re.match(pattern, r'CATEGORY/TYPE: Flexi Cap Fund - An open-ended equity')
print(match.group(1),match.group(2))

CATEGORY/TYPE Flexi Cap Fund - An open-ended equity


In [57]:
text = extract_span_data(cleaned_data,'text')

In [63]:
subheaders = []
for k, value in text.items():
    print(f"-------------{k}------------\n")
    for v in value:
        txt = v.strip()
        if txt not in [':', '`',"#"]: 
            if match:= re.match(pattern, txt):
                subheaders.append(match.group(1).strip())
                print(match.group(1)+":")
                print(match.group(2))
            else:
                print(txt)
    

-------------Page: 1------------

FUND INFORMATION
CATEGORY/TYPE:
Flexi Cap Fund - An open-ended equity
scheme investing across large cap, mid cap, small cap
stocks
SCHEME OBJECTIVE:
To generate capital appreciation by
investing in equity and equity related securities. However,
there can be no assurance that the investment objective
of the scheme will be realized.
Monthend AUM
7,256.26
Crores
Monthly AVG AUM
6,893.04   Crores
NAV:
(as on March 31, 2022)
Direct Plan - Growth Option
239.1200
Regular Plan - Growth Option
221.8500
Regular Plan - IDCW (payout/reinvestment)
45.6900
Direct Plan - IDCW (payout/reinvestment)
66.7500
DATE OF ALLOTMENT:
September 16, 2003
ASSET ALLOCATION:
Equity & Equity Related Instruments - 65% to 100% (Risk-
High)
Debt and money- market instruments  - 0% to 35% (Risk-
Low to medium)
Reits/Invits- 0% to 10% (Risk- Medium to High)
MINIMUM INVESTMENT:
5000 and in multiples of
1 thereafter
Subsequent purchases: Minimum amount of
1000
and multiples of
1 thereafter

In [64]:
set(subheaders)

{'ASSET ALLOCATION',
 'BENCHMARK',
 'CATEGORY/TYPE',
 'DAT E OF ALLOTMENT',
 'DATE OF ALLOTMENT',
 'ENTRY LOAD',
 'EXIT LOAD',
 'FUND MANAGER',
 'MANAGING THIS FUND',
 'NAV',
 'PLANS / OPTIONS',
 'PLANS/OPTIONS',
 'SCHEME OBJECTIVE',
 'SIP',
 'STP',
 'SWP',
 'TOTAL EXPERIENCE'}