In [1]:
import pdfplumber
import fitz
import warnings , math, collections , os, re, pprint, json
import pickle
import numpy as np
import pandas as pd



warnings.filterwarnings("ignore", category=UserWarning) 


path = r"C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo"
#path = r"C:\Users\rando\OneDrive\Documents\mywork-repo"

In [6]:
icici_path = path + r"\files\icici-factsheet.pdf"
dry_path  = path + r"\output\DryRun.pdf"
indice_path = path + r"\output\pkl\indices_var.pkl"

In [56]:
def get_financial_indices(path:str):
    final_indices = set()
    with open(path , 'rb') as file:
        indices = pickle.load(file)  
        for k,v in indices.items():
            temp = [k] + v
            for t in temp:
                final_indices.add(t)
    
    return final_indices


""" Highlights important financial indices in the pdf, does other pre
analysis of data.
Args: list of indices, string of pdf path
Returns: dict of pages highlighted, string of output pdf, dict of pages contaiting FUND NAMES
"""
def check_indice_highlight(path:str, indices_variations:list, fund_pattern:str, fund_size:int):
    doc = fitz.open(path)
    page_count = doc.page_count #No of pages
    
    pages = [i for i in range(page_count)]
    important_pages = dict.fromkeys(pages, 0)
    fund_titles = dict.fromkeys(pages, "")


    for page_num, page in enumerate(doc):
        
        text_instances = page.get_text('dict')["blocks"]
        
        #sort for all data in pdf document 
        sorted_text_instances = sorted(text_instances, key=lambda x: (x['bbox'][1], x['bbox'][0]))
        
        # rect = fitz.Rect((35,120,250,765))
        # page.add_highlight_annot(rect)

        for pgn,block in enumerate(sorted_text_instances):     
            if "lines" not in block:
                continue
            
            for line in block["lines"]: 
                for span in line["spans"]:
                    if span['flags'] in [20,25, 16,0 ,4]:  # learn flag logic , rn set for all flags value
                        span_text = span['text'].strip().lower()
                        
                        
                        #FUND PAGE CHECK
                        conditions = [
                            pgn in range(0,6),
                            re.match(fund_pattern, span_text, re.IGNORECASE),
                            span['size'] > fund_size
                        ]
                        if all(conditions):
        
                            fund_titles[page_num] = span_text
                            #print(span_text,page_num)
                        
                        #CHECK IMP FINANCE INDICES  
                        for term in indices_variations:  
                            pattern = r'\b' + re.escape(term.lower()) + r'\b'
                            if re.search(pattern, span_text):

                                #count highlights
                                important_pages[page_num] +=1
                                #mark content
                                rect = fitz.Rect(span['bbox']) 
                                page.add_highlight_annot(rect)
                                break  #optional , one highlight

    
    output_path = None
    if any(important_pages.values()):
        output_path = path.replace('.pdf', '_highlighted.pdf')
        doc.save(output_path)

    doc.close()
    return important_pages, output_path, fund_titles


""" Get the clipped data in the bbox provided and store in nested dict
Args: input path, dryrun path, important pages, bbox coords
Returns: dict { 'page' : int 'block': dict}"""
def get_clipped_data(input:str, output:str, pageSelect:list, bbox:list[set], fund_names:dict):
    
    document = fitz.open(input)
    finalData = []
    
    for pgn in pageSelect:
        #get the page
        page = document[pgn]
        fundName = fund_names[pgn]
    
        blocks = page.get_text('dict', clip = bbox[0])['blocks'] #get all blocks
        
        filtered_blocks = [block for block in blocks if block['type']==0 and 'lines' in block]
        sorted_blocks = sorted(filtered_blocks, key= lambda x: (x['bbox'][1], x['bbox'][0]))
        
        finalData.append({
            "page": pgn,
            "fundname": fundName,
            "block": sorted_blocks,
        })
            
    return finalData

def extract_span_data(data:list, name:str): #all
    final_data = dict()
    for pgn,page in enumerate(data):
        pgn_content = []
        for blocks in page['block']:
            for line in blocks['lines']:
                spans = line.get('spans',[])
                for span in spans:
                    pgn_content.append(span[name])
                
        final_data[f"Page: {pgn + 1}"] = pgn_content
    
    return final_data

In [83]:
file_path = icici_path
final_indices = get_financial_indices(indice_path)
fund_pattern = r"^(samco|tata|canara|ICICI|BHARAT)" #.*(fund|ETF|\)|FOF|Plan|-)$
fund_size = 14

highlight_pages, saved_path, fund_pages =  check_indice_highlight(file_path, final_indices, fund_pattern, fund_size)


In [92]:
"""_summary_ fund is located only on certain pages, based on no. of 
highlights we know which pages are imp. automate this content later
"""
pagedf = pd.DataFrame({'page_num': [i for i in range(172)],'title': fund_pages.values(),'highlight_count': highlight_pages.values()})
pagedf.to_excel(path+r'\output\example.xlsx')
fund_dict = pagedf.to_dict()
#print(pagedf)

In [99]:
pages = list()
fund_names = list()
for num,fund,count in zip(fund_dict['page_num'].values(), fund_dict['title'].values(),fund_dict['highlight_count'].values()):
    
    if count >2 and fund !="":
        pages.append(num)
        fund_names.append(fund)
        
#pages

In [142]:
document = fitz.open(icici_path)
finalData = []

for pgn, fund in zip(pages, fund_names):
    #get the page
    page = document[pgn]


    blocks = page.get_text('dict') #get all blocks
    
    filtered_blocks = [block for block in blocks['blocks'] if block['type']==0 and 'lines' in block]
    sorted_blocks = sorted(blocks['blocks'], key= lambda x: (x['bbox'][1],x['bbox'][0]))
    
    finalData.append({
        "page": pgn,
        "fund": fund,
        "block": sorted_blocks,
        "only_block": blocks
    })

In [139]:
finalData[0]['only_block']

[{'number': 0,
  'type': 0,
  'bbox': (558.4761962890625,
   760.5255737304688,
   566.3641357421875,
   768.5255737304688),
  'lines': [{'spans': [{'size': 8.0,
      'flags': 20,
      'font': 'ZurichBT-BoldCondensed',
      'color': -13159371,
      'ascender': 0.7599999904632568,
      'descender': 0.23999999463558197,
      'text': '16',
      'origin': (558.4761962890625, 772.2178955078125),
      'bbox': (558.4761962890625,
       760.5255737304688,
       566.3641357421875,
       768.5255737304688)}],
    'wmode': 0,
    'dir': (1.0, 0.0),
    'bbox': (558.4761962890625,
     760.5255737304688,
     566.3641357421875,
     768.5255737304688)}]},
 {'number': 1,
  'type': 0,
  'bbox': (59.05839920043945,
   84.94588470458984,
   90.9573974609375,
   91.94588470458984),
  'lines': [{'spans': [{'size': 7.0,
      'flags': 20,
      'font': 'Arial-BoldMT',
      'color': -65794,
      'ascender': 0.7279999852180481,
      'descender': -0.20999999344348907,
      'text': 'Style Box'

In [136]:
text_data = extract_span_data(finalData,'text')

In [137]:
text_data['Page: 1']

['ICICI Prudential Bluechip Fund',
 'Large Cap Fund',
 'Category',
 '(An open ended equity scheme predominantly investing in large cap stocks.)',
 'Returns of ICICI Prudential Bluechip Fund - Growth Option as on October 31, 2021 ',
 'Style Box',
 '1 Year',
 '3 Years',
 '5 Years',
 'Since inception',
 'Particulars',
 'Current ',
 'Value of',
 ' Investment ',
 'of Rs. 10000',
 'Current ',
 'Value of',
 ' Investment ',
 'of Rs. 10000',
 'Current ',
 'Value of',
 ' Investment ',
 'of Rs. 10000',
 'Current ',
 'Value of',
 ' Investment ',
 'of Rs. 10000',
 'CAGR ',
 'CAGR ',
 'CAGR ',
 'CAGR ',
 '(%)',
 '(%)',
 '(%)',
 '(%)',
 'Style',
 '57.59 ',
 '15739.86 ',
 '18.87 ',
 '16789.71 ',
 '15.34 ',
 '20430.61 ',
 '15.06 ',
 '65950.00',
 '54.01 ',
 '15382.80 ',
 '20.41 ',
 '17447.37 ',
 '16.37 ',
 '21354.84 ',
 '11.77 ',
 '44654.60',
 '53.72 ',
 '15354.21 ',
 '20.82 ',
 '17625.49 ',
 '16.81 ',
 '21761.33 ',
 '11.28 ',
 '42056.47',
 'Scheme',
 'Value  Blend  Growth',
 'Size',
 'Nifty 100 TRI (Be