In [1]:
import pdfplumber
import fitz
import warnings , math, collections , os, re
import pickle
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore", category=UserWarning) 

In [2]:
path = r"C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo"
#path = r"C:\Users\rando\OneDrive\Documents\mywork-repo"

tata_path = path + r"\files\TataFactSheet2024.pdf"
dry_run_path = path + r"\output\DryRun.pdf"
indice_path = path + r"\output\pkl\indices_var.pkl"

In [130]:
def get_financial_indices(path:str):
    final_indices = set()
    with open(path , 'rb') as file:
        indices = pickle.load(file)  
        for k,v in indices.items():
            temp = [k] + v
            for t in temp:
                final_indices.add(t)
    
    return final_indices


""" Highlights important financial indices in the pdf, does other pre
analysis of data.
Args: list of indices, string of pdf path
Returns: dict of pages highlighted, string of output pdf, dict of pages contaiting FUND NAMES
"""
def check_indice_highlight(path:str, indices_variations:list, fund_pattern:str, fund_size:int):
    doc = fitz.open(path)
    page_count = doc.page_count #No of pages
    
    pages = [i for i in range(page_count)]
    important_pages = dict.fromkeys(pages, 0)
    fund_titles = dict.fromkeys(pages, "")


    for page_num, page in enumerate(doc):
        
        text_instances = page.get_text('dict')["blocks"]
        
        #sort for all data in pdf document 
        sorted_text_instances = sorted(text_instances, key=lambda x: (x['bbox'][1], x['bbox'][0]))
        
        # rect = fitz.Rect((35,120,250,765))
        # page.add_highlight_annot(rect)

        for pgn,block in enumerate(sorted_text_instances):     
            if "lines" not in block:
                continue
            
            for line in block["lines"]: 
                for span in line["spans"]:
                    if span['flags'] in [20,25, 16,0 ,4]:  # learn flag logic , rn set for all flags value
                        span_text = span['text'].strip().lower()
                        size = span['size']
                        color = span['color']
                        
                        #FUND PAGE CHECK
                        conditions = [
                            pgn in range(0,20),
                            re.match(fund_pattern, span_text, re.IGNORECASE),
                            size in range(fund_size-4, fund_size+4),
                            color == -1
                        ]
                        if all(conditions):
        
                            fund_titles[page_num] = span_text
                            rect = fitz.Rect(span['bbox']) 
                            page.add_rect_annot(rect)
                            
                            rect = fitz.Rect([0,50,160,765])
                            page.add_rect_annot(rect)
                        
                        #CHECK IMP FINANCE INDICES  
                        for term in indices_variations:  
                            pattern = r'\b' + re.escape(term.lower()) + r'\b'
                            if re.search(pattern, span_text):

                                #count highlights
                                important_pages[page_num] +=1
                                #mark content
                                rect = fitz.Rect(span['bbox']) 
                                page.add_highlight_annot(rect)
                                break  #optional , one highlight
                            
                            

    
    output_path = None
    if any(important_pages.values()):
        output_path = path.replace('.pdf', '_highlighted.pdf')
        doc.save(output_path)

    doc.close()
    return important_pages, output_path, fund_titles


""" Get the clipped data in the bbox provided and store in nested dict
Args: input path, dryrun path, important pages, bbox coords
Returns: dict { 'page' : int 'block': dict}"""
def get_clipped_data(input:str, output:str, pageSelect:list, bbox:list[set], fund_names:dict):
    
    document = fitz.open(input)
    finalData = []
    
    for pgn in pageSelect:
        #get the page
        page = document[pgn]
        fundName = fund_names[pgn]

        #get all block
        final_blocks = []
        for box in bbox:
            blocks = page.get_text('dict', clip = box)['blocks'] #get all blocks
            filtered_blocks = [block for block in blocks if block['type']==0 and 'lines' in block] #only text blocks
            sorted_blocks = sorted(filtered_blocks, key= lambda x: (x['bbox'][1], x['bbox'][0]))
            final_blocks.extend(sorted_blocks)
        
        
        finalData.append({
            "page": pgn,
            "fundname": fundName,
            "block": final_blocks,
        })
            
    return finalData


def get_pdf_data(input:str, pageSelect:list, fund_names:dict):
    
    document = fitz.open(input)
    finalData = []
    
    for pgn in pageSelect:
        #get the page
        page = document[pgn]
        if fund_names:
            fundName = fund_names[pgn]
        else:
            fundName = "dummy"
    
        blocks = page.get_text('dict')['blocks'] #get all blocks
        
        filtered_blocks = [block for block in blocks if block['type']==0 and 'lines' in block]
        sorted_blocks = sorted(filtered_blocks, key= lambda x: (x['bbox'][1], x['bbox'][0]))
        
        finalData.append({
            "page": pgn,
            "fundname": fundName,
            "block": sorted_blocks,
        })
            
    return finalData


In [22]:
test_data = get_pdf_data(tata_path,[i for i in range(0,100)],{})

In [131]:
file_path  = tata_path
financial_indices = get_financial_indices(indice_path)
fund_pattern = r"^(TATA|tata)"
fund_size = 14 #greater than condn

highlight_pages, saved_path, fund_pages =  check_indice_highlight(file_path, financial_indices, fund_pattern, fund_size)

In [37]:
pagedf = pd.DataFrame({'title': fund_pages.values(),'highlight_count': highlight_pages.values()})

"""_summary_ fund is located only on certain pages, based on no. of 
highlights we know which pages are imp. automate this content later
"""
pagedf.to_excel(path + r'\output\example.xlsx')

In [None]:
pages = [ i for i in range(16,60)]+ [61,63] + [i for i in range(65,82)]
pages

In [122]:
bbox = [(0,50,160,750)]

data = get_clipped_data(tata_path, dry_run_path, pages, bbox, fund_pages)

In [44]:
def extract_span_data(data:list, name:list): #all
    final_data = dict()
    for pgn,page in enumerate(data):
        pgn_content = []
        for blocks in page['block']:
            for line in blocks['lines']:
                spans = line.get('spans',[])
                for span in spans:
                    
                    text = span['text'].strip()
                    size = span['size']
                    color = span['color']
                    origin = span['origin']
                    bbox = span['bbox']
                
                    pgn_content.append([size,text,color,origin,bbox])
                    
        final_data[page['fundname']] = pgn_content
    
    return final_data

In [108]:
from collections import defaultdict

def clean_block_data(blocks):
    
    remove_text = ['Purchase','Amount','thereafter','.','. ',',',':','st',";","-",'st ',' ','th', 'th ', 'rd', 'rd ', 'nd', 'nd ','','`','(Date of Allotment)']
    
    sorted_blocks = sorted(blocks, key=lambda x: (x[3][1],x[3][0]))
    
    cleaned_blocks = []
    for block in sorted_blocks:
        size, text, color, origin, bbox = block
        if text not in remove_text:
            cleaned_blocks.append(block)
 
    processed_blocks = []
    # adjust size based on color and size
    for block in cleaned_blocks:
        size, text, color, origin, bbox = block
        text = text.strip()
        if size in [5.0,6.0,8.0] and color == -15570765:
            size = 20.0  # Update size to 20.0
        processed_blocks.append([size, text, color, origin, bbox])
                

    # group blocks by rounded y-coordinate
    grouped_blocks = defaultdict(list)
    for block in processed_blocks:
        y_coord = math.ceil(block[3][1])# Extract and round the y-coordinate
        size = block[0]
        grouped_blocks[(y_coord,size)].append(block)

    # Combine blocks with the same y-coordinate
    combined_blocks = []
    for key, group in grouped_blocks.items():
        
        if key[1] == 20:
            combined_text = " ".join(item[1] for item in group).strip()
            if combined_text:  # Ignore whitespace-only text
                size, color, origin, bbox = group[0][0], group[0][2], group[0][3],group[0][4]
                combined_blocks.append([size, combined_text, color, origin,bbox])
        
        else:
            for item in group:
                combined_blocks.append(item)

    return combined_blocks

def process_text_data(text_data):
    
    updated_text_data = {}

    for fund, data in text_data.items():
        blocks = data
        cleaned_blocks = clean_block_data(blocks)
        updated_text_data[fund] = cleaned_blocks

    return updated_text_data

In [123]:
text_data = extract_span_data(data,[])
cleaned_data = process_text_data(text_data)

In [125]:
text_data['tata large cap fund']

[[7.0,
  'As on 30th November 2024',
  -14475488,
  (43.5171012878418, 69.326171875),
  (43.5171012878418,
   61.99017333984375,
   134.23011779785156,
   71.22317504882812)],
 [6.0,
  'INVESTMENT STYLE',
  -15570765,
  (24.534700393676758, 88.05267333984375),
  (24.534700393676758,
   82.27467346191406,
   73.0027084350586,
   89.47467041015625)],
 [6.0,
  'Primarily invests in equity and equity related instruments',
  -14475488,
  (24.5, 98.263671875),
  (24.5, 92.48567199707031, 158.34201049804688, 99.67967224121094)],
 [6.0,
  'of large market cap companies.',
  -14475488,
  (24.5, 105.46368408203125),
  (24.5, 99.68568420410156, 99.31399536132812, 106.87968444824219)],
 [6.0,
  '',
  -14475488,
  (157.00869750976562, 100.44866943359375),
  (157.00869750976562,
   94.67066955566406,
   158.50270080566406,
   101.86466979980469)],
 [6.0,
  'INVESTMENT OBJECTIVE',
  -15570765,
  (24.534698486328125, 117.24267578125),
  (24.534698486328125,
   111.46467590332031,
   84.07270050048828,

In [111]:
header_size = 20
content_size = 10
final_text_data = dict()
final_matrix = dict()

for fund, items in cleaned_data.items(): #ech fund
    
    #step 1 extract size, coord
    coordinates = list()
    sizes = set()
    
    for item in items: #size,text,color,origin
        origin = tuple(item[3])
        coordinates.append(origin)
        sizes.add(item[0])
    
    coordinates = sorted(set(coordinates), key=lambda c: (c[1], c[0]))  # Sort by y, then x
    sizes = sorted(sizes, reverse=True)  
    
    #step 2 create matrix
    coord_to_index = {coord: idx for idx, coord in enumerate(coordinates)}  # (x,y) at pos 0 etc. ROWS
    size_to_index = {font: idx for idx, font in enumerate(sizes)}  # COLUMNS
    matrix = np.zeros((len(coordinates), len(sizes)), dtype=object)
    
    
    #step 3
    nested_dict = {}
    current_header = None
    for item in items:
        origin = tuple(item[3])
        size = item[0]
        text = item[1]
        
        #populate the matrix
        if origin in coord_to_index and size in size_to_index:
            row = coord_to_index[origin]
            col = size_to_index[size]
            
            if matrix[row,col] ==0:
                matrix[row,col] ==r"N\A"
            matrix[row,col] == text
        
        #build nested dict
        if size == header_size:
            current_header = text
            nested_dict[current_header] = []
        elif size<= content_size and current_header:
            nested_dict[current_header].append(item)
            
    matrix_df = pd.DataFrame(matrix, index=coordinates, columns=sizes)   
    final_matrix[fund] = matrix_df
    final_text_data[fund] = nested_dict   

In [None]:
# pagedf = pd.DataFrame(final_matrix['tata large cap fund'])
# pagedf.to_excel(path + r'\output\example.xlsx')

In [101]:
def generate_pdf_from_data(data:list, output_path:str):
    
    pdf_doc = fitz.open()
    
    for header, items in data.items():
        
        page = pdf_doc.new_page()
        text_position = 22  # for title initalize something

        #section title
        title_font_size = 24
        try:
            page.insert_text(
                (72, text_position), #initalizor
                header,
                fontsize=title_font_size,
                fontname="helv",
                color=(0, 0, 1),
            )        
        except Exception as e:
            print(f"Error while parsing fund {e}")
        
        if len(items) > 0:    
            for item in items:
                
                bbox = item[3] #origin coords
                text = item[1]
                size = item[0]
                color = item[2]
    
                #Errror in fitz font 
                try:
                    page.insert_text(
                        (bbox[0], bbox[1]),
                        text,
                        fontsize=size,
                        fontname="helv",
                        color=tuple(int(color & 0xFFFFFF) for _ in range(3)))#unsigned int value so (0,0,0)
                    
                except Exception:
                    page.insert_text(
                        (bbox[0], bbox[1]),
                        text,
                        fontsize=size,
                        fontname="helv",
                        color=(1, 0, 0),
                )

    # Save the created PDF
    pdf_doc.save(output_path)
    pdf_doc.close()
    print(f" PDF generated to: {output_path}")

def extract_data_from_pdf(path:str):
    
    # def replace_main_key(string: str):
    #     replace_key = string
    #     if re.match(r'^NAV.*as on', string, re.IGNORECASE):
    #         replace_key = "NAV" 
    #     elif "market" in string.lower():
    #         replace_key = "Market Cap"
    #     elif re.match(r"Assets Under Management", string, re.IGNORECASE):
    #         replace_key = "Assets Under Management"   
    #     return replace_key
    
    with pdfplumber.open(path) as pdf:
        final_data = []
        final_data_generated = {}
        
        for page in pdf.pages:
            # extract text from the page
            text = page.extract_text()
            final_data.append(text)
        
        #store them in a dict for each page
        for data in final_data[1:]:
            content = data.split('\n')
            main_key = content[0]
            values = content[1:]
        
            final_data_generated[main_key] = values

        #sort the headers in lex order
        sorted_final_generated = {key: final_data_generated[key] for key in sorted(final_data_generated)}

    return sorted_final_generated

In [113]:
final_extracted_text = dict()
for fund, items in final_text_data.items():
    print(fund)
    generate_pdf_from_data(items, dry_run_path)
    extract_data = extract_data_from_pdf(dry_run_path)
    final_extracted_text[fund] = extract_data

tata large cap fund
 PDF generated to: C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo\output\DryRun.pdf
tata flexi cap fund
 PDF generated to: C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo\output\DryRun.pdf
tata large & mid cap fund
 PDF generated to: C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo\output\DryRun.pdf
tata equity p/e fund
 PDF generated to: C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo\output\DryRun.pdf
tata mid cap growth fund
 PDF generated to: C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo\output\DryRun.pdf
tata elss tax saver fund
 PDF generated to: C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo\output\DryRun.pdf
tata small cap fund
 PDF generated to: C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Service

In [None]:
final_extracted_text

In [57]:
def set_correct_headers(data:dict):
    
    temp_data = data
    for fund,items in data.items():
        temp_item = items
        for key in temp_item.keys():
            
            if match:= re.match(r'^(ADDI|MINI).*/$',key):
                print("DUMMY/")
            elif match:=re.match(r'^(MUL).*(TORS)$',key):
                print("MULTIPLE INVESTORS")
            elif match:=re.match(r'^(MUL).*(MENT)$',key):
                print("MULTIPLE INVESTMENT")
            elif match:= re.match(r'^NAV', key):
                print("NAV")
            elif match:= re.match(r'^EXPENSE', key):
                print("EXPENSE RATIO")
            else:
                print(key)
            
    return data

In [90]:
class HeaderRegex:
    
    def __init__(self, integer):
        self.integer = integer
        
    
    def extract_inv_sty(self,data:list):
        text = ""
        for txt in data:
           text+=rf' {txt}'
        
        text = text.strip()
        return text
    
    def extract_inv_obj(self,data:list):
        text = ""
        for txt in data:
            text += rf' {txt}'
        text = text.strip()
        return text
    
    def extract_date_of_all(self,data:list):
        return data[0]
    
    def extract_benchmark_ind(self, data:list):
        return data[0]

In [93]:
len(final_text_data)

63

In [119]:
import pprint

for fund, value in final_extracted_text.items():
    print(fund)
    pprint.pprint(value)

tata large cap fund
{'ADDITIONAL INVESTMENT/': [],
 'BENCHMARK': ['Nifty 100 TRI'],
 'DATE OF ALLOTMENT': ['May 07,1998'],
 'EXPENSE RATIO**': ['Direct 1.02',
                     'Regular 2.03',
                     '**Note: The rates specified are actual month end '
                     'expenses charged',
                     'as on Nov 30, 2024. The above ratio includes the Service '
                     'tax on',
                     'Investment Management Fees. The above ratio excludes, '
                     'borrowing',
                     'cost, wherever applicable.'],
 'FUND MANAGER': ['Abhinav Sharma (Managing Since 05-Apr-23 and overall',
                  'experience of 22 years), Kapil Malhotra (Managing Since',
                  '19-Dec-23 and overall experience of 14 years)'],
 'FUND SIZE': ['Rs. 2435.52 (Rs. in Cr.)'],
 'INVESTMENT OBJECTIVE': ['To provide income distribution and / or medium to '
                          'long',
                          'term capita