In [1]:
import pprint, json, math, os, sys, camelot
import fitz, pdfplumber
import pandas as pd
import numpy as np
import ocrmypdf
from collections import defaultdict

# dir_path = "C:\\Users\\Kaustubh.keny\\OneDrive - Cogencis Information Services Ltd\\Documents\\mywork-repo\\"
# fund_path = "C:\\Users\\Kaustubh.keny\\OneDrive - Cogencis Information Services Ltd\\Documents\\Jan 25\\"

dir_path = "C:\\Users\\rando\\OneDrive\\Documents\\mywork-repo"
fund_path =  "C:\\Users\\rando\\OneDrive\\Documents\\Jan 25"
sys.path.append(os.path.abspath(dir_path))

from app.utils import Helper
from app.parse_regex import *

dry_path = r'\data\output\DryRun.pdf'
fin_path = r'\data\input\financial_indices.xlsx'
mutual_fund = Helper.get_fund_paths(fund_path)

In [None]:
def extract_clipped_data(input:str, pages:list, bboxes:list):
        
        document = fitz.open(input)
        final_list = []
    
        for pgn in pages:
            page = document[pgn]
            
            all_blocks = [] #store every data from bboxes
            
            for bbox in bboxes:
                blocks, seen_blocks = [], set()  #store unique blocks based on content and bbox
                
                page_blocks = page.get_text('dict', clip=bbox)['blocks']
                for block in page_blocks:
                    if block['type'] == 0 and 'lines' in block: #type 0 means text block
                        #hash_key
                        block_key = (tuple(block['bbox']), tuple(tuple(line['spans'][0]['text'] for line in block['lines'])))
                        if block_key not in seen_blocks:
                            seen_blocks.add(block_key)
                            blocks.append(block)

                sorted_blocks = sorted(blocks, key=lambda x: (x['bbox'][1], x['bbox'][0]))
                all_blocks.append(sorted_blocks)

            final_list.append({
                "pgn": pgn,
                "block": all_blocks #will be list[list,list,..]
            })

        document.close()
        return final_list
    
def extract_data_relative_line(path: str, line_x: float, side: str):
    doc = fitz.open(path)
    pages = doc.page_count

    final_list = []

    for pgn in range(pages):
        page = doc[pgn]

        blocks = page.get_text("dict")["blocks"]
        sorted_blocks = sorted(blocks, key=lambda x: (x["bbox"][1], x["bbox"][0]))
        extracted_blocks = []

        # Keep track of blocks to avoid duplicates
        added_blocks = set()

        for block in sorted_blocks:
            block_id = id(block)  # Unique identifier for the block

            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    origin = span["origin"]
                    x0, _ = origin

                    # Check the side condition
                    if side == "left" and x0 < line_x and block_id not in added_blocks:
                        extracted_blocks.append(block)
                        added_blocks.add(block_id)  # Mark block as added
                    elif side == "right" and x0 > line_x and block_id not in added_blocks:
                        extracted_blocks.append(block)
                        added_blocks.add(block_id)  # Mark block as added

      
        final_list.append({
            "pgn": pgn,
            "blocks": extracted_blocks
        })

    doc.close()

    return final_list
  
def get_clipped_data(input:str, bboxes:list[set], *args):
    
        document = fitz.open(input)
        final_list = []
        if args:
            pages = list(args)
        else:
            pages = [i for i in document.page_count]
        
        for pgn in pages:
            page = document[pgn]

            blocks = []
            for bbox in bboxes:
                blocks.extend(page.get_text('dict', clip = bbox)['blocks']) #get all blocks
            
            filtered_blocks = [block for block in blocks if block['type']== 0 and 'lines' in block]
            # sorted_blocks = sorted(filtered_blocks, key= lambda x: (x['bbox'][1], x['bbox'][0]))
            
             # Extract text from sorted blocks
            extracted_text = []
            for block in filtered_blocks:
                block_text = []
                for line in block['lines']:
                    line_text = " ".join(span['text'] for span in line['spans'])
                    block_text.append(line_text)
                extracted_text.append("\n".join(block_text))
            
            final_list.append({
            "pgn": pgn,
            "block": filtered_blocks,
            "text": extracted_text
            })
            
            
        document.close()
        return final_list
    
def get_clipped_text(input:str, bboxes:list[set],*args):

    document = fitz.open(input)
    final_list = []
    
    if args:
        pages = list(args)
    else:
        pages = [i for i in document.page_count]
    
    for pgn in pages:
        page = document[pgn]
        blocks = []
        for bbox in bboxes:
            blocks = page.get_text('text', clip = bbox).split('\n') #get all blocks
        final_list.append({
        "pgn": pgn,
        "block": blocks
        })   
    document.close()
    return final_list

In [9]:
sample_path  = mutual_fund["ICICI Prudential Mutual Fund"]
sample_path

'C:\\Users\\rando\\OneDrive\\Documents\\Jan 25\\ICICI Prudential Mutual Fund\\14_31-Jan-2025_1_FS.pdf'

In [6]:
lines = [
    ((165, 0), (165, 812)),# Vertical line
    ((0, 500), (812, 500))
]
pages = [12, 14,16]
bboxes = [(0, 0, 560, 150)]
pages = [i for i in range(1,110)]
Helper.draw_lines_on_pdf(sample_path, lines, bboxes, pages, dir_path +dry_path)


Modified PDF saved to: C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo\\data\output\DryRun.pdf


In [10]:
data = Helper.get_all_pdf_data(sample_path)

In [12]:
data[16]

{'pgn': 16,
 'blocks': [{'number': 3,
   'type': 0,
   'bbox': (27.91790008544922,
    13.343345642089844,
    292.4304504394531,
    34.6783447265625),
   'lines': [{'spans': [{'size': 17.0,
       'flags': 20,
       'font': 'Mulish-Regular',
       'color': -65794,
       'ascender': 1.0049999952316284,
       'descender': -0.25,
       'text': 'ICICI Prudential BSE Sensex ETF',
       'origin': (27.91790008544922, 30.4283447265625),
       'bbox': (27.91790008544922,
        13.343345642089844,
        292.4304504394531,
        34.6783447265625)}],
     'wmode': 0,
     'dir': (1.0, 0.0),
     'bbox': (27.91790008544922,
      13.343345642089844,
      292.4304504394531,
      34.6783447265625)}]},
  {'number': 75,
   'type': 0,
   'bbox': (497.0616149902344,
    24.999317169189453,
    539.7313232421875,
    46.547119140625),
   'lines': [{'spans': [{'size': 8.0,
       'flags': 4,
       'font': 'Mulish-Regular',
       'color': -65794,
       'ascender': 1.0049999952316284,
   

In [None]:
def get_proper_fund_names(path: str):
    pattern = "(Navi.*?(?:fund|fund of fund))"
    title = {}
    
    with fitz.open(path) as doc:
        for pgn, page in enumerate(doc):
            text = " ".join(page.get_text("text", clip=(0, 0, 500, 200)).split("\n"))
            text = re.sub("[^A-Za-z0-9\\s\\-\\(\\).,]+", "", text).strip()
            if matches := re.findall(pattern, text,re.IGNORECASE):
                title[pgn] = matches[0]
    return title


LIC DATA

In [None]:
import fitz
import ocrmypdf
import re

replacement_dict = {
    "LIC MF Large Cap Fund": r"LIC MF LARGE CAP FUND",
    "LIC MF Large & Mid Cap Fund": r"LIC MF LARGE[^\S\r\n]*&[^\S\r\n]*MID[^\S\r\n]*CAP[^\S\r\n]*FUND",
    "LIC MF Multi Cap Fund": r"LIC MF MULTICAP FUND",
    "LIC MF Value Cap Fund": r"LIC MF V[^\S\r\n]*'?T[^\S\r\n]*CAP[^\S\r\n]*FUND",
    "LIC MF Small Cap Fund": r"SMALLCAP FUND",
    "LIC MF Dividend Yield Fund": r"LIC MF DIV[^\S\r\n]*DEND[^\S\r\n]*YIELD[^\S\r\n]*FUND",
    "LIC MF Value Fund": r"LIC MF VALUE FUND",
    "LIC MF Focused Fund": r"LIC MF FOCUSED FUND",
    "LIC MF Infrastructure Fund": r"LIC MF INFRASTRUCTURE FUND",
    "LIC MF Point Manufacturing Fund": r"LIC MF Poin[^\S\r\n]*MANGFACTURING[^\S\r\n]*FUND",
    "LIC MF Banking & Financial Services Fund": r"BANKING[^\S\r\n]*&[^\S\r\n]*FINANC[^\S\r\n]*IAL[^\S\r\n]*SERVICES[^\S\r\n]*FUND",
    "LIC MF Healthcare Fund": r"HEALTHCARE FUND",
    "LIC MF ELSS Tax Saver Fund": r"LiC MF EL[^\S\r\n]*SS[^\S\r\n]*TAX[^\S\r\n]*SAVER?",
    "LIC MF Aggressive Hybrid Fund": r"LIC MF AGGRESSIVE HYBR[^\S\r\n]*D[^\S\r\n]*FUND",
    "LIC MF Balanced Advantage Fund": r"LIC MF B[^\S\r\n]*'?L[^\S\r\n]*\*[^\S\r\n]*NCED[^\S\r\n]*ADVANTAGE[^\S\r\n]*FUND",
    "LIC MF Equity Savings Fund": r"LIC MF EQUITY SAVINGS FUND",
    "LIC MF Conservative Hybrid Fund": r"LIC MF CONSER[^\S\r\n]*VATIVE[^\S\r\n]*nip?!?\)?[^\S\r\n]*FUND",
    "LIC MF Arbitrage Fund": r"LIC MF ARBITRAGE FUND",
    "LIC MF Overnight Fund": r"LIC MF OVERNIGHT FUND",
    "LIC MF Liquid Fund": r"LIC MF LIQUID FUND",
    "LIC MF Ultra Short Duration Fund": r"LIC MF ULTRA SHORT DURATION FUND",
    "LIC MF Low Duration Fund": r"LIC MF LOW[^\S\r\n]*'?DURATION[^\S\r\n]*FUND",
    "LIC MF Medium to Long Duration Fund": r"LIC MF MEDIUM[^\S\r\n]*[Ff]2[^\S\r\n]*LONG[^\S\r\n]*DURATION[^\S\r\n]*FUND",
    "LIC MF Banking & PSU Fund": r"LIC MF BANK[^\S\r\n]*&[^\S\r\n]*ONG[^\S\r\n]*&[^\S\r\n]*PSU[^\S\r\n]*FUND",
    "LIC MF Short Duration Fund": r"LIC MF[^\S\r\n]*SHORT[^\S\r\n]*DURATION[^\S\r\n]*FUND",
    "LIC MF Gilt Fund": r"LIC MGI FUND",
    "LIC MF Children's Fund": r"LIC MF[^\S\r\n]*l[^\S\r\n]*HILDRENS[^\S\r\n]*FUND",
    "LIC MF BSE Sensex ETF": r"BSE SENSEX ETF",
    "LIC MF Nifty 50 ETF": r"LIC MF NIFTY 50 ETF",
    "LIC MF Nifty 100 ETF": r"LIC MF NIFTY 100 ETF",
    "LIC MF Nifty Midcap 100 ETF": r"LIC MF NIFTY MIDCAP 100 ETF",
    "LIC MF Nifty 8-13 Yr G-Sec ETF": r"LIC MF NIFTY 8[^\S\r\n]*13[^\S\r\n]*YR[^\S\r\n]*G[^\S\r\n]*SECETF",
    "LIC MF BSE Sensex Index Fund": r"LIC MF BSE SENSEX INDEX FUND",
    "LIC MF Nifty 50 Index Fund": r"LIC MF NIFTY 50 INDEX FUND",
    "LIC MF Nifty Next 50 Index Fund": r"LIC MF NIFTY NEXT 50 INDEX FUND",
    "LIC MF Gold Exchange Traded Fund": r"LIC MF G[^\S\r\n]*2LD[^\S\r\n]*EXCHANGE[^\S\r\n]*TRADED[^\S\r\n]*FUND",
    "LIC MF Gold ETF Fund of Fund": r"LIC MF GSLD ETF",
}

def process_pdf_and_correct_funds(input_pdf, bbox = (0, 0, 400, 100)):
    clipped_pdf = input_pdf.replace(".pdf", "_clipped.pdf")
    ocr_pdf = input_pdf.replace(".pdf", "_ocr.pdf")
    
    with fitz.open(input_pdf) as doc:
        with fitz.open() as new_doc:
            for page_num in range(len(doc)):
                new_page = new_doc.new_page(width=bbox[2] - bbox[0], height=bbox[3] - bbox[1])
                new_page.show_pdf_page(new_page.rect, doc, page_num, clip=bbox)
            new_doc.save(clipped_pdf)
    
    ocrmypdf.ocr(clipped_pdf, ocr_pdf, deskew=True, force_ocr=True)
    
    pattern = r"((?:LI?i?C|BSE|BANK|SMALL|HEALTH).*?(?:FUND|Path|ETF|FTF|EOF|FOF|PLAN|SAVER|FUND\s*OF\s*FUND))"
    extracted_titles = {}

    with fitz.open(ocr_pdf) as doc:
        for page_num, page in enumerate(doc):
            page_content = page.get_text("text")
            text = " ".join(page_content.split("\n"))
            if matches := re.findall(pattern, text, re.IGNORECASE):
                extracted_titles[page_num] = matches[0]
    return extracted_titles

corrected_titles = process_pdf_and_correct_funds(sample_path)


In [None]:
{14: 'LIC MF LARGE CAP FUND',
 15: 'LIC MF LARGE& MID CAPFUND',
 17: 'LIC MF MULTICAP FUND',
 18: "LIC MF V't CAP FUND",
 19: 'SMALLCAP FUND',
 21: 'LIC MF DIV_DEND YIELD FUND',
 22: 'LIC MF VALUE FUND',
 23: 'LIC MF FOCUSED FUND',
 24: 'LIC MF INFRASTRUCTURE FUND',
 25: 'LIC MF Poin MANGFACTURING FUND',
 26: 'BANKING & FINANC-AL SERVICES FUND',
 27: 'HEALTHCARE FUND',
 28: 'LiC MF EL_SS TAX SAVER',
 29: 'LIC MF AGGRESSIVE HYBRiD FUND',
 30: 'LIC MF B’L*NCED ADVANTAGE FUND',
 31: 'LIC MF EQUITY SAVINGS FUND',
 32: 'LIC MF CONSERWATIVE nip?!) FUND',
 33: 'LIC MF ARBITRAGE FUND',
 36: 'LIC MF OVERNIGHT FUND',
 37: 'LIC MF LIQUID FUND',
 38: 'LIC MF ULTRA SHORT DURATION FUND',
 40: 'LIC MF LOW’ DURATION FUND',
 41: 'LIC MF MEDIUM-F2°LONG DURATION FUND',
 42: 'LIC MF BANK&ONG & PSU FUND',
 43: 'LIC MF_. SHORT DURATION FUND',
 45: 'LIC MGI FUND',
 46: 'LIC MF l HILDRENS FUND',
 47: 'BSE SENSEX ETF',
 48: 'LIC MF NIFTY 50ETF',
 49: 'LIC MF NIFTY 100 ETF',
 50: 'LIC MF NIFTY MIDCAP 100 ETF',
 51: 'LIC MF NIFTY 8-13 YR G-SECETF',
 52: 'LIC MF BSE SENSEX INDEX FUND',
 53: 'LIC MF NIFTY 50 INDEX FUND',
 54: 'LIC MF NIFTY NEXT 50 INDEX FUND',
 55: 'LIC MF G2LD EXCHANGE TRADED FUND',
 56: 'LIC MF GSLD ETF',
 57: 'LIC MF Large Cap Fund',
 58: 'LIC MF Mid cap Fund',
 59: 'LIC MF Focused Fund',
 60: 'Lic MF ELSs Tax Saver',
 61: 'LIC MF Arbitrage Fund',
 63: 'LIC MF Nifty Midcap 100 ETF',
 64: 'LIC MF Large Cap Fund',
 65: 'LIC MF Dividend Yield Fund',
 66: 'LIC MEF Healthcare Fund',
 67: 'LIC MF Unit Linked Insurance Scheme __LIC MF Overnight Fund',
 68: 'LIC MF Short Duration Fund',
 69: 'LIC MF BSE Sensex ETF',
 70: 'LIC MF BSE Sensex Index Fund',
 73: 'LIC MF Healthcare Fund',
 74: 'LIC MF Liquid Fund',
 75: 'LIC MF Nifty 50 ETF',
 76: 'LIC Mutual Fund'}

In [26]:
import camelot
def get_something(path: str):
    pattern = r"^(REDEMPTION PROCEEDS|FEATURES|ASSET ALLOCATION|FUND MANAGER|SCHEME|Sr\. No\.)$"

    with fitz.open(path) as doc:
        exists = defaultdict(int)
        for pgn, page in enumerate(doc):
            page_text = [t.strip() for t in page.get_text().split("\n")]
            for text in page_text:
                if re.match(pattern,text):
                    exists[pgn]+=1
                    
    
    return [str(pgn+1) for pgn, count in exists.items() if count > 4] #camelot starts pages from 1

pages = get_something(sample_path)

imp_pages = ",".join(pages)                

In [None]:
tables = camelot.read_pdf(sample_path,pages=imp_pages, flavor="stream", table_areas= ["30,50,612,812"],column_tol = 4, split_text = True) #["30,50,612,812"]   

In [None]:
import camelot
import pandas as pd
import numpy as np

def clean_column_name(x):
    cleaned = re.sub(r"[^A-Za-z0-9\s]", "", str(x), flags=re.IGNORECASE) #Corrected re.sub.
    cleaned = "_".join(cleaned.split())
    return cleaned

with pd.ExcelWriter("cleaned_tables.xlsx", engine="xlsxwriter") as writer:
    for i, table in enumerate(tables):
        df = table.df
        df = df.map(lambda x: " ".join(str(x).split("\n")).strip())
        df.replace("", np.nan, inplace=True)
        df.iloc[2:, 0] = df.iloc[2:, 0].ffill()
        df = df.iloc[1:, :]
        df.columns = df.iloc[0, :].apply(clean_column_name)
        # print(df.columns)
        df = df.iloc[1:, :]
        sheet_name = f"Table_{i+1}"

        # workbook = writer.book
        # worksheet = writer.sheets[sheet_name]

        # row = 1
        # last_value = None
        # start_row = None

        # for j, value in enumerate(df.iloc[:, 0], start=1):
        #     if pd.notna(value):  # New group found
        #         if last_value is not None and start_row is not None:
        #             worksheet.merge_range(start_row, 0, row - 1, 0, last_value)  # Merge previous block
        #         last_value = value
        #         start_row = row
        #     row += 1

        # # Merge last group
        # if last_value is not None and start_row is not None:
        #     worksheet.merge_range(start_row, 0, row - 1, 0, last_value)
    
        

print("Cleaned tables saved with merged spanning rows!")


NIPPON DATA

In [None]:
def via_block(path:str):
    pattern = r"FUNDS AT A GLANCE"
    amc_pattern = "^(Nippon India|CPSE).*(?=Plan|Next 50|Sensex|Fund|Path|ETF|FOF|EOF|Funds|$)"
    imp_pages = []
    with fitz.open(path) as doc:
        for pgn, page in enumerate(doc):
                page_blocks = page.get_text("dict")["blocks"]
                sorted_blocks = sorted(page_blocks, key=lambda x: (x["bbox"][1], x["bbox"][0]))
                for block_count, block in enumerate(sorted_blocks[:10]):
                    if "lines" not in block:
                        continue
                    for line in block["lines"]:
                        for span in line["spans"]:
                            text = span["text"].strip()
                            if re.match(pattern,text):
                                imp_pages.append(pgn)
                                
        amc_fund = defaultdict(list)
    
        for pgn in imp_pages:
            page = doc[pgn]
            page_blocks = page.get_text("dict")["blocks"]
            sorted_blocks = sorted(page_blocks, key=lambda x: (x["bbox"][1], x["bbox"][0]))
            for block_count, block in enumerate(sorted_blocks):
                if "lines" not in block:
                    continue
                for line in block["lines"]:
                    for span in line["spans"]:
                        text = span["text"].strip()
                        color = span['color']
                        if re.match(amc_pattern,text)and color == -1:
                            # matches = re.findall(amc_pattern,text)
                            amc_fund[pgn].append(text)
                            
    return imp_pages, dict(amc_fund)          

In [None]:
pages,amc = via_block(sample_path)
pages = list(map(str,[x+1 for x in pages]))

In [None]:
final_scheme = defaultdict(list)
for key, value in amc.items():
    # print(key)
    set1 = ['Scheme Name']+value[:4]
    set2 = ['Scheme Name']+value[4:]
    final_scheme[key+1].append(set1)
    final_scheme[key+1].append(set2)

final_scheme = dict(final_scheme)

In [None]:
imp_pages = ",".join(pages)
tables = camelot.read_pdf(sample_path,pages=imp_pages, flavor="lattice", line_scale = 40)  #table_areas = ["0,0,580,690"]            

In [None]:
import pandas as pd
import numpy as np

with pd.ExcelWriter("merged_tables.xlsx", engine="openpyxl") as writer:
    count = 0  # Toggle between 0 and 1
    
    for i, table in enumerate(tables):
        df = table.df
        if df.shape[1] < 3:
            continue

    
        df = df.map(lambda x: " ".join(x.split("\n")).strip())
        df = df.map(lambda x: np.nan if not x.strip() else x)
        df.set_index(df.columns[0], inplace=True)

        for check in ["Scheme Name", "Market Capitalization"]:
            if check in df.index:
                df.drop(check, inplace=True)
                
        df_cleaned = df[~df.index.isna()]
        df_cleaned = df_cleaned[df_cleaned.index != ""]
        df_cleaned = df_cleaned.reset_index()
        df_fill = df_cleaned.ffill(axis=1)


        sch_vals = final_scheme[table.page][count]
        count = 1 - count  # Toggle between 0 and 1

        if len(sch_vals) == 5:
            df_fill.loc[-1] = sch_vals 
            df_fill = df_fill.sort_index().reset_index(drop=True)

        # Write to a new sheet
        df_fill.to_excel(writer, sheet_name=f"Table_{i+1}", index=False)

print("All tables saved in separate sheets in 'merged_tables.xlsx' 🚀")
