In [12]:
import pprint, json, math, os, sys
import fitz, pdfplumber, ocrmypdf
import pandas as pd
import numpy as np
from collections import defaultdict

dir_path = "C:\\Users\\Kaustubh.keny\\Projects\\office-work\\mywork-repo"
fund_path = "C:\\Users\\Kaustubh.keny\\Projects\\Jan 25"

# dir_path = "C:\\Users\\rando\\OneDrive\\Documents\\mywork-repo"
# fund_path =  "C:\\Users\\rando\\OneDrive\\Documents\\Jan 25"
sys.path.append(os.path.abspath(dir_path))

from app.utils import Helper
from app.parse_regex import *

dry_path = r'\data\output\DryRun.pdf'
fin_path = r'\data\input\financial_indices.xlsx'
mutual_fund = Helper.get_fund_paths(fund_path)

In [3]:
def extract_clipped_data(input:str, pages:list, bboxes:list):
        
        document = fitz.open(input)
        final_list = []
    
        for pgn in pages:
            page = document[pgn]
            
            all_blocks = [] #store every data from bboxes
            
            for bbox in bboxes:
                blocks, seen_blocks = [], set()  #store unique blocks based on content and bbox
                
                page_blocks = page.get_text('dict', clip=bbox)['blocks']
                for block in page_blocks:
                    if block['type'] == 0 and 'lines' in block: #type 0 means text block
                        #hash_key
                        block_key = (tuple(block['bbox']), tuple(tuple(line['spans'][0]['text'] for line in block['lines'])))
                        if block_key not in seen_blocks:
                            seen_blocks.add(block_key)
                            blocks.append(block)

                sorted_blocks = sorted(blocks, key=lambda x: (x['bbox'][1], x['bbox'][0]))
                all_blocks.append(sorted_blocks)

            final_list.append({
                "pgn": pgn,
                "block": all_blocks #will be list[list,list,..]
            })

        document.close()
        return final_list
    
def extract_data_relative_line(path: str, line_x: float, side: str):
    doc = fitz.open(path)
    pages = doc.page_count

    final_list = []

    for pgn in range(pages):
        page = doc[pgn]

        blocks = page.get_text("dict")["blocks"]
        sorted_blocks = sorted(blocks, key=lambda x: (x["bbox"][1], x["bbox"][0]))
        extracted_blocks = []

        # Keep track of blocks to avoid duplicates
        added_blocks = set()

        for block in sorted_blocks:
            block_id = id(block)  # Unique identifier for the block

            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    origin = span["origin"]
                    x0, _ = origin

                    # Check the side condition
                    if side == "left" and x0 < line_x and block_id not in added_blocks:
                        extracted_blocks.append(block)
                        added_blocks.add(block_id)  # Mark block as added
                    elif side == "right" and x0 > line_x and block_id not in added_blocks:
                        extracted_blocks.append(block)
                        added_blocks.add(block_id)  # Mark block as added

      
        final_list.append({
            "pgn": pgn,
            "blocks": extracted_blocks
        })

    doc.close()

    return final_list
  
def get_clipped_data(input:str, bboxes:list[set], *args):
    
        document = fitz.open(input)
        final_list = []
        if args:
            pages = list(args)
        else:
            pages = [i for i in document.page_count]
        
        for pgn in pages:
            page = document[pgn]

            blocks = []
            for bbox in bboxes:
                blocks.extend(page.get_text('dict', clip = bbox)['blocks']) #get all blocks
            
            filtered_blocks = [block for block in blocks if block['type']== 0 and 'lines' in block]
            # sorted_blocks = sorted(filtered_blocks, key= lambda x: (x['bbox'][1], x['bbox'][0]))
            
             # Extract text from sorted blocks
            extracted_text = []
            for block in filtered_blocks:
                block_text = []
                for line in block['lines']:
                    line_text = " ".join(span['text'] for span in line['spans'])
                    block_text.append(line_text)
                extracted_text.append("\n".join(block_text))
            
            final_list.append({
            "pgn": pgn,
            "block": filtered_blocks,
            "text": extracted_text
            })
            
            
        document.close()
        return final_list
    
def get_clipped_text(input:str, bboxes:list[set],*args):

    document = fitz.open(input)
    final_list = []
    
    if args:
        pages = list(args)
    else:
        pages = [i for i in document.page_count]
    
    for pgn in pages:
        page = document[pgn]
        blocks = []
        for bbox in bboxes:
            blocks = page.get_text('text', clip = bbox).split('\n') #get all blocks
        final_list.append({
        "pgn": pgn,
        "block": blocks
        })   
    document.close()
    return final_list

def get_proper_fund_names(path: str, pages: list):
    doc = fitz.open(path)
    title = {}

    for pgn in pages:
        page = doc[pgn]
        blocks = page.get_text("dict")['blocks']
        text_all = " ".join(
            span["text"].strip()
            for block in blocks[:4]
            for line in block.get("lines", [])
            for span in line.get("spans", [])
            if span["text"].strip()
        )

        text_all = re.sub(r'[^A-Za-z0-9\s]+', '', text_all).strip()
        matches = re.findall(r"((?:LIC\s*MF|BLNCED|LOW)\s+.*?(?:FUND|ETF|FTF|FOF|PLAN|SAVER))", text_all, re.IGNORECASE)

        title[pgn] = matches[0] if matches else ""

    return title

In [13]:
mutual_fund

{'360 ONE Mutual Fund': 'C:\\Users\\Kaustubh.keny\\Projects\\Jan 25\\360 ONE Mutual Fund\\18_31-Jan-25_FS.pdf',
 'Aditya Birla Sun Life Mutual Fund': 'C:\\Users\\Kaustubh.keny\\Projects\\Jan 25\\Aditya Birla Sun Life Mutual Fund\\3_31-Jan-25_FS.pdf',
 'Axis Mutual Fund': 'C:\\Users\\Kaustubh.keny\\Projects\\Jan 25\\Axis Mutual Fund\\1_31-Jan-2025_1_FS.pdf',
 'Bajaj finserv Mutual Fund': 'C:\\Users\\Kaustubh.keny\\Projects\\Jan 25\\Bajaj finserv Mutual Fund\\59_31-Jan-25_FS.pdf',
 'Bandhan Mutual Fund': 'C:\\Users\\Kaustubh.keny\\Projects\\Jan 25\\Bandhan Mutual Fund\\16_31-Jan-25_FS.pdf',
 'Bank of India Mutual Fund': 'C:\\Users\\Kaustubh.keny\\Projects\\Jan 25\\Bank of India Mutual Fund\\5_31-Jan-25_FS.pdf',
 'Baroda BNP Paribas Mutual Fund': 'C:\\Users\\Kaustubh.keny\\Projects\\Jan 25\\Baroda BNP Paribas Mutual Fund\\2_31-Jan-25_FS.pdf',
 'Canara Robeco Mutual Fund': 'C:\\Users\\Kaustubh.keny\\Projects\\Jan 25\\Canara Robeco Mutual Fund\\6_31-Jan-25_FS.pdf',
 'DSP Mutual Fund': 'C:\\

In [None]:
sample_path = mutual_fund["ITI Mutual Fund"]

In [196]:
lines = [
    ((180, 0), (180, 812)),# Vertical line
    ((0, 40), (812, 40))
]
pages = [12, 14,16]
bboxes = [(390, 105, 596, 812)] #[(0, 85, 180, 812),(180, 85, 360, 812),(0,100,270,812),(0,100,350,812)]
pages = [i for i in range(1,110)]
Helper.draw_lines_on_pdf(sample_path, lines, bboxes, pages, dir_path +dry_path)

Modified PDF saved to: C:\Users\Kaustubh.keny\Projects\office-work\mywork-repo\data\output\DryRun.pdf


In [None]:
def get_proper_fund_names(path: str):
    pattern = "(Invesco India.*?(?:Fund(?:of Funds?)?|ETF|FOF|Path))"

    title = {}   
    with fitz.open(path) as doc:
        for pgn, page in enumerate(doc):
            text = " ".join(page.get_text("text", clip=(180, 0, 590, 40)).split("\n"))
            text = re.sub("[^A-Za-z0-9\\s\\-\\(\\).,]+", "", text).strip()
            # print(pgn,text)
            if matches := re.findall(pattern, text, re.DOTALL):
                title[pgn] = matches[0]
                print(matches[0])
    return title

In [None]:
title = get_proper_fund_names(sample_path)

In [None]:
# r"(360 ONE.*)$" 0,0,520,50
# "(Aditya Birla.*(?:Plan\\*?\\#?\\'?|Sensex|Fund|Path|ETF|FOF\\*?|Scheme|EOF|Funds\\*?|Yojna)?)$" 0,0,470,25
# "(Bajaj.*(?:Fund|Path|ETF|FOF|EOF|Growth))$" 0,0,470,50
# "(Bank of India.*?(?:Plan|Funds?|ETF|FOF|FTF))"  0,55,280,105
# "(Baroda BNP.*?(?:Fund|Path|ETF|FTF|FOF|Index|Fund of Fund))" 0,0,220,120
# "CANARA.*?\\)" 0,0,400,55
# "((?:DSP|Bharat).*?(?:Fund\\s*(?:of Fund)?|FUND|ETF|FTF|FOF))" 0,0,500,40
# "(Edelweiss\s*.+?(?:Fund|Path|ETF|FOF|Path))" 0,0,150,100
# "((?:Franklin|Templeton).*?(?:Fund\\s*(?:of Funds)?|Plan))" 0,0,470,80
# "(GROWW.*?FUND)" 0,0,470,60
# "(HDFC.*?(?:FUND|Fund\\s*(?:of Funds?)?|ETF\\s*(?:Fund of Funds?)?))" 0,0,400,60
# "(HSBC.*?(?:FUND|Fund\\s*(?:of Funds?)?|ETF\\s*(?:Fund of Funds?)?))" 0,0,600,45
# "(Helios.*)" 0,0,600,40
# "((?:ICICI|BHARAT).*)" 0,0,490,30
# "(Invesco India.*?(?:Fund(?:of Funds?)?|ETF|FOF|Path))" 180,0,590,40

In [11]:
data = Helper.get_all_pdf_data(sample_path)

In [1]:
def check_and_highlight(path: str):
        output_path = path.replace(".pdf", "_hltd.pdf")
        
        with fitz.open(path) as doc:
            page_count = doc.page_count
            indices = Helper._get_financial_indices(r"C:\Users\Kaustubh.keny\Projects\office-work\mywork-repo\data\input\financial_indices.xlsx")
            data = [{"title": "", "highlights": 0, "detect_idx": []} for _ in range(page_count)]

            for dpgn, page in enumerate(doc):
                page_blocks = page.get_text("dict")["blocks"]
                sorted_blocks = sorted(page_blocks, key=lambda x: (x["bbox"][1], x["bbox"][0]))

                for block_count, block in enumerate(sorted_blocks):
                    if "lines" not in block:
                        continue
                    for line in block["lines"]:
                        for span in line["spans"]:
                            text = span["text"].strip().lower()
                            
                            for indice in indices:
                                pattern = rf"\b{re.escape(indice)}\b"
                                if re.search(pattern, text):
                                    if indice not in data[dpgn]['detect_idx']:
                                        data[dpgn]['detect_idx'].append(indice)
                                        data[dpgn]['highlights'] += 1
                                    page.add_highlight_annot(fitz.Rect(span["bbox"]))
                                    break

            doc.save(output_path)
        return output_path

In [26]:
import camelot
def get_something(path: str):
    pattern = r"^(REDEMPTION PROCEEDS|FEATURES|ASSET ALLOCATION|FUND MANAGER|SCHEME|Sr\. No\.)$"

    with fitz.open(path) as doc:
        exists = defaultdict(int)
        for pgn, page in enumerate(doc):
            page_text = [t.strip() for t in page.get_text().split("\n")]
            for text in page_text:
                if re.match(pattern,text):
                    exists[pgn]+=1
                    
    
    return [str(pgn+1) for pgn, count in exists.items() if count > 4] #camelot starts pages from 1

pages = get_something(sample_path)

imp_pages = ",".join(pages)                

In [None]:
tables = camelot.read_pdf(sample_path,pages=imp_pages, flavor="stream", table_areas= ["30,50,612,812"],column_tol = 4, split_text = True) #["30,50,612,812"]   

In [None]:
import camelot
import pandas as pd
import numpy as np

def clean_column_name(x):
    cleaned = re.sub(r"[^A-Za-z0-9\s]", "", str(x), flags=re.IGNORECASE) #Corrected re.sub.
    cleaned = "_".join(cleaned.split())
    return cleaned

with pd.ExcelWriter("cleaned_tables.xlsx", engine="xlsxwriter") as writer:
    for i, table in enumerate(tables):
        df = table.df
        df = df.map(lambda x: " ".join(str(x).split("\n")).strip())
        df.replace("", np.nan, inplace=True)
        df.iloc[2:, 0] = df.iloc[2:, 0].ffill()
        df = df.iloc[1:, :]
        df.columns = df.iloc[0, :].apply(clean_column_name)
        # print(df.columns)
        df = df.iloc[1:, :]
        sheet_name = f"Table_{i+1}"

        # workbook = writer.book
        # worksheet = writer.sheets[sheet_name]

        # row = 1
        # last_value = None
        # start_row = None

        # for j, value in enumerate(df.iloc[:, 0], start=1):
        #     if pd.notna(value):  # New group found
        #         if last_value is not None and start_row is not None:
        #             worksheet.merge_range(start_row, 0, row - 1, 0, last_value)  # Merge previous block
        #         last_value = value
        #         start_row = row
        #     row += 1

        # # Merge last group
        # if last_value is not None and start_row is not None:
        #     worksheet.merge_range(start_row, 0, row - 1, 0, last_value)
    
        

print("Cleaned tables saved with merged spanning rows!")


In [None]:
# {14: 'LIC MF LARGE CAP FUND',
#  15: 'LIC MF LARGE& MID CAPFUND',
#  17: 'LIC MF MULTICAP FUND',
#  18: "LIC MF V't CAP FUND",
#  19: 'SMALLCAP FUND',
#  21: 'LIC MF DIV_DEND YIELD FUND',
#  22: 'LIC MF VALUE FUND',
#  23: 'LIC MF FOCUSED FUND',
#  24: 'LIC MF INFRASTRUCTURE FUND',
#  25: 'LIC MF Poin MANGFACTURING FUND',
#  26: 'BANKING & FINANC-AL SERVICES FUND',
#  27: 'HEALTHCARE FUND',
#  28: 'LiC MF EL_SS TAX SAVER',
#  29: 'LIC MF AGGRESSIVE HYBRiD FUND',
#  30: 'LIC MF B’L*NCED ADVANTAGE FUND',
#  31: 'LIC MF EQUITY SAVINGS FUND',
#  32: 'LIC MF CONSERWATIVE nip?!) FUND',
#  33: 'LIC MF ARBITRAGE FUND',
#  36: 'LIC MF OVERNIGHT FUND',
#  37: 'LIC MF LIQUID FUND',
#  38: 'LIC MF ULTRA SHORT DURATION FUND',
#  40: 'LIC MF LOW’ DURATION FUND',
#  41: 'LIC MF MEDIUM-F2°LONG DURATION FUND',
#  42: 'LIC MF BANK&ONG & PSU FUND',
#  43: 'LIC MF_. SHORT DURATION FUND',
#  45: 'LIC MGI FUND',
#  46: 'LIC MF l HILDRENS FUND',
#  47: 'BSE SENSEX ETF',
#  48: 'LIC MF NIFTY 50ETF',
#  49: 'LIC MF NIFTY 100 ETF',
#  50: 'LIC MF NIFTY MIDCAP 100 ETF',
#  51: 'LIC MF NIFTY 8-13 YR G-SECETF',
#  52: 'LIC MF BSE SENSEX INDEX FUND',
#  53: 'LIC MF NIFTY 50 INDEX FUND',
#  54: 'LIC MF NIFTY NEXT 50 INDEX FUND',
#  55: 'LIC MF G2LD EXCHANGE TRADED FUND',
#  56: 'LIC MF GSLD ETF',
#  57: 'LIC MF Large Cap Fund',
#  58: 'LIC MF Mid cap Fund',
#  59: 'LIC MF Focused Fund',
#  60: 'Lic MF ELSs Tax Saver',
#  61: 'LIC MF Arbitrage Fund',
#  63: 'LIC MF Nifty Midcap 100 ETF',
#  64: 'LIC MF Large Cap Fund',
#  65: 'LIC MF Dividend Yield Fund',
#  66: 'LIC MEF Healthcare Fund',
#  67: 'LIC MF Unit Linked Insurance Scheme __LIC MF Overnight Fund',
#  68: 'LIC MF Short Duration Fund',
#  69: 'LIC MF BSE Sensex ETF',
#  70: 'LIC MF BSE Sensex Index Fund',
#  73: 'LIC MF Healthcare Fund',
#  74: 'LIC MF Liquid Fund',
#  75: 'LIC MF Nifty 50 ETF',
#  76: 'LIC Mutual Fund'}

NIPPON DATA

In [None]:
def via_block(path:str):
    pattern = r"FUNDS AT A GLANCE"
    amc_pattern = "^(Nippon India|CPSE).*(?=Plan|Next 50|Sensex|Fund|Path|ETF|FOF|EOF|Funds|$)"
    imp_pages = []
    with fitz.open(path) as doc:
        for pgn, page in enumerate(doc):
                page_blocks = page.get_text("dict")["blocks"]
                sorted_blocks = sorted(page_blocks, key=lambda x: (x["bbox"][1], x["bbox"][0]))
                for block_count, block in enumerate(sorted_blocks[:10]):
                    if "lines" not in block:
                        continue
                    for line in block["lines"]:
                        for span in line["spans"]:
                            text = span["text"].strip()
                            if re.match(pattern,text):
                                imp_pages.append(pgn)
                                
        amc_fund = defaultdict(list)
    
        for pgn in imp_pages:
            page = doc[pgn]
            page_blocks = page.get_text("dict")["blocks"]
            sorted_blocks = sorted(page_blocks, key=lambda x: (x["bbox"][1], x["bbox"][0]))
            for block_count, block in enumerate(sorted_blocks):
                if "lines" not in block:
                    continue
                for line in block["lines"]:
                    for span in line["spans"]:
                        text = span["text"].strip()
                        color = span['color']
                        if re.match(amc_pattern,text)and color == -1:
                            # matches = re.findall(amc_pattern,text)
                            amc_fund[pgn].append(text)
                            
    return imp_pages, dict(amc_fund)          

In [None]:
pages,amc = via_block(sample_path)
pages = list(map(str,[x+1 for x in pages]))

In [None]:
final_scheme = defaultdict(list)
for key, value in amc.items():
    # print(key)
    set1 = ['Scheme Name']+value[:4]
    set2 = ['Scheme Name']+value[4:]
    final_scheme[key+1].append(set1)
    final_scheme[key+1].append(set2)

final_scheme = dict(final_scheme)

In [None]:
imp_pages = ",".join(pages)
tables = camelot.read_pdf(sample_path,pages=imp_pages, flavor="lattice", line_scale = 40)  #table_areas = ["0,0,580,690"]            

In [None]:
import pandas as pd
import numpy as np

with pd.ExcelWriter("merged_tables.xlsx", engine="openpyxl") as writer:
    count = 0  # Toggle between 0 and 1
    
    for i, table in enumerate(tables):
        df = table.df
        if df.shape[1] < 3:
            continue

    
        df = df.map(lambda x: " ".join(x.split("\n")).strip())
        df = df.map(lambda x: np.nan if not x.strip() else x)
        df.set_index(df.columns[0], inplace=True)

        for check in ["Scheme Name", "Market Capitalization"]:
            if check in df.index:
                df.drop(check, inplace=True)
                
        df_cleaned = df[~df.index.isna()]
        df_cleaned = df_cleaned[df_cleaned.index != ""]
        df_cleaned = df_cleaned.reset_index()
        df_fill = df_cleaned.ffill(axis=1)


        sch_vals = final_scheme[table.page][count]
        count = 1 - count  # Toggle between 0 and 1

        if len(sch_vals) == 5:
            df_fill.loc[-1] = sch_vals 
            df_fill = df_fill.sort_index().reset_index(drop=True)

        # Write to a new sheet
        df_fill.to_excel(writer, sheet_name=f"Table_{i+1}", index=False)

print("All tables saved in separate sheets in 'merged_tables.xlsx' 🚀")


In [None]:
hello =  {
    "number": 0,
    "type": 0,
    "bbox": (0,0,0,0), #406.72119140625, 439.4930419921875, 565.697265625, 484.5830383300781
    "lines": [
        {
            "spans": [
                {
                    "size": 30.0,
                    "flags": 20,
                    "font": "Montserrat-Regular", #set this
                    "color": -1, #set this
                    "ascender": 1.0429999828338623,
                    "descender": -0.2619999945163727,
                    "text": "DUMMYDUMMYDUMMYDUMMY",
                    "origin": (406.72119140625, 458.26702880859375),
                    "bbox": (0,0,0,0), #406.72119140625,439.4930419921875,565.697265625,462.9830322265625,
                }
            ],
            "wmode": 0,
            "dir": (1.0, 0.0),
            "bbox": (0,0,0,0), #406.72119140625,439.4930419921875,565.697265625,462.9830322265625,
        },
        
    ],
},

   