In [31]:
import pprint, json, math, os, sys, camelot
import fitz, pdfplumber
import pandas as pd
import numpy as np
from collections import defaultdict

dir_path = "C:\\Users\\Kaustubh.keny\\OneDrive - Cogencis Information Services Ltd\\Documents\\mywork-repo\\"
fund_path = "C:\\Users\\Kaustubh.keny\\OneDrive - Cogencis Information Services Ltd\\Documents\\Dec 24\\"

# dir_path = "C:\\Users\\rando\\OneDrive\\Documents\\mywork-repo"
# fund_path =  "C:\\Users\\rando\\OneDrive\\Documents\\Dec 24"
sys.path.append(os.path.abspath(dir_path))

from app.utils import Helper
from app.fund_regex import *

dry_path = r'\data\output\DryRun.pdf'
fin_path = r'\data\input\financial_indices.xlsx'
mutual_fund = Helper.get_fund_paths(fund_path)

In [4]:
def extract_clipped_data(input:str, pages:list, bboxes:list):
        
        document = fitz.open(input)
        final_list = []
    
        for pgn in pages:
            page = document[pgn]
            
            all_blocks = [] #store every data from bboxes
            
            for bbox in bboxes:
                blocks, seen_blocks = [], set()  #store unique blocks based on content and bbox
                
                page_blocks = page.get_text('dict', clip=bbox)['blocks']
                for block in page_blocks:
                    if block['type'] == 0 and 'lines' in block: #type 0 means text block
                        #hash_key
                        block_key = (tuple(block['bbox']), tuple(tuple(line['spans'][0]['text'] for line in block['lines'])))
                        if block_key not in seen_blocks:
                            seen_blocks.add(block_key)
                            blocks.append(block)

                sorted_blocks = sorted(blocks, key=lambda x: (x['bbox'][1], x['bbox'][0]))
                all_blocks.append(sorted_blocks)

            final_list.append({
                "pgn": pgn,
                "block": all_blocks #will be list[list,list,..]
            })

        document.close()
        return final_list
    
def extract_data_relative_line(path: str, line_x: float, side: str):
    doc = fitz.open(path)
    pages = doc.page_count

    final_list = []

    for pgn in range(pages):
        page = doc[pgn]

        blocks = page.get_text("dict")["blocks"]
        sorted_blocks = sorted(blocks, key=lambda x: (x["bbox"][1], x["bbox"][0]))
        extracted_blocks = []

        # Keep track of blocks to avoid duplicates
        added_blocks = set()

        for block in sorted_blocks:
            block_id = id(block)  # Unique identifier for the block

            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    origin = span["origin"]
                    x0, _ = origin

                    # Check the side condition
                    if side == "left" and x0 < line_x and block_id not in added_blocks:
                        extracted_blocks.append(block)
                        added_blocks.add(block_id)  # Mark block as added
                    elif side == "right" and x0 > line_x and block_id not in added_blocks:
                        extracted_blocks.append(block)
                        added_blocks.add(block_id)  # Mark block as added

      
        final_list.append({
            "pgn": pgn,
            "blocks": extracted_blocks
        })

    doc.close()

    return final_list
  
def get_clipped_data(input:str, bboxes:list[set], *args):
    
        document = fitz.open(input)
        final_list = []
        if args:
            pages = list(args)
        else:
            pages = [i for i in document.page_count]
        
        for pgn in pages:
            page = document[pgn]

            blocks = []
            for bbox in bboxes:
                blocks.extend(page.get_text('dict', clip = bbox)['blocks']) #get all blocks
            
            filtered_blocks = [block for block in blocks if block['type']== 0 and 'lines' in block]
            # sorted_blocks = sorted(filtered_blocks, key= lambda x: (x['bbox'][1], x['bbox'][0]))
            
             # Extract text from sorted blocks
            extracted_text = []
            for block in filtered_blocks:
                block_text = []
                for line in block['lines']:
                    line_text = " ".join(span['text'] for span in line['spans'])
                    block_text.append(line_text)
                extracted_text.append("\n".join(block_text))
            
            final_list.append({
            "pgn": pgn,
            "block": filtered_blocks,
            "text": extracted_text
            })
            
            
        document.close()
        return final_list
    
def get_clipped_text(input:str, bboxes:list[set],*args):

    document = fitz.open(input)
    final_list = []
    
    if args:
        pages = list(args)
    else:
        pages = [i for i in document.page_count]
    
    for pgn in pages:
        page = document[pgn]
        blocks = []
        for bbox in bboxes:
            blocks = page.get_text('text', clip = bbox).split('\n') #get all blocks
        final_list.append({
        "pgn": pgn,
        "block": blocks
        })   
    document.close()
    return final_list

def get_proper_fund_names(path: str, pages: list):
    doc = fitz.open(path)
    title = {}

    for pgn in pages:
        page = doc[pgn]
        blocks = page.get_text("dict")['blocks']
        text_all = " ".join(
            span["text"].strip()
            for block in blocks[:4]
            for line in block.get("lines", [])
            for span in line.get("spans", [])
            if span["text"].strip()
        )

        text_all = re.sub(r'[^A-Za-z0-9\s]+', '', text_all).strip()
        matches = re.findall(r"((?:LIC\s*MF|BLNCED|LOW)\s+.*?(?:FUND|ETF|FTF|FOF|PLAN|SAVER))", text_all, re.IGNORECASE)

        title[pgn] = matches[0] if matches else ""

    return title

In [52]:
sample_path  = mutual_fund["Zerodha Mutual Fund"]
sample_path

'C:\\Users\\Kaustubh.keny\\OneDrive - Cogencis Information Services Ltd\\Documents\\Dec 24\\Zerodha Mutual Fund\\71_31-Dec-24_FS.pdf'

In [54]:
lines = [
    ((340, 0), (340, 812)),# Vertical line
    ((0, 500), (812, 500))
]
pages = [12, 14,16]
bboxes = [ (0,100,270,812),(270,100,410,812)]
pages = [i for i in range(1,110)]
Helper.draw_lines_on_pdf(sample_path, lines, bboxes, pages, dir_path +dry_path)


Modified PDF saved to: C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo\\data\output\DryRun.pdf


In [40]:
data = Helper.get_all_pdf_data(sample_path)

In [41]:
data

[{'pgn': 0,
  'blocks': [{'number': 6,
    'type': 0,
    'bbox': (37.96339416503906,
     29.195236206054688,
     231.86839294433594,
     151.90122985839844),
    'lines': [{'spans': [{'size': 34.0,
        'flags': 20,
        'font': 'Montserrat-Bold',
        'color': -14475488,
        'ascender': 1.0759999752044678,
        'descender': -0.26600000262260437,
        'text': 'Invest',
        'origin': (37.97119903564453, 65.77923583984375),
        'bbox': (37.97119903564453,
         29.195236206054688,
         146.4652099609375,
         74.82323455810547)},
       {'size': 31.0,
        'flags': 4,
        'font': 'Montserrat-Regular',
        'color': -14475488,
        'ascender': 1.0429999828338623,
        'descender': -0.2619999945163727,
        'text': ' with ',
        'origin': (146.46339416503906, 65.77923583984375),
        'bbox': (146.46339416503906,
         33.44623565673828,
         231.86839294433594,
         73.90123748779297)}],
      'wmode': 0,
      

In [30]:
def crop_pdf_top(sample_path, output_path, crop_height=100):
    doc = fitz.open(sample_path)

    for page in doc:
        rect = page.rect
        new_rect = fitz.Rect(rect.x0, rect.y0, rect.x1, crop_height)
        page.set_cropbox(new_rect)

    doc.save(output_path)
    doc.close()

crop_pdf_top(sample_path,"output.pdf")

In [8]:
dataf = get_proper_fund_names(sample_path,list(range(84)))

In [None]:
dataf

In [26]:
import camelot
def get_something(path: str):
    pattern = r"^(REDEMPTION PROCEEDS|FEATURES|ASSET ALLOCATION|FUND MANAGER|SCHEME|Sr\. No\.)$"

    with fitz.open(path) as doc:
        exists = defaultdict(int)
        for pgn, page in enumerate(doc):
            page_text = [t.strip() for t in page.get_text().split("\n")]
            for text in page_text:
                if re.match(pattern,text):
                    exists[pgn]+=1
                    
    
    return [str(pgn+1) for pgn, count in exists.items() if count > 4] #camelot starts pages from 1

pages = get_something(sample_path)

imp_pages = ",".join(pages)                

In [None]:
tables = camelot.read_pdf(sample_path,pages=imp_pages, flavor="stream", table_areas= ["30,50,612,812"],column_tol = 4, split_text = True) #["30,50,612,812"]   

In [None]:
import camelot
import pandas as pd
import numpy as np

def clean_column_name(x):
    cleaned = re.sub(r"[^A-Za-z0-9\s]", "", str(x), flags=re.IGNORECASE) #Corrected re.sub.
    cleaned = "_".join(cleaned.split())
    return cleaned

with pd.ExcelWriter("cleaned_tables.xlsx", engine="xlsxwriter") as writer:
    for i, table in enumerate(tables):
        df = table.df
        df = df.map(lambda x: " ".join(str(x).split("\n")).strip())
        df.replace("", np.nan, inplace=True)
        df.iloc[2:, 0] = df.iloc[2:, 0].ffill()
        df = df.iloc[1:, :]
        df.columns = df.iloc[0, :].apply(clean_column_name)
        # print(df.columns)
        df = df.iloc[1:, :]
        sheet_name = f"Table_{i+1}"

        # workbook = writer.book
        # worksheet = writer.sheets[sheet_name]

        # row = 1
        # last_value = None
        # start_row = None

        # for j, value in enumerate(df.iloc[:, 0], start=1):
        #     if pd.notna(value):  # New group found
        #         if last_value is not None and start_row is not None:
        #             worksheet.merge_range(start_row, 0, row - 1, 0, last_value)  # Merge previous block
        #         last_value = value
        #         start_row = row
        #     row += 1

        # # Merge last group
        # if last_value is not None and start_row is not None:
        #     worksheet.merge_range(start_row, 0, row - 1, 0, last_value)
    
        

print("Cleaned tables saved with merged spanning rows!")


NIPPON DATA

In [None]:
def via_block(path:str):
    pattern = r"FUNDS AT A GLANCE"
    amc_pattern = "^(Nippon India|CPSE).*(?=Plan|Next 50|Sensex|Fund|Path|ETF|FOF|EOF|Funds|$)"
    imp_pages = []
    with fitz.open(path) as doc:
        for pgn, page in enumerate(doc):
                page_blocks = page.get_text("dict")["blocks"]
                sorted_blocks = sorted(page_blocks, key=lambda x: (x["bbox"][1], x["bbox"][0]))
                for block_count, block in enumerate(sorted_blocks[:10]):
                    if "lines" not in block:
                        continue
                    for line in block["lines"]:
                        for span in line["spans"]:
                            text = span["text"].strip()
                            if re.match(pattern,text):
                                imp_pages.append(pgn)
                                
        amc_fund = defaultdict(list)
    
        for pgn in imp_pages:
            page = doc[pgn]
            page_blocks = page.get_text("dict")["blocks"]
            sorted_blocks = sorted(page_blocks, key=lambda x: (x["bbox"][1], x["bbox"][0]))
            for block_count, block in enumerate(sorted_blocks):
                if "lines" not in block:
                    continue
                for line in block["lines"]:
                    for span in line["spans"]:
                        text = span["text"].strip()
                        color = span['color']
                        if re.match(amc_pattern,text)and color == -1:
                            # matches = re.findall(amc_pattern,text)
                            amc_fund[pgn].append(text)
                            
    return imp_pages, dict(amc_fund)          

In [None]:
pages,amc = via_block(sample_path)
pages = list(map(str,[x+1 for x in pages]))

In [None]:
final_scheme = defaultdict(list)
for key, value in amc.items():
    # print(key)
    set1 = ['Scheme Name']+value[:4]
    set2 = ['Scheme Name']+value[4:]
    final_scheme[key+1].append(set1)
    final_scheme[key+1].append(set2)

final_scheme = dict(final_scheme)

In [None]:
imp_pages = ",".join(pages)
tables = camelot.read_pdf(sample_path,pages=imp_pages, flavor="lattice", line_scale = 40)  #table_areas = ["0,0,580,690"]            

In [None]:
import pandas as pd
import numpy as np

with pd.ExcelWriter("merged_tables.xlsx", engine="openpyxl") as writer:
    count = 0  # Toggle between 0 and 1
    
    for i, table in enumerate(tables):
        df = table.df
        if df.shape[1] < 3:
            continue

    
        df = df.map(lambda x: " ".join(x.split("\n")).strip())
        df = df.map(lambda x: np.nan if not x.strip() else x)
        df.set_index(df.columns[0], inplace=True)

        for check in ["Scheme Name", "Market Capitalization"]:
            if check in df.index:
                df.drop(check, inplace=True)
                
        df_cleaned = df[~df.index.isna()]
        df_cleaned = df_cleaned[df_cleaned.index != ""]
        df_cleaned = df_cleaned.reset_index()
        df_fill = df_cleaned.ffill(axis=1)


        sch_vals = final_scheme[table.page][count]
        count = 1 - count  # Toggle between 0 and 1

        if len(sch_vals) == 5:
            df_fill.loc[-1] = sch_vals 
            df_fill = df_fill.sort_index().reset_index(drop=True)

        # Write to a new sheet
        df_fill.to_excel(writer, sheet_name=f"Table_{i+1}", index=False)

print("All tables saved in separate sheets in 'merged_tables.xlsx' 🚀")
