In [2]:
import pprint, json, math, os, sys
import fitz
import pandas as pd
from collections import defaultdict
import pdfminer

dir_path = r"C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo"
fund_path = r"C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\Dec 24"
sys.path.append(os.path.abspath(dir_path))


from app.helper import Helper


dry_path = r'\data\output\DryRun.pdf'
fin_path = r'\data\input\financial_indices.xlsx'


mutual_fund = Helper.get_fund_paths(fund_path)

In [3]:
def extract_data_relative_line(path: str, line_x: float, side: str):
    doc = fitz.open(path)
    pages = doc.page_count

    final_list = []

    for pgn in range(pages):
        page = doc[pgn]

        blocks = page.get_text("dict")["blocks"]
        sorted_blocks = sorted(blocks, key=lambda x: (x["bbox"][1], x["bbox"][0]))
        extracted_blocks = []

        # Keep track of blocks to avoid duplicates
        added_blocks = set()

        for block in sorted_blocks:
            block_id = id(block)  # Unique identifier for the block

            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    origin = span["origin"]
                    x0, _ = origin

                    # Check the side condition
                    if side == "left" and x0 < line_x and block_id not in added_blocks:
                        extracted_blocks.append(block)
                        added_blocks.add(block_id)  # Mark block as added
                    elif side == "right" and x0 > line_x and block_id not in added_blocks:
                        extracted_blocks.append(block)
                        added_blocks.add(block_id)  # Mark block as added

      
        final_list.append({
            "pgn": pgn,
            "blocks": extracted_blocks
        })

    doc.close()

    return final_list

def extract_spans(data):
    final_span = []
    for pgn,blocks in enumerate(data):
        spans = []
        print(f"___{pgn}___")
        for num,block in enumerate(blocks.get('blocks',[])):
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    if num in range(3,10):
                        print(span['text'], span['size'])
        final_span.append(spans)
    return final_span

def get_proper_fund_names(path:str,pages:list):
        
    doc = fitz.open(path)
    final_fund_names = dict()
    
    for pgn in range(doc.page_count):
        text_all = ''
        if pgn in pages:
            # print(pgn)
            page = doc[pgn]            
            blocks = page.get_text("dict")['blocks']
            
            sorted_blocks = sorted(blocks,key=lambda k:(k['bbox'][1],k['bbox'][0]))
            for count,block in enumerate(sorted_blocks):
                for line in block.get("lines", []):
                    for span in line.get("spans", []):
                        text = span['text'].strip()
                        if count in range(0,1):
                            text_all+=f" {text}"
            print(text_all)
        pattern = r"MIRAE ASSET.*?\b(?:ETF|EOF|FOF|FTF|FUND)\b"
        if matches := re.findall(pattern, text_all.strip(), re.DOTALL):
            final_fund_names[pgn] = matches[0]
        else:
            final_fund_names[pgn] = ""
    return final_fund_names
  
def get_clipped_data(input:str, bboxes:list[set]):
    
        document = fitz.open(input)
        final_list = []
        
        for pgn in range(document.page_count):
            page = document[pgn]

            blocks = []
            for bbox in bboxes:
                blocks.extend(page.get_text('dict', clip = bbox)['blocks']) #get all blocks
            
            filtered_blocks = [block for block in blocks if block['type']== 0 and 'lines' in block]
            # sorted_blocks = sorted(filtered_blocks, key= lambda x: (x['bbox'][1], x['bbox'][0]))
            
            final_list.append({
            "pgn": pgn,
            "block": filtered_blocks
            })
            
            
        document.close()
        return final_list
    
def get_clipped_text(input:str, bboxes:list[set]):

    document = fitz.open(input)
    final_list = []
    
    for pgn in range(document.page_count):
        page = document[pgn]
        blocks = []
        for bbox in bboxes:
            blocks = page.get_text('text', clip = bbox).split('\n') #get all blocks
  
        final_list.append({
        "pgn": pgn,
        "block": blocks
        })   
    document.close()
    return final_list

In [7]:
sample_path  = mutual_fund['Edelweiss Mutual Fund']

In [None]:
lines = [
    ((220, 0), (220, 812)),# Vertical line
    #((420, 0), (420, 1000))
]

rectangles = [(0, 50, 200, 812),(200, 50, 380, 812),(380, 50, 580, 812)]
pages = [i for i in range(1,110)]
Helper.draw_lines_on_pdf(sample_path, lines, rectangles, pages, dir_path +dry_path)


Modified PDF saved to: C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo\data\output\DryRun.pdf


In [3]:
json_folder = dir_path + r'\data\output'
common_keys = Helper.extract_common_keys(json_folder)

In [8]:
data = Helper.get_all_pdf_data(sample_path)

In [9]:
data[15]

{'pgn': 15,
 'blocks': [{'number': 2,
   'type': 0,
   'bbox': (14.665200233459473,
    17.717891693115234,
    153.00440979003906,
    58.14990234375),
   'lines': [{'spans': [{'size': 16.0,
       'flags': 20,
       'font': 'Roboto-Bold',
       'color': -16298334,
       'ascender': 1.055999994277954,
       'descender': -0.2709999978542328,
       'text': 'Edelweiss Recently',
       'origin': (14.665200233459473, 34.6138916015625),
       'bbox': (14.665200233459473,
        17.717891693115234,
        153.00440979003906,
        38.94989013671875)}],
     'wmode': 0,
     'dir': (1.0, 0.0),
     'bbox': (14.665200233459473,
      17.717891693115234,
      153.00440979003906,
      38.94989013671875)},
    {'spans': [{'size': 16.0,
       'flags': 20,
       'font': 'Roboto-Bold',
       'color': -16298334,
       'ascender': 1.055999994277954,
       'descender': -0.2709999978542328,
       'text': 'Listed IPO Fund',
       'origin': (14.665200233459473, 53.81390380859375),
    