In [180]:
import pdfplumber
import fitz
import warnings , math, collections , os, re, pprint, json
import pickle
import numpy as np
import pandas as pd



warnings.filterwarnings("ignore", category=UserWarning) 


#path = r"C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo"
path = r"C:\Users\rando\OneDrive\Documents\mywork-repo"

In [183]:
samco_path = path + r"\files\58_31-Dec-24_FS.pdf"
dry_path  = path + r"\output\DryRun.pdf"
indice_path = path + r"\output\pkl\indices_var.pkl"

In [125]:
def get_financial_indices(path:str):
    final_indices = set()
    with open(path , 'rb') as file:
        indices = pickle.load(file)  
        for k,v in indices.items():
            temp = [k] + v
            for t in temp:
                final_indices.add(t)
    
    return final_indices


""" Highlights important financial indices in the pdf, does other pre
analysis of data.
Args: list of indices, string of pdf path
Returns: dict of pages highlighted, string of output pdf, dict of pages contaiting FUND NAMES
"""
def check_indice_highlight(path:str, indices_variations:list, fund_pattern:str):
    doc = fitz.open(path)
    page_count = doc.page_count #No of pages
    
    pages = [i for i in range(page_count)]
    important_pages = dict.fromkeys(pages, 0)
    fund_titles = dict.fromkeys(pages, "")


    for page_num, page in enumerate(doc):
        
        text_instances = page.get_text('dict')["blocks"]
        
        #sort for all data in pdf document 
        sorted_text_instances = sorted(text_instances, key=lambda x: (x['bbox'][1], x['bbox'][0]))
        
        # rect = fitz.Rect((35,120,250,765))
        # page.add_highlight_annot(rect)

        for pgn,block in enumerate(sorted_text_instances):     
            if "lines" not in block:
                continue
            
            for line in block["lines"]: 
                for span in line["spans"]:
                    if span['flags'] in [20,25]:  # learn flag logic , rn set for all flags value
                        span_text = span['text'].strip().lower()
                        
                        
                        #FUND PAGE CHECK
                        conditions = [
                            pgn in range(0,6),
                            re.match(fund_pattern, span_text, re.IGNORECASE),
                            span['size'] >18
                        ]
                        if all(conditions):
        
                            fund_titles[page_num] = span_text  
                        
                        #CHECK IMP FINANCE INDICES  
                        for term in indices_variations:  
                            pattern = r'\b' + re.escape(term.lower()) + r'\b'
                            if re.search(pattern, span_text):

                                #count highlights
                                important_pages[page_num] +=1
                                #mark content
                                rect = fitz.Rect(span['bbox']) 
                                page.add_highlight_annot(rect)
                                break  #optional , one highlight

    
    output_path = None
    if any(important_pages.values()):
        output_path = path.replace('.pdf', '_highlighted.pdf')
        doc.save(output_path)

    doc.close()
    return important_pages, output_path, fund_titles


""" Get the clipped data in the bbox provided and store in nested dict
Args: input path, dryrun path, important pages, bbox coords
Returns: dict { 'page' : int 'block': dict}"""
def get_clipped_data(input:str, output:str, pageSelect:list, bbox:list[set], fund_names:dict):
    
    document = fitz.open(input)
    finalData = []
    
    for pgn in pageSelect:
        #get the page
        page = document[pgn]
        fundName = fund_names[pgn]
    
        blocks = page.get_text('dict', clip = bbox[0])['blocks'] #get all blocks
        
        filtered_blocks = [block for block in blocks if block['type']==0 and 'lines' in block]
        sorted_blocks = sorted(filtered_blocks, key= lambda x: (x['bbox'][1], x['bbox'][0]))
        
        finalData.append({
            "page": pgn,
            "fundname": fundName,
            "block": sorted_blocks,
        })
            
    return finalData

def clean_data_combine(data: list, check_color: int, replace_size):
    
    remove_text = ['.','. ',',',':','st',";","-",'st ',' ','th', 'th ', 'rd', 'rd ', 'nd', 'nd ']
    
    for page in data:
        for blocks in page['block']:
            for lines in  blocks.get('lines',[]):
            
                #REMOVE WASTE VALUES
                lines['spans'] = [
                    span for span in lines.get("spans", [])
                            if span.get("text").strip() not in remove_text
                ]
                
                #MAKE SAME COLOR AND SIZE
                for span in lines.get('spans',[]):
                    if span['color'] == check_color and span['size'] > 7:
                        span['size'] = replace_size
                
                
                #COMBINE SPANS HAVING SIMILAR PROP
                combined_spans = []
                for span in lines.get("spans", []):
                    if combined_spans and all(
                        combined_spans[-1].get(key) == span.get(key)
                        for key in ["flags", "size", "color"]
                    ):
                        # Combine text if spans are similar
                        combined_spans[-1]["text"] += " " + span["text"]
                    else:
                        combined_spans.append(span)
                lines["spans"] = combined_spans
                
    return data

def return_span_data(data:list, name:str): #all
    final_data = dict()
    for pgn,page in enumerate(data):
        pgn_content = []
        for blocks in page['block']:
            for line in blocks['lines']:
                spans = line.get('spans',[])
                for span in spans:
                    pgn_content.append(span[name])
                
        final_data[f"Page: {pgn + 1}"] = pgn_content
    
    return final_data

In [184]:
file_path = samco_path
final_indices = get_financial_indices(indice_path)
fund_pattern = r"^(samco|tata).*fund$"
highlight_pages, saved_path, fund_pages =  check_indice_highlight(file_path, final_indices, fund_pattern)


pagedf = pd.DataFrame({'title': fund_pages.values(),'highlight_count': highlight_pages.values()})

"""_summary_ fund is located only on certain pages, based on no. of 
highlights we know which pages are imp. automate this content later
"""
print(pagedf)

                                  title  highlight_count
0                                                      0
1                                                     11
2                                                      0
3            samco active momentum fund                9
4            samco active momentum fund                1
5   samco dynamic asset allocation fund               12
6   samco dynamic asset allocation fund                1
7                  samco flexi cap fund                9
8                  samco flexi cap fund                1
9                  samco multi cap fund                8
10                 samco multi cap fund                1
11     samco special opportunities fund                8
12     samco special opportunities fund                1
13            samco elss tax saver fund                8
14            samco elss tax saver fund                1
15    samco multi asset allocation fund               11
16    samco multi asset allocat

In [185]:
pages = [3,5,7,9,11,13,15,17,18]
bbox = [(35,120,250,765)]

data = get_clipped_data(samco_path, dry_path, pages, bbox, fund_pages)
#text_data = return_span_data(data, 'text')
#data[2]['block']
clean_data = clean_data_combine(data, -1,12.0)
#set font whose color is __ to size __ 
#Bad Logic here btw
#Page 0,1,2,3 indexes-> 3,5,7,9 pages 

In [162]:
def create_matrix_structure(data: list, header_font: float, content_font: float):
    # Step 1: collect font sizes and coordinates
    coordinates = []
    fonts = set()

    for block in data['block']:
        for line in block['lines']:
            for span in line['spans']:
                origin = tuple(span['origin'])  # Top-left coordinates
                coordinates.append(origin)
                fonts.add(span['size'])

    coordinates = sorted(set(coordinates), key=lambda c: (c[1], c[0]))  # Sort by y, then x
    fonts = sorted(fonts, reverse=True)  # Descending order of font size

    # Step 2: create the matrix
    coord_to_index = {coord: idx for idx, coord in enumerate(coordinates)}  # (x,y) at pos 0 etc. ROWS
    font_to_index = {font: idx for idx, font in enumerate(fonts)}  # COLUMNS
    matrix = np.zeros((len(coordinates), len(fonts)), dtype=object)  # Set all to zeros initially

    # Step 3: matrix populate and nested dict add
    nested_dict = {}
    current_subheader = None

    for block in data['block']:
        for line in block['lines']:
            for span in line['spans']:
                origin = tuple(span['origin'])  # Top-left x,y
                font = span['size']
                text_preview = span['text']  # Get the first two words of the text

                # Populate the matrix with text preview
                if origin in coord_to_index and font in font_to_index:
                    row = coord_to_index[origin]
                    col = font_to_index[font]
                    if matrix[row, col] == 0:
                        matrix[row, col] = "na"
                    matrix[row, col] = text_preview

                # Build the nested dictionary
                if font == header_font:
                    current_subheader = span
                    nested_dict[current_subheader['text']] = []
                elif font <= content_font and current_subheader:
                    nested_dict[current_subheader['text']].append(span)

    matrix_df = pd.DataFrame(matrix, index=coordinates, columns=fonts)

    return nested_dict, matrix_df
def generate_pdf_from_data(data:list, output_path:str):
    pdf_doc = fitz.open()
    
    for section, spans in data.items():
    
        page = pdf_doc.new_page()
        text_position = 72  # for title initalize something

        #section title
        title_font_size = 14 
        try:
            page.insert_text(
                (72, text_position), #initalizor
                section,
                fontsize=title_font_size,
                fontname="helv",
                color=(0, 0, 1),
            )        
        except Exception as e:
            print(f"The error is {e}")
            
        #content title
        for span in spans:
            bbox = span.get("bbox", [0, 0, 0, 0])  # default 

            #Errror in fitz font 
            try:
                page.insert_text(
                    (bbox[0], bbox[1]),
                    span["text"],
                    fontsize=span["size"],
                    fontname="helv",
                    color=tuple(int(span["color"] & 0xFFFFFF) for _ in range(3)))#unsigned int value so (0,0,0)
                
            except Exception:
                page.insert_text(
                    (bbox[0], bbox[1]),
                    span["text"],
                    fontsize=span["size"],
                    fontname="helv",
                    color=(1, 0, 0),
                )

    # Save the created PDF
    pdf_doc.save(output_path)
    pdf_doc.close()
    print(f"  PDF generated to: {output_path}")

def extract_data_from_pdf(path:str):
    
    def replace_main_key(string: str):
        replace_key = string
        if re.match(r'^NAV.*as on', string, re.IGNORECASE):
            replace_key = "NAV" 
        elif "market" in string.lower():
            replace_key = "Market Cap"
        elif re.match(r"Assets Under Management", string, re.IGNORECASE):
            replace_key = "Assets Under Management"   
        return replace_key
    
    with pdfplumber.open(file_path) as pdf:
        final_data = []
        final_data_generated = {}
        
        for page in pdf.pages:
            # extract text from the page
            text = page.extract_text()
            final_data.append(text)
        
        #store them in a dict for each page
        for data in final_data:
            content = data.split('\n')
            main_key = replace_main_key(content[0])
            values = content[1:]
        
            final_data_generated[main_key] = values

        #sort the headers in lex order
        sorted_final_generated = {key: final_data_generated[key] for key in sorted(final_data_generated)}

    return sorted_final_generated

In [186]:
header_font = 12 #set by user
content_font = 8 #anything below this is content
#check clean_data content to determine this font sizes

grand_final_data = {}
matrix_final = {}

for counter, page in enumerate(clean_data):
    
    fund_name = page['fundname'].title()
    
    result, matrix  = create_matrix_structure(page, header_font, content_font)
    

    file_path = os.path.join(path, 'output', f"SamcoDryRun{counter + 1}.pdf")
    print(fund_name)
    
    #GENERATE AND EXTRACT DATA
    generate_pdf_from_data(result, file_path)
    page_data =  extract_data_from_pdf(file_path)
                
    
    master_key = fund_name
    matrix_final[master_key] = matrix
    grand_final_data[master_key] = page_data

Samco Active Momentum Fund
  PDF generated to: C:\Users\rando\OneDrive\Documents\mywork-repo\output\SamcoDryRun1.pdf
Samco Dynamic Asset Allocation Fund
  PDF generated to: C:\Users\rando\OneDrive\Documents\mywork-repo\output\SamcoDryRun2.pdf
Samco Flexi Cap Fund
  PDF generated to: C:\Users\rando\OneDrive\Documents\mywork-repo\output\SamcoDryRun3.pdf
Samco Multi Cap Fund
  PDF generated to: C:\Users\rando\OneDrive\Documents\mywork-repo\output\SamcoDryRun4.pdf
Samco Special Opportunities Fund
  PDF generated to: C:\Users\rando\OneDrive\Documents\mywork-repo\output\SamcoDryRun5.pdf
Samco Elss Tax Saver Fund
  PDF generated to: C:\Users\rando\OneDrive\Documents\mywork-repo\output\SamcoDryRun6.pdf
Samco Multi Asset Allocation Fund
  PDF generated to: C:\Users\rando\OneDrive\Documents\mywork-repo\output\SamcoDryRun7.pdf
Samco Overnight Fund
  PDF generated to: C:\Users\rando\OneDrive\Documents\mywork-repo\output\SamcoDryRun8.pdf
Samco Arbitrage Fund
  PDF generated to: C:\Users\rando\OneDr

In [218]:
#REGEX FUNCTIONS

def return_invest_data(key:str,data:list):
    investment_objective = data
    values = " ".join(txt for txt in investment_objective)

    data = {
        key:values
    }

    return data

def return_scheme_data(key:str,data:list):
    scheme_data = data
    main_key = key
    structured_data = {main_key: {}}

    # Patterns
    date_pattern = r"^(.*?date)\s(\d{2}-[A-Za-z]{3}-\d{4})$"
    benchmark_pattern = r"^(Benchmark)\s+(.*)$"
    application_pattern = r"(?:·)?\d+(?:,\d{3})*(?:\.\d+)?/-"

    for data in scheme_data:
        if re.search(date_pattern, data, re.IGNORECASE):
            match = re.match(date_pattern, data, re.IGNORECASE)
            if match:
                key = match.group(1)
                value = match.group(2)
                structured_data[main_key][key] = value
        elif re.search(benchmark_pattern, data, re.IGNORECASE):
            match = re.match(benchmark_pattern, data, re.IGNORECASE)
            if match:
                key = match.group(1)
                value = match.group(2)
                structured_data[main_key][key] = value
        elif re.search(r"\b(min|application)\b", data, re.IGNORECASE):
            matches = re.findall(application_pattern, data, re.IGNORECASE)
            if matches:
                cleaned_matches = [match.replace('·', '') for match in matches]
                structured_data[main_key]["min_appl_amt"] = cleaned_matches
        elif re.search(r"\b(additional.* and in multiples of)\b", data, re.IGNORECASE):
            matches = re.findall(application_pattern, data, re.IGNORECASE)
            if matches:
                cleaned_matches = [match.replace('·', '') for match in matches]
                structured_data[main_key]["additional_amt"] = cleaned_matches

    return structured_data

def return_fund_data(key:str,data:list):
    fund_manager = data
    main_key = key
    strucuted_data = {main_key:[]}
    current_entry = None
    name_pattern = r'^(Ms\.|Mr\.)'
    manage_pattern = r'^\(|\)$'
    date_pattern = r'\b\w+ \d{1,2}, \d{4}\b'
    experience_pattern = r'^Total Experience: (.+)$'

    for data in fund_manager:
        if re.match(name_pattern,data):
            if current_entry:
                strucuted_data[main_key].append(current_entry)
            current_entry = {
                'name': data.split(",")[0].strip().lower(),
                'designation': "".join(data.split(",")[1:]).strip().lower()
            }
            #print(data.split(",")[0],"".join(data.split(",")[1:]))
        elif re.match(manage_pattern,data):
            if "inception" in data.lower():
                current_entry['managing_since'] = 'inception'
            else:
                date = re.search(date_pattern, data)
                current_entry['managing_since'] = date.group() if date != None else None
        elif re.match(experience_pattern,data):
            current_entry['total_experience'] = data.split(":")[1].strip().lower()
            #print(data.split(":")[1])

        
    if current_entry:  # Append the last entry
        strucuted_data[main_key].append(current_entry)
            
    return strucuted_data

def return_nav_data(key:str,data:list):
    main_key = key
    structured_data = {main_key: {}}
    
    growth_pattern = r"([\w\s]+):\s*·([\d.]+)"
    
    for line in data:
        matches = re.findall(growth_pattern, line)
        for key, value in matches:
            structured_data[main_key][key.strip().lower()] = value
        
    return structured_data

def return_quant_data(key:str,data:list):
    qunatitative_data = data
    main_key = key

    strucuted_data = {main_key:{}}
    current_entry = None
    comment = ""

    ratio_pattern = r"\b(ratio|turnover)\b"
    annual_pattern = r'\b(annualised|YTM)\b'
    macaulay_pattern = r"\b(macaulay.*duration)\b"
    residual_pattern = r"\b(residual.*maturity)\b"
    modified_pattern = r"\b(modified.*duration)\b"

    for data in qunatitative_data:
        if re.search(ratio_pattern,data, re.IGNORECASE):
            key = data.split(":")[0].lower().strip()
            value = data.split(":")[1].lower().strip()
        elif re.search(annual_pattern,data, re.IGNORECASE):
            key = data.split(":")[0].lower().strip()
            value = data.split(":")[1].lower().strip()
        elif re.search(macaulay_pattern,data, re.IGNORECASE):
            key = data.split(":")[0].lower().strip()
            value = data.split(":")[1].lower().strip()
        elif re.search(residual_pattern,data, re.IGNORECASE):
            key = data.split(":")[0].lower().strip()
            value = data.split(":")[1].lower().strip()
        elif re.search(modified_pattern,data, re.IGNORECASE):
            key = data.split(":")[0].lower().strip()
            value = data.split(":")[1].lower().strip()
        else:
            comment+= data
        strucuted_data[main_key][key] = value
    
    strucuted_data[main_key]['comment'] = comment

    return strucuted_data

def return_aum_data(key:str,data:list):
    
    aum = data
    main_key = key
    strucuted_data = {main_key:{}}

    pattern = r"\b\d{1,3}(?:,\d{3})*(?:\.\d+)? Crs\b"

    for data in aum:
        if re.search(r'average', data, re.IGNORECASE):
            match = re.search(pattern, data)
            key = 'avg_aum'
        elif re.search(pattern, data):
            match = re.search(pattern, data)
            key = "aum"
        else:
            continue
        
        if match:
            strucuted_data[main_key][key] = match.group()

    return strucuted_data

def return_mar_data(key:str,data:list):
    return {
        key: {}
    }


In [187]:
list(grand_final_data.keys())

['Samco Active Momentum Fund',
 'Samco Dynamic Asset Allocation Fund',
 'Samco Flexi Cap Fund',
 'Samco Multi Cap Fund',
 'Samco Special Opportunities Fund',
 'Samco Elss Tax Saver Fund',
 'Samco Multi Asset Allocation Fund',
 'Samco Overnight Fund',
 'Samco Arbitrage Fund']

In [195]:
# Extract unique indices and ensure "Market Cap" is included
imp_indices = {indices for value in grand_final_data.values() for indices in value.keys()}
imp_indices.add("Market Cap")
modified_indices = sorted(imp_indices)

modified_indices

['Assets Under Management',
 'Fund Manager',
 'Investment Objective',
 'Market Cap',
 'NAV',
 'Quantitative Data',
 'Scheme Details']

In [213]:
# Map indices to funct , it has to be sorted
function_indices = [
    return_aum_data,
    return_fund_data,
    return_invest_data,
    return_mar_data,
    return_nav_data,
    return_quant_data,
    return_scheme_data,
]

function_map = {index: func for index, func in zip(modified_indices, function_indices)}


In [214]:
# Perform operation based on the function map
def perform_operation(operation, data):
    func = function_map.get(operation)
    if func:
        try:
            return func(data)
        except Exception as e:
            return f"Error executing function for {operation}: {e}"
    return "Invalid operation"



grand_dictionary = {}
for master_k, page_data in grand_final_data.items():
    print(master_k)
    page_content = []

    for main_k, main_content in page_data.items():
        func = function_map.get(main_k)
        if func:
            try:
                result = func(main_k, main_content)
                page_content.append(result)
            except Exception as e:
                print(f"Error processing -> {main_k}: {e}")
        else:
            print(f"Warning: No function mapped for {main_k}")

    grand_dictionary[master_k] = page_content
    print("Done for _________________")

Samco Active Momentum Fund
Done for _________________
Samco Dynamic Asset Allocation Fund
Done for _________________
Samco Flexi Cap Fund
Done for _________________
Samco Multi Cap Fund
Done for _________________
Samco Special Opportunities Fund
Done for _________________
Samco Elss Tax Saver Fund
Done for _________________
Samco Multi Asset Allocation Fund
Done for _________________
Samco Overnight Fund
Done for _________________
Samco Arbitrage Fund
Done for _________________


In [217]:
with open(path +r"\output\dump58_31-Dec-24_FS.json", "w", encoding="utf-8") as file:
    json.dump(grand_dictionary, file, ensure_ascii=False, indent=4)
    print("JSON CREATED\n")

JSON CREATED

