In [59]:
import pdfplumber
import fitz
import camelot
import warnings , math, collections , os, re, pprint
import pickle
import numpy as np, pandas as pd

warnings.filterwarnings("ignore", category=UserWarning) 

In [6]:
#path = r"C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo"
path = r"C:\Users\rando\OneDrive\Documents\mywork-repo"

#file data paths
samco_path = path + r"\files\SamcoFactSheet2024.pdf"

#dry run paths
dry_run_path = path + r"\output\DryRun.pdf"


In [7]:
#SAMCO PATHS
#samco output path
no_image_path = path + r"\output\sam\NoImgPdf.pdf"
textual_pdf_path = path + r"\output\sam\TextualPdf.pdf"
tabular_pdf_path = path + r"\output\sam\TabularPdf.pdf"

#pickle data paths samco
pickle_text = r"\output\pkl\sam\textual_data.pkl"
pickle_tab = r"\output\pkl\sam\tabular_data.pkl"
pickle_nonimg = r"\output\pkl\sam\nonimg_data.pkl"
pickle_indices = r"\output\pkl\indices_var.pkl"

In [53]:
#get the indices 
with open(path + pickle_indices , 'rb') as file:
    indices = pickle.load(file)  
final_indices = []
for k,v in indices.items():
   temp = [k] + v
   for t in temp:
      final_indices.append(t)
      
final_indices = list(set(final_indices))

In [43]:
def check_indice_highlight(indices_variations, path):
    doc = fitz.open(path)
    
    important_pages = {}
    first_blocks = []
    fund_pattern = r"^(samco|tata).*fund$"

    for page_number, page in enumerate(doc):
        
        text_instances = page.get_text('dict')["blocks"]
        
        #sort for all data above 
        sorted_text_instances = sorted(text_instances, key=lambda x: (x['bbox'][1], x['bbox'][0]))

        for pgn,block in enumerate(sorted_text_instances):     
            if "lines" in block: 
                for line in block["lines"]: 
                    for span in line["spans"]:
                        if span['flags'] in [20,25]:  # learn flag logic , rn set for all flags value
                            span_text = span['text'].lower()
                            
                            #check if its fund page
                            if pgn ==0 and re.match(fund_pattern, span_text, re.IGNORECASE):
                                print(span_text, page_number)
                                
                                
                            for term in indices_variations:
                                
                                pattern = r'\b' + re.escape(term.lower()) + r'\b' #learn regex its complicated
                                if re.search(pattern, span_text):
    
                                    #counter highlights on instances
                                    if page_number not in important_pages:
                                        important_pages[page_number] = 0
                                    important_pages[page_number] += 1
                                    #mark content
                                    rect = fitz.Rect(span['bbox']) 
                                    page.add_highlight_annot(rect)
                                    break  #optional , one highlight

    if important_pages:
        output_path = path.replace('.pdf', '_highlighted.pdf')
        doc.save(output_path)
        doc.close()
        return important_pages, output_path, first_blocks
    else:
        doc.close()
        return important_pages, None, first_blocks

In [None]:
highlighted_pages,dummy, fund_pages =  check_indice_highlight(final_indices, samco_path)

#pprint.pprint(highlighted_pages)

In [46]:
""" fund is located only on certain pages, based on no. of highlights
    we know which pages are imp. automate this content later
"""

pages = [3,5,7,9,11,13,15]
bbox = [(31,15,575,115),(35,120,250,765)] #for header and other for content in that order

def get_data_clipped(input:str, output:str, pageSelect:list, bbox:list[set]):
    
    document = fitz.open(input)
    finalData = []
    
    for pgn, pages in enumerate(document):
        if pgn in pageSelect:
            page = document[pgn]
            
            title_blocks = page.get_text('dict', clip = bbox[0])['blocks'] #get title
            blocks = page.get_text('dict', clip = bbox[1])['blocks'] #get all blocks
            filtered_blocks = [block for block in title_blocks if block['type']==0] + [block for block in blocks if block['type']==0]
            sorted_blocks = sorted(filtered_blocks, key= lambda x: (x['bbox'][1], x['bbox'][0]))
            finalData.append({
                "page": pgn,
                "block": sorted_blocks
            })
            
    return finalData

In [55]:
data = get_data_clipped(samco_path, dry_run_path, pages, bbox)
#data[2]['block']

In [50]:
final_text_data = []
for page in data:
    page_content = []
    for blocks in page['block']:
        if 'lines' in blocks:
            for line in blocks['lines']:
                spans = line.get('spans',[])
                text = "".join(span['text'] for span in spans)
                page_content.append(text)
    
    final_text_data.append(page_content)                      
#final_text_data    #returns a list of strings

In [61]:
#to remove keys not required
def remove_keys(data:list, keys_to_remove:set):
    if isinstance(data, list):
        # Process each element in the list
        return [remove_keys(item, keys_to_remove) for item in data]
    elif isinstance(data, dict):
        # Process each key-value pair in the dictionary
        return {key: remove_keys(value, keys_to_remove) for key, value in data.items() if key not in keys_to_remove}
    else:
        # Return data as is if it's neither a dict nor a list
        return data

#IMPORTANT FUNCTION 
def clean_data_combine_spans(data:list, check_color:int,replace_size):
    
    #remove redundant text
    texts_to_remove = [':','st',";","-",'st ',' ']
    for cleaned_data in data:
        for block in cleaned_data.get("block", []):
                for line in block.get("lines", []):
                    #need to refine the code here
                    line["spans"] = [
                        span for span in line.get("spans", [])
                        if span.get("text").strip() not in texts_to_remove
                    ] 
                    
    #make size of text with same color equal to size
    for cleaned_data in data:
        for block in cleaned_data.get("block", []):
                for line in block.get("lines", []):
                    for span in line.get('spans',[]):
                        if span['color'] == check_color and span['size']> 7:
                            span['size'] = replace_size              
                        
    #combine spans with same font, flag,size,color etc
    for cleaned_data in data:
        for block in cleaned_data.get("block", []):
            for line in block.get("lines", []):
                combined_spans = []
                for span in line.get("spans", []):
                    if combined_spans and all(
                        combined_spans[-1].get(key) == span.get(key)
                        for key in ["flags", "size", "color"]
                    ):
                        # Combine text if spans are similar
                        combined_spans[-1]["text"] += " " + span["text"]
                    else:
                        combined_spans.append(span)
                line["spans"] = combined_spans
    
    return data
keys_to_remove = {}
cleaned_data = remove_keys(data, keys_to_remove)
#cleaned_data[3]['block']

In [62]:
keys_to_remove = {}
cleaned_data = remove_keys(data, keys_to_remove)
#cleaned_data[3]['block']

combined_data = clean_data_combine_spans(cleaned_data, -1, 9.0) 
#set font whose color is __ to size __ 
#Bad Logic here btw

In [None]:
#combined_data[3]['block'] #Page 0,1,2,3 indexes-> 3,5,7,9 pages 

In [90]:
#IMPORTANT FUNCTION
def create_matrix_structure(data: list, title_font: float, subheader_font: float, content_max_font: float):
    # Step 1: collect font and sizes
    coordinates = []
    fonts = set()

    for block in data['block']:
        for line in block['lines']:
            for span in line['spans']:
                origin = tuple(span['origin'])  # Top-left coords
                coordinates.append(origin)
                fonts.add(span['size'])

    coordinates = sorted(set(coordinates), key=lambda c: (c[1], c[0]))  # Sort by y, then x
    fonts = sorted(fonts, reverse=True)  #desc

    # Step 2: create the matrix
    coord_to_index = {coord: idx for idx, coord in enumerate(coordinates)}  # (x,y) at pos 0 etc. ROWS
    font_to_index = {font: idx for idx, font in enumerate(fonts)}  # COLUMNS
    matrix = np.zeros((len(coordinates), len(fonts)), dtype=object)  # Set all to zeros initially

    # Step 3: matrix populate and nested dict add
    nested_dict = {}
    current_title = None
    current_subheader = None

    for block in data['block']:
        for line in block['lines']:
            for span in line['spans']:
                
                origin = tuple(span['origin'])  # Top-left x,y
                font = span['size']
                text_preview = span['text']  # Get the first two words of the text
                
                # Populate the matrix with text preview
                if origin in coord_to_index and font in font_to_index:
                    
                    row = coord_to_index[origin]
                    col = font_to_index[font]
                    if matrix[row, col] == 0:
                        matrix[row, col] = []
                    matrix[row, col].append(text_preview)

                # Build the nested dictionary
                if font == title_font:
                    current_title = span
                    nested_dict[current_title['text']] = {}
                elif font == subheader_font and current_title:
                    current_subheader = span
                    nested_dict[current_title['text']][current_subheader['text']] = []
                elif font <= content_max_font and current_subheader:
                    nested_dict[current_title['text']][current_subheader['text']].append(span)

    matrix_df = pd.DataFrame(matrix, index=coordinates, columns=fonts)

    return nested_dict, matrix_df


""" FOR DEV PURPOSE ONLY
Generates a PDF from the nested dictionary data structure.
Params:
    data (dict): The nested dictionary containing sections and fitz spans.
    output_path (str): The file path where the PDF will be saved.
"""
def generate_pdf_from_data(data:list, output_path:str):
    pdf_doc = fitz.open()
    
    for section, spans in data.items():
    
        page = pdf_doc.new_page()
        text_position = 72  # for title initalize something

        #section title
        title_font_size = 14 
        try:
            page.insert_text(
                (72, text_position), #initalizor
                section,
                fontsize=title_font_size,
                fontname="helv",
                color=(0, 0, 1),
            )        
        except Exception as e:
            print(f"The error is {e}")
            
        #content title
        for span in spans:
            bbox = span.get("bbox", [0, 0, 0, 0])  # default 

            #Errror in fitz font 
            try:
                page.insert_text(
                    (bbox[0], bbox[1]),
                    span["text"],
                    fontsize=span["size"],
                    fontname="helv",
                    color=tuple(int(span["color"] & 0xFFFFFF) for _ in range(3)))#unsigned int value so (0,0,0)
                
            except Exception:
                page.insert_text(
                    (bbox[0], bbox[1]),
                    span["text"],
                    fontsize=span["size"],
                    fontname="helv",
                    color=(1, 0, 0),
                )

    # Save the created PDF
    pdf_doc.save(output_path)
    pdf_doc.close()
    print(f"PDF successfully generated to: {output_path}")

In [91]:
title_font = 24.0  #Samco Title
subheader_font = 9.0  # Subheader Title 
content_max_font = 7.0  #max content

grand_final_data = {}
matrix_final = []

for page_data in combined_data: #each page data in doc  
    result, matrix = create_matrix_structure(page_data, title_font, subheader_font, content_max_font)
    master_key = list(result.keys()) #fund name
    content = result[master_key[0]] #content of each fund
    
    matrix_final.append(matrix)
    
    generate_pdf_from_data(content, path + r"\output\samcoDryRun.pdf")
    with pdfplumber.open(path +r"\output\samcoDryRun.pdf") as pdf:
        final_data = []
        final_data_generated = {}
        
        for page in pdf.pages:
            # extract text from the page
            text = page.extract_text()
            final_data.append(text)
        
        #store them in a dict for each page
        for data in final_data:
            content = data.split('\n')
            main_key = content[0]
            values = content[1:]
        
            final_data_generated[main_key] = values
        
        #sort the headers in lex order
        sorted_final_generated = {key: final_data_generated[key] for key in sorted(final_data_generated)}
            
    
    grand_final_data[master_key[0]] = sorted_final_generated
    

PDF successfully generated to: C:\Users\rando\OneDrive\Documents\mywork-repo\output\samcoDryRun.pdf
PDF successfully generated to: C:\Users\rando\OneDrive\Documents\mywork-repo\output\samcoDryRun.pdf
PDF successfully generated to: C:\Users\rando\OneDrive\Documents\mywork-repo\output\samcoDryRun.pdf
PDF successfully generated to: C:\Users\rando\OneDrive\Documents\mywork-repo\output\samcoDryRun.pdf
PDF successfully generated to: C:\Users\rando\OneDrive\Documents\mywork-repo\output\samcoDryRun.pdf
PDF successfully generated to: C:\Users\rando\OneDrive\Documents\mywork-repo\output\samcoDryRun.pdf
PDF successfully generated to: C:\Users\rando\OneDrive\Documents\mywork-repo\output\samcoDryRun.pdf


In [93]:
matrix_final[0].head(30)

Unnamed: 0,24.000000,9.000000,8.000000,7.000000,6.634068,6.500000,6.123724,6.000000,5.878776,5.692100
"(36.273399353027344, 93.87298583984375)",[Samco Active Momentum Fund],0,0,0,0,0,0,0,0,0
"(37.22850036621094, 107.4208984375)",0,0,[(An open-ended equity scheme following moment...,0,0,0,0,0,0,0
"(43.30080032348633, 131.7001953125)",0,[Investment Objective],0,0,0,0,0,0,0,0
"(43.3916015625, 151.3095703125)",0,0,0,[The investment objective of the Scheme is to ...,0,0,0,0,0,0
"(43.3916015625, 159.7095947265625)",0,0,0,[generate long-term capital appreciation by in...,0,0,0,0,0,0
"(43.3916015625, 168.109619140625)",0,0,0,[stocks showing strong momentum. Momentum stoc...,0,0,0,0,0,0
"(43.3916015625, 176.5096435546875)",0,0,0,[such that exhibit positive price momentum – b...,0,0,0,0,0,0
"(43.3916015625, 184.90966796875)",0,0,0,[phenomenon that stocks which have performed w...,0,0,0,0,0,0
"(43.3916015625, 193.3096923828125)",0,0,0,[past relative to other stocks (winners) conti...,0,0,0,0,0,0
"(43.3916015625, 201.709716796875)",0,0,0,"[well in the future, and stocks that have perf...",0,0,0,0,0,0


In [None]:
#fund names
list(grand_final_data.keys())

REGEX FUNCTIONS

In [15]:
def return_invest_data(key:str,data:list):
    investment_objective = data
    values = " ".join(txt for txt in investment_objective)

    data = {
        key:values
    }

    return data

def return_scheme_data(key:str,data:list):
    scheme_data = data
    main_key = key
    structured_data = {main_key: {}}

    # Patterns
    date_pattern = r"^(.*?date)\s(\d{2}-[A-Za-z]{3}-\d{4})$"
    benchmark_pattern = r"^(Benchmark)\s+(.*)$"
    application_pattern = r"(?:·)?\d+(?:,\d{3})*(?:\.\d+)?/-"

    for data in scheme_data:
        if re.search(date_pattern, data, re.IGNORECASE):
            match = re.match(date_pattern, data, re.IGNORECASE)
            if match:
                key = match.group(1)
                value = match.group(2)
                structured_data[main_key][key] = value
        elif re.search(benchmark_pattern, data, re.IGNORECASE):
            match = re.match(benchmark_pattern, data, re.IGNORECASE)
            if match:
                key = match.group(1)
                value = match.group(2)
                structured_data[main_key][key] = value
        elif re.search(r"\b(min|application)\b", data, re.IGNORECASE):
            matches = re.findall(application_pattern, data, re.IGNORECASE)
            if matches:
                cleaned_matches = [match.replace('·', '') for match in matches]
                structured_data[main_key]["min_appl_amt"] = cleaned_matches
        elif re.search(r"\b(additional.* and in multiples of)\b", data, re.IGNORECASE):
            matches = re.findall(application_pattern, data, re.IGNORECASE)
            if matches:
                cleaned_matches = [match.replace('·', '') for match in matches]
                structured_data[main_key]["additional_amt"] = cleaned_matches

    return structured_data

def return_fund_data(key:str,data:list):
    fund_manager = data
    main_key = key
    strucuted_data = {main_key:[]}
    current_entry = None
    name_pattern = r'^(Ms\.|Mr\.)'
    manage_pattern = r'^\(|\)$'
    date_pattern = r'\b\w+ \d{1,2}, \d{4}\b'
    experience_pattern = r'^Total Experience: (.+)$'

    for data in fund_manager:
        if re.match(name_pattern,data):
            if current_entry:
                strucuted_data[main_key].append(current_entry)
            current_entry = {
                'name': data.split(",")[0].strip().lower(),
                'designation': "".join(data.split(",")[1:]).strip().lower()
            }
            #print(data.split(",")[0],"".join(data.split(",")[1:]))
        elif re.match(manage_pattern,data):
            if "inception" in data.lower():
                current_entry['managing_since'] = 'inception'
            else:
                date = re.search(date_pattern, data)
                current_entry['managing_since'] = date.group() if date != None else None
        elif re.match(experience_pattern,data):
            current_entry['total_experience'] = data.split(":")[1].strip().lower()
            #print(data.split(":")[1])

        
    if current_entry:  # Append the last entry
        strucuted_data[main_key].append(current_entry)
            
    return strucuted_data

def return_nav_data(key:str,data:list):
    main_key = key
    structured_data = {main_key: {}}
    
    growth_pattern = r"([\w\s]+):\s*·([\d.]+)"
    
    for line in data:
        matches = re.findall(growth_pattern, line)
        for key, value in matches:
            structured_data[main_key][key.strip().lower()] = value
        
    return structured_data

def return_quant_data(key:str,data:list):
    qunatitative_data = data
    main_key = key

    strucuted_data = {main_key:{}}
    current_entry = None
    comment = ""

    ratio_pattern = r"\b(ratio|turnover)\b"
    annual_pattern = r'\b(annualised|YTM)\b'
    macaulay_pattern = r"\b(macaulay.*duration)\b"
    residual_pattern = r"\b(residual.*maturity)\b"
    modified_pattern = r"\b(modified.*duration)\b"

    for data in qunatitative_data:
        if re.search(ratio_pattern,data, re.IGNORECASE):
            key = data.split(":")[0].lower().strip()
            value = data.split(":")[1].lower().strip()
        elif re.search(annual_pattern,data, re.IGNORECASE):
            key = data.split(":")[0].lower().strip()
            value = data.split(":")[1].lower().strip()
        elif re.search(macaulay_pattern,data, re.IGNORECASE):
            key = data.split(":")[0].lower().strip()
            value = data.split(":")[1].lower().strip()
        elif re.search(residual_pattern,data, re.IGNORECASE):
            key = data.split(":")[0].lower().strip()
            value = data.split(":")[1].lower().strip()
        elif re.search(modified_pattern,data, re.IGNORECASE):
            key = data.split(":")[0].lower().strip()
            value = data.split(":")[1].lower().strip()
        else:
            comment+= data
        strucuted_data[main_key][key] = value
    
    strucuted_data[main_key]['comment'] = comment

    return strucuted_data

def return_aum_data(key:str,data:list):
    
    aum = data
    main_key = key
    strucuted_data = {main_key:{}}

    pattern = r"\b\d{1,3}(?:,\d{3})*(?:\.\d+)? Crs\b"

    for data in aum:
        if re.search(r'average', data, re.IGNORECASE):
            match = re.search(pattern, data)
            key = 'avg_aum'
        else:
            match = re.search(pattern, data)
            key = "aum"
        
        strucuted_data[main_key][key] = match.group()

    return strucuted_data

def return_mar_data(key:str,data:list):
    return {
        key: None
    }


In [None]:
sample = grand_final_data['Samco Active Momentum Fund']
sample

In [14]:
print(sample['Assets Under Management (AUM)'])
print(sample['Fund Manager'])
print(sample['Investment Objective'])
print(sample['NAV as on 31 October 2024 (· per unit)'])
print(sample['Quantitative Data'])
print(sample['Scheme Details'])

['AUM as on October 31, 2024 · 850.06 Crs', 'Average AUM for Month of October 2024 · 852.46 Crs']
['Mr. Paras Matalia, Fund Manager & Head - Research Equity', '(Managing this scheme since inception)', 'Total Experience: Around 9 years', 'Mr. Umeshkumar Mehta, Director, CIO & Fund Manager', '(Managing the scheme since August 01, 2023)', 'Total Experience: Over 20 years', 'Mr. Dhawal Ghanshyam Dhanani', '(Dedicated Fund Manager for Overseas investments since inception)', 'Total Experience: Around 6 years']
['The investment objective of the Scheme is to seek to', 'generate long-term capital appreciation by investing in', 'stocks showing strong momentum. Momentum stocks are', 'such that exhibit positive price momentum · based on the', 'phenomenon that stocks which have performed well in the', 'past relative to other stocks (winners) continue to perform', 'well in the future, and stocks that have performed', 'relatively poorly (losers) continue to perform poorly.', 'However, there can be no

In [16]:
return_aum_data("Assets Under Management (AUM)",sample['Assets Under Management (AUM)'])

{'Assets Under Management (AUM)': {'aum': '850.06 Crs',
  'avg_aum': '852.46 Crs'}}

In [17]:
return_fund_data("Fund Manager",sample['Fund Manager'])

{'Fund Manager': [{'name': 'mr. paras matalia',
   'designation': 'fund manager & head - research equity',
   'managing_since': 'inception',
   'total_experience': 'around 9 years'},
  {'name': 'mr. umeshkumar mehta',
   'designation': 'director cio & fund manager',
   'managing_since': 'August 01, 2023',
   'total_experience': 'over 20 years'},
  {'name': 'mr. dhawal ghanshyam dhanani',
   'designation': '',
   'managing_since': 'inception',
   'total_experience': 'around 6 years'}]}

In [18]:
imp_indices = set()
for k, value in grand_final_data.items():
    for indices in value.keys():
        imp_indices.add(indices)
        
imp_indices = sorted(imp_indices)

function_indices = [return_aum_data,
        return_fund_data,
        return_invest_data,
        return_mar_data,
        return_nav_data,
        return_quant_data,
        return_scheme_data]

function_map = {}

for k, v in zip(imp_indices,function_indices):
    function_map[k] = v

#print(function_map)

def perform_operation(operation, data):
    
    if operation in function_map.keys():
        # Call the mapped function
        return function_map[operation](data)
    else:
        return "Invalid operation"

In [29]:
import pprint, json

grand_dictionary = {}
for master_k in grand_final_data.keys():

    sample = grand_final_data[master_k]
    
    page_content = list()
    
    for main_k, main_content in sample.items():
        
        hello = function_map[main_k]
        page_content.append(hello(main_k, main_content))
    
    grand_dictionary[master_k] = page_content

In [33]:
with open(path +r"\output\dump.json", "w", encoding="utf-8") as file:
    json.dump(grand_dictionary, file, ensure_ascii=False, indent=4)