In [123]:
import pdfplumber
import fitz
import camelot
import warnings , math, collections , os, re
import pickle
import numpy as np

warnings.filterwarnings("ignore", category=UserWarning) 

In [124]:
path = r"C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo"

#file data paths
samco_path = path + r"\files\SamcoFactSheet2024.pdf"
tata_path = path + r"\files\TataFactSheet2024.pdf"

#dry run paths
dry_run_path = path + r"\output\DryRun.pdf"


In [125]:
#tata output path
no_image_path = path +r"\output\tata\TatanoImgPdf.pdf"
textual_pdf_path = path + r"\output\tata\TatatextalPdf.pdf"
tabular_pdf_path = path + r"\output\tata\TatatabularPdf.pdf"

#pickkle data paths tata
pickle_text = r"\output\pkl\tata\textual_data.pkl"
pickle_tab = r"\output\pkl\tata\tabular_data.pkl"
pickle_nonimg = r"\output\pkl\tata\nonimg_data.pkl"
pickle_all = r'\output\pkl\tata\all_data.pkl'

In [18]:
with open(path + pickle_all , 'rb') as file:
    data = pickle.load(file)

In [241]:
pages = [3,5,7,9,11,13,15]
bbox = [(31,15,575,115),(35,120,250,765)] #for header and other for content


def get_data_clipped(input, output, pageSelect, bbox):
    
    document = fitz.open(input)
    finalData = []
    
    for pgn, pages in enumerate(document):
        if pgn in pageSelect:
            page = document[pgn]
            
            title_blocks = page.get_text('dict', clip = bbox[0])['blocks'] #get title
            blocks = page.get_text('dict', clip = bbox[1])['blocks'] #get all blocks
            filtered_blocks = [block for block in title_blocks if block['type']==0] + [block for block in blocks if block['type']==0]
            sorted_blocks = sorted(filtered_blocks, key= lambda x: (x['bbox'][1], x['bbox'][0]))
            finalData.append({
                "page": pgn,
                "block": sorted_blocks
            })
            
    return finalData

In [242]:
data = get_data_clipped(samco_path, dry_run_path, pages, bbox)

In [None]:
data[0]['block']

In [244]:
final_text_data = []
for page in data:
    page_content = []
    for blocks in page['block']:
        if 'lines' in blocks:
            for line in blocks['lines']:
                spans = line.get('spans',[])
                text = "".join(span['text'] for span in spans)
                page_content.append(text)
    
    final_text_data.append(page_content)           
    
            
final_text_data    

[['Samco Active Momentum Fund',
  '(An open-ended equity scheme following momentum theme)',
  'Investment Objective',
  'The investment objective of the Scheme is to seek to ',
  'generate long-term capital appreciation by investing in ',
  'stocks showing strong momentum. Momentum stocks are ',
  'such that exhibit positive price momentum – based on the ',
  'phenomenon that stocks which have performed well in the ',
  'past relative to other stocks (winners) continue to perform ',
  'well in the future, and stocks that have performed ',
  'relatively poorly (losers) continue to perform poorly. ',
  'However, there can be no assurance or guarantee that the ',
  'investment objective of the scheme would be achieved.',
  'Scheme Details',
  'Inception Date',
  '(Date of Allotment)',
  '05-Jul-2023',
  'Nifty 500 TRI',
  'Benchmark',
  '₹5000/- and in multiples of ₹1/- ',
  'thereafter ',
  'Min.Application',
  'Amount',
  'Additional ',
  'Purchase',
  '₹500/- and in multiples of ₹1/- t

In [160]:
extracted_text = []
for page in data:
    for block in page['block']:
        for line in block['lines']:
            for span in line['spans']:
                extracted_text.append(span['text'])

In [202]:
def remove_keys(data, keys_to_remove):
    """Recursively remove specific keys from nested dictionaries or lists.
    Args:data (dict | list): The input data (nested structure of dictionaries and lists).
        keys_to_remove (set): Keys to be removed from the dictionaries.
    Returns:dict | list: Data with specified keys removed."""
    if isinstance(data, list):
        # Process each element in the list
        return [remove_keys(item, keys_to_remove) for item in data]
    elif isinstance(data, dict):
        # Process each key-value pair in the dictionary
        return {key: remove_keys(value, keys_to_remove) for key, value in data.items() if key not in keys_to_remove}
    else:
        # Return data as is if it's neither a dict nor a list
        return data

In [203]:
keys_to_remove = {}
cleaned_data = remove_keys(data, keys_to_remove)

In [None]:
for block in cleaned_data['block']:
    for lines in block['lines']:
        for spans in lines['spans']:
           
            x0,y0,x1,y1 = (round(x,2)for x in spans['bbox'])
            print(spans['text'], spans['size'],(x0,y0,x1,y1))
    print("\n------------------------------------------\n")

In [192]:
def create_matrix_structure(data, title_font, subheader_font, content_max_font):
    # Step 1: Extract all unique coordinates
    coordinates = []
    fonts = set()

    for block in data['block']:
        for line in block['lines']:
            for span in line['spans']:
                origin = tuple(span['origin'])  # Extract (x, y) coordinates from origin
                coordinates.append(origin)
                fonts.add(span['size'])

    coordinates = sorted(set(coordinates), key=lambda c: (c[1], c[0]))  # Sort by y, then x
    fonts = sorted(fonts)

    # Step 2: Initialize the matrix
    coord_to_index = {coord: idx for idx, coord in enumerate(coordinates)}
    font_to_index = {font: idx for idx, font in enumerate(fonts)}
    matrix = np.zeros((len(coordinates), len(fonts)), dtype=object)

    # Step 3: Fill the matrix
    for block in data['block']:
        for line in block['lines']:
            for span in line['spans']:
                origin = tuple(span['origin'])  # Get (x, y) coordinates from origin
                font = span['size']
                if origin in coord_to_index and font in font_to_index:
                    row = coord_to_index[origin]
                    col = font_to_index[font]
                    if matrix[row, col] == 0:
                        matrix[row, col] = []
                    matrix[row, col].append(span)  # Append the entire span dictionary

    # Step 4: Generate the nested dictionary
    nested_dict = {}
    current_title = None
    current_subheader = None

    for row_idx, coord in enumerate(coordinates):
        for col_idx, font in enumerate(fonts):
            if matrix[row_idx, col_idx] != 0:
                spans = matrix[row_idx, col_idx]

                for span in spans:
                    if font == title_font:
                        current_title = span
                        nested_dict[current_title['text']] = {}
                    elif font == subheader_font and current_title:
                        current_subheader = span
                        nested_dict[current_title['text']][current_subheader['text']] = []
                    elif font <= content_max_font and current_subheader:
                        nested_dict[current_title['text']][current_subheader['text']].append(span)

    return nested_dict



In [204]:
title_font = 24.0  # Example title font size
subheader_font = 9.0  # Example subheader font size
content_max_font = 8.0  # Example maximum content font size

result = create_matrix_structure(cleaned_data[0], title_font, subheader_font, content_max_font)

In [None]:
result

In [195]:
content = result['Samco Active Momentum Fund']

In [208]:
extracted_text = []
for data in content['Scheme Details']:
    text = data['text']
    putText = "".join(text)
    extracted_text.append(putText)
    

INVESTMENT OBJECTIVE


In [206]:
#Investment Obj
text = "".join(block['text'] for block in content['Investment Objective'])
text  

'The investment objective of the Scheme is to seek to generate long-term capital appreciation by investing in stocks showing strong momentum. Momentum stocks are such that exhibit positive price momentum – based on the phenomenon that stocks which have performed well in the past relative to other stocks (winners) continue to perform well in the future, and stocks that have performed relatively poorly (losers) continue to perform poorly. However, there can be no assurance or guarantee that the investment objective of the scheme would be achieved.'

FUND MANAGER

In [None]:
# Regex patterns
name_pattern = r'^Mr\. [A-Za-z\s]+(?:,[\s\S]+)?$'  # Matches "Mr."
experience_pattern = r'^Total Experience: (.+)$'   # Matches experience details
managing_since_pattern = r'\((Managing.*?since (inception|\w+ \d{1,2}, \d{4})).*\)'  # Matches managing details


# Process the lines
result = []
current_entry = {}

for block in content['Fund Manager']:
    line = block['text']
    # Check for name and designation
    if re.match(name_pattern, line):
         # Save the current entry if it's not empty
        if current_entry:
            result.append(current_entry)
            current_entry = {}
        # name desig
        parts = line.split(",", 1)
        current_entry["name"] = parts[0].strip()
        current_entry["desig"] = parts[1].strip() if len(parts) > 1 else ""
        
    #experience
    elif re.match(experience_pattern, line):
        experience = re.match(experience_pattern, line).group(1).strip()
        current_entry["exp"] = experience
        
    # since
    elif re.search(managing_since_pattern, line):
        managing_since_match = re.search(managing_since_pattern, line)
        managing_since = managing_since_match.group(2).strip()
        if managing_since.lower() == "inception":
            current_entry["manage_since"] = "Inception"
        else:
            current_entry["manage_since"] = managing_since

# Append the last entry
if current_entry:
    result.append(current_entry)

result


NAV as on 31st October

In [26]:
NAV = [block['text'].strip().lower() for block in content['NAV as on 31st October 2024 (₹ per unit)']]
NAV_dict = {
    NAV[0]: NAV[1],
    NAV[2]: NAV[3]
}

AAUM

In [135]:
for block in content['Assets Under Management (AUM)']:
    text = block['text']
    
    print(text)

AUM as on October 31, 2024
₹ 850.06 Crs
₹ 852.46 Crs
Average AUM for Month of October 2024


SCHEME DETAILS

In [120]:

content['Scheme Details']

TypeError: list indices must be integers or slices, not str

In [89]:
x0all = list()
for block in content['Scheme Details']:
    x0,y0 = block['origin']
    x0, y0 = round(x0), round(y0)
    x0all.append(((x0,y0), block['text']))
    
    

from collections import Counter

counter = Counter([item[0][0] for item in x0all]).most_common() #get the line for distinction
counter

[(42, 13), (105, 12), (107, 6), (165, 2), (78, 1)]

In [90]:
divider = counter[1][0] - 5
divider

100

In [92]:
left, right = [],[]
for x in x0all:
    if x[0][0] <divider:
        left.append(x)
    else:
        right.append(x)
        

In [99]:
keys = ['inception_date','benchmark_index','min_application_amt','additional_purchase','entry_load','exit_load','total_expense_ratio']
scheme_details = dict.fromkeys(keys,None)

In [None]:
#05-Jul-2023
date_pattern = r'\b\d{2}[-_/][A-Za-z]{3}[-_/]\d{4}\b'

words_to_check = ['Index', 'Nifty', 'Sensex']
index_pattern = r'\b(?:' + '|'.join(words_to_check) + r')\b'#check if the item is second in list 

amount_pattern = r'\b\d+/-|and in multiples of|1/-'


In [67]:
import fitz  # PyMuPDF

def generate_pdf_from_data(data, output_path):
    """
    Generates a PDF from the nested dictionary data structure.

    Parameters:
        data (dict): The nested dictionary containing sections and fitz spans.
        output_path (str): The file path where the PDF will be saved.
    """
    # Create a new PDF document
    pdf_document = fitz.open()
    
    for section, spans in data.items():
        # Add a new page for each section
        page = pdf_document.new_page()
        text_position = 72  # Initial vertical position (used only for section titles)

        # Add section title
        title_font_size = 14
        page.insert_text(
            (72, text_position),
            section,
            fontsize=title_font_size,
            fontname="helv",
            color=(0, 0, 1),
        )

        # Iterate through each span in the section
        for span in spans:
            bbox = span.get("bbox", [0, 0, 0, 0])  # Use bbox for exact placement

            # Error handling for font issues
            try:
                page.insert_text(
                    (bbox[0], bbox[1]),  # Use bbox coordinates for exact placement
                    span["text"],
                    fontsize=span["size"],
                    fontname="helv",
                    color=tuple(int(span["color"] & 0xFFFFFF) for _ in range(3)),  # Convert span color
                )
            except Exception:
                page.insert_text(
                    (bbox[0], bbox[1]),  # Use bbox coordinates for exact placement
                    span["text"],
                    fontsize=span["size"],
                    fontname="helv",
                    color=(1, 0, 0),  # Fallback color for errors
                )

    # Save the created PDF
    pdf_document.save(output_path)
    pdf_document.close()
    print(f"PDF successfully generated and saved to: {output_path}")


In [249]:
generate_pdf_from_data(result['Samco Active Momentum Fund'], dry_run_path)

PDF successfully generated and saved to: C:\Users\Kaustubh.keny\OneDrive - Cogencis Information Services Ltd\Documents\mywork-repo\output\DryRun.pdf


In [None]:
# Print the result
import pprint
pprint.pprint(result)

In [None]:

with pdfplumber.open(dry_run_path) as pdf:
    final_data = []
    for page in pdf.pages:
        # Extract text from the page
        text = page.extract_text()
        final_data.append(text)

'Scheme Details\nInception Date 05-Jul-2023\n(Date of Allotment)\nBenchmark Nifty 500 TRI\nMin.Application ·5000/- and in multiples of ·1/-\nAmount thereafter\nAdditional ·500/- and in multiples of ·1/- thereafter\nPurchase\nEntry Load NIL\nExit Load : 1.00% If the investment is redeemed\nor switched out on or before 365 days\nfrom the date of allotment of units.\nNo Exit Load will be charged if\ninvestment is redeemed or switched\nout after 365 days from the date of\nallotment of units.\n(With effect from October 03, 2024)\nTotal Expense Regular Plan Direct Plan\nRatio (TER)\n2.26% 0.86%\nas on October 31,\n2024 Including Goods and Service Tax on\nManagement Fees.'

In [253]:
final_data[1].split("\n")

['Scheme Details',
 'Inception Date 05-Jul-2023',
 '(Date of Allotment)',
 'Benchmark Nifty 500 TRI',
 'Min.Application ·5000/- and in multiples of ·1/-',
 'Amount thereafter',
 'Additional ·500/- and in multiples of ·1/- thereafter',
 'Purchase',
 'Entry Load NIL',
 'Exit Load : 1.00% If the investment is redeemed',
 'or switched out on or before 365 days',
 'from the date of allotment of units.',
 'No Exit Load will be charged if',
 'investment is redeemed or switched',
 'out after 365 days from the date of',
 'allotment of units.',
 '(With effect from October 03, 2024)',
 'Total Expense Regular Plan Direct Plan',
 'Ratio (TER)',
 '2.26% 0.86%',
 'as on October 31,',
 '2024 Including Goods and Service Tax on',
 'Management Fees.']

In [247]:
doc = fitz.open(samco_path)

final_data = []
# Iterate through pages
for page_num, page in enumerate(doc):
    # Extract text from the page
    final_data.append(page.get_text())

In [248]:
data = final_data[3].split("\n")
data

['3',
 'Investment Objective',
 'Portfolio as on October 31, 2024',
 'Inception Date',
 '(Date of Allotment)',
 'Issuer',
 'Industry',
 'Mr. Paras Matalia, Fund Manager & Head - Research Equity',
 '05-Jul-2023',
 'Nifty 500 TRI',
 '₹500/- and in multiples of ₹1/- thereafter ',
 '₹5000/- and in multiples of ₹1/- ',
 'thereafter ',
 'NIL',
 'Benchmark',
 'Additional ',
 'Purchase',
 'Indian Equity and Equity Related Total',
 ' ',
 'Min.Application',
 'Amount',
 'Entry Load',
 'Exit Load',
 '2.26%',
 ': ',
 '₹ 14.53',
 '₹ 14.81',
 'Total Expense ',
 'Total Experience: Around 9 years ',
 'Ratio (TER)',
 'as on October 31, ',
 '2024',
 'Regular Plan',
 '0.86%',
 'Direct Plan',
 'Regular Growth',
 'Direct Growth',
 'Scheme Details',
 'Fund Manager',
 'NAV as on 31st October 2024 (₹ per unit)',
 'Market Capitalisation (% of allocation)',
 '₹ 850.06 Crs',
 '₹ 852.46 Crs',
 'AUM as on October 31, 2024',
 'Average AUM for Month of October 2024',
 'Assets Under Management (AUM)',
 '(Managing this