In [53]:
import fitz  # PyMuPDF
from PIL import Image ,ImageDraw,ImageFont
import json
import os

In [54]:
#!pip install PyMuPDF

In [55]:
def convert_pdf_to_text(pdf_file):
    doc = fitz.open(pdf_file)
    text = ""
    for page in doc:
        text += page.get_text("text")
    doc.close()
    return text
   


In [56]:
def create_line_bounding_boxes(text):
    lines = text.strip().split('\n')
    bounding_boxes = {}
    for idx, line in enumerate(lines):
        # Assuming each line has 10px height
        top_left = [0, idx * 100]
        top_right = [len(line)*100, idx * 100]
        bottom_left = [0, (idx + 1) * 100]
        bottom_right = [len(line) * 100, (idx + 1) * 100]

        bounding_boxes[f"box{idx+1}"] = {
            "top_left": top_left,
            "top_right": top_right,
            "bottom_left": bottom_left,
            "bottom_right": bottom_right
        }
    return lines, bounding_boxes

In [61]:
def save_lines_as_images(lines, image_folder):
    os.makedirs(image_folder, exist_ok=True)
    font_path = "/kaggle/input/nato-sansdev/NotoSansDevanagari-VariableFont_wdthwght.ttf"  # Replace this with the actual path to your downloaded font

    font_size = 40 # You can adjust the font size here

    for idx, line in enumerate(lines):
        image_width = len(line) * 100
        image_height = 100
        image = Image.new('RGB', (image_width, image_height), color='white')
        draw = ImageDraw.Draw(image)

        font = ImageFont.truetype(font_path, font_size)
        draw.text((0, 0), line, font=font, fill='black')  # Drawing text with black color on the white image

        image.save(os.path.join(image_folder, f"line_{idx + 1}.jpg"))

In [62]:
def save_bounding_boxes_to_json(bounding_boxes, json_path):
    with open(json_path, 'w') as json_file:
        json.dump(bounding_boxes, json_file, indent=4)

In [63]:
def main(pdf_file, image_folder, json_path):
    text = convert_pdf_to_text(pdf_file)
    lines, bounding_boxes = create_line_bounding_boxes(text)
    save_lines_as_images(lines, image_folder)
    save_bounding_boxes_to_json(bounding_boxes, json_path)

In [64]:
if __name__ == "__main__":
    pdf_file = "/kaggle/input/sanskrit-text/Sanskrit_Text (1).pdf"
    output_image_folder = "/kaggle/working/images"
    output_json_file = "/kaggle/working/bounding_boxes.json"

    main(pdf_file, output_image_folder, output_json_file)