# Imports and Models

In [1]:
#--------------------------------
# imports
#--------------------------------

import os
import cv2
import time
import math
import numpy as np
import gdown


from apsisocr import ApsisBNOCR
from apsisocr.utils import download
from bs4 import BeautifulSoup
from bs4.formatter import HTMLFormatter
from ultralytics import YOLO
from pathlib import Path
from shapely.geometry import box

#-----------------------------
# models
#-----------------------------

# YOLO
YOLO_DLA_GID="1n-XbOwUwgMjaFPFzEJ59Avrl9Nc5xsx8"  # Google Drive Link of Yolo Model Weights
# local weight file path
yolo_model_weight_path = "weights/best.pt"
# download if not found
if not os.path.isfile(yolo_model_weight_path ):
    download(YOLO_DLA_GID,yolo_model_weight_path )
# layout analysis model YOLO
dla_model = YOLO(yolo_model_weight_path)

# OCR
ocr=ApsisBNOCR()



[32m#LOG     :[0m[34mLoaded Bangla Model[0m
[INFO] fastdeploy/runtime/runtime.cc(273)::CreatePaddleBackend	Runtime initialized with Backend::PDINFER in Device::CPU.
[32m#LOG     :[0m[34mLoaded Paddle detector[0m


W0501 01:40:28.529585  4068 analysis_config.cc:971] It is detected that mkldnn and memory_optimize_pass are enabled at the same time, but they are not supported yet. Currently, memory_optimize_pass is explicitly disabled


[INFO] fastdeploy/runtime/runtime.cc(273)::CreatePaddleBackend	Runtime initialized with Backend::PDINFER in Device::CPU.


# HTMLGenerator Object and Related Helpers
- This Object takes care of fetching the template HTML file that is in "reconstruction/templates/index.html" file and populating this template using required class type layout and its content

In [2]:
class HtmlGenerator:
    """
    This class generates the html file
    """
    def __init__(self, filename="default"):
        """
        This function initializes the class
        Args:
        filename: Name of the html file
        """
        with open("reconstruction/templates/index.html", "r") as f:
            index_template = f.read()

        self.index_template = BeautifulSoup(index_template, "html.parser")
        self.index_template_root_div = self.index_template.find("div", {"id": "root"})
        self.filename = filename

    def read_html_template(self, template_name):
        """
        This function reads the html template
        Args:
        template_name: Name of the template
        Returns:
        soup_template: Template
        """
        with open(f"reconstruction/templates/{template_name}.html", "r") as f:
            template = f.read()
            soup_template = BeautifulSoup(template, "html.parser")
            return soup_template

    def get_styles(self, dict):
        """
        This function gets the styles for the html elements
        Args:
        dict: Dictionary containing the styles
        Returns:
        styles: Styles for the html elements
        """
        styles = f'top: {dict["top"]}vh; left: {dict["left"]}vw; height: {dict["elem_height"]}vh; width: {dict["elem_width"]}vw;'
        return styles

    def insert_paragraph(self, paragraph_info):
        """
        This function inserts the paragraph into the html file
        Args:
        paragraph_info: Information about the paragraph
        """
        paragraph_template = self.read_html_template("paragraph")

        p_tag = paragraph_template.find("p")
        text = paragraph_template.new_string(paragraph_info["text"])
        p_tag.append(text)

        paragraph_div = paragraph_template.find("div")
        paragraph_div["style"] = self.get_styles(paragraph_info)

        self.index_template_root_div.append(paragraph_template)

    def insert_text_box(self, text_box_info):
        """
        This function inserts the text box into the html file
        Args:
        text_box_info: Information about the text box
        """
        text_box_template = self.read_html_template("text_box")

        p_tag = text_box_template.find("p")
        text = text_box_template.new_string(text_box_info["text"])
        p_tag.append(text)

        text_box_div = text_box_template.find("div")
        text_box_div["style"] = self.get_styles(text_box_info)

        self.index_template_root_div.append(text_box_template)

    def insert_image(self, img_info):
        """
        This function inserts the image into the html file
        Args:
        img_info: Information about the image
        """
        image_template = self.read_html_template("image")

        img_div = image_template.find("div")
        img_div["style"] = self.get_styles(img_info)

        img_tag = image_template.new_tag("img")
        img_tag["src"] = img_info["img_src"]

        img_style = "width: 100%; height: 100%; object-fit: fill;"
        img_tag["style"] = img_style

        img_div.append(img_tag)

        self.index_template_root_div.append(image_template)

    def create_html_file(self):
        """
        This function creates the html file
        """
        global img_src_save_dir
        html_path = Path(img_src_save_dir).parent
        with open(html_path / f"reconstruction/{self.filename}.html", "w") as f:
            f.write(
                str(self.index_template.prettify(formatter=HTMLFormatter(indent=2)))
            )

# Helper function to initialize HTMLGenerator and passing element where needed

def generate_html(detected_elements_info, file_name):
    """
    This function generates the html file
    Args:
    detected_elements_info: Information about the detected elements
    file_name: Name of the file
    """
    file_name, extension = file_name.split(".")

    gen = HtmlGenerator(file_name)

    for element_info in detected_elements_info:

        if element_info["class"] == "paragraph":
            gen.insert_paragraph(element_info)

        elif element_info["class"] == "text_box":
            gen.insert_text_box(element_info)

        elif element_info["class"] == "image":
            gen.insert_image(element_info)

    gen.create_html_file()

# Helper function to get proper coordinate information and padding

def get_normalized_coordinates(xyxy_tensor, height, width):
    """
    This function gets the normalized coordinates
    Args:
    xyxy_tensor: Tensor containing the coordinates
    height: Height of the image
    width: Width of the image
    Returns:
    coordinates: Normalized coordinates
    """
    x_min = xyxy_tensor[0][0].item() / width
    y_min = xyxy_tensor[0][1].item() / height
    x_max = xyxy_tensor[0][2].item() / width
    y_max = xyxy_tensor[0][3].item() / height

    coordinates = [x_min, y_min, x_max, y_max]
    return coordinates


def get_original_coordinates(normalized_coordinates, image_width, image_height):
    """
    This function gets the original coordinates
    Args:
    normalized_coordinates: Normalized coordinates
    image_width: Width of the image
    image_height: Height of the image
    Returns:
    orig_coordinates: Original coordinates
    """
    orig_coordinates = [None] * 4

    orig_coordinates[0] = math.floor(normalized_coordinates[0] * image_width)
    orig_coordinates[1] = math.floor(normalized_coordinates[1] * image_height)
    orig_coordinates[2] = math.ceil(normalized_coordinates[2] * image_width)
    orig_coordinates[3] = math.ceil(normalized_coordinates[3] * image_height)

    return orig_coordinates




def top_bottom_padding(cropped_text_region):
    """
    This function adds top and bottom padding to the cropped text region
    Args:
    cropped_text_region: Cropped text region
    Returns:
    padded_image: Padded image
    """
    h, w = cropped_text_region.shape[:2]
    padded_height = int(h * 1.5)
    padded_width = w

    padded_image = np.ones((padded_height, padded_width, 3), dtype=np.uint8) * 255

    top_padding = (h * 2 - h) // 2
    bottom_padding = top_padding + h

    padded_image[top_padding:bottom_padding, :] = cropped_text_region

    return padded_image

# Inference
- It takes care of all the inference, that is, Layout Detection at first (`res = model(image)[0]`), then line segmentation, word segmentation and finally returning all of them with appropriate coordinates

In [3]:

names = {0: "paragraph", 1: "text_box", 2: "image", 3: "table"}

def run_inference(image_path, file_name, img_src_save_directory):
    """
    This function runs the inference
    Args:
    image_path: Path of the image
    file_name: Name of the file
    img_src_save_directory: Directory to save the image
    Returns:
    region_of_interests: Region of interests
    """
    file_name, extension = file_name.split(".")
    image = cv2.imread(image_path)
    res = dla_model(image)[0]
    res.save(f"reconstruction/image/{file_name}.{extension}")
    region_of_interests = []
    for i in range(len(res.boxes)):
        info_dict = {
            "class": None,
            "coordinates": None,
            "left": None,
            "top": None,
            "elem_height": None,
            "elem_width": None,
            "img_height": None,
            "img_width": None,
            "text": None,
            "single-line": False,
            "img_src": None,
        }
        cls = res.boxes[i].cls.item()
        img_height, img_width = res.boxes[i].orig_shape
        normalized_coordinates = get_normalized_coordinates(
            res.boxes[i].xyxy, img_height, img_width
        )
        if cls == 0:
            info_dict["class"] = names[0]
        elif cls == 1:
            info_dict["class"] = names[1]
        elif cls == 2:
            info_dict["class"] = names[2]
        elif cls == 3:
            info_dict["class"] = names[3]
        info_dict["coordinates"] = normalized_coordinates
        info_dict["left"], info_dict["top"] = (
            normalized_coordinates[0] * 100,
            normalized_coordinates[1] * 100,
        )
        info_dict["img_height"], info_dict["img_width"] = img_height, img_width
        info_dict["elem_width"] = (normalized_coordinates[2] - normalized_coordinates[0]) * 100
        info_dict["elem_height"] = (normalized_coordinates[3] - normalized_coordinates[1]) * 100
        
        if info_dict["class"] == "paragraph" or info_dict["class"] == "text_box":
            x_min, y_min, x_max, y_max = get_original_coordinates(normalized_coordinates, info_dict["img_width"], info_dict["img_height"])
            cropped_text_region = image[y_min:y_max, x_min:x_max]
            cropped_text_region = top_bottom_padding(cropped_text_region)
            ocr_result=ocr(cropped_text_region)
            text =ocr_result["text"].replace("\n","")
            info_dict["text"] = text
        elif info_dict["class"] == "image":
            x_min, y_min, x_max, y_max = get_original_coordinates(
                normalized_coordinates, info_dict["img_width"], info_dict["img_height"]
            )
            cropped_image_region = image[y_min:y_max, x_min:x_max]
            src = f"{img_src_save_directory}\\{file_name}_{i}.{extension}"
            info_dict["img_src"] = src
            cv2.imwrite(
                f"reconstruction/image/{file_name}_{i}.{extension}",
                cropped_image_region,
            )
        region_of_interests.append(info_dict)
    discard_elements = []
    for i, element in enumerate(region_of_interests):
        bb1 = box(
            element["coordinates"][0],
            element["coordinates"][1],
            element["coordinates"][2],
            element["coordinates"][3],
        )

        for j, other_element in enumerate(region_of_interests):
            if j > i:
                bb2 = box(
                    other_element["coordinates"][0],
                    other_element["coordinates"][1],
                    other_element["coordinates"][2],
                    other_element["coordinates"][3],
                )
                intersection = bb1.intersection(bb2).area
                if bb1.area < bb2.area:
                    iou = intersection / bb1.area
                    if iou > 0.5:
                        if i not in discard_elements:
                            discard_elements.append(i)
                else:
                    iou = intersection / bb2.area
                    if iou > 0.5:
                        if j not in discard_elements:
                            discard_elements.append(j)
    items_deleted = 0
    for index in discard_elements:
        del region_of_interests[index - items_deleted]
        items_deleted += 1
    return region_of_interests

This is a gigantic class, takes care of everything, that is, calling inference and passing it to HTML Geneation.

In [4]:
def reconstruct(directory, img_src_save_dir):
    """
    This function reconstructs the image
    Args:
    directory: Directory containing the images
    img_src_save_dir: Directory to save the image
    """
    directory = "image/"  # Replace with your test directory path
    img_src_save_dir = "image/"  # Replace with your image source directory path

    for file_name in os.listdir(directory):
        if os.path.isfile(os.path.join(directory, file_name)):

            file_path = directory + "/" + file_name

            print(
                "----------------------------------------------------------------------------"
            )
            print("File name:", file_name)

            start_time = time.time()
            roi = run_inference(
                file_path, file_name, img_src_save_dir
            )
            print(
                "Execution Time for Layout Prediction and Text Recognition:",
                round(time.time() - start_time, 2),
                "seconds",
            )

            start_time = time.time()
            print(roi)
            generate_html(roi, file_name)
            print(
                "Execution Time for Reconstruction:",
                round(time.time() - start_time, 2),
                "seconds",
            )

Modify this to your need.

In [5]:
test_image_directory = "image/"
img_src_save_dir = "image/"
reconstruct(test_image_directory, img_src_save_dir)

----------------------------------------------------------------------------
File name: Prothom_Alo.png

0: 352x640 5 paragraphs, 3 text_boxs, 267.3ms
Speed: 3.9ms preprocess, 267.3ms inference, 295.9ms postprocess per image at shape (1, 3, 352, 640)
Execution Time for Layout Prediction and Text Recognition: 19.84 seconds
[{'class': 'paragraph', 'coordinates': [0.003550624729090234, 0.5779537525866941, 0.9953124091287996, 0.7196987156550831], 'left': 0.3550624729090234, 'top': 57.79537525866941, 'elem_height': 14.174496306838902, 'elem_width': 99.17617843997094, 'img_height': 857, 'img_width': 1612, 'text': " তবে ধর্মভিত্তিক দলগুলোর অন্যতম ইসলামী আন্দোলন সংসদ নির্বাচনের মতো উপজেলা নির্বাচনেও না যাওয়ার সিদ্ধান্ত নিয়েছে। এই' .সদদ্ধান্ত কার্যকর করতে ইতিমধ্যে তারা মাঠপর্যায়ে নির্দেশনা পাঠিয়েছে যে দলের দায়িত্বশীল কেউ যাতে এই' 'নির্বাচনে অংশনা নেন। যদিও ইসলামী আন্দোলন স্থানীয় সরকারের আগের উপজেলা ও ইউনিয়ন পরিষদের নির্বাচনে অংশ নিয়েছিল।", 'single-line': False, 'img_src': None}, {'class': 'para