Jumpstart? How?

- Create a folder called "reconstruction"
- Create another folder called "templates" inside reconstruction
- Populate with appropriate HTML files (kinda vague, but can't wrap everything in this notebook now. Just ask me aka Istiak Shihab). Also I will provide you a snapshot of my current directory structure. Just copy over the files needed. I guess.
- Create another folder called "img_src" inside reconstruction
- Create another folder called "image" inside reconstruction
- Create another folder called "html_output" inside reconstruction
- Now get out of reconstruction folder and Create YET another folder called "image". This is where you will keep your PNG images to run inference on.
- Make sure the folder structure is like this: image/   best.pt    make-html.ipynb  environment.yml    reconstruction/...
- Run this command `conda env create -f environment.yml`
- You should be good to go? Hopefully.
- Good Luck!

In [None]:
import os
import cv2
import time
import copy
import math
import numpy as np
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup
from bs4.formatter import HTMLFormatter
from ultralytics import YOLO
from paddleocr import PaddleOCR
from pathlib import Path
from shapely.geometry import box
from IPython.display import HTML, display

Path to YOLO Model. Currently it assumes that YOLO is in it's root directory, i.e: where the notebook is, and it's name is "best.pt"

In [None]:
yolo_model_weight_path = "best.pt"

Initialize Line Segmentation, Word Segmentation, OCR, and Layout Detection (YOLO) models

In [None]:
line = PaddleOCR(use_angle_cls=False, lang="en", use_gpu=True)
word = PaddleOCR(use_angle_cls=False, lang="ar", use_gpu=True)
ocr = PaddleOCR(use_angle_cls=True, lang="en")
model = YOLO(yolo_model_weight_path)

Inference helper functions for Line, Word segmentation and Word Recognition

In [None]:
def line_segmentation(image):
    result_line = line.ocr(image, rec=False, cls=False)[0]
    return result_line


def word_segmentation(image):
    result_word = word.ocr(image, rec=False, cls=False)[0]
    return result_word

def recognize_word(image, result_word):
    texts = ocr.ocr(image, det=False, rec=True, cls=True)[0][0][0]
    return texts

Helper Functions for Inference run

In [None]:
def padWordImage(img, pad_loc, pad_dim, pad_val):
    """
    This function pads the image to the desired dimension
    Args:
    img: Image to be padded
    pad_loc: Location of padding
    pad_dim: Desired dimension of the padded image
    pad_val: Value to be padded with
    Returns:
    img: Padded image
    """
    if pad_loc == "lr":
        h, w, d = img.shape
        pad_width = pad_dim - w
        pad = np.ones((h, pad_width, 3)) * pad_val
        img = np.concatenate([img, pad], axis=1)
    else:
        h, w, d = img.shape
        if h >= pad_dim:
            return img
        else:
            pad_height = pad_dim - h
            pad = np.ones((pad_height, w, 3)) * pad_val
            img = np.concatenate([img, pad], axis=0)
    return img.astype("uint8")

In [None]:
def correctPadding(img, dim, pvalue=255):
    """
    This function corrects the padding of the image
    Args:
    img: Image to be padded
    dim: Desired dimension of the padded image
    pvalue: Value to be padded with
    Returns:
    img: Padded image
    mask: Mask of the padded image
    """
    img_height, img_width = dim
    mask = 0
    h, w, d = img.shape
    w_new = int(img_height * w / h)
    img = cv2.resize(img, (w_new, img_height))
    h, w, d = img.shape
    if w > img_width:
        h_new = int(img_width * h / w)
        img = cv2.resize(img, (img_width, h_new))
        img = padWordImage(img, pad_loc="tb", pad_dim=img_height, pad_val=pvalue)
        mask = img_width
    elif w < img_width:
        img = padWordImage(img, pad_loc="lr", pad_dim=img_width, pad_val=pvalue)
        mask = w
    img = cv2.resize(img, (img_width, img_height))
    return img, mask

In [None]:
def word_horizontal_dilation(boxes, image):
    """
    This function performs horizontal dilation on the word boxes
    Args:
    boxes: Word boxes
    image: Image
    Returns:
    image: Image with horizontal dilation
    """
    crops = []
    length = len(boxes)
    for i in range(len(boxes)):

        if i + 1 < length:
            [[x_min, y_min], [x_max, y_min], [x_max, y_max], [x_min, y_max]] = boxes[i]
            [[x_min1, y_min1], [x_max1, y_min1], [x_max1, y_max1], [x_min1, y_max1]] = (
                boxes[i + 1]
            )

            right_gap = x_min1 - x_max
            if right_gap > 0:
                x_max += right_gap // 2
                x_min1 -= right_gap // 2

                boxes[i][1][0], boxes[i][2][0] = x_max, x_max
                boxes[i + 1][0][0], boxes[i + 1][3][0] = x_min1, x_min1

                crop = image[int(y_min) : int(y_max), int(x_min) : int(x_max)]
                h, w, d = crop.shape
                if h != 0 and w != 0:
                    crops.append(crop)

    crop = image[
        int(boxes[0][0][1]) : int(boxes[0][2][1]),
        int(boxes[0][0][0]) : int(boxes[0][1][0]),
    ]
    h, w, d = crop.shape
    if h != 0 and w != 0:
        crops.append(crop)

    return crops

Declare HTMLGenerator Object. This Object takes care of fetching the template HTML file that is in "reconstruction/templates/index.html" file and populating this template using required class type layout and its content

In [None]:
class HtmlGenerator:
    """
    This class generates the html file
    """
    def __init__(self, filename="default"):
        """
        This function initializes the class
        Args:
        filename: Name of the html file
        """
        with open("reconstruction/templates/index.html", "r") as f:
            index_template = f.read()

        self.index_template = BeautifulSoup(index_template, "html.parser")
        self.index_template_root_div = self.index_template.find("div", {"id": "root"})
        self.filename = filename

    def read_html_template(self, template_name):
        """
        This function reads the html template
        Args:
        template_name: Name of the template
        Returns:
        soup_template: Template
        """
        with open(f"reconstruction/templates/{template_name}.html", "r") as f:
            template = f.read()
            soup_template = BeautifulSoup(template, "html.parser")
            return soup_template

    def get_styles(self, dict):
        """
        This function gets the styles for the html elements
        Args:
        dict: Dictionary containing the styles
        Returns:
        styles: Styles for the html elements
        """
        styles = f'top: {dict["top"]}vh; left: {dict["left"]}vw; height: {dict["elem_height"]}vh; width: {dict["elem_width"]}vw;'
        return styles

    def insert_paragraph(self, paragraph_info):
        """
        This function inserts the paragraph into the html file
        Args:
        paragraph_info: Information about the paragraph
        """
        paragraph_template = self.read_html_template("paragraph")

        p_tag = paragraph_template.find("p")
        text = paragraph_template.new_string(paragraph_info["text"])
        p_tag.append(text)

        paragraph_div = paragraph_template.find("div")
        paragraph_div["style"] = self.get_styles(paragraph_info)

        self.index_template_root_div.append(paragraph_template)

    def insert_text_box(self, text_box_info):
        """
        This function inserts the text box into the html file
        Args:
        text_box_info: Information about the text box
        """
        text_box_template = self.read_html_template("text_box")

        p_tag = text_box_template.find("p")
        text = text_box_template.new_string(text_box_info["text"])
        p_tag.append(text)

        text_box_div = text_box_template.find("div")
        text_box_div["style"] = self.get_styles(text_box_info)

        self.index_template_root_div.append(text_box_template)

    def insert_image(self, img_info):
        """
        This function inserts the image into the html file
        Args:
        img_info: Information about the image
        """
        image_template = self.read_html_template("image")

        img_div = image_template.find("div")
        img_div["style"] = self.get_styles(img_info)

        img_tag = image_template.new_tag("img")
        img_tag["src"] = img_info["img_src"]

        img_style = "width: 100%; height: 100%; object-fit: fill;"
        img_tag["style"] = img_style

        img_div.append(img_tag)

        self.index_template_root_div.append(image_template)

    def create_html_file(self):
        """
        This function creates the html file
        """
        global img_src_save_dir
        html_path = Path(img_src_save_dir).parent
        with open(html_path / f"reconstruction/{self.filename}.html", "w") as f:
            f.write(
                str(self.index_template.prettify(formatter=HTMLFormatter(indent=2)))
            )

Helper function to initialize HTMLGenerator and passing element where needed

In [None]:
def generate_html(detected_elements_info, file_name):
    """
    This function generates the html file
    Args:
    detected_elements_info: Information about the detected elements
    file_name: Name of the file
    """
    file_name, extension = file_name.split(".")

    gen = HtmlGenerator(file_name)

    for element_info in detected_elements_info:

        if element_info["class"] == "paragraph":
            gen.insert_paragraph(element_info)

        elif element_info["class"] == "text_box":
            gen.insert_text_box(element_info)

        elif element_info["class"] == "image":
            gen.insert_image(element_info)

    gen.create_html_file()

Helper function to get proper coordinate information and padding

In [None]:
def get_normalized_coordinates(xyxy_tensor, height, width):
    """
    This function gets the normalized coordinates
    Args:
    xyxy_tensor: Tensor containing the coordinates
    height: Height of the image
    width: Width of the image
    Returns:
    coordinates: Normalized coordinates
    """
    x_min = xyxy_tensor[0][0].item() / width
    y_min = xyxy_tensor[0][1].item() / height
    x_max = xyxy_tensor[0][2].item() / width
    y_max = xyxy_tensor[0][3].item() / height

    coordinates = [x_min, y_min, x_max, y_max]
    return coordinates


def get_original_coordinates(normalized_coordinates, image_width, image_height):
    """
    This function gets the original coordinates
    Args:
    normalized_coordinates: Normalized coordinates
    image_width: Width of the image
    image_height: Height of the image
    Returns:
    orig_coordinates: Original coordinates
    """
    orig_coordinates = [None] * 4

    orig_coordinates[0] = math.floor(normalized_coordinates[0] * image_width)
    orig_coordinates[1] = math.floor(normalized_coordinates[1] * image_height)
    orig_coordinates[2] = math.ceil(normalized_coordinates[2] * image_width)
    orig_coordinates[3] = math.ceil(normalized_coordinates[3] * image_height)

    return orig_coordinates


def get_coordinates_from_segmentation(result_word):
    """
    This function gets the coordinates from the segmentation
    Args:
    result_word: Result of the word segmentation
    Returns:
    words_xyxy: Coordinates of the words
    """
    words_xyxy = []
    if result_word != None:
        for i in range(len(result_word)):
            [[x_min, y_min], [x_max, y_min], [x_max, y_max], [x_min, y_max]] = (
                result_word[i]
            )
            words_xyxy.append([int(x_min), int(y_min), int(x_max), int(y_max)])

    return words_xyxy

def top_bottom_padding(cropped_text_region):
    """
    This function adds top and bottom padding to the cropped text region
    Args:
    cropped_text_region: Cropped text region
    Returns:
    padded_image: Padded image
    """
    h, w = cropped_text_region.shape[:2]
    padded_height = int(h * 1.5)
    padded_width = w

    padded_image = np.ones((padded_height, padded_width, 3), dtype=np.uint8) * 255

    top_padding = (h * 2 - h) // 2
    bottom_padding = top_padding + h

    padded_image[top_padding:bottom_padding, :] = cropped_text_region

    return padded_image

It takes care of all the inference, that is, Layout Detection at first (`res = model(image)[0]`), then line segmentation, word segmentation and finally returning all of them with appropriate coordinates

In [None]:

names = {0: "paragraph", 1: "text_box", 2: "image", 3: "table"}

def run_inference(image_path, file_name, img_src_save_directory):
    """
    This function runs the inference
    Args:
    image_path: Path of the image
    file_name: Name of the file
    img_src_save_directory: Directory to save the image
    Returns:
    region_of_interests: Region of interests
    """
    file_name, extension = file_name.split(".")
    image = cv2.imread(image_path)
    res = model(image)[0]
    res.save(f"reconstruction/image/{file_name}.{extension}")
    region_of_interests = []
    for i in range(len(res.boxes)):
        info_dict = {
            "class": None,
            "coordinates": None,
            "left": None,
            "top": None,
            "elem_height": None,
            "elem_width": None,
            "img_height": None,
            "img_width": None,
            "text": None,
            "single-line": False,
            "img_src": None,
        }
        cls = res.boxes[i].cls.item()
        img_height, img_width = res.boxes[i].orig_shape
        normalized_coordinates = get_normalized_coordinates(
            res.boxes[i].xyxy, img_height, img_width
        )
        if cls == 0:
            info_dict["class"] = names[0]
        elif cls == 1:
            info_dict["class"] = names[1]
        elif cls == 2:
            info_dict["class"] = names[2]
        elif cls == 3:
            info_dict["class"] = names[3]
        info_dict["coordinates"] = normalized_coordinates
        info_dict["left"], info_dict["top"] = (
            normalized_coordinates[0] * 100,
            normalized_coordinates[1] * 100,
        )
        info_dict["img_height"], info_dict["img_width"] = img_height, img_width
        info_dict["elem_width"] = (
            normalized_coordinates[2] - normalized_coordinates[0]
        ) * 100
        info_dict["elem_height"] = (
            normalized_coordinates[3] - normalized_coordinates[1]
        ) * 100
        if info_dict["class"] == "paragraph" or info_dict["class"] == "text_box":
            x_min, y_min, x_max, y_max = get_original_coordinates(
                normalized_coordinates, info_dict["img_width"], info_dict["img_height"]
            )
            cropped_text_region = image[y_min:y_max, x_min:x_max]

            cropped_text_region = top_bottom_padding(cropped_text_region)
            result_line = line_segmentation(cropped_text_region)
            line_coordinates = get_coordinates_from_segmentation(result_line)
            sorted_line_coordinates = sorted(line_coordinates, key=lambda x: x[1])
            text = []
            for i in range(len(sorted_line_coordinates)):
                cropped_line_region = cropped_text_region[
                    sorted_line_coordinates[i][1] : sorted_line_coordinates[i][3],
                    sorted_line_coordinates[i][0] : sorted_line_coordinates[i][2],
                ]
                if len(sorted_line_coordinates) == 1:
                    info_dict["single-line"] = True
                    info_dict["elem_height"] *= 1.4
                if len(cropped_line_region) != 0:
                    result_word = word_segmentation(cropped_line_region)
                    if result_word != None:
                        if len(result_word) != 0:
                            sorted_result_word = sorted(
                                result_word[0], key=lambda x: x[0]
                            )
                            if len(sorted_result_word) != 0:
                                words = recognize_word(
                                    cropped_line_region, [sorted_result_word]
                                )
                                text += words
            text = "".join(text)
            info_dict["text"] = text
        elif info_dict["class"] == "image":
            x_min, y_min, x_max, y_max = get_original_coordinates(
                normalized_coordinates, info_dict["img_width"], info_dict["img_height"]
            )
            cropped_image_region = image[y_min:y_max, x_min:x_max]
            src = f"{img_src_save_directory}\\{file_name}_{i}.{extension}"
            info_dict["img_src"] = src
            cv2.imwrite(
                f"reconstruction/image/{file_name}_{i}.{extension}",
                cropped_image_region,
            )
        region_of_interests.append(info_dict)
    discard_elements = []
    for i, element in enumerate(region_of_interests):
        bb1 = box(
            element["coordinates"][0],
            element["coordinates"][1],
            element["coordinates"][2],
            element["coordinates"][3],
        )

        for j, other_element in enumerate(region_of_interests):
            if j > i:
                bb2 = box(
                    other_element["coordinates"][0],
                    other_element["coordinates"][1],
                    other_element["coordinates"][2],
                    other_element["coordinates"][3],
                )
                intersection = bb1.intersection(bb2).area
                if bb1.area < bb2.area:
                    iou = intersection / bb1.area
                    if iou > 0.5:
                        if i not in discard_elements:
                            discard_elements.append(i)
                else:
                    iou = intersection / bb2.area
                    if iou > 0.5:
                        if j not in discard_elements:
                            discard_elements.append(j)
    items_deleted = 0
    for index in discard_elements:
        del region_of_interests[index - items_deleted]
        items_deleted += 1
    return region_of_interests

This is a gigantic class, takes care of everything, that is, calling inference and passing it to HTML Geneation.

In [None]:
def reconstruct(directory, img_src_save_dir):
    """
    This function reconstructs the image
    Args:
    directory: Directory containing the images
    img_src_save_dir: Directory to save the image
    """
    directory = "image/"  # Replace with your test directory path
    img_src_save_dir = "image/"  # Replace with your image source directory path

    for file_name in os.listdir(directory):
        if os.path.isfile(os.path.join(directory, file_name)):

            file_path = directory + "/" + file_name

            print(
                "----------------------------------------------------------------------------"
            )
            print("File name:", file_name)

            start_time = time.time()
            roi = run_inference(
                file_path, file_name, img_src_save_dir
            )
            print(
                "Execution Time for Layout Prediction and Text Recognition:",
                round(time.time() - start_time, 2),
                "seconds",
            )

            start_time = time.time()
            print(roi)
            generate_html(roi, file_name)
            print(
                "Execution Time for Reconstruction:",
                round(time.time() - start_time, 2),
                "seconds",
            )

Modify this to your need.

In [None]:
test_image_directory = "image/"
img_src_save_dir = "image/"
reconstruct(test_image_directory, img_src_save_dir)