In [1]:
import os
import json
import pandas as pd
import numpy as np
import json
from tqdm import tqdm

from shapely.geometry import Polygon
import glob
from PIL import Image
from pytesseract import pytesseract
from lxml import etree
import ast

from sklearn.model_selection import train_test_split
pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

In [3]:
f = open('labeling_result.json')
label_studio_data = json.load(f)
label_studio_data

[{'id': 13,
  'annotations': [{'id': 13,
    'completed_by': 1,
    'result': [{'original_width': 879,
      'original_height': 1295,
      'image_rotation': 0,
      'value': {'x': 1.0974041519163262,
       'y': 0.37243947858473,
       'width': 45.26792126654845,
       'height': 62.56983240223464,
       'rotation': 0,
       'rectanglelabels': ['image']},
      'id': 'oAG_FRxy1n',
      'from_name': 'label',
      'to_name': 'image',
      'type': 'rectanglelabels',
      'origin': 'manual'},
     {'original_width': 879,
      'original_height': 1295,
      'image_rotation': 0,
      'value': {'x': 1.6461062278744891,
       'y': 63.50093109869647,
       'width': 44.71921919059029,
       'height': 3.351955307262557,
       'rotation': 0,
       'rectanglelabels': ['caption-image']},
      'id': 'UmHbH1FUNa',
      'from_name': 'label',
      'to_name': 'image',
      'type': 'rectanglelabels',
      'origin': 'manual'},
     {'original_width': 879,
      'original_height': 1295,

In [4]:
os.makedirs('./data/layoutlmv3_hocr_output', exist_ok=True)

In [5]:
def calculate_iou(box_1, box_2):
    poly_1 = Polygon(box_1)
    poly_2 = Polygon(box_2)
    # print(poly_1,poly_2)
    # iou = poly_1.intersection(poly_2).area / poly_1.union(poly_2).area
    iou = poly_1.intersection(poly_2).area
    min_area = min(poly_1.area,poly_2.area)
    return iou/min_area
    
    
def hocr_to_dataframe(fp):
    """Convert .hocr file to dataframe of words, coords, confidence."""
    doc = etree.parse(fp)
    words, wordConf, coords_list = [], [], []

    for path in doc.xpath('//*'):
        if 'ocrx_word' in path.values():
            title = path.attrib.get('title', '')
            # Example: title="bbox 36 148 215 173; x_wconf 95"
            parts = title.split(';')
            bbox_part = parts[0].replace('bbox', '').strip().split(' ')
            word_coord = list(map(int, bbox_part))  # [x1, y1, x2, y2]
            conf = 0
            if len(parts) > 1 and 'x_wconf' in parts[1]:
                conf = int(parts[1].split('x_wconf')[-1].strip())
            wordConf.append(conf)
            coords_list.append(word_coord)
            words.append(path.text if path.text else '-')

    return pd.DataFrame({'word': words, 'coords': coords_list, 'confidence': wordConf})

In [6]:
document_data = dict()
document_data['file_name'] = []
document_data['labelled_bbox']= []

for i in range(len(label_studio_data)):
    row = label_studio_data[i]
    file_name = os.path.basename(row['data']['image'])
    label_list, labels, bboxes = [], [], []

    for label_ in row['annotations'][0]['result']:
        label_value = label_['value']
        x, y, w, h = label_value['x'], label_value['y'], label_value['width'], label_value['height']
        original_w , original_h = label_['original_width'], label_['original_height']

        x1 = int((x * original_w) / 100)
        y1 = int((y * original_h) / 100)
        x2 = x1 + int(original_w*w / 100)
        y2 = y1 + int(original_h*h / 100)
        
        label = label_value['rectanglelabels']
        label_list.append((label, (x1,y1,x2,y2), original_h, original_w))
        
    document_data['file_name'].append(file_name)    
    document_data['labelled_bbox'].append(label_list)        

custom_dataset = pd.DataFrame(document_data)

In [7]:
custom_dataset

Unnamed: 0,file_name,labelled_bbox
0,e2a55870-13.png,"[([image], (9, 4, 406, 814), 1295, 879), ([cap..."
1,800bbe52-14.png,"[([image], (7, 4, 397, 331), 1301, 863), ([cap..."
2,13a1e382-25.png,"[([image], (6, 8, 398, 207), 1103, 835), ([cap..."
3,e1253a39-26.png,"[([image], (11, 20, 375, 234), 1239, 821), ([c..."
4,58cf7bf4-27.png,"[([image], (431, 100, 800, 570), 1232, 840), (..."
5,90d8d74d-32.png,"[([image], (0, 315, 375, 647), 1146, 784), ([c..."
6,3b6c5c88-41.png,"[([image], (32, 17, 365, 221), 1171, 798), ([c..."
7,3ea3794d-42.png,"[([image], (132, 6, 732, 584), 1227, 832), ([c..."
8,e3d40692-44.png,"[([image], (102, 222, 716, 563), 1174, 826), (..."
9,cdd9c1fd-49.png,"[([image], (38, 590, 394, 1158), 1211, 928), (..."


In [8]:
label2id = {"image": 0, "caption-image": 1}
id2label = {v:k for k, v in label2id.items()}

In [9]:
%%time

final_list = []

for i in tqdm(custom_dataset.iterrows(), total=custom_dataset.shape[0]):
    custom_label_text = {}
    word_list, ner_tags_list, bboxes_list = [], [], []

    file_name = i[1]['file_name']
    for image in glob.glob('./data/dataset/*.png'):
        if os.path.basename(image) == file_name:
            custom_label_text['id'] = i[0]
            custom_label_text['file_name'] = file_name
            label_coord_list = i[1]['labelled_bbox']

            # ✅ Generate HOCR file (fixed extension)
            base_name = os.path.join('./data/layoutlmv3_hocr_output', os.path.splitext(file_name)[0])
            pytesseract.run_tesseract(image, base_name, extension='hocr', lang=None, config="hocr")

            hocr_file = base_name + '.hocr'
            if not os.path.exists(hocr_file):
                raise FileNotFoundError(f"Tesseract failed to create {hocr_file}")

            hocr_df = hocr_to_dataframe(hocr_file)

            for label_coord in label_coord_list:
                (x1, y1, x2, y2) = label_coord[1]
                box1 = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
                label = label_coord[0][0]

                for _, word in hocr_df.iterrows():
                    coords = word['coords']
                    (x1df, y1df, x2df, y2df) = coords
                    box2 = [[x1df, y1df], [x2df, y1df], [x2df, y2df], [x1df, y2df]]
                    overlap_perc = calculate_iou(box1, box2)
                    if overlap_perc > 0.80 and word['word'] != '-':
                        word_list.append(word['word'])
                        bboxes_list.append(coords)
                        ner_tags_list.append(label2id[label])

            custom_label_text['tokens'] = word_list
            custom_label_text['bboxes'] = bboxes_list
            custom_label_text['ner_tags'] = ner_tags_list
            final_list.append(custom_label_text)

100%|██████████| 14/14 [00:41<00:00,  2.93s/it]

CPU times: total: 4.23 s
Wall time: 41.1 s





In [None]:
# with open('test.json','w') as file:
#     json.dump(final_list, file, indent=2)

In [16]:
train, test = train_test_split(final_list, random_state=21, test_size=0.2)

for detail  in final_list:
    with open('final_list_text.txt', 'a') as f:
        f.write(str(detail))
        f.write('\n')
        
for detail  in train:
    with open('train.txt', 'a') as f:
        f.write(str(detail))
        f.write('\n')

In [17]:
for detail in test:
    with open('test.txt', 'a') as f:
        f.write(str(detail))
        f.write('\n')