In [3]:
import xml.etree.ElementTree as ET
import cv2

def normalize_bbox(bbox, width, height, scale=1000):
     return [
         int(scale * (bbox[0] / width)),
         int(scale * (bbox[1] / height)),
         int(scale * (bbox[2] / width)),
         int(scale * (bbox[3] / height)),
     ]

def load_and_process_alto(alto_file):
    """
    Loads and processes an ALTO file, returning two lists:
        tokens: a list of strings, where each string is the content of a <String> element
        bboxes: a list of lists of integers, where each inner list represents the bounding box
                of the corresponding token in the tokens list. The format is [X1, Y1, X2, Y2]
    """
    tokens = []
    bboxes = []
    page_margins = []
    tree = ET.parse(alto_file)
    root = tree.getroot()
    # get xlmns from alto file
    xlmns = root.tag.split('}')[0] + '}'
    # Find all <Page> elements
    for page in root.findall(".//"+xlmns+"Page"):
        # Extract page dimensions
        page_width = int(page.attrib["WIDTH"])
        page_height = int(page.attrib["HEIGHT"])

        # Find all <TextLine> elements
        for textline in page.findall(".//"+xlmns+"TextLine"):
            for string in textline.findall(".//"+xlmns+"String"):
                # Extract token text and bounding box coordinates
                content = string.attrib["CONTENT"]
                h = int(string.attrib["HEIGHT"])
                w = int(string.attrib["WIDTH"])
                vpos = int(string.attrib["VPOS"])
                hpos = int(string.attrib["HPOS"])
                x1, y1 = hpos, vpos
                x2, y2 = x1 + w, y1 + h

                tokens.append(content)
                bboxes.append([x1, y1, x2, y2])

    return tokens, bboxes

In [4]:
def overlap(box1, box2):
  """
  This function computes the area of overlap between two bounding boxes.

  Args:
      box1: A list containing the coordinates of the first bounding box (xmin, ymin, xmax, ymax)
      box2: A list containing the coordinates of the second bounding box (xmin, ymin, xmax, ymax)

  Returns:
      The percentage [0.0-1.0] of the area of the first bounding box that is overlapped by the second bounding box.
  """

  assert box1[0] < box1[2]
  assert box1[1] < box1[3]
  assert box2[0] < box2[2]
  assert box2[1] < box2[3]


  # determine the coordinates of the intersection rectangle
  x_left = max(box1[0], box2[0])
  y_top = max(box1[1], box2[1])
  x_right = min(box1[2], box2[2])
  y_bottom = min(box1[3], box2[3])

  if x_right < x_left or y_bottom < y_top:
      return 0.0

  # The intersection of two axis-aligned bounding boxes is always an
  # axis-aligned bounding box
  intersection_area = (x_right - x_left) * (y_bottom - y_top)

  # compute the area of both AABBs
  bb1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
  # compute the intersection over union by taking the intersection
  # area and dividing it by the sum of prediction + ground-truth
  # areas - the interesection area
  iou = intersection_area / bb1_area
  assert iou >= 0.0
  assert iou <= 1.0
  return iou


def is_inside(box1, box2):
  """
  This function checks if box1 is inside box2.

  Args:
      box1: A list containing the coordinates of the first bounding box (xmin, ymin, xmax, ymax)
      box2: A list containing the coordinates of the second bounding box (xmin, ymin, xmax, ymax)

  Returns:
      True if box1 is inside box2, False otherwise.
  """
  x_left1, y_top1, x_right1, y_bottom1 = box1
  x_left2, y_top2, x_right2, y_bottom2 = box2
  return x_left1 >= x_left2 and y_top1 >= y_top2 and x_right1 <= x_right2 and y_bottom1 <= y_bottom2

In [6]:
alto_file = "in.xml"
tokens, bounding_boxes = load_and_process_alto(alto_file)

print("Tokens:", tokens)
print("Bounding boxes:", bounding_boxes)

# save image with bounding boxes drawn
image_file = "1.jpg"
image = cv2.imread(image_file)
#get width and height of the image
height, width, _ = image.shape
print(height, width)
for bbox in bounding_boxes:
    x1, y1, x2, y2 = bbox
    if x1 < 0:
        x1 = 0
    if y1 < 0:
        y1 = 0
    cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 3)

cv2.imwrite("out.jpg", image)

Tokens: ['Josef', 'Příjmení', 'a', 'jméno', 'András', 'desatník', 'býv.', 'honv.', 'pl.', 'čís.', '15', '1889', 'Datum', 'a', 'místo', 'narození', '(polit.', 'okres.', 'země)', 'Malé', 'Zalužice', 'Michalovce,', 'Slovensko', 'Hodnost', 'a', 'pluk', 'příslušnost', '(polit.', 'okres,', 'země)', 'Datum', 'pohřbu', '12.', 'IX.', '1918', 'Datum', 'a', 'místo', 'úmrtí', '(polit.', 'okres,', 'země)', 'býv.', 'pol.', 'nem.', 'č.', '605', 'Palba', 'v', 'Tiroly', 'v', '13.', 'IX.', '1918', 'Označení', 'hřbitova', 'a', 'místo', 'na', 'voj.', 'hřbitově', '(polit.', 'okres,', 'země)', 'v', 'Palla', 'v', 'Tiroly', 'Oddělení,', 'číslo', 'hrobu', 'Opsáno', 'podle.', 'býv.', 'honv.', 'pluků', 'ul.', 'u', 'Z.', 'J.', 'V.', 'Košice', 'Úmrtní', 'kniha:', 'tom.', 'fol.', 'tom.', 'I.', 'fol.', '112', 'Tiskárna', 'MNO.', '—', '1163-36.']
Bounding boxes: [[851, -8, 1026, 80], [86, 123, 247, 176], [260, 127, 279, 176], [297, 123, 408, 175], [502, 134, 921, 252], [570, 238, 899, 323], [444, 324, 573, 402], [587

True

In [7]:
import pandas as pd
from pathlib import Path

DATA_DIR = Path("data")
DATASET = DATA_DIR / "VHA"
TEST_FILE_XML = DATASET / "alto" / "1.xml"
dataset = pd.read_csv(DATASET / "csv.csv")
dataset.head()

Unnamed: 0,annotation_id,annotator,bbox,created_at,id,label,lead_time,ocr,poly,transcription,updated_at
0,3535,21,"[{""x"":9.037937301226734,""y"":5.105633802816902,...",2024-04-17T17:24:34.224433Z,60188,"[{""x"":9.037937301226734,""y"":5.105633802816902,...",7898.954,/data/upload/14/8634cacb-1.jpg,,"[""Příjmení a jméno"",""Petr Efimov"",""Hodnost a p...",2024-04-24T14:29:08.202063Z
1,3536,22,"[{""x"":5.812827870040286,""y"":6.199460916442049,...",2024-04-17T22:27:27.896988Z,60189,"[{""x"":5.812827870040286,""y"":6.199460916442049,...",3218.212,/data/upload/14/b53d630f-2.jpg,,"[""Příjmení a jméno"",""Ivan Josefovič Tskorostin...",2024-04-24T14:29:03.211568Z
2,3786,23,"[{""x"":6.44465698634901,""y"":5.840071877807719,""...",2024-04-24T13:51:49.487793Z,60190,"[{""x"":6.44465698634901,""y"":5.840071877807719,""...",124.145,/data/upload/14/74ac1cc0-3.jpg,,"[""Příjmení a jméno"",""Hodnost a pluk"",""Datum a ...",2024-04-24T13:51:49.487799Z
3,3792,23,"[{""x"":6.933212176283508,""y"":5.28169014084507,""...",2024-04-24T13:55:04.214951Z,60191,"[{""x"":6.933212176283508,""y"":5.28169014084507,""...",146.15,/data/upload/14/20ceb832-4.jpg,,"[""Příjmení a jméno"",""Hodnost a pluk"",""Datum a ...",2024-04-24T13:55:04.214963Z
4,3793,23,"[{""x"":7.304634257155838,""y"":6.338028169014084,...",2024-04-24T13:55:47.019083Z,60192,"[{""x"":7.304634257155838,""y"":6.338028169014084,...",33.555,/data/upload/14/50ae42e5-5.jpg,,"[""Příjmení a jméno"",""Hodnost a pluk"",""Datum a ...",2024-04-24T13:55:47.019089Z


In [8]:
dataset["file"] = dataset["ocr"].apply(lambda x: x.rsplit("-")[-1])
dataset.head()

Unnamed: 0,annotation_id,annotator,bbox,created_at,id,label,lead_time,ocr,poly,transcription,updated_at,file
0,3535,21,"[{""x"":9.037937301226734,""y"":5.105633802816902,...",2024-04-17T17:24:34.224433Z,60188,"[{""x"":9.037937301226734,""y"":5.105633802816902,...",7898.954,/data/upload/14/8634cacb-1.jpg,,"[""Příjmení a jméno"",""Petr Efimov"",""Hodnost a p...",2024-04-24T14:29:08.202063Z,1.jpg
1,3536,22,"[{""x"":5.812827870040286,""y"":6.199460916442049,...",2024-04-17T22:27:27.896988Z,60189,"[{""x"":5.812827870040286,""y"":6.199460916442049,...",3218.212,/data/upload/14/b53d630f-2.jpg,,"[""Příjmení a jméno"",""Ivan Josefovič Tskorostin...",2024-04-24T14:29:03.211568Z,2.jpg
2,3786,23,"[{""x"":6.44465698634901,""y"":5.840071877807719,""...",2024-04-24T13:51:49.487793Z,60190,"[{""x"":6.44465698634901,""y"":5.840071877807719,""...",124.145,/data/upload/14/74ac1cc0-3.jpg,,"[""Příjmení a jméno"",""Hodnost a pluk"",""Datum a ...",2024-04-24T13:51:49.487799Z,3.jpg
3,3792,23,"[{""x"":6.933212176283508,""y"":5.28169014084507,""...",2024-04-24T13:55:04.214951Z,60191,"[{""x"":6.933212176283508,""y"":5.28169014084507,""...",146.15,/data/upload/14/20ceb832-4.jpg,,"[""Příjmení a jméno"",""Hodnost a pluk"",""Datum a ...",2024-04-24T13:55:04.214963Z,4.jpg
4,3793,23,"[{""x"":7.304634257155838,""y"":6.338028169014084,...",2024-04-24T13:55:47.019083Z,60192,"[{""x"":7.304634257155838,""y"":6.338028169014084,...",33.555,/data/upload/14/50ae42e5-5.jpg,,"[""Příjmení a jméno"",""Hodnost a pluk"",""Datum a ...",2024-04-24T13:55:47.019089Z,5.jpg


In [51]:
import json
def label_tokens(tokens, bboxes, annotation):
    """
    This function labels the tokens extracted from an ALTO file with the corresponding label from the datast based on the label bounding box.

    Args:
        tokens: A list of strings, where each string is the content of a <String> element in the ALTO file.
        bboxes: A list of lists of integers, where each inner list represents the bounding box of the corresponding token in the tokens list.
        annotation: A list of annotations for the image.
    Returns:
        3 lists, first containing the tokens, second containing the bounding boxes, and third containing the corresponding label.
    """
    labels = []
    for i, bbox in enumerate(bboxes):
        for label in annotation:
            l =  label["labels"][0]
            x, y, w, h = label["x"], label["y"], label["width"], label["height"]
            orig_w = label["original_width"]
            orig_h = label["original_height"]
            x, y , w, h = int(x * orig_w/100), int(y * orig_h/100), int(w * orig_w/100), int(h * orig_h/100)
            label_bbox = [x, y, x + w, y + h]
            if is_inside(bbox, label_bbox) or overlap(bbox, label_bbox) > 0.3:
                labels.append(l)
                break
        if len(labels) != i + 1:
            labels.append("background")

    return tokens, bboxes, labels

In [10]:
# row where Ekimof in transcription
dataset[dataset["transcription"].apply(lambda x: "Efimov" in x)]

Unnamed: 0,annotation_id,annotator,bbox,created_at,id,label,lead_time,ocr,poly,transcription,updated_at,file
0,3535,21,"[{""x"":9.037937301226734,""y"":5.105633802816902,...",2024-04-17T17:24:34.224433Z,60188,"[{""x"":9.037937301226734,""y"":5.105633802816902,...",7898.954,/data/upload/14/8634cacb-1.jpg,,"[""Příjmení a jméno"",""Petr Efimov"",""Hodnost a p...",2024-04-24T14:29:08.202063Z,1.jpg


In [52]:
tokens, bounding_boxes = load_and_process_alto(TEST_FILE_XML)
annot = [json.loads(value) for value in dataset[dataset["file"] == "1.jpg"]["label"].values][0]
image = cv2.imread("1.jpg")
for label in annot:
    x, y, w, h = label["x"], label["y"], label["width"], label["height"]
    orig_w = label["original_width"]
    orig_h = label["original_height"]
    x, y , w, h = int(x * orig_w/100), int(y * orig_h/100), int(w * orig_w/100), int(h * orig_h/100)
    label_bbox = [x, y, x + w, y + h]
    cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 3)
for bbox in bounding_boxes:
    x1, y1, x2, y2 = bbox
    if x1 < 0:
        x1 = 0
    if y1 < 0:
        y1 = 0
    cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 3)
cv2.imwrite("out.jpg", image)
tokens, bounding_boxes, labels = label_tokens(tokens, bounding_boxes, annot)
assert len(tokens) == len(labels)
for i, token in enumerate(tokens):
    print(f"{token}: {labels[i]}")


Přijmení: key
a: key
jméno: key
A: name
Petr: name
příslušnost: key
polit.: key
okres,: key
země: key
Efimov: name
pěšák: rank
ruský: rank
p.: rank
pl.: rank
502.: rank
Datum: key
a: key
místo: key
narození: key
1892: birth_date
(polit.: key
okres,: key
země): key
Suchaja: birth_date
Poljana: birth_date
gub.: birth_date
Kursk,: birth_date
Rusko: birth_date
tamtéž: nationality
Hodnost: key
a: key
pluk: key
Datum: key
pohřbu: key
21.: death_date
VI.: death_date
1918: death_date
táborová: death_date
nem: death_date
Osot: death_date
kom.: death_date
Vesrprém,: death_date
Uhry: death_date
23.: funeral_date
VI.: funeral_date
1918: funeral_date
Označení: key
hřbitova: key
a: key
místo: key
táborový: grave_location
(polit.: key
okres,: key
země): key
Esot,: grave_location
kom.: grave_location
Veszprém: grave_location
Uhry: grave_location
Datum: key
a: key
místo: key
úmrtí: key
(polit.: key
okres,: key
země): key
Oddělení,: key
číslo: key
hrobu: key
Opsáno: key
podle: key
matr.: information_sou