# Extract vocabulary from PAGE XML

## Setup

Define imports and basic variables.

In [1]:
from pathlib import Path
from urllib.request import urlopen
from collections import namedtuple
from functools import cmp_to_key
import re

import treelib
import tinycss2
from lxml import etree

In [2]:
#PAGE_DIR = Path('.') / 'Repertorio_clean' / 'page'
PAGE_DIR = Path.home() / 'Downloads' / 'Repertorio_clean' / 'Repertorio_clean' / 'page'
IMG_DIR = PAGE_DIR / '..'

In [3]:
PAGE_NS = 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'
PAGE = '{' + PAGE_NS + '}'
NSMAP = {'pc': PAGE_NS}

In [4]:
PAGE_FILES = sorted(PAGE_DIR.glob('*.xml'))
PAGE_FILES

[PosixPath('/home/frederik/Downloads/Repertorio_clean/Repertorio_clean/page/Repertorio-022.xml'),
 PosixPath('/home/frederik/Downloads/Repertorio_clean/Repertorio_clean/page/Repertorio-023.xml'),
 PosixPath('/home/frederik/Downloads/Repertorio_clean/Repertorio_clean/page/Repertorio-024.xml'),
 PosixPath('/home/frederik/Downloads/Repertorio_clean/Repertorio_clean/page/Repertorio-025.xml'),
 PosixPath('/home/frederik/Downloads/Repertorio_clean/Repertorio_clean/page/Repertorio-026.xml'),
 PosixPath('/home/frederik/Downloads/Repertorio_clean/Repertorio_clean/page/Repertorio-027.xml'),
 PosixPath('/home/frederik/Downloads/Repertorio_clean/Repertorio_clean/page/Repertorio-028.xml'),
 PosixPath('/home/frederik/Downloads/Repertorio_clean/Repertorio_clean/page/Repertorio-029.xml'),
 PosixPath('/home/frederik/Downloads/Repertorio_clean/Repertorio_clean/page/Repertorio-030.xml'),
 PosixPath('/home/frederik/Downloads/Repertorio_clean/Repertorio_clean/page/Repertorio-031.xml'),
 PosixPath('/home/fr

## Definitions

Since we’re looping through all the elements, we define a couple of functions to keep the main code readable.

This is a small helper function that makes dictionary values accessible as object properties. We need it later for printing the resulting vocabulary tree, since we collect data in dicts, but `treelib` expects objects for custom data.

In [5]:
class objectview(object):
    """Helper class for label formatting with treelib."""
    def __init__(self, d):
        self.__dict__ = d

### CSS parsing

Transkribus uses something like CSS syntax in the PAGE XML `custom` attributes to store additional information that it cannot represent in PAGE XML. We need mainly to pieces:

1. The reading order (which could actually be defined in PAGE XML, but for some reason Transkribus omits `GraphicRegion`s in the PAGE XML `ReadingOrder`, while it does add the reading order of graphic elements in their `custom` attribute). Also, we probably would not need this at all, as it appears that Transkribus exports elements in reading order, but I’m paranoid and hence better make sure.
2. The structure labels we assigned in Transkribus.

Unfortunately, I could not find an easy high-level CSS parsing library for Python, so this is more verbose as I hoped it would be, as I build a simple data dict from the CSS primitives `tinycss2` yields.

In [6]:
def parse_custom(custom_str):
    """
    Parse custom data in the pseudo-CSS syntax Transkribus uses.
    """
    custom_dict = {}
    rules = [el for el in tinycss2.parse_stylesheet(custom_str) if isinstance(el, tinycss2.ast.QualifiedRule)]
    for rule in rules:
        # Build data dict
        data_dict = {}
        key = None
        value = None
        new_statement = True
        for token in rule.content:
            if isinstance(token, tinycss2.ast.IdentToken) and new_statement:
                if key:
                    # add previous data
                    data_dict[key] = value
                key = token.value
                new_statement = False
            elif isinstance(token, tinycss2.ast.LiteralToken):
                if token.value == ';':
                    new_statement = True
                continue
            else:
                value = token.value
        if key:
            # add final data
            data_dict[key] = value
        data_dict
        # add to custom dict
        key = ' ' .join([t.value for t in rule.prelude if isinstance(t, tinycss2.ast.IdentToken)])
        custom_dict[key] = data_dict
    return custom_dict

### OCR extraction

This simple helper function just collects the OCR’d text from the individual lines of a region.

In [7]:
def gather_lines(region_element):
    return [el.text for el in region_element.findall(f'{PAGE}TextLine/{PAGE}TextEquiv/{PAGE}Unicode')]

Sometime entries span multiple lines and we have to merge them. To handle words with hyphens and hyphenated words, we look at the final character when merging. Here, `-` is taken as a hyphen to be kept (e.g., in composita), while `¬` is taken as a hyphen that should be removed.

In [8]:
def merge_lines(line1, line2):
    if line1.endswith('-'):
        return f'{line1}{line2}'
    if line1.endswith('¬'):
        line1 = line1.rstrip('¬')
        return f'{line1}{line2}'
    return f'{line1} {line2}'

The Repertorio uses numbered section headings that span columns and contain both the Italian and English heading separated by `|`. This function parses `heading` regions and extracts information like heading number, level, and a label in each language.

In [9]:
def parse_headings(lines):
    heading_re = re.compile(r'''
                        (?P<number>\d+(?:\.\d+)*)\.?  # Do not capture optional trailing dot
                        \s*-\s*
                        (?P<it>\w(?:[\w\s:])+\w)
                        \s*\|\s*
                        (?P<en>\w(?:[\w\s:])+\w)
                        ''', re.VERBOSE)
    headings = []
    prev_line = ''
    heading_data = {}
    for line in lines:
        res = heading_re.match(line)
        if res is None:
            # Incomplete line
            prev_line = merge_lines(prev_line, line)
            # An incomplete line should always follow a matching line,
            # so this should work. Report if it doesn’t.
            try:
                heading_data = heading_re.match(prev_line).groupdict()
            except AttributeError:
                print(f'No match for continued heading "{prev_line}".')
                raise
        else:
            # New heading, save old one
            if heading_data:
                headings.append(heading_data)
            # Save current line in case of continuation
            prev_line = line
            # Retrieve data
            heading_data = res.groupdict()
        heading_data = {k: v.lower() for k, v in heading_data.items()}
        heading_data['type'] = 'heading'
        heading_data['level'] = len(heading_data['number'].split('.'))
    # Append final data
    headings.append(heading_data)
    return headings

The concept entries are in separate columns for Italian and English and processed independently. Each concept can have three levels:

1. The main concept
2. (Optional) facets, like front and back side
3. Parts

This function parses an OCR block for a concept and yields its individual components. Since sometimes labels span more than a line, it merges lines that do not start with a number with the preciding line and re-analyzes them.

In [10]:
def parse_concept(lines):
    concept_re = re.compile(r'''
                        (?P<number>\d+)
                        \s*-\s*
                        (?P<label>.*\S)
                        ''', re.VERBOSE)
    facet_re = re.compile(r'''
                        (?P<number>\d+(?:\.\d+)*)
                        \s*:\s*    # Separator
                        \[?        # Optional brackets
                        (?P<label>.*?\S)
                        #(?:\s+\(=\s*(?P<altLabel>.*\S)\))?  # Alt label does not work reliably, deactivate
                        \]?        # Optional brackets
                        $''', re.VERBOSE)
    part_re = re.compile(r'''
                        (?P<number>\w\d?(?:-\w\d?)?(?:,\s*\w\d?(?:-\w\d?)?)?)  # part label can be a, a1, a-b, or a1-b1
                        :\s*
                        (?P<label>.*\S)
                        ''', re.VERBOSE)
    entries = []
    prev_line = ''
    prev_re = concept_re
    prev_type = 'concept'
    line_data = {}
    for line in lines:
        for current_type, current_re in (('concept', concept_re),
                                       ('facet', facet_re),
                                       ('part', part_re)):
            if (m := current_re.match(line)) is not None:
                # Found new entry, save last one
                if line_data:
                    entries.append(line_data)
                # Save current line and expression in case of continuation
                prev_line = line
                prev_re = current_re
                prev_type = current_type
                # Retrieve data
                line_data = m.groupdict()
                line_data['type'] = current_type
                break
        else:
            prev_line = merge_lines(prev_line, line)
            line_data = prev_re.match(prev_line).groupdict()
            line_data['type'] = prev_type
    # Add final data
    entries.append(line_data)
    return entries

Since Transkribus is currently assigning a column-based reading order, we re-sort the regions to follow a row-based reading order. For that purpose, we construct the bounding box from the region coordinates.

In [11]:
def region_coords(region):
    coords = region.find(f'{PAGE}Coords').get('points')
    coords = [[int(coord) for coord in point.split(',')] for point in coords.split(' ')]
    return coords

In [12]:
BoundingBox = namedtuple('BoundingBox', ['minx', 'miny', 'maxx', 'maxy'])

def bounding_box(coords):
    x = [point[0] for point in coords]
    y = [point[1] for point in coords]
    return BoundingBox(min(x), min(y), max(x), max(y))

Our re-ordering of the regions is based on a simple heuristic:

* If a region is below another region, it always comes after in reading order.
* If a region is at the same height, the region that is further to the right comes after.

Because regions might vary slightly in their coordinates, we don’t just compare on point (e.g., $X_{min},Y_{min}$), but take the full shape into account. This will not work for overlapping shapes, but we don’t have them in our case.

In [13]:
def region_cmp(region1, region2):
    bb1 = bounding_box(region_coords(region1))
    bb2 = bounding_box(region_coords(region2))
    if bb1 == bb2:
        return 0
    if bb1.maxy <= bb2.miny:
        return -1
    if bb1.miny >= bb2.maxy:
        return 1
    if bb1.maxx <= bb2.minx:
        return -1
    if bb1.minx >= bb2.maxx:
        return 1
    return 0

We now put the previously defined functions together to extract all information from a page.

Headings are quite easy to process, as they contain all information in one block.

For the actual concept descriptions, we assume that we always have three consecutive elements:

1. A `GraphicRegion` for the illustration,
2. a `TextRegion` with structure type `column-italiano` for Italian labels, and
3. a `TextRegion` with structure type `column-english` for English labels.

*Note:* At this point, we do not process graphics yet, but we will evantually extract them from the page image here.

In [14]:
def extract_from_page(page_file):
    tree = etree.parse(str(page_file))
    page = tree.find(f'{PAGE}Page')
    
    last_index = -1
    rows = []
    row = {}
    regions = list(page.iter(f'{PAGE}GraphicRegion', f'{PAGE}TextRegion'))
    for region in sorted(regions, key=cmp_to_key(region_cmp)):
        custom_data = parse_custom(region.get('custom'))
        # Make sure elements are actually in reading order
        ## We don’t currently do this since we re-order regions ourselves.
        #index = custom_data['readingOrder']['index']
        #assert index > last_index, 'page elements not in reading order.'
        #last_index = index
        # Manually created regions sometimes have no structure tag, we catch that here
        if region.tag == f'{PAGE}TextRegion':
            try:
                custom_data['structure']['type']
            except KeyError:
                print(f'No structure data for elem {region.get("id")} in file {page_file}!')
                raise
        # Determine how to process given element
        if (region.tag == f'{PAGE}TextRegion'
              and custom_data['structure']['type'] == 'heading'):
            # Process heading
            lines = gather_lines(region)
            data = parse_headings(lines)
            rows.extend(data)
        elif region.tag == f'{PAGE}GraphicRegion':
            # Process graphics
            # TODO
            row['depiction'] = None
        elif (region.tag == f'{PAGE}TextRegion'
              and custom_data['structure']['type'] == 'column-italiano'):
            # Process data (Italian)
            lines = gather_lines(region)
            data = parse_concept(lines)
            row['it'] = data
        elif (region.tag == f'{PAGE}TextRegion'
              and custom_data['structure']['type'] == 'column-english'):
            # Process data (English)
            lines = gather_lines(region)
            data = parse_concept(lines)
            row['en'] = data
            # Row complete, save and reset
            rows.append(row)
            row = {}
    return rows

Just to get an idea of the data structure so far, let’s test this with the first page file:

In [15]:
extract_from_page(PAGE_FILES[2])

[{'number': '1.1.3',
  'it': 'strumenti per abrasione',
  'en': 'abrasive tools',
  'type': 'heading',
  'level': 3},
 {'depiction': None,
  'it': [{'number': '1', 'label': 'Sega a telaio', 'type': 'concept'},
   {'number': 'a', 'label': 'telaio', 'type': 'part'},
   {'number': 'a1', 'label': 'staggio', 'type': 'part'},
   {'number': 'a2', 'label': 'traversa', 'type': 'part'},
   {'number': 'a3', 'label': 'corda', 'type': 'part'},
   {'number': 'a4', 'label': 'asta per tensione', 'type': 'part'},
   {'number': 'b', 'label': 'lama', 'type': 'part'}],
  'en': [{'number': '1', 'label': 'Saw', 'type': 'concept'},
   {'number': 'a', 'label': 'frame', 'type': 'part'},
   {'number': 'a1', 'label': 'upright', 'type': 'part'},
   {'number': 'a2', 'label': 'cross piece', 'type': 'part'},
   {'number': 'a3', 'label': 'cord', 'type': 'part'},
   {'number': 'a4', 'label': 'tension bar', 'type': 'part'},
   {'number': 'b', 'label': 'blade', 'type': 'part'}]},
 {'depiction': None,
  'it': [{'number':

Now we go through all PAGE XML files an collect their information.

In [16]:
rows = []
for page_file in PAGE_FILES:
    rows.extend(extract_from_page(page_file))
len(rows)

50

We now have the separate cells and the concepts within each cell as a flat list. We can now build an actualy concept tree using this information:

* Main concepts are children of the last heading we encountered on the page,
* The other terms in a block are children of the main concepts.

Since we have Italian and English labels in separate cells, we need to merge the two lists and make sure they actually have matching labels. (Did I mention I am paranoid? And yes, I do like `assert` statements.)

In [17]:
tree = treelib.Tree()
lang = 'it'
vocab = tree.create_node('DiGA', data = objectview({'display': 'DiGA', 'row': 0}))
current_headings = {0: vocab}
current_level = 0
# Since treelib does not keep track of the order of nodes, and the concept numbering
# re-starts on every page, we add the row number for sorting.
for row_number, row in enumerate(rows, start=1):
    if 'type' in row and row['type'] == 'heading':
        level = row['level']
        row['display'] = f"({row['number']}) {row['it']} | {row['en']}"
        row['row'] = row_number
        parent = current_headings.get(level - 1, vocab)
        node = tree.create_node(row[lang], parent = parent, data = objectview(row))
        current_headings[level] = node
        current_level = level
    else:
        # We have separate structures for both languages, merge these first.
        for entry_it, entry_en in zip(row['it'], row['en']):
            assert entry_it['number'] == entry_en['number'], f'entry numbers do not match: {entry_it}, {entry_en}.'
            # Take Italian entry as template
            entry = entry_it.copy()
            entry['it'] = entry.pop('label')
            entry['en'] = entry_en['label']
            entry['display'] = f"({entry['number']}) {entry['it']} | {entry['en']}"
            entry['row'] = row_number
            if entry['type'] == 'concept':
                parent = current_headings[current_level]
                concept = tree.create_node(entry[lang], parent = parent, data = objectview(entry))
            else:
                tree.create_node(entry[lang], parent = concept, data = objectview(entry))

Eventually we can print the tree structure we extracted.

In [18]:
tree.show(key = lambda node: (node.data.row, node.data.number), data_property = 'display')

DiGA
├── (1.1) strumenti per la lavorazione della pietra | tools for working stone
│   ├── (1.1.1) strumenti da percussione | percussion tools
│   │   ├── (1) Martello, mazza, a due facce | Hammer
│   │   │   ├── (a) testa | head
│   │   │   ├── (b) faccia | face
│   │   │   └── (c) manico | handle
│   │   ├── (2) Picco a una punta | Pick
│   │   │   ├── (a) testa | head
│   │   │   └── (b) punta | point
│   │   ├── (3) Picco a due punte | Double-pointed pick
│   │   │   ├── (a) penna | head
│   │   │   └── (b) punta | point
│   │   ├── (4) Ascia (1) | Axe (1)
│   │   │   ├── (a) testa | head
│   │   │   └── (b) tagliente | cutting edge
│   │   ├── (5) Martellina a due tagli (1) | Double-edged hammer (1)
│   │   │   ├── (a) testa | head
│   │   │   └── (b) tagliente | cutting edge
│   │   └── (6) Picchiarello (2) | Sculptor’s pick (2)
│   │       └── (a) punta | point
│   ├── (1.1.2) strumenti per taglio con percussore | percussion carving tools
│   │   ├── (1) Subbia ad asta circolare

Done!