# Tech housekeeping

In [1]:
from jupyter_bbox_widget import BBoxWidget
import ipywidgets as widgets
import os,re
import json
import urllib
import xml.etree.ElementTree as ET

# Define where to store annotations

We save all annotations in annotations.json
The file has the follow format:

<pre><code>
{
    ppn-id: {
        page-number: [
            {
                x:      2788,
                y:      607,
                width:  1892,
                height: 227,
                label:  "tibetan_content",
            },
            {
                ...
            },
            ...
        ],
        page-number: [
            ...
        ],
        ...
    },
    ppn-id: ...
}
</code></pre>

The following labels have been defined:

<pre><code>
    'arabic_numeral',         /* Latin number on the scan */
    'illustration_image',     /* Image part of an illustration */
    'illustration_caption',   /* Image caption of an illustration */
    'tibetan_page_number',    /* Tibetan page number */
    'chinese_text',           /* Chinese text */
    'chinese_number',         /* Chinese page number */
    'tibetan_content'         /* Tibetan content */
</code></pre>

In [2]:
#path = 'images'
#files = sorted(os.listdir(path))

annotations = {}
annotations_path = 'annotations.json'

## Read annotations file

If an annotation file already exists, we open it so the annotations can be appended or modified.

In [3]:
try:
    with open(annotations_path, 'r') as f:
        annotations = json.load(f)
except FileNotFoundError:
    print( "%s does not exist yet." % annotations_path )

# Annotations workflow under the hood stuff

## Event driven downloading of PPN's

In [4]:
files = []

In [5]:
def upd_ppn(change):
    metadata = None

    with urllib.request.urlopen("https://content.staatsbibliothek-berlin.de/dc/%s.mets.xml" % w_ppn.value) as metadata_url:
        metadata = ET.parse(metadata_url).getroot()
        for fileGrp in metadata.find('{http://www.loc.gov/METS/}fileSec').findall('{http://www.loc.gov/METS/}fileGrp'):
                if( fileGrp.attrib['USE'] == 'DEFAULT' ):
                    files.clear()
                    w_pagelist.options = []
                    for file in fileGrp.findall('{http://www.loc.gov/METS/}file'):
                        url = file.find('{http://www.loc.gov/METS/}FLocat').attrib['{http://www.w3.org/1999/xlink}href']
                        files.append( url )
                    w_pagelist.options = map(lambda url: int(re.search( r'PPN(\d{10})-(\d{8})', url )[2]), files )


## Create UI elements

Create all the UI elements we need:
    w_ppn: Control to enter the PPN
    w_bbox: Control to select the bounding boxes (the actual annotation control)
    w_pagelist: Control to select the page within a PPN
    w_debug: Nifty space to output some debugging info when needed
    q_container: Compound UI element containing all of the above

We'll use a `BBoxWidget` for creating annotations for an image. It already has "Submit" and "Skip" buttons for going through our list of images. Let's also add a progress bar - from the `ipywidgets` library.

In [6]:
# a progress bar to show how far we got
#w_progress = widgets.IntProgress(value=0, max=len(files), description='Progress')
# PPN selection
w_ppn = widgets.Text(
    value='',
    placeholder='PPN',
    description='PPN:',
    #disabled=False   
)
w_ppn.observe(upd_ppn, names='value')# the bbox widget
w_bbox = BBoxWidget(
    image = '',
    classes=['arabic_numeral', 'illustration_image', 'illustration_caption', 'tibetan_page_number','chinese_text','chinese_number','tibetan_content'],
)
# selection list to be able to pick the page number directly
w_pagelist = widgets.Select(
    options=['0'],
    rows=10,
    value = '0',
    description='Page:',
    disabled=False
)
w_debug = widgets.Text(
    value='',
    placeholder='Type something',
    description='Debug:',
    disabled=False   
)
#l_page = widgets.link( (w_progress,'value'), (w_pagelist,'value') )
# combine widgets into a container
w_container = widgets.VBox([
#    w_progress,
    w_ppn,
    w_debug,
    w_pagelist,
    w_bbox,
])

When submit is clicked, we store the annotation into the annotations dict. When skip is pressed, we just advance to the next image.

In [7]:
# when Skip button is pressed we move on to the next file
@w_bbox.on_skip
def skip():
    if w_pagelist.value is not None:
        w_pagelist.value += 1

def upd_img(change):
    # open new image in the widget
    #w_debug.value = change
    if change['new'] is None:
        return
    #w_debug.value = None
    w_bbox.image = files[int(change['new'])-1]
    # here we assign an empty list to bboxes but 
    # we could also run a detection model on the file
    # and use its output for creating inital bboxes
    m = re.search( r'PPN(\d{10})-(\d{8})', w_bbox.image )
    try:
        w_debug.value = "%s - %d" % (m[1],int(m[2]))
        w_bbox.bboxes = annotations[m[1]][str(int(m[2]))]
    except (AttributeError,KeyError):
        w_bbox.bboxes = [] 

# when Submit button is pressed we save current annotations
# and then move on to the next file
@w_bbox.on_submit
def submit():
    if w_pagelist.value is None:
        return
    url = files[w_pagelist.value-1]
    m = re.search( r'PPN(\d{10})-(\d{8})', url )
    if m is not None:
        ppn=m[1]
        page=str(int(m[2]))
        if ppn not in annotations:
            annotations[ppn] = {}
        # save annotations for current image
        annotations[ppn][page] = w_bbox.bboxes
    # move on to the next file
    skip()

w_pagelist.observe(upd_img, names='value')


# Annotation UI

Now we display the container widget and we are ready to annotate.

In [8]:
w_container

VBox(children=(Text(value='', description='PPN:', placeholder='PPN'), Text(value='', description='Debug:', pla…

# Save annotations file

## See annotations

In [None]:
annotations

## Save annotations

In [None]:
with open(annotations_path, 'w') as f:
    json.dump(annotations, f, indent=4)