### Generating metadata from slide labels.
*Last updated: 1 Oct. 2018*

Tissue is usually set in a glass slide that is scanned to generate our histopathology images. Accompanying each glass slide is a printed label, which contains relevant metadata for the slide (e.g. stain type or id of the subject). We develop a computational means to obtain the metadata from images of the slide label and store them on the digital image file on a DSA server.

Key words: optical character recognition (ocr), regular expression (regex), digital slide archive (dsa)

In [1]:
# Dependencies
import girder_tools as gt
import pytesseract
import re

In [2]:
# Inputs
API_URL = 'http://computablebrain.emory.edu:8080/api/v1'
collection_name = "FOX_DATA"
tags = ['BrainID', 'Column','Hemi','Section','Stain','StainDate']
regex_str = '.*?(?P<BrainID>FOX\d{1,2}[A-Z])? ?' + \
    '(?P<Hemi>(LH MID|LH FRONTAL|LH FRONT|LH POST FRONTAL|CAUDAL|LH MISC|MISC|LH CAUDAL|LH))? ?' + \
    'COL (?P<Column>\d{1,2}) S (?P<Section>\d{1,3}) ' + \
    '(?P<Stain>(SILVER|NISSL|1:20K AVP|OXT 1:5000)) ' + \
    '(?P<StainDate>(\d{1,2}/\d{1,2}/\d{1,2}|\d{1,2}/\d{1,2}))'
override = False

# Initiate variables.
gc = gt.login(API_URL, private=True)
items = gt.collection_items(gc, collection_name, limit=5000)
re_compile = re.compile(regex_str)

Login or email: admin
Password for admin: ········


In [3]:
# Local functions.
def _ocr_clean(raw_string):
    # Remove non alpha-numeric character (backslash is an exception).
    # Replaces multiple white spaces and new line characters with single space.
    clean_string = re.sub(r'[^A-Za-z\d//:\s]+', '', raw_string)
    clean_string = re.sub(r'\n', ' ', clean_string)
    clean_string = re.sub('\s{2,}', ' ', clean_string)
    return clean_string

def _run_regex(string_input):
    print(string_input)
    
def _clean_m(dict_input, tags):
    dict_output = {}
    for t in tags:
        if dict_input[t] is None:
            dict_output[t] = ''
        else:
            dict_output[t] = dict_input[t]
    return dict_output

In [4]:
# Loop that reads, cleans, and pushes the metadata.
for i in items:
    # If 'meta' not in item, then there is not metadata.
    # If there is at least one tag missing or empty.
    if 'meta' not in i or any(t not in i['meta'] or i['meta'][t] == '' for t in tags):
        try:
            image = gt.get_label_image(gc, i['_id'])
            raw_output = pytesseract.image_to_string(image)
            clean_output = _ocr_clean(raw_output).upper()
            m = re_compile.search(clean_output)
            if m:
                # If regex is good, add tags that are either not already present
                # or are empty.
                m = m.groupdict()
                m = _clean_m(m, tags)
                if 'meta' not in i:
                    gc.addMetadataToItem(i['_id'], m)
                else:
                    new_metadata = {}
                    for t in tags:
                        if t not in i['meta'] or i['meta'][t] == '' or override:
                            new_metadata[t] = m[t]
                    gc.addMetadataToItem(i['_id'], new_metadata)
            else:
                print("Skipping id: %s, because of failed regex" % i['_id'])
                pass
        except:
            print("Skipping id: %s, due to no valid image" % i['_id'])
            continue

Skipping id: 5bad1700e62914001aa13fa5, due to no valid image
Skipping id: 5bad1730e62914001aa140a5, due to no valid image
Skipping id: 5bad173be62914001aa140e1, due to no valid image
Skipping id: 5bad173be62914001aa140e3, due to no valid image
Skipping id: 5bad173be62914001aa140e5, due to no valid image
Skipping id: 5bad173be62914001aa140e7, due to no valid image
Skipping id: 5bad173be62914001aa140e9, due to no valid image
Skipping id: 5bad173ce62914001aa140eb, due to no valid image
Skipping id: 5bad173ce62914001aa140ed, due to no valid image
Skipping id: 5bad173ce62914001aa140ef, due to no valid image
Skipping id: 5bad173ce62914001aa140f1, due to no valid image
Skipping id: 5bad173ce62914001aa140f3, due to no valid image
Skipping id: 5bad173ce62914001aa140f5, due to no valid image
Skipping id: 5bad173ce62914001aa140f7, due to no valid image
Skipping id: 5bad173ce62914001aa140f9, due to no valid image
Skipping id: 5bad173ce62914001aa140fb, due to no valid image
Skipping id: 5bad173ce62

Code kept for reference but does not need to be used anymore.

In [None]:
# Copy over groups metadata tag to individual tags under meta.
# for i in items:
#     if 'meta' in i and 'groups' in i['meta']:
#         groups = i['meta']['groups']
#         for k, v in groups.items():
#             if v is None:
#                 groups[k] = ''
#             else:
#                 groups[k] = re.sub(r'\s{2,}', '', v.upper().strip())  
#         gc.addMetadataToItem(i['_id'], groups)