# Vector Assisted Region Proposals (VARP) for Performant Barcode Decoding
    - Compairing the VARP method to numerious python based alternatives for decoding barcodes in herbarium specimen images.

In [1]:
# basic imports, should be easily satisfied using pip
import re
import cv2
import pandas as pd
import numpy as np
import csv
import urllib.request
import os
from shutil import copyfile as shutil_copy_file 
import time
from glob import glob
import random
from pathlib import Path
import timeit

#### Assemble the testing dataset
    - Manually surfed SERNEC for collections meeting the following criteria:
        - Has a published DwC archive
        - Tends to post high resolution images
        - Seems to use consistant catalogNumber format
    - Retrieved the DwC-A for each of the collections listed below

- Desert Botanical Garden Herbarium (DES)
- Marshall University (MUHW)
- Morris Arboretum of University of Pennsylvania (MOAR)
- Morehead State University Herbarium (MDKY)
- Muhlenberg College (MCA)
- Florida State University's Robert K. Godfrey Herbarium (FSU)
- University of South Carolina, A. C. Moore Herbarium (USCH)
- University of Maryland, Norton-Brown Herbarium (MARY)
- Lynchburg College, Ramsey-Freer Herbarium (LYN)
- University of Tennessee, Chattanooga (UCHT)

For each of those collections, sample their DwC-A and retrieve the images from associated records sampled

In [40]:
collection_codes = ["DES", "MUHW", "MOAR", "MDKY", "MCA", "FSU", "USCH", "MARY", "LYN", "UCHT"]

data_set_fn = "data_set.csv"

# check if the data_set csv has been generated
if not os.path.isfile(data_set_fn):    
    keep_cols = ['institutionCode', 'collectionID',
                 'catalogNumber', 'otherCatalogNumbers','occurrenceID',
                 'accessURI'] 
    sample_df = pd.DataFrame()
    for coll in collection_codes:
        coll_dir = f"./collections/{coll}_DwC-A"
        # make sure the eventual destination image subfolder exists
        Path(f"{coll_dir}/imgs/").mkdir(parents=True, exist_ok=True)
        
        imgCSV = pd.read_csv(f"{coll_dir}/images.csv", low_memory=False, quoting=csv.QUOTE_ALL)
        occCSV = pd.read_csv(f"{coll_dir}/occurrences.csv", low_memory=False, quoting=csv.QUOTE_ALL)
        coll_df = occCSV.merge(imgCSV, left_on='id', right_on='coreid', how='inner')[keep_cols]
        coll_df = coll_df.loc[coll_df['accessURI'].notnull()].sample(120)
        # clean up the occurrenceIDs
        coll_df['occurrenceID'] = coll_df['occurrenceID'].str.lstrip("urn:uuid:")
        # generate a file path (for the future image)
        coll_df['file_path'] = coll_df['occurrenceID'].apply(lambda x: "/imgs/".join([coll_dir, x])) + ".jpg"
        sample_df = sample_df.append(coll_df, ignore_index=True)
    sample_df.to_csv(data_set_fn, encoding='utf-8', index=False)
else:
    sample_df = pd.read_csv(data_set_fn)

# now, verify all images exist and are where they should be
def retrieve_img(row_data, retry=True, sleep_period=1.1):
    """
    makes sure the entire dataset exists.
    """
    file_path = row_data['file_path']
    if not os.path.isfile(file_path):    
        url = row_data['accessURI']
        url = url.replace("resize:4000/", "")
        try:
            urllib.request.urlretrieve(url, file_path)
        except:
            if retry:
                print("""Failed to retrieve image, may be exceeding request limits.
                         Taking a little break... zzz .. zzz..""")
                time.sleep(sleep_period * 5)
                urllib.request.urlretrieve(url, file_path)
            else:
                return ""
        time.sleep(sleep_period)

# using longer than normal sleep_period to avoid an unrelated script from getting request rate limited.
#sample_df.apply(retrieve_img, axis=1, sleep_period=3.5)

## Each image is then manualy verified to contain a barcode.
    - during this process code patterns are identified

In [41]:
# ["DES", "MUHW", "MOAR", "MDKY", "MCA", "FSU", "USCH", "MARY", "LYN", "UCHT"]
collection_patterns = {"DES":re.compile("DES\d{8}\?*"),
                       "MUHW":re.compile("MUHW\d{6}\?*"),
                       "MOAR":re.compile("MOAR\d{7}\?*"),
                       "MDKY":re.compile("MDKY\d{8}\?*"),
                       "MCA":re.compile("MCA\d{7}\?*"),
                       "FSU":re.compile("\d{9}\?*"),
                       "USCH":re.compile("((USCH|HWR-)\d{7})|(ACM\d{4})\?*"),
                       "MARY":re.compile("MARY\d{7}\?*"),
                       "LYN":re.compile("LYN-\d{7}\?*"),
                       "UCHT":re.compile("UCHT\d{6}\?*")}

if len(sample_df) > 1000:
    # reduce each collection's directory to 100 images
    for coll in collection_codes:
        coll_dir = f"./collections/{coll}_DwC-A"
        coll_images = glob(f"{coll_dir}/imgs/*.jpg")
        num_to_toss = len(coll_images) - 100
        to_toss = random.sample(coll_images, num_to_toss)
        for file_to_toss in to_toss:
            os.remove(file_to_toss)

    # identify the files existing in the file system
    coll_images = glob(f"./collections/**/imgs/*.jpg")
    # restrict the dataframe records to those with images on the file system
    sample_df = sample_df.loc[sample_df['file_path'].isin(coll_images)]
    # store the file (overwriting previous csv)
    sample_df.to_csv(data_set_fn, encoding='utf-8', index=False)

## Import the methods to be evaluated
    - Each method will be initially tested on a cropped, binary image to be sure the functions are operating as expected

In [42]:
sample_img = "./collections/MOAR_DwC-A/imgs/9128fb79-f223-424a-8973-ed3edbaf8aab.jpg"
sample_img

'./collections/MOAR_DwC-A/imgs/9128fb79-f223-424a-8973-ed3edbaf8aab.jpg'

#### The bcRead module is the VARP implimentation used in HerbASAP
    - the module is self contained in bcRead.py and retrieved from:
        - https://github.com/CapPow/HerbASAP/tree/master/libs

In [43]:
# instantiate the reader and parameters
from deps.bcRead import *
VARP_reader = bcRead(patterns="")

def varp_decode(img_path):
    """
    function used to test "VARP" method. Expects path as a string.
    """
    gray = cv2.imread(img_path, flags=cv2.IMREAD_GRAYSCALE)
    results = VARP_reader.decodeBC(gray, verifyPattern=False)
    return results

print(f"VARP test results: {varp_decode(sample_img)}")

VARP test results: ['MOAR0019195']


#### bcAudit is a rotation based pyzbar implimentation developed for and used by the Tennessee Herbarium Consortium (THC)
    - pyzbar uses the Zbar backed making this dependency stack: bcAudit > PyZbar > Zbar
    - The function tested is drastically simplified version located at:
        - https://github.com/CapPow/bcAudit
    - The modifications removed all dialog boxes, file renaming, and pattern matching
    - The simplified form is so concise, it is directly implimented in the function (and not imported)

In [44]:
# instantiate the reader and parameters
from PIL import Image
# this import function is using the distributed, pip available version of pyzbar
from pyzbar.pyzbar import decode as orig_pyzbar_decode
thc_rotationList = [12, -12,  21, -21]

def thc_rotation_decode(img_path):
    img = Image.open(img_path)
    bcData = orig_pyzbar_decode(img)  # get the barcode data
    if len(bcData) == 0:
        for i in thc_rotationList:
            img2 = img.rotate((i), resample=Image.NEAREST, expand=True)
            bcData = orig_pyzbar_decode(img2)
            if len(bcData) > 0:
                results = bcData
        else:
            return None
    else:
        results = bcData
    return results

print(f"THC_rotation test results: {thc_rotation_decode(sample_img)}")

THC_rotation test results: [Decoded(data=b'MOAR0019195', type='CODE128', rect=Rect(left=2834, top=7585, width=679, height=99), polygon=[Point(x=2834, y=7679), Point(x=2834, y=7683), Point(x=3509, y=7684), Point(x=3510, y=7678), Point(x=3512, y=7620), Point(x=3513, y=7590), Point(x=3513, y=7586), Point(x=2837, y=7585), Point(x=2836, y=7613), Point(x=2835, y=7645)])]


#### Gouda is an advanced pyzbar implimentation maintained by the Natural History Museum of London
    - Pyzbar uses the Zbar backend, making this dependency stack: Gouda > PyZbar > Zbar
    - Retrieved for python 3.8 by cloning the repo located at:
        - https://github.com/NaturalHistoryMuseum/gouda
    - Then Pip installing the local package:
        - e.g., "pip install "./gouda" --user
        
    - Gouda, by default sends results to a special handler class which writes them to console or a file.
        - This behaviour was modified to make the process compatible with testing and consequently simpler.
        - The necessary modifications were made to "./gouda/scripts/decode_barcodes.py," where the decode function was altered as below:  

<pre><code>
def decode(paths, strategies, engine, read_greyscale):
    """Finds and decodes barcodes in images given in pathss
    """
    for p in sorted(paths):
        if p.is_dir():
            # Descend into directory
            decode(p.iterdir(), strategies, engine, read_greyscale)
        else:
            # Process file
            try:
                img = read_image(p, read_greyscale)
                if img is None:
                    # Most likely not an image
                    return None
#                    for visitor in visitors:
#                        visitor.result(p, [None, []])
                else:
                    # Read barcodes
                    for strategy in strategies:
                        result = strategy(img, engine)
                        if result:
                            # Found a barcode
                            ### Modified to return results instead of send them to visitor class
                            return result[-1]
#                           break
                    else:
                        # No barcode was found
#                        result = [None, []]
                        return None
#                    return result
#
#                    for visitor in visitors:
#                        visitor.result(p, result)
            except Exception:
                print('Error processing [{0}]'.format(p))
                traceback.print_exc()
</code></pre>

In [45]:
from gouda.scripts import decode_barcodes as gouda_decode
from gouda.engines.options import engine_options
from gouda.engines.zbar import ZbarEngine
from gouda.strategies.roi.roi import roi
from gouda.strategies.resize import resize

# instantiate the reader and parameters
gouda_reader = gouda_decode
gouda_engine = ZbarEngine()
gouda_roi_strat = [roi]
gouda_resize_strat = [resize]

def gouda_roi_decode(img_path):
    """
    function used to test "Gouda-roi" method (using "roi" strategy). Expects path object.
    """
    results = gouda_decode.decode(paths=[img_path],
                                  strategies= gouda_roi_strat,
                                  engine=gouda_engine,
                                  read_greyscale=True)
    return results

print(f"Gouda-roi test results: {gouda_roi_decode(Path(sample_img))}")

def gouda_resize_decode(img_path):
    """
    function used to test "Gouda-resize" method (using "resize" strategy). Expects path object.
    """
    results = gouda_decode.decode(paths=[img_path],
                                  strategies= gouda_resize_strat,
                                  engine=gouda_engine,
                                  read_greyscale=True)
    return results

print(f"Gouda-resize test results: {gouda_resize_decode(Path(sample_img))}")

Gouda-roi test results: [Barcode(type='CODE128', data=b'MOAR0019195')]
Gouda-resize test results: [Barcode(type='CODE128', data=b'MOAR0019195')]


#### The ZXing library is actually the python implimentation of the ZXing backend.
    - Using a maintained fork of the original (because the original is not being updated)
    - Source available at: https://github.com/dlenski/python-zxing
    - retrieved with a simple pip install:
        - "pip install zxing --user"

In [46]:
import zxing

# instantiate the reader and parameters
zxing_reader = zxing.BarCodeReader()
possible_formats = ["CODE_128", "CODE_39", "CODE_93"]

def zxing_th_decode(img_path):
    """
    function used to test "ZXing-th" method (with try_harder parameter). Expects path object.
    """
    result = zxing_reader.decode(img_path,
                                 try_harder=True,
                                 possible_formats=possible_formats)
    return result

print(f"zxing-th test results: {zxing_th_decode(sample_img)}")

# zxing is always failing without the "try_harder" parameter = True
def zxing_decode(img_path):
    """
    function used to test "ZXing" method (without try_harder parameter). Expects path object.
    """
    result = zxing_reader.decode(img_path,
                                 try_harder=False,
                                 possible_formats=possible_formats)
    return result

print(f"zxing test results: {zxing_decode(sample_img)}")

zxing-th test results: BarCode('MOAR0019195', 'MOAR0019195', 'CODE_128', 'TEXT', [(2854.5, 7611.0), (3491.5, 7611.0)])
zxing test results: None


## Decoding success rate tests

In [47]:
def test_accuracy(row_data):
    """
    given a dataframe row, tests if each method can successfully decode the associated image.
    returns the row data with each method's success or failure added
    """
    collection_code = row_data['institutionCode']
    pattern = collection_patterns[collection_code]
    file_path = row_data['file_path']
    
    # test VARP
    varp_bcs = varp_decode(file_path)
    if varp_bcs:
        varp_bcs = [x for x in varp_bcs if pattern.match(x)]
        if len(varp_bcs) > 0:
            varp_res = True
        else:
            varp_res = False
    else:
        varp_res = False
    row_data['VARP_success'] = varp_res

    # test thc_rotation_decode
    thc_bcs = thc_rotation_decode(file_path)
    if thc_bcs:
        thc_bcs = [x.data.decode('utf-8') for x in thc_bcs]
        thc_bcs = [x for x in thc_bcs if pattern.match(x)]
        if len(thc_bcs) > 0:
            thc_res = True
        else:
            thc_res = False
    else:
        thc_res = False
    row_data['THC-resize_success'] = thc_res
    
    # convert path to path object
    file_path = Path(file_path)

    # test gouda_roi_decode
    gouda_roi_bcs = gouda_roi_decode(file_path)
    if gouda_roi_bcs:
        gouda_roi_bcs = [x.data.decode('utf-8') for x in gouda_roi_bcs]
        gouda_roi_bcs = [x for x in gouda_roi_bcs if pattern.match(x)]
        if len(gouda_roi_bcs) > 0:
            gouda_roi_res = True
        else:
            gouda_roi_res = False
    else:
        gouda_roi_res = False
    row_data['gouda-roi_success'] = gouda_roi_res
    
    # test gouda_resize_decode
    gouda_resize_bcs = gouda_resize_decode(file_path)
    if gouda_resize_bcs:
        gouda_resize_bcs = [x.data.decode('utf-8') for x in gouda_resize_bcs]
        gouda_resize_bcs = [x for x in gouda_resize_bcs if pattern.match(x)]
        if len(gouda_resize_bcs) > 0:
            gouda_resize_res = True
        else:
            gouda_resize_res = False
    else:
        gouda_resize_res = False
    row_data['gouda-resize_success'] = gouda_resize_res
    
    # test zxing_th_decode
    zxing_th_bc = zxing_th_decode(file_path)
    try:
        if zxing_th_bc:
            zxing_th_bc = zxing_th_bc.parsed
            zxing_th_bcs = pattern.match(zxing_th_bc).string
            if len(zxing_th_bcs) > 0:
                zxing_th_res = True
            else:
                zxing_th_res = False
        else:
            zxing_th_res = False
    except:
        zxing_th_res = False
    row_data['zxing-th_success'] = zxing_th_res

    # test zxing_decode
    zxing_bcs = zxing_decode(file_path)
    try:
        if zxing_bcs:
            zxing_bcs = zxing_bcs.parsed
            zxing_bcs = pattern.match(zxing_bcs).string
            if len(zxing_bcs) > 0:
                zxing_res = True
            else:
                zxing_res = False
        else:
            zxing_res = False
    except:
        zxing_res = False
    row_data['zxing_success'] = zxing_res

    return row_data

#### Establish the success col names 

In [48]:
success_cols = ['VARP_success', 'THC-resize_success', 'gouda-roi_success',
                'gouda-resize_success', 'zxing-th_success', 'zxing_success']

#### Test the process on a small batch ahead of time

In [49]:
x = sample_df.sample(5).copy().reset_index()
for i, y in x.iterrows():
    print(y['file_path'])
    display(test_accuracy(y)[success_cols])

./collections/UCHT_DwC-A/imgs/8efd7626-f4c8-4ac0-95e7-5e8ad5496711.jpg


VARP_success             True
THC-resize_success       True
gouda-roi_success        True
gouda-resize_success     True
zxing-th_success         True
zxing_success           False
Name: 0, dtype: object

./collections/FSU_DwC-A/imgs/3bc3054d-0917-4a55-8e28-b8f1a8a4e37e.jpg


VARP_success             True
THC-resize_success       True
gouda-roi_success        True
gouda-resize_success     True
zxing-th_success         True
zxing_success           False
Name: 1, dtype: object

./collections/UCHT_DwC-A/imgs/1024a314-c129-409f-8bac-cad74be58734.jpg


VARP_success             True
THC-resize_success       True
gouda-roi_success        True
gouda-resize_success     True
zxing-th_success         True
zxing_success           False
Name: 2, dtype: object

./collections/MUHW_DwC-A/imgs/cb707610-2872-4ea9-b3d7-883964c368e5.jpg


VARP_success             True
THC-resize_success       True
gouda-roi_success        True
gouda-resize_success     True
zxing-th_success        False
zxing_success           False
Name: 3, dtype: object

./collections/MUHW_DwC-A/imgs/31e266f2-2c83-4bf1-8c96-3b46dd0a16c5.jpg


VARP_success             True
THC-resize_success       True
gouda-roi_success        True
gouda-resize_success     True
zxing-th_success         True
zxing_success           False
Name: 4, dtype: object

#### Run the success rate test (this may take some time!)

In [50]:
sample_df.shape

(1000, 19)

In [51]:
# test the accuracy accross all images
sample_df = sample_df.apply(test_accuracy, axis=1)
# save the results
sample_df.to_csv(data_set_fn, encoding='utf-8', index=False)



In [52]:
# display the problem files which no methods succeeded in decoding.
problem_list = sample_df.loc[(sample_df['VARP_success']==False) & (sample_df['THC-resize_success']==False)
                              & (sample_df['gouda-roi_success']==False) & (sample_df['gouda-resize_success']==False)
                              & (sample_df['zxing-th_success']==False) & (sample_df['zxing_success']==False)]['file_path'].values.tolist()
print(len(problem_list))
problem_list

21


['./collections/DES_DwC-A/imgs/0ed08137-c0af-4ab4-b0e8-fb64c71cc1c0.jpg',
 './collections/DES_DwC-A/imgs/7863a40c-3fe7-4166-a14a-c4934521a949.jpg',
 './collections/DES_DwC-A/imgs/665aaf37-0b1c-4790-89a1-ee7024ee6446.jpg',
 './collections/DES_DwC-A/imgs/e21ec6ad-bf1e-410f-b259-f21f2a1971b0.jpg',
 './collections/DES_DwC-A/imgs/01caacbb-2335-4700-a4de-322653ae8b2f.jpg',
 './collections/DES_DwC-A/imgs/eed0c669-3852-43c9-b33f-4b0ac7fbdd36.jpg',
 './collections/DES_DwC-A/imgs/c9f0eccb-e3c6-4baa-ae79-f0fd8e41c04d.jpg',
 './collections/DES_DwC-A/imgs/ac71f9bf-d120-4ec6-8ceb-4605e2d506ca.jpg',
 './collections/DES_DwC-A/imgs/5a4d9199-ce82-475f-a8a3-8f62f17ecc5b.jpg',
 './collections/DES_DwC-A/imgs/5e7ab4e2-4abc-4bce-a9ab-904de588ab3a.jpg',
 './collections/DES_DwC-A/imgs/48d43af-85db-486f-a639-371e0031468f.jpg',
 './collections/DES_DwC-A/imgs/2089ce2f-f562-4899-a3b1-fc5be149f3b4.jpg',
 './collections/DES_DwC-A/imgs/0e8f5442-2a11-4333-a790-875dd0b53bf5.jpg',
 './collections/DES_DwC-A/imgs/67093022

## Establish timed functions

In [53]:
def timeit(f):

    def timed(*args, **kw):
        ts = time.time()
        result = f(*args, **kw)
        te = time.time()
        return te-ts
    return timed

def timeit_path(f):

    def timed_path(*args, **kw):
        img_path = Path(args[0])
        ts = time.time()
        result = f(img_path)
        te = time.time()
        return te-ts
    return timed_path

@timeit
def varp_decode(img_path):
    """
    function used to test "VARP" method. Expects path as a string.
    """
    gray = cv2.imread(img_path, flags=cv2.IMREAD_GRAYSCALE)
    results = VARP_reader.decodeBC(gray, verifyPattern=False)
    return results

@timeit
def thc_rotation_decode(img_path):
    img = Image.open(img_path)
    bcData = orig_pyzbar_decode(img)  # get the barcode data
    if len(bcData) == 0:
        for i in thc_rotationList:
            img2 = img.rotate((i), resample=Image.NEAREST, expand=True)
            bcData = orig_pyzbar_decode(img2)
            if len(bcData) > 0:
                results = bcData
        else:
            return None
    else:
        results = bcData
    return results

@timeit_path
def gouda_roi_decode(img_path):
    """
    function used to test "Gouda-roi" method (using "roi" strategy). Expects path object.
    """
    results = gouda_decode.decode(paths=[img_path],
                                  strategies= gouda_roi_strat,
                                  engine=gouda_engine,
                                  read_greyscale=True)
    return results

@timeit_path
def gouda_resize_decode(img_path):
    """
    function used to test "Gouda-resize" method (using "resize" strategy). Expects path object.
    """
    results = gouda_decode.decode(paths=[img_path],
                                  strategies= gouda_resize_strat,
                                  engine=gouda_engine,
                                  read_greyscale=True)
    return results

@timeit_path
def zxing_th_decode(img_path):
    """
    function used to test "ZXing-th" method (with try_harder parameter). Expects path object.
    """
    result = zxing_reader.decode(img_path,
                                 try_harder=True,
                                 possible_formats=possible_formats)
    return result

#### establish a dict to reference and a list to randomize and a function to perform the tests

In [54]:
col_to_func = {"VARP_time":varp_decode,
              "thc-rotate_time":thc_rotation_decode,
              "gouda-roi_time":gouda_roi_decode,
              "gouda-resize_time":gouda_resize_decode,
              "zxing-th_time":zxing_th_decode}

col_list = ["VARP_time",
            "thc-rotate_time",
            "gouda-roi_time",
            "gouda-resize_time",
            "zxing-th_time"]

def execution_time_tests(row_data, n=5):
    random.shuffle(col_list)
    for col in col_list:
        times = []
        try:
            for i in range(n):
                times.append(col_to_func[col](row_data['file_path']))
            row_data[col] = round(np.median(times), 6)
        except:
            print(col, times, row_data['file_path'])
    return row_data

## Execution Time Tests

In [55]:
sample_df = sample_df.apply(execution_time_tests, axis=1)
sample_df.to_csv(data_set_fn, encoding='utf-8', index=False)



## Results Analysis

In [56]:
sample_df.describe()

Unnamed: 0,VARP_time,gouda-resize_time,gouda-roi_time,thc-rotate_time,zxing-th_time,VARP_reduction
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.640308,5.197384,0.370076,3.168757,3.263542,0.98308
std,0.270719,8.559542,0.109348,2.687103,1.982094,0.008982
min,0.165615,0.436277,0.100064,0.487928,1.043149,0.902064
25%,0.474371,1.139439,0.306716,1.269581,1.873428,0.97964
50%,0.617316,1.93389,0.367012,2.273467,2.198027,0.984969
75%,0.752403,2.964441,0.434579,4.289565,3.389721,0.988987
max,3.425976,63.550875,0.8324,21.964081,7.718951,0.995665


In [57]:
method_to_cols = {"VARP":["VARP_time", "VARP_success"],
                 "gouda-resize":["gouda-resize_time", "gouda-resize_success"],
                 "gouda-roi":["gouda-roi_time", "gouda-roi_success"],
                 "THC-rotate":["thc-rotate_time", "THC-resize_success"],
                 "zxing-th":["zxing-th_time", "zxing-th_success"]}

results_df = pd.DataFrame({"Method":[], "Success Rate":[],
                           "Execution Time":[], 
                           "Successful Execution Time":[],
                           "Failure Execution Time":[]})

for method, cols in method_to_cols.items():
    time_col, success = cols
    success_rows = sample_df.loc[sample_df[success]==True]
    failed_rows = sample_df.loc[sample_df[success]==False]

    row_data = {}    
    row_data['Method'] = method
    row_data['Success Rate'] = round( len(success_rows) / len(sample_df) , 5)        
    row_data['Execution Time'] = np.median(sample_df[time_col])
    row_data['Successful Execution Time'] = np.median(success_rows[time_col])
    row_data['Failure Execution Time'] = np.median(failed_rows[time_col])
    
    results_df = results_df.append(row_data, ignore_index=True)
    
results_df

Unnamed: 0,Method,Success Rate,Execution Time,Successful Execution Time,Failure Execution Time
0,VARP,0.965,0.617316,0.612066,1.105602
1,gouda-resize,0.917,1.93389,1.689242,27.426875
2,gouda-roi,0.479,0.367012,0.37206,0.356412
3,THC-rotate,0.742,2.273467,1.468902,6.935636
4,zxing-th,0.643,2.198027,3.280999,2.17334


In [58]:
# how many of the problem list
varp_problem_list = sample_df.loc[(sample_df['VARP_success']==False)]['file_path'].values.tolist()
print(len(varp_problem_list))
solo_varp_problem_list = [x for x in varp_problem_list if x not in problem_list]
len(solo_varp_problem_list)
# Therefore, of the 35 VARP failed to decode, 14 of them were successfully decoded by some other method.

35


14

#### Determine how frequently (if ever) the catalogNumber does not match the decoded value
    - Not really important to the evaluation, but valuable information which can be opportunisticly derived
        - assuming the 10 collections can be said to be a representative sample...

In [23]:
collection_patterns = {"DES":re.compile("DES\d{8}\?*"),
                       "MUHW":re.compile("MUHW\d{6}\?*"),
                       "MOAR":re.compile("MOAR\d{7}\?*"),
                       "MDKY":re.compile("MDKY\d{8}\?*"),
                       "MCA":re.compile("MCA\d{7}\?*"),
                       "FSU":re.compile("\d{9}\?*"),
                       "USCH":re.compile("((USCH|HWR-)\d{7})|(ACM\d{4})\?*"),
                       "MARY":re.compile("MARY\d{7}\?*"),
                       "LYN":re.compile("LYN-\d{7}\?*"),
                       "UCHT":re.compile("UCHT\d{6}\?*")}

def varp_decode(img_path):
    """
    function used to test "VARP" method. Expects path as a string.
    """
    gray = cv2.imread(img_path, flags=cv2.IMREAD_GRAYSCALE)
    results = VARP_reader.decodeBC(gray, verifyPattern=False)
    return results

def det_correct_value(row_data):
    """
    given a dataframe row, tests if each method can successfully decode the associated image.
    returns the row data with each method's success or failure added
    """
    collection_code = row_data['institutionCode']
    catalogNumber = row_data['catalogNumber']
    # handle the FSU's trunicated leading zeroes
    if collection_code == "FSU":
        catalogNumber = str(catalogNumber).zfill(9) 

    pattern = collection_patterns[collection_code]
    file_path = row_data['file_path']

    # test VARP
    varp_bcs = varp_decode(file_path)
    if varp_bcs:
        varp_bcs = [x for x in varp_bcs if pattern.match(x)]
        if catalogNumber in varp_bcs:
            row_data['bc_matches'] = True
        else:
            row_data['bc_matches'] = False
    return row_data

In [24]:
cat_verif = sample_df.apply(det_correct_value, axis=1)
cat_verif

Unnamed: 0,THC-resize_success,VARP_reduction,VARP_success,VARP_time,accessURI,bc_matches,catalogNumber,collectionID,file_path,gouda-resize_success,gouda-resize_time,gouda-roi_success,gouda-roi_time,institutionCode,occurrenceID,otherCatalogNumbers,thc-rotate_time,zxing-th_success,zxing-th_time,zxing_success
0,True,0.953881,True,0.646803,https://serv.biokic.asu.edu/imglib/h_seinet/se...,True,DES00032153,edb9860c-c481-4d19-88cc-224a536ebcd4,./collections/DES_DwC-A/imgs/9b08d2b0-99b6-472...,True,1.353585,False,0.482916,DES,9b08d2b0-99b6-4726-b467-b5a0075b5b85,,1.425708,True,1.098230,False
1,True,0.967761,True,0.523438,https://serv.biokic.asu.edu/imglib/h_seinet/se...,True,DES00010750,edb9860c-c481-4d19-88cc-224a536ebcd4,./collections/DES_DwC-A/imgs/06a17a59-e2aa-4a7...,True,0.932862,False,0.171819,DES,06a17a59-e2aa-4a74-bad7-e38f06b323a5,,1.008749,True,1.067084,False
2,False,0.980085,True,0.338213,https://serv.biokic.asu.edu/imglib/h_seinet/se...,True,DES00051133,edb9860c-c481-4d19-88cc-224a536ebcd4,./collections/DES_DwC-A/imgs/48210812-bc59-4f6...,True,8.895074,False,0.191055,DES,48210812-bc59-4f61-becb-444caa23a7f8,,3.589678,False,1.289185,False
3,False,0.983913,False,0.821538,https://serv.biokic.asu.edu/imglib/h_seinet/se...,,DES00020925,edb9860c-c481-4d19-88cc-224a536ebcd4,./collections/DES_DwC-A/imgs/0ed08137-c0af-4ab...,False,22.909585,False,0.292756,DES,0ed08137-c0af-4ab4-b0e8-fb64c71cc1c0,,4.804950,False,1.501495,False
4,True,0.967268,True,0.482620,https://serv.biokic.asu.edu/imglib/h_seinet/se...,True,DES00050626,edb9860c-c481-4d19-88cc-224a536ebcd4,./collections/DES_DwC-A/imgs/72b786f1-0da7-49c...,True,0.897832,False,0.314793,DES,72b786f1-0da7-49cd-af8c-2170659f0a49,,0.993489,True,1.089518,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,False,0.974564,True,0.837261,https://bisque.cyverse.org/image_service/image...,True,HWR-0002271,a13355a1-1ccc-46d9-9672-521d8d668804,./collections/USCH_DwC-A/imgs/5e6ad056-ff6d-11...,True,9.577035,False,0.489137,USCH,5e6ad056-ff6d-11e2-9beb-001c23ce290f,102790.0,7.922155,False,7.008716,False
996,False,0.981449,True,0.721884,https://bisque.cyverse.org/image_service/image...,True,USCH0000551,a13355a1-1ccc-46d9-9672-521d8d668804,./collections/USCH_DwC-A/imgs/5da77f52-ff6d-11...,True,4.953367,False,0.513058,USCH,5da77f52-ff6d-11e2-9beb-001c23ce290f,71063.0,8.783523,False,7.087594,False
997,True,0.957977,True,0.914995,https://bisque.cyverse.org/image_service/image...,True,USCH0042115,a13355a1-1ccc-46d9-9672-521d8d668804,./collections/USCH_DwC-A/imgs/9b401698-1583-4f...,True,1.357288,True,0.312536,USCH,9b401698-1583-4fd5-80e2-80209c03e576,30873.0,1.510768,True,1.932974,False
998,True,0.988529,True,0.559320,https://bisque.cyverse.org/image_service/image...,True,USCH0045125,a13355a1-1ccc-46d9-9672-521d8d668804,./collections/USCH_DwC-A/imgs/300966c2-4651-49...,True,1.356677,True,0.384144,USCH,300966c2-4651-4935-a828-21e6dfedc5c0,123146.0,1.506480,True,1.924870,False


In [25]:
mismatched_df = cat_verif.loc[cat_verif['bc_matches']==False].copy()

In [27]:
for i, row in mismatched_df.iterrows():
    print(row['institutionCode'])
    print(row["file_path"])
    print(varp_decode(row['file_path']))
    print(row['catalogNumber'])
    print()

FSU
./collections/FSU_DwC-A/imgs/1a972c37-2fc8-4a5e-9585-85db088deba5.jpg
['000168051']
168050

MARY
./collections/MARY_DwC-A/imgs/7f5a4bab-1b80-4217-ba50-e41a68e397b9.jpg
['MARY1019245']
MARY1019250

MUHW
./collections/MUHW_DwC-A/imgs/eb5c2bc1-0e3c-4614-a3c5-a454f5731225.jpg
['MUHW018748']
MUHW018723



## Determine average resolution reduction using VARP method
    - bcRead module was modified with a new function: reduction_determination_extract_by_squares()
        - which is essentially the same as the typical decode method caled by VARP except
        - it returns the proportion of resolution reduction, determed by:
        - ( (orig_h * orig_w) / (composite_h * composite_w) ) - 1
        - the absolute value of that value is taken in the function below, producing "% reduced by" value.

In [1]:
def varp_decode(img_path):
    """
    function used to test "VARP" method. Expects path as a string.
    """
    gray = cv2.imread(img_path, flags=cv2.IMREAD_GRAYSCALE)
    try:
        v, reduction = VARP_reader.reduction_determination_extract_by_squares(gray)
    except ValueError: # a failure to decode
        print(VARP_reader.reduction_determination_extract_by_squares(gray))
        v, reduction = False, False
    return (v, reduction)

def det_reduction_factor(row_data):
    fp = row_data['file_path']
    v, r = varp_decode(fp)
    if r:
        row_data['VARP_reduction'] = abs(r)
    else:
        row_data['VARP_reduction'] = 0 #failed to decode, so no resolution reduction
    return row_data

sample_df = sample_df.apply(det_reduction_factor, axis=1)
sample_df.to_csv(data_set_fn, encoding='utf-8', index=False)

In [11]:
reduction_median = np.median(sample_df['VARP_reduction'])
reduction_median

0.9849501549999999