In [1]:
import re
import numpy as np
import pandas as pd
import cv2
from glob import glob

import pytesseract as ocr
import os
import io
import urllib.request as rq

import csv
# using sernec CSV instead of IDB
#import idigbio
#api = idigbio.pandas()


import time
from datetime import datetime

from joblib import Parallel, delayed, parallel_backend
import multiprocessing

In [2]:
# set some global variables
num_cores = multiprocessing.cpu_count() - 1
print(num_cores)

15


#### Setup tesseract & OCR Processing functions

In [3]:
# set up inclusion and exclusion lists
include = []
with open('include.txt', 'r') as file:
    include = [x.replace('/n', '').lower().strip() for x in file.readlines()]
print(f"include = {include}")

exclude = []
with open('exclude.txt', 'r') as file:
    exclude = [x.replace('/n', '').lower().strip() for x in file.readlines()]
print(f"exclude = {exclude}")


# functions necessary for processing URLs
def filter_ocr(inputStr):
    inputStr = inputStr.lower()
    result = None
    excluded_words = [x for x in exclude if x in inputStr]
    if len(excluded_words) <= 0:
        included_words = [x for x in include if x in inputStr]
        if len(included_words) > 0:
            result = "|".join(included_words)
    return result

def url_to_ocr(url, retry=True):
    """
    Accepts a URL, converts it into multiple binary images at varying thresholds
    and runs OCR on each image returning the result.
    """
    # download image,convert to a NumPy array,and read it into opencv
    try:
            # trim out any resize commands
        if retry:
            if "resize" in url:
                use_URL = url.split("resize")[0]
                resp = rq.urlopen(use_URL)
            else:
                resp = rq.urlopen(url)
        else:
            resp = rq.urlopen(url)
    except:
        print("url error, sleeping")
        # if we hit a url error sleep a second
        time.sleep(5)
        if retry:
            # if it should retry sleep 2 additional seconds
            return url_to_ocr(url, retry=False)
        else:
            return "", ""

    img = np.asarray(bytearray(resp.read()),dtype="uint8")
    img = cv2.imdecode(img, cv2.IMREAD_GRAYSCALE)

    try:
    # blindly cut off most of the image
        h,w = img.shape[0:2]
    except AttributeError:
        return "", ""
    # address potentially rotated images
    if w>h:
        h = int(h/2.5)
        w = int(w/2)
        img = img[h:, w:]
    else:
        h = int(h/2)
        w = int(w/2.5)
        img = img[h:, w:]
    
    # convert to grayscale at multiple thresholds and run OCR on each
    with parallel_backend('threading'):
        returnedLists = Parallel(n_jobs=num_cores)(delayed(ocr_gray_img)(img, i) for i in range(50, 245, 55))

    # clean up the results
    all_hits, all_rawOcr = zip(*returnedLists)

    # remove any empty lists from the groups
    all_hits = [x for x in all_hits if x !=  None]
    all_rawOcr = [x for x in set(all_rawOcr) if x != None]
    
    # join all ocr results togeather
    joined_ocr = "|".join([x for x in all_rawOcr])
    # join all unique hit terms togeather into a | seperated list
    hits = "|".join(list(set([x for sublist in all_hits for x in sublist.split("|")])))
    # return the combined results
    return joined_ocr, hits

def ocr_gray_img(img, thresh):
    """
    converts to grayscale at specified threshold and returns runs OCR
    """
    gray = cv2.threshold(img, thresh, 255, cv2.THRESH_BINARY)[1]
    rawOcr = ocr.image_to_string(gray, lang='eng').lower().replace('\n', ' ').replace('  ', ' ')
    hits = filter_ocr(rawOcr)
    return (hits, rawOcr)

def process_row(rowData):
    """
    given a dataframe row, reads the url runs ocr and adds informative fields
    """
    # store start time for speed limit checks.
    start_time = time.time()

    url = rowData['accessURI']
    ocr, hits = url_to_ocr(url)

    if len(hits) > 0:
        rowData['has_keyword'] = 'True'
        rowData['keywords'] = hits
    else:
        rowData['has_keyword'] = 'False'
        rowData['keywords'] = ""
    rowData['ocr'] = ocr
    # be sure we're not exceeding the speed limit
    run_time = time.time() - start_time
    if run_time < 1:
        print('waiting')
        print(run_time)
        time.sleep(1)
    return rowData

include = ['duck', 'ducktown', 'copper', 'copperhill', 'bog', 'potato', 'basin', 'burra', 'ellis', 'london', 'gypsum']
exclude = ['murrell', 'hiwassee', 'frog', 'wyrick', 'gee']


In [4]:
# Perform a quick test of the word filtering
test_strs = ["THIS SHOULD PASS: a place in copperhill which is called ducktown",
            "THIS SHOULD FAIL: a copperhill location near the hiwassee with frogs. gee wiz!",
            "THIS SHOULD FAIL: a place outside of north carolina which is very swampy."]

for test in test_strs:
    res = filter_ocr(test)
    print(f"testing: {test}")
    print(res)

testing: THIS SHOULD PASS: a place in copperhill which is called ducktown
duck|ducktown|copper|copperhill
testing: THIS SHOULD FAIL: a copperhill location near the hiwassee with frogs. gee wiz!
None
testing: THIS SHOULD FAIL: a place outside of north carolina which is very swampy.
None


In [5]:
testUrl = "https://s.idigbio.org/idigbio-images-prod-fullsize/2d48591483a3262529727af641d45b17.jpg"
start = datetime.now()
x = url_to_ocr(testUrl)
end = datetime.now() - start
print(f"retrieving results required: {end} seconds.")
print(f"hits were: {x[1]}")

retrieving results required: 0:00:03.145838 seconds.
hits were: copperhill|copper


#### Load in the dataset

In [6]:
dwcFolder = 'SERNEC_Query'

imgCSV = pd.read_csv(f"{dwcFolder}/images.csv",encoding='utf-8', low_memory=False, quoting=csv.QUOTE_ALL)
occCSV = pd.read_csv(f"{dwcFolder}/occurrences.csv",encoding='utf-8', low_memory=False, quoting=csv.QUOTE_ALL)
df = occCSV.merge(imgCSV,left_on = 'id',right_on='coreid', how = 'inner')

# reduce the dataset to the fields we want
keep_cols = ['coreid', 'id','institutionCode', 'collectionCode',
            'basisOfRecord', 'occurrenceID', 'catalogNumber',
             'kingdom', 'phylum', 'class', 'order', 'family',
             'genus', 'scientificName',
             'eventDate', 'year', 'month', 'day',
             'occurrenceRemarks', 'habitat','stateProvince', 
             'county', 'municipality', 'locality',
             'locationRemarks', 'decimalLatitude', 'decimalLongitude',
             'coordinateUncertaintyInMeters', 'accessURI']
df = df[keep_cols]
print(df.shape)

(9557, 29)


In [7]:
#df = df.sample(96)

In [8]:
print(f"processing {df.shape[0]} records")
print(f'currently it is {datetime.now()}')
print( f'may take {round( ((df.shape[0] * 4) / 60) / 60, 2)} hours to run!')
start_time = time.time()

ext_df = df.apply(process_row, axis=1)

run_time = time.time() - start_time
display( 'DONE!' )
print(f'currently it is {datetime.now()}')
print(f"run time per record = {run_time/ df.shape[0]}")
print(f"total runtime = {run_time}")
display(ext_df.shape)

processing 9557 records
currently it is 2020-02-05 07:20:49.146165
may take 10.62 hours to run!
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url error, sleeping
url 

'DONE!'

currently it is 2020-02-05 17:36:28.206718
run time per record = 3.865130568319027
total runtime = 36939.05284142494


(9557, 32)

In [9]:
ext_df.to_csv("ocr_Results.csv", encoding='utf-8', quoting=csv.QUOTE_ALL, index=False)
ext_df

Unnamed: 0,coreid,id,institutionCode,collectionCode,basisOfRecord,occurrenceID,catalogNumber,kingdom,phylum,class,...,municipality,locality,locationRemarks,decimalLatitude,decimalLongitude,coordinateUncertaintyInMeters,accessURI,has_keyword,keywords,ocr
0,2430062,2430062,USU,UTC,PreservedSpecimen,dfa754e1-1219-4cad-b2e6-26c02a8096ad,UTC00187912,Plantae,Magnoliophyta,,...,,"Junction of highway 1926 and Highway 938, alon...",,,,,https://api.idigbio.org/v2/media/a0803cc40e265...,False,,|plants of texas justicia lanceolata (chapm.) ...
1,2431498,2431498,ASU,Plants,PreservedSpecimen,93cc2229-ebe7-4f2c-a84a-36909d12ea99,ASU0053470,Plantae,Magnoliophyta,,...,,Vicinity Ocoee River.,,35.147415,-84.693514,2000.0,http://storage.idigbio.org/portals/seinet/ariz...,False,,sanifraeaceae she ds is hemddera comcnican...
2,3845980,3845980,APSC,,PreservedSpecimen,f8e86145-1742-454c-8e01-605e1fffc908,APSC0000070,Plantae,Magnoliophyta,,...,,1.3 roadmiles W of intersection of US Hwy 64 a...,,35.108062,-84.597614,,https://bisque.cyverse.org/image_service/image...,False,,austin peay state university herbarium (apsc) ...
3,3846450,3846450,APSC,,PreservedSpecimen,59e38590-5812-4ebb-ad68-b65a0ae1a8a7,APSC0000539,Plantae,Magnoliophyta,,...,,"Flowers white, with yellow eye and purple vein...",,,,,https://bisque.cyverse.org/image_service/image...,False,,d )7 s ! - g) & n o) qo @ plants of tennessee ...
4,3846563,3846563,APSC,,PreservedSpecimen,d5660865-2f4d-4c56-b141-be0072349612,APSC0000652,Plantae,Magnoliophyta,,...,,0.6 roadmile W of intersection of US Hwy 64 an...,,,,,https://bisque.cyverse.org/image_service/image...,False,,austin peay state university herbarium (apsc) ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9552,23861200,23861200,Harvard,GH,PreservedSpecimen,8ab5866f-e540-48bd-9958-248dbfd38e64,barcode-02033083,Plantae,Magnoliophyta,,...,,[data not captured],,,,,http://data.huh.harvard.edu/8ab5866f-e540-48bd...,False,,|tennessee potk county r .o - /3 » p a 7 - 1 b...
9553,23861270,23861270,Harvard,GH,PreservedSpecimen,5371f7f8-b002-4066-a9d5-724501ad958f,barcode-01990381,Plantae,Magnoliophyta,,...,,North side of Ocoee River,,,,,http://data.huh.harvard.edu/5371f7f8-b002-4066...,True,basin,< e v # o a 4 ¥ : 4 fereariunm of cornel...
9554,23869563,23869563,Harvard,GH,PreservedSpecimen,6f57e096-54e1-46a0-b859-a2bdcf59f24c,barcode-01881329,Plantae,Magnoliophyta,,...,,[data not captured],,,,,http://data.huh.harvard.edu/6f57e096-54e1-46a0...,False,,ln d 1 arvard university (c arium of 11 ray he...
9555,20609936,20609936,UCHT,,PreservedSpecimen,667ab2b1-f813-4e8a-832e-003e8ffdc0e7,UCHT028579,Plantae,Magnoliophyta,,...,,,,,,,https://bisque.cyverse.org/image_service/image...,False,,|herbarmmof’l‘hellnimtyof'femat : ; | astermma...
