In [1]:
import pytesseract as ocr
from pytesseract import Output
from glob import glob
import cv2
import os
import numpy as np
import itertools

# using joblib to schedule parallell tasks
from joblib import Parallel, delayed, parallel_backend
# multiprocessing is really only here to detect core counts.. could replace this with an explicit count
import multiprocessing

__Note:__ the destination for the source images will need to be altered. Also, it currently spits output images into the script's root directory. It may be wise to add in a clean destination. 

In [2]:
#available core count
num_cores = multiprocessing.cpu_count() - 4
#source images
img_files = glob('../data_gathering/output/*.jpg')
print(len(img_files))

14


__Note:__ It is worth trying out different confidence values to fine tune ratio of Type I and type II errors. Currently indicated by the optional variable in "ocr_gray_img." It may also be worth testing out some of the commented out tesseract configurations in same function.

In [3]:
def ocr_gray_img(img, thresh, conf=80):
    """
    converts to grayscale at specified threshold and returns runs OCR
    img = an opened cv2 image file
    thresh = the value used in cv2 binary thresholding
    conf = tesseract's minimum condience threshold when identifying text
    """
    # run ocr on grayscale conversion
    gray = cv2.threshold(img, thresh, 255, cv2.THRESH_BINARY)[1]
    
    # here I am testing many different tesseract configuration settings. It may be worth trying out different configs
    
    #config = "--psm 13 --oem 3"
    #config = "--dpi 300 --psm 11 --oem 1 -c tessedit_char_whitelist=0123456789"
    #config = "--dpi 300 --psm 11 --oem 2 \
    #config = "--dpi 1200 --psm 11 --oem 2 \
    #config = "--dpi 1200 --psm 12 --oem 2 \
    config = "--dpi 300 --psm 11 --oem 2"
#    config = "--dpi 300 --psm 11 --oem 2 \
#            -c language_model_penalty_non_freq_dict_word=0.0 \
#            -c language_model_penalty_non_dict_word=0.0 \
#            -c language_model_penalty_punc=0.0 \
#            -c language_model_penalty_case=0.0 \
#            -c language_model_penalty_script=0.0 \
#            -c language_model_penalty_chartype=0.0 \
#            -c language_model_penalty_font=0.0 \
#            -c language_model_penalty_spacing=0.2"
    
    ocr_boxes = ocr.image_to_data(gray, 
                                  output_type=Output.DICT,
                                  lang=None,
                                  config=config)

    rot_img = cv2.rotate(img, cv2.cv2.ROTATE_90_COUNTERCLOCKWISE) 
    rot_ocr_boxes = ocr.image_to_data(rot_img, 
                                      output_type=Output.DICT,
                                      lang=None,
                                      config=config)

    img_h, img_w = gray.shape[0:2]
    # container to hold the results
    boxes = []
    # generate bounding box each text blob
    # see: https://nanonets.com/blog/ocr-with-tesseract/
    for box_group in [ocr_boxes, rot_ocr_boxes]:
        # flag to determine if these are rotated boxes
        rotated = box_group == rot_ocr_boxes
        n_boxes = len(box_group['text'])
        for i in range(n_boxes):
            if int(box_group['conf'][i]) > conf:
                (left, top, w, h) = (box_group['left'][i], box_group['top'][i], box_group['width'][i], box_group['height'][i])
                # if the image was rotated 90deg ccw, correct the coords
                if rotated:
                    top_new = left
                    left = img_w - (top+h)
                    top = top_new
                    h, w = w, h

                # Set qualifying conditions for box size
                width_thresh =  w < img_w * 0.4
                height_thresh = h < img_h * 0.3
                # if box size is reasonable, append it to results container
                if (width_thresh & height_thresh):
                    box = (top, left, w, h)
                    boxes.append(box)

    return boxes

def apply_blur(img, boxes):
    mask = np.zeros_like(img)
    for box in boxes:
        y, x, w, h = box
        mask[y:y+h, x:x+w, ...] = [1, 1, 1]
    blurred = cv2.GaussianBlur(img, (51, 51),30,30)
    #blurred = cv2.GaussianBlur(img, (75, 75),100, 100)
    output = np.where(mask, blurred, img)
    
    return output

def merge_nearby_boxes(boxes, distance_thresh=10):    
    # top, left, w, h = box
    # container for final, joined rectangles
    final_rects = []
    toss_rects = [] # used to identify which ones are already merged into another.
    for rect, next_rect in itertools.combinations(boxes, 2):    
        keep_rect = rect # in case no other conditions trigger keep the unmerged rect.

        if rect[1] < next_rect[1]:
            left_most_rect = rect
            left_least_rect = next_rect
        else:
            left_most_rect = next_rect
            left_least_rect = rect
            
        if rect[0] < next_rect[0]:
            top_most_rect = rect
            top_least_rect = next_rect
        else:
            top_most_rect = next_rect
            top_least_rect = rect

        # establish overlapping conditionals
        y_cond = (left_most_rect[1] + left_most_rect[2] + distance_thresh) >= left_least_rect[1]

        x_cond = (top_most_rect[0] + top_most_rect[3] + distance_thresh) >= top_least_rect[0] 

        if (y_cond and x_cond):
            w = (left_least_rect[1] + left_least_rect[2]) - left_most_rect[1]
            h = (top_least_rect[0] + top_least_rect[3]) - top_least_rect[0]
            keep_rect = (top_most_rect[0], left_most_rect[1], w, h)
            toss_rects.append(rect)
            toss_rects.append(next_rect)

        final_rects.append(keep_rect)

    toss_rects = set(toss_rects)
    final_rects = set([x for x in final_rects if x not in toss_rects])

    return final_rects



In [None]:
debug = False

for img_file in img_files:
    img = cv2.imread(img_file)
    boxes = []
    
    # contrast adjustment (see: https://stackoverflow.com/questions/42257173/contrast-stretching-in-python-opencv)
    xp = [0, 64, 128, 192, 255]
    fp = [0, 16, 128, 240, 255]
    x = np.arange(256)
    table = np.interp(x, xp, fp).astype('uint8')
    cleaned = cv2.LUT(img, table)

    for k in [0, 3, 5]:
        # minor text cleaning before we ocr
        if k > 0:
            kernel = np.ones((k,k), np.uint8)
            cleaned = cv2.erode(cleaned, kernel, iterations=1)
            cleaned = cv2.dilate(cleaned, kernel, iterations=1)
        # used to check out the cleaned image being handed to ocr
        # establish an output file dest
        if debug:
            fn_w_ext = os.path.basename(img_file)
            fn, ext = os.path.splitext(fn_w_ext)
            new_fn = f"{fn}_cleaned{ext}"
            cv2.imwrite(new_fn, cleaned)

        # convert to grayscale at multiple thresholds and run OCR on each
        with parallel_backend('threading'):
            boxes.extend( Parallel(n_jobs=num_cores)(delayed(ocr_gray_img)(cleaned, i) for i in range(50, 200, 24)) )

    # flatten the list of lists (one list of boxes per threshold) # reduce to unique elements
    boxes = list(set(x for y in boxes for x in y))
    # combine nearby boxes to blur interspaces
    distance_thresh = int( min(img.shape[0:2]) * 0.05) #0.025
    boxes = merge_nearby_boxes(boxes, distance_thresh)

    # blur the area within each box 
    img = apply_blur(img, boxes)

    # establish an output file dest
    fn_w_ext = os.path.basename(img_file)
    fn, ext = os.path.splitext(fn_w_ext)
    new_fn = f"{fn}_blurred{ext}"
    #save it
    cv2.imwrite(new_fn, img) 

In [None]:
len(boxes)