# OCR Tools Benchmark

    Author: Jean-Romain Roy 
    Date: 23/09/2020

This notebook contains scripts, test data and instructions explaining how to benchmark different OCR tools including CAI Meza, Google Vision, Amazon Textract and Tesseract.

The test data is composed of actual pictures of logbooks from our pilots in the Republic of Congo. Out of the valid pictures (relatively clear handwriting, logbook borders are visible, page is well lit) a few were selected randomly.

Images take two forms. In *test_data/full_images/*, images are in their raw format. In *test_data/cell_images/*, we already took care of cutting the individual cells from the source images. The tedious task of decoding the handwritten characters in the images has already been done and can be found in the same directories.

The *test_data/full_images/* directory also has a json file containing the shape of each potential template as Meza is configured to match incoming images with preloaded templates. As an aside, preloading templates is a feature that allows Meza to organize images and reference each extracted cell. It also allows it to automatically rotate the image if it's not upright.


In [None]:
# Dependencies
import os
import json
import pandas as pd

In [None]:
# Paths to test data
BUBBLES_IMG_PATH = 'test_data/cell_images/bubbles/images/'
BUBBLES_VALUES = 'test_data/cell_images/bubbles/values.txt'

EASTERN_ARABIC_NUMERALS_IMG_PATH = 'test_data/cell_images/eastern_numerals/images/'
EASTERN_ARABIC_NUMERALS_VALUES = 'test_data/cell_images/eastern_numerals/values.txt'

WESTERN_ARABIC_NUMERALS_IMG_PATH = 'test_data/cell_images/western_numerals/images/'
WESTERN_ARABIC_NUMERALS_VALUES = 'test_data/cell_images/western_numerals/values.txt'

FULL_IMG_PATH = 'test_data/full_images/images/'
FULL_IMG_VALUES = 'test_data/full_images/values.json'
FULL_IMG_TEMPLATES = 'test_data/full_images/templates.json'

# Paths to results
RESULTS_BUBBLES_DIR = 'results/cell_images/bubbles/'
RESULTS_EASTERN_ARABIC_NUMERALS_DIR = 'results/cell_images/eastern_numerals/'
RESULTS_WESTERN_ARABIC_NUMERALS_DIR = 'results/cell_images/western_numerals/'

## Load Test Data

### Bubbles

In [None]:
# Load the path & values
bubbles_df = pd.read_csv(BUBBLES_VALUES, index_col=False, delimiter=";", dtype={'filename': 'str', 'value': 'str'})

# Create the filename column
bubbles_df['filename'] = BUBBLES_IMG_PATH + bubbles_df['filename'].astype(str)

# Replace , with .
bubbles_df['value'].replace({',': '.'}, inplace=True, regex=True)

# replace rows with NaN values with ''
bubbles_df['value'] = bubbles_df['value'].fillna('')

# print stats
print(f"Length of Validation Set = {len(bubbles_df.index)}")

### Eastern Arabic Numerals

In [None]:
# Load the path & values
eastern_numerals_df = pd.read_csv(EASTERN_ARABIC_NUMERALS_VALUES, index_col=False, delimiter=";", dtype={'filename': 'str', 'value': 'str'})

# Create the filename column
eastern_numerals_df['filename'] = EASTERN_ARABIC_NUMERALS_IMG_PATH + eastern_numerals_df['filename'].astype(str)

# Replace , with .
eastern_numerals_df['value'].replace({',': '.'}, inplace=True, regex=True)

# replace rows with NaN values with ''
eastern_numerals_df['value'] = eastern_numerals_df['value'].fillna('')

# print stats
print(f"Length of Validation Set = {len(eastern_numerals_df.index)}")

### Western Arabic Numerals

In [None]:
# Load the path & values
western_numerals_df = pd.read_csv(WESTERN_ARABIC_NUMERALS_VALUES, index_col=False, delimiter=";", dtype={'filename': 'str', 'value': 'str'})

# Create the filename column
western_numerals_df['filename'] = WESTERN_ARABIC_NUMERALS_IMG_PATH + western_numerals_df['filename'].astype(str)

# Replace , with .
western_numerals_df['value'].replace({',': '.'}, inplace=True, regex=True)

# replace rows with NaN values with ''
western_numerals_df['value'] = western_numerals_df['value'].fillna('')

# print stats
print(f"Length of Validation Set = {len(western_numerals_df.index)}")

### Full Images

In [None]:
# init dataframe
full_dataframe = []

# load file
with open(FULL_IMG_VALUES) as json_file:
    full_dataframe = json.load(json_file)

# CAI Meza
Meza is able to fit the templates on the image and then decode each cell. We compare its accuracy with the actual values.

## Full Images
Images were submitted to Meza and once processed, we simply downloaded the results

In [None]:
# load file
with open(FULL_IMG_VALUES) as json_file:
    meza_full_dataframe = json.load(json_file)

In [None]:
# init accuracy dict
accuracy_dict = {}

# go through images
for image in full_dataframe:
    
    # grab attributes
    image_id = image['image_id']
    actual_cells = image['cells']
    
    # grab the predicted values
    pred_df = None
    for image_df in meza_full_dataframe:
        if(image_df['image_id'] == image_id):
            pred_df = image_df
            break
    
    # check if we found the actual values
    if(pred_df is None):
        raise Exception(f'Could not find results for image with id ({image_id})')
    
    # grab attributes
    pred_cells = pred_df['cells']
    
    # build dicts
    pred_dict = {}
    for cell in pred_cells:
        pred_dict[cell['rect_id']] = cell['value']
        
    actual_dict = {}
    for cell in actual_cells:
        actual_dict[cell['rect_id']] = cell['value']
        
    # compare
    total = 0
    nbr_match = 0
    for rect_id in pred_dict:
        
        # grab predicted value
        pred_val = pred_dict[rect_id]       
        
        # grab actual
        actual_val = actual_dict[rect_id]
        
        if(actual_val == pred_val):
            nbr_match += 1
        total += 1
        
    # set accuracy
    accuracy_dict[image_id] = nbr_match/float(total)

In [None]:
# display results
print("### Meza Results ###")
for image_id in accuracy_dict:
    accuracy = round(accuracy_dict[image_id]*10000.0)/100.0
    print(f'Accuracy for {image_id}: {accuracy}%')

## Cell images
Here we fed the individual cell image one by one to Meza and noted the prediction

### Western Arabic Numerals

In [None]:
# Load the predicted values
results_df = pd.read_csv(f'{RESULTS_WESTERN_ARABIC_NUMERALS_DIR}meza/results.txt', index_col=False, delimiter=";", dtype={'filename': 'str', 'value': 'str'})

# Replace , with .
results_df['value'].replace({',': '.'}, inplace=True, regex=True)

# replace rows with NaN values with ''
results_df['value'] = results_df['value'].fillna('')

# Compare
total = 0
nbr_match = 0
for i, cell in western_numerals_df.iterrows():
    
    # grab attributes
    filename = cell['filename'].split('/')[-1]
    actual_val = cell['value']
    
    # grab the predicted values
    cell_df = results_df.loc[results_df['filename'] == filename]
    pred_val = cell_df['value'].tolist()[0]

    # check if we found the actual values
    if(pred_val is None):
        raise Exception(f'Could not find results for cell with id ({filename})')
    
    if(actual_val == pred_val):
        nbr_match += 1
    total += 1

# Display
accuracy = round((nbr_match/float(total))*10000.0)/100.0
print(f'Accuracy on cell images : {accuracy}%')


### Eastern Arabic Numerals

In [None]:
# Load the predicted values
results_df = pd.read_csv(f'{RESULTS_EASTERN_ARABIC_NUMERALS_DIR}meza/results.txt', index_col=False, delimiter=";", dtype={'filename': 'str', 'value': 'str'})

# Replace , with .
results_df['value'].replace({',': '.'}, inplace=True, regex=True)

# replace rows with NaN values with ''
results_df['value'] = results_df['value'].fillna('')

# Compare
total = 0
nbr_match = 0
for i, cell in eastern_numerals_df.iterrows():
    
    # grab attributes
    filename = cell['filename'].split('/')[-1]
    actual_val = cell['value']
    
    # grab the predicted values
    cell_df = results_df.loc[results_df['filename'] == filename]
    pred_val = cell_df['value'].tolist()[0]

    # check if we found the actual values
    if(pred_val is None):
        raise Exception(f'Could not find results for cell with id ({filename})')
    
    if(actual_val == pred_val):
        nbr_match += 1
    total += 1

# Display
accuracy = round((nbr_match/float(total))*10000.0)/100.0
print(f'Accuracy on cell images : {accuracy}%')

### Bubbles

In [None]:
# Load the predicted values
results_df = pd.read_csv(f'{RESULTS_BUBBLES_DIR}meza/results.txt', index_col=False, delimiter=";", dtype={'filename': 'str', 'value': 'str'})

# Replace , with .
results_df['value'].replace({',': '.'}, inplace=True, regex=True)

# replace rows with NaN values with ''
results_df['value'] = results_df['value'].fillna('')

# Compare
total = 0
nbr_match = 0
for i, cell in bubbles_df.iterrows():
    
    # grab attributes
    filename = cell['filename'].split('/')[-1]
    actual_val = cell['value']
    
    # grab the predicted values
    cell_df = results_df.loc[results_df['filename'] == filename]
    pred_val = cell_df['value'].tolist()[0]

    # check if we found the actual values
    if(pred_val is None):
        raise Exception(f'Could not find results for cell with id ({filename})')
    
    if(actual_val == pred_val):
        nbr_match += 1
    total += 1

# Display
accuracy = round((nbr_match/float(total))*10000.0)/100.0
print(f'Accuracy on cell images : {accuracy}%')

# Tesseract
Tesseract doesn't feature true structured extraction. It can find text and return bounding boxes around it, but can't return the real table shape. Thus, we won't test it with the full images, only with the cell images. 

In [None]:
#!pip3 install pytesseract

In [None]:
# load the results
if(os.path.exists(RESULTS_TESSERACT_CELLS)):
    
    # init dataframe
    tesseract_cells_dataframe = []
    
    # Grab all the cell images with values
    with open(RESULTS_TESSERACT_CELLS, 'r', encoding="utf-8") as f:

        # grab headers
        headers = f.readline()
        headers = [val.strip() for val in headers.split(';')]

        # go through file
        for row in f:

            # split
            values = [val.strip() for val in row.split(';')]

            # create datum
            datum = {}
            for i, val in enumerate(headers):
                datum[val] = values[i]

            # push
            tesseract_cells_dataframe.append(datum)
            
    # Compare
    total = 0
    nbr_match = 0
    for cell in cells_dataframe:

        # grab attributes
        filename = cell['filename'].split('/')[-1]
        actual_val = cell['value']

        # grab the predicted values
        pred_val = None
        for cell_df in tesseract_cells_dataframe:
            if(cell_df['filename'] == filename):
                pred_val = cell_df['value']
                break

        # check if we found the actual values
        if(pred_val is None):
            raise Exception(f'Could not find results for cell with id ({filename})')

        if(actual_val == pred_val):
            nbr_match += 1
        total += 1

# Display
accuracy = round((nbr_match/float(total))*10000.0)/100.0
print(f'Accuracy on cell images : {accuracy}%')

### To rerun tesseract on the cell images

In [None]:
# try:
#     from PIL import Image
# except ImportError:
#     import Image
# import pytesseract

# # go through cells
# total = 0
# nbr_match = 0
# for i, cell in enumerate(cells_dataframe):
    
#     # grab attributes
#     filename = cell['filename']
#     actual_val = cell['value']
    
#     # load image
#     img = Image.open(filename)
    
#     # decode using tesseract
#     pred_val = pytesseract.image_to_string(img)
#     if(pred_val is None):
#         pred_val = 'failed'
    
#     # format
#     pred_val = str(pred_val).strip()
#     pred_val = pred_val.replace('\n', '')
#     pred_val = pred_val.replace(',', '.')
    
#     # set
#     cells_dataframe[i]['pred'] = pred_val
    
#     # Simple image to string
#     if(actual_val == pred_val):
#         nbr_match += 1
#     total += 1
    

# # save results
# with open(RESULTS_TESSERACT_CELLS, 'w+', newline='', encoding="utf-8") as outfile:
#     outfile.write("filename;value\n")
    
#     # go through cells
#     for cell in cells_dataframe:
#         filename = cell['filename'].split('/')[-1]
#         pred_val = cell['pred']
#         outfile.write(f"{filename};{pred_val}\n")

# Google Vision
Google also doesn't really feature table extraction, so we will only test the cell images.

In [None]:
# Libraries
#!pip3 install google-cloud-vision

In [None]:
# load the results
if(os.path.exists(RESULTS_GOOGLE_CELLS)):
    
    # init dataframe
    google_cells_dataframe = []
    
    # Grab all the cell images with values
    with open(RESULTS_GOOGLE_CELLS, 'r', encoding="utf-8") as f:

        # grab headers
        headers = f.readline()
        headers = [val.strip() for val in headers.split(';')]

        # go through file
        for row in f:

            # split
            values = [val.strip() for val in row.split(';')]

            # create datum
            datum = {}
            for i, val in enumerate(headers):
                datum[val] = values[i]

            # push
            google_cells_dataframe.append(datum)
            
    # Compare
    total = 0
    nbr_match = 0
    for cell in cells_dataframe:

        # grab attributes
        filename = cell['filename'].split('/')[-1]
        actual_val = cell['value']

        # grab the predicted values
        pred_val = None
        for cell_df in google_cells_dataframe:
            if(cell_df['filename'] == filename):
                pred_val = cell_df['value']
                break

        # check if we found the actual values
        if(pred_val is None):
            raise Exception(f'Could not find results for cell with id ({filename})')

        if(actual_val == pred_val):
            nbr_match += 1
        total += 1

# Display
accuracy = round((nbr_match/float(total))*10000.0)/100.0
print(f'Accuracy on cell images : {accuracy}%')

### To rerun google on the cell images

In [None]:
# # YOU NEED TO SETUP YOUR API KEY BEFORE RUNNING THIS
# from google.cloud import vision

# client = vision.ImageAnnotatorClient()

# # Detects text in the file
# def google_detect_text(path):

#     with open(path, 'rb') as image_file:
#         content = image_file.read()

#     image = vision.Image(content=content)

#     response = client.text_detection(image=image)
#     texts = response.text_annotations

#     for text in texts:
#         return str('{}'.format(text.description)).strip()


# # go through cells
# total = 0
# nbr_match = 0
# for i, cell in western_numerals_df.iterrows():
    
#     # grab attributes
#     filename = cell['filename']
#     actual_val = cell['value']
    
#     # decode using google
#     pred_val = google_detect_text(filename)
#     if(pred_val is None):
#         pred_val = 'failed'
    
#     # format
#     pred_val = str(pred_val).strip()
#     pred_val = pred_val.replace('\n', '')
#     pred_val = pred_val.replace(',', '.')
    
#     # set
#     cells_dataframe[i]['pred'] = pred_val
    
#     # Simple image to string
#     if(actual_val == pred_val):
#         nbr_match += 1
#     total += 1
    

# # save results
# with open(RESULTS_GOOGLE_CELLS, 'w+', newline='', encoding="utf-8") as outfile:
#     outfile.write("filename;value\n")
    
#     # go through cells
#     for cell in cells_dataframe:
#         filename = cell['filename'].split('/')[-1]
#         pred_val = cell['pred']
#         outfile.write(f"{filename};{pred_val}\n")

# Amazon Textract
Amazon Textract is theoretically able to do structured extraction. Thus, we run both test for this benchmark

### Full Images

For the full images, I suggest that you upload the test images to amazon textract and use their user interface,
    https://us-east-2.console.aws.amazon.com/textract/home?region=us-east-2#/demo

The results are saved in the *results/aws/full_images/*. I have not taken the time to write code to analyse the results but I invite TECI to do so. Image captured with a high end camera in perfect lighting work relatively well, but the images from the field perform very poorly. Also, because we can't preload templates amazon textract messes up the table shape. On a final note, amazon textract doesn't handle checkboxes/bubbles or future data types that we are working on such as signatures, fingerprints, etc. 

### Cell Images
The performance are too low, I don't think I am hitting the right endpoint for these type of images. But the full image test is revealing of the performance on single cell images.

In [None]:
#!pip3 install boto3

In [None]:
# load the results
if(os.path.exists(RESULTS_AWS_CELLS)):
    
    # init dataframe
    google_cells_dataframe = []
    
    # Grab all the cell images with values
    with open(RESULTS_AWS_CELLS, 'r', encoding="utf-8") as f:

        # grab headers
        headers = f.readline()
        headers = [val.strip() for val in headers.split(';')]

        # go through file
        for row in f:

            # split
            values = [val.strip() for val in row.split(';')]

            # create datum
            datum = {}
            for i, val in enumerate(headers):
                datum[val] = values[i]

            # push
            google_cells_dataframe.append(datum)
            
    # Compare
    total = 0
    nbr_match = 0
    for cell in cells_dataframe:

        # grab attributes
        filename = cell['filename'].split('/')[-1]
        actual_val = cell['value']

        # grab the predicted values
        pred_val = None
        for cell_df in google_cells_dataframe:
            if(cell_df['filename'] == filename):
                pred_val = cell_df['value']
                break

        # check if we found the actual values
        if(pred_val is None):
            raise Exception(f'Could not find results for cell with id ({filename})')

        if(actual_val == pred_val):
            nbr_match += 1
        total += 1

# Display
accuracy = round((nbr_match/float(total))*10000.0)/100.0
print(f'Accuracy on cell images : {accuracy}%')

### To rerun aws on the cell images

In [None]:
# import boto3

# # SET YOUR API KEY HERE
# AWS_REGION_NAME=''
# AWS_ACCESS_KEY_ID=''
# AWS_SECRET_ACCESS_KEY=''
# SESSION_TOKEN=''

# # Amazon Textract client
# textract = boto3.client(
#     'textract', 
#     region_name=AWS_REGION_NAME,
#     aws_access_key_id=AWS_ACCESS_KEY_ID,
#     aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
#     aws_session_token=SESSION_TOKEN
# )

# def aws_detect_text(path):

#     # Read document content
#     with open(path, 'rb') as document:
#         imageBytes = bytearray(document.read())


#     # Call Amazon Textract
#     response = textract.detect_document_text(Document={'Bytes': imageBytes})

#     textRes = []
#     # Print detected text
#     for item in response["Blocks"]:
#         if item["BlockType"] == "LINE":
#             textRes.append(item["Text"])

#     textRes = ','.join(textRes)
#     if(len(textRes) == 0):
#         textRes = 'failed'

#     return textRes


# # go through cells
# total = 0
# nbr_match = 0
# for i, cell in enumerate(cells_dataframe):
    
#     # grab attributes
#     filename = cell['filename']
#     actual_val = cell['value']
    
#     # decode using google
#     pred_val = aws_detect_text(filename)
#     if(pred_val is None):
#         pred_val = 'failed'
    
#     # format
#     pred_val = str(pred_val).strip()
#     pred_val = pred_val.replace('\n', '')
#     pred_val = pred_val.replace(',', '.')
    
#     # set
#     cells_dataframe[i]['pred'] = pred_val
    
#     # Simple image to string
#     if(actual_val == pred_val):
#         nbr_match += 1
#     total += 1
    

# # save results
# with open(RESULTS_AWS_CELLS, 'w+', newline='', encoding="utf-8") as outfile:
#     outfile.write("filename;value\n")
    
#     # go through cells
#     for cell in cells_dataframe:
#         filename = cell['filename'].split('/')[-1]
#         pred_val = cell['pred']
#         outfile.write(f"{filename};{pred_val}\n")