# OCR engine comparison

<img src="https://media.arxiv-vanity.com/render-output/6158804/images/fig_2_no_trademarks.png" alt="text to images" width="600" height="600">

In [1]:
import pandas as pd
import numpy as np
import random

from glob import glob
# from tqdm.notebook import tqdm
from tqdm import tqdm

import matplotlib.pyplot as plt
from PIL import Image
import cv2
import Levenshtein as Lv
from statistics import mean

plt.style.use('ggplot')

# Outline
1. Take a look at the data
2. Extract text from images:
    - pytesseract
    - easyocr
    - keras_ocr
3. Run on a few examples and compare the results

## About the data
TextOCR requires models to perform text-recognition on arbitrary shaped scene-text present on natural images. TextOCR provides ~1M high quality word annotations on TextVQA images allowing application of end-to-end reasoning on downstream tasks such as visual question answering or image captioning.

- 28,134 natural images from TextVQA
- 903,069 annotated scene-text words
- 32 words per image on average

In [2]:
annot = pd.read_parquet('train_val_images/annot.parquet')
imgs = pd.read_parquet('train_val_images/img.parquet')
img_fns = glob('train_val_images/train_images/*')

In [None]:
annot.head(5)

# Plot Example Images

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
ax.imshow(plt.imread(img_fns[100]))
ax.axis('off')
plt.show()

In [None]:
image_id = img_fns[100].split('/')[-1].split('\\')[-1].split('.')[0]
annot[annot["image_id"] == image_id]

## Display for first 25 images

In [None]:
fig, axs = plt.subplots(5, 5, figsize=(20, 20))
axs = axs.flatten()
for i in range(25):
    axs[i].imshow(plt.imread(img_fns[i]))
    axs[i].axis('off')
    image_id = img_fns[i].split('/')[-1].split('\\')[-1].split('.')[0]
    n_annot = len(annot.query('image_id == @image_id'))
    axs[i].set_title(f'{image_id} - {n_annot}')
plt.show()

***

# Comparison method:
Because the bounding box can be slightly different depending on the OCR engine, it would not be possible to to let the engines detect text, and compare the result to the ground truth by comparing it with the text in the bounding boxes provided by the dataset. Since words can repeat in a single picture, looking if a string is included in the annotations could also lead to false positives. 
To avoid these problems, the bounding boxes described by the dataset for the text will be used by the OCR engines as Regions Of Interet (ROIs). The resulting detected string can then be compared one-to-one to the base truth for this area of the picture.
Working on ROIs rather than the whole picture can improve the accuracy of the detection and recognition of characters by reducing noise, leading to skewed comparison results, but since the use cases of this thesis will also make use of ROIs, this method is here appropriate to choose the best engine for our uses.

### Statistical tests:
Test on 100 randomly chosen pictures from dataset
1. Word Error Rate (WER)
2. Character Error Rate (CER) -> Levenshtein distance
<br>-> Remove punctuation?

In [3]:
sample_size = 100
sample_idxs = random.sample(range(0, len(img_fns)), sample_size)

class Img:
    def __init__(self, idx):
        self.image = cv2.imread(img_fns[idx])
        self.image_id = img_fns[idx].split('/')[-1].split('\\')[-1].split('.')[0]
        self.annotations = annot.query('image_id == @self.image_id')

sample = [Img(idx) for idx in sample_idxs]


In [None]:
sample[0].annotations

# Engine 1: pytesseract

In [4]:
from pytesseract import pytesseract
pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract'

In [13]:
with open('engine_acc_results/pytesseract_accuracy_test.txt', 'w', encoding="utf-8") as f:
    f.write(f"Pytesseract inaccurate results:\n")

total_accurate = 0
total_annot = 0
total_char = 0
lv_distances = []

for img in sample[:20]:

    with open('engine_acc_results/pytesseract_accuracy_test.txt', 'a', encoding="utf-8") as f:
        f.write(f"Image {img.image_id}:\n")

    img_accurate = 0
    img_annot = len(img.annotations)
    total_annot += img_annot

    for _, row in img.annotations.iterrows():
        
        if row['utf8_string'] == ".":
            continue

        # Pass ROI to engine
        roi = [int(x) for x in row['bbox']]
        roi_img = img.image[roi[1]:roi[1]+roi[3], roi[0]:roi[0]+roi[2]]
        result = pytesseract.image_to_string(roi_img)

        # Get Levenshtein distance
        lv_distance = Lv.distance(row['utf8_string'].lower(), result.lower())
        lv_distances.append(lv_distance)
        
        # Check if word recognition is correct
        if result.lower() == row['utf8_string'].lower():
            img_accurate += 1
        else:
            with open('engine_acc_results/pytesseract_accuracy_test.txt', 'a', encoding="utf-8") as f:
                f.write(f"Expected: {row['utf8_string'].lower()}, actual: {result.lower()}, lv_distance: {lv_distance}\n")
            
        total_accurate += img_accurate
        total_char += len(row['utf8_string'])

    with open('engine_acc_results/pytesseract_accuracy_test.txt', 'a', encoding="utf-8") as f:
        f.write("\n")
    
    # print(f"Image word accuracy = {img_accurate}/{img_annot}")

In [9]:
print(f"Total word accuracy = {total_accurate}/{total_annot}")
print(f"Sum of Levenshtein distances / Total number of characters = {sum(lv_distances)} / {total_char}")

Total word accuracy = 0/581
Sum of Levenshtein distances / Total number of characters = 2183 / 2537


# Method 2: easyocr

In [11]:
import easyocr
reader = easyocr.Reader(['en'], gpu = True)

CUDA not available - defaulting to CPU. Note: This module is much faster with a GPU.


In [12]:
with open('engine_acc_results/easy_accuracy_test.txt', 'w', encoding="utf-8") as f:
    f.write(f"EasyOCR inaccurate results:\n")

total_accurate = 0
total_annot = 0
total_char = 0
lv_distances = []

for img in sample[:20]:

    with open('engine_acc_results/easy_accuracy_test.txt', 'a', encoding="utf-8") as f:
        f.write(f"Image {img.image_id}:\n")

    img_accurate = 0
    img_annot = len(img.annotations)
    total_annot += img_annot

    for _, row in img.annotations.iterrows():
        
        # Pass ROI to engine
        roi = [int(x) for x in row['bbox']]
        roi_img = img.image[roi[1]:roi[1]+roi[3], roi[0]:roi[0]+roi[2]]
        result = reader.readtext(roi_img)
        result_df = pd.DataFrame(result, columns=['bbox','text','conf'])
        result = ''.join(result_df['text'])
        
        # Get Levenshtein distance
        lv_distance = Lv.distance(row['utf8_string'].lower(), result.lower())
        lv_distances.append(lv_distance)
        
        # Check if word recognition is correct
        if result.lower() == row['utf8_string'].lower():
            img_accurate += 1
        else:
            with open('engine_acc_results/easy_accuracy_test.txt', 'a', encoding="utf-8") as f:
                f.write(f"Expected: {row['utf8_string'].lower()}, actual: {result.lower()}, lv_distance: {lv_distance}\n")

        total_accurate += img_accurate
        total_char += len(row['utf8_string'])

    with open('engine_acc_results/easy_accuracy_test.txt', 'a', encoding="utf-8") as f:
        f.write("\n")
    
    print(f"Image word accuracy = {img_accurate}/{img_annot}")

Image word accuracy = 0/0
Image word accuracy = 3/13
Image word accuracy = 5/114
Image word accuracy = 0/0
Image word accuracy = 2/42
Image word accuracy = 7/73
Image word accuracy = 1/12
Image word accuracy = 0/0
Image word accuracy = 0/0
Image word accuracy = 0/35
Image word accuracy = 3/61
Image word accuracy = 25/29
Image word accuracy = 1/15
Image word accuracy = 1/40
Image word accuracy = 2/76
Image word accuracy = 1/8
Image word accuracy = 0/0
Image word accuracy = 0/2
Image word accuracy = 3/21
Image word accuracy = 3/40


In [None]:
print(f"Total word accuracy = {total_accurate}/{total_annot}")
print(f"Sum of Levenshtein distances / Total number of characters = {sum(lv_distances)} / {total_char}")

# Method 3: keras_ocr

In [None]:
import keras_ocr

pipeline = keras_ocr.pipeline.Pipeline()

In [None]:
results = pipeline.recognize([img_fns[100]])

In [None]:
pd.DataFrame(results[0], columns=['text', 'bbox'])