# PyTesseract

In [None]:
import pytesseract
from PIL import Image
import os

#Tesseract executable
pytesseract.pytesseract.tesseract_cmd = "C:/Program Files/Tesseract-OCR/tesseract.exe"

image_directory = 'cleaned/000-cropped'

#expected output directory as text file
#output_directory = 'cleaned/000'
#file 1 in one directory will be file 1 in another directory

for filename in os.listdir(image_directory):

    if filename.endswith('.png'):
        image_path = os.path.join(image_directory, filename)

        # Open using PIL
        img = Image.open(image_path)

        # Output file format
        #output_file ="outputs/pytesseract/" + os.path.splitext(filename)[0] + '.txt'
        output_file ="outputs/py_cleaned/" + os.path.splitext(filename)[0] + '.txt'

        # Run Tesseract and writes into output file
        with open(output_file, 'w') as f:
            text = pytesseract.image_to_string(img)
            f.write(text)

        print(f"Text extracted from {filename} and saved to {output_file}")


Text extracted from a01-000.png and saved to outputs/py_cleaned/a01-000.txt
Text extracted from a01-003.png and saved to outputs/py_cleaned/a01-003.txt
Text extracted from a01-007.png and saved to outputs/py_cleaned/a01-007.txt
Text extracted from a01-011.png and saved to outputs/py_cleaned/a01-011.txt
Text extracted from a01-014.png and saved to outputs/py_cleaned/a01-014.txt
Text extracted from a01-020.png and saved to outputs/py_cleaned/a01-020.txt
Text extracted from a01-026.png and saved to outputs/py_cleaned/a01-026.txt
Text extracted from a01-030.png and saved to outputs/py_cleaned/a01-030.txt
Text extracted from a01-043.png and saved to outputs/py_cleaned/a01-043.txt
Text extracted from a01-049.png and saved to outputs/py_cleaned/a01-049.txt
Text extracted from a01-049x.png and saved to outputs/py_cleaned/a01-049x.txt
Text extracted from a01-053.png and saved to outputs/py_cleaned/a01-053.txt
Text extracted from a01-058.png and saved to outputs/py_cleaned/a01-058.txt
Text extra

In [18]:
import os
from difflib import SequenceMatcher
import Levenshtein

# Directories
expected_output_directory = 'cleaned/000'
actual_output_directory = 'outputs/py_cleaned'

# METHOD 1:
# Similarity percentage using Levenshtein distance (edit distance), more robust for OCR
# Levenshtein distance outputs how similar, not dis-similar
def calculate_similarity_lev(text1, text2):
    distance = Levenshtein.distance(text1, text2)
    max_length = max(len(text1), len(text2))
    if max_length == 0:  # Avoids dividing by zero
        return 100.0
    return (1 - distance / max_length) * 100

# METHOD 2:
def calculate_similarity(text1, text2): 
    return SequenceMatcher(None, text1, text2).ratio() * 100

# Iterate through files
for filename in os.listdir(actual_output_directory):
    if filename.endswith('.txt'):
        actual_output_path = os.path.join(actual_output_directory, filename)
        expected_output_path = os.path.join(expected_output_directory, filename)

        # Read actual output
        with open(actual_output_path, 'r') as f:
            actual_text = f.read()

        # Read expected output
        if os.path.exists(expected_output_path):
            with open(expected_output_path, 'r') as f:
                expected_text = f.read()

            # Calculate similarity
            lev_similarity = calculate_similarity_lev(expected_text, actual_text)
            similarity = calculate_similarity(expected_text, actual_text)

            print(f"File: {filename}")
            print(f"Similarity Percentage from Levenshtein: {lev_similarity:.2f}%")
            print(f"Similarity Percentage from SeqMatch: {similarity:.2f}%")
        else:
            print(f"Expected output not found for {filename}")
    print("---------------------------------------------------")

File: a01-000.txt
Similarity Percentage from Levenshtein: 58.30%
Similarity Percentage from SeqMatch: 17.61%
---------------------------------------------------
File: a01-003.txt
Similarity Percentage from Levenshtein: 45.33%
Similarity Percentage from SeqMatch: 1.64%
---------------------------------------------------
File: a01-007.txt
Similarity Percentage from Levenshtein: 50.79%
Similarity Percentage from SeqMatch: 11.35%
---------------------------------------------------
Expected output not found for a01-011.txt
---------------------------------------------------
Expected output not found for a01-014.txt
---------------------------------------------------
Expected output not found for a01-020.txt
---------------------------------------------------
Expected output not found for a01-026.txt
---------------------------------------------------
File: a01-030.txt
Similarity Percentage from Levenshtein: 47.59%
Similarity Percentage from SeqMatch: 5.42%
----------------------------------

# Keras-OCR

In [2]:
import matplotlib.pyplot as plt

import keras_ocr

# keras-ocr will automatically download pretrained
# weights for the detector and recognizer.
pipeline = keras_ocr.pipeline.Pipeline()

# Get a set of three example images
images = [
    keras_ocr.tools.read(url) for url in [
        'https://upload.wikimedia.org/wikipedia/commons/b/bd/Army_Reserves_Recruitment_Banner_MOD_45156284.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/e/e8/FseeG2QeLXo.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/b/b4/EUBanana-500x112.jpg'
    ]
]

# Each list of predictions in prediction_groups is a list of
# (word, box) tuples.
prediction_groups = pipeline.recognize(images)

# Plot the predictions
fig, axs = plt.subplots(nrows=len(images), figsize=(20, 20))
for ax, image, predictions in zip(axs, images, prediction_groups):
    keras_ocr.tools.drawAnnotations(image=image, predictions=predictions, ax=ax)

ModuleNotFoundError: No module named 'tensorflow'

# EasyOCR

In [10]:
import easyocr
import os

reader = easyocr.Reader(['en'])

folder_path = 'cleaned/000-cropped'

for filename in os.listdir(folder_path):

    if filename.endswith('.png'):
        image_path = os.path.join(folder_path, filename)

        output = reader.readtext(image_path, detail=0)

        output_file ="outputs/easy-ocr-cleaned/" + os.path.splitext(filename)[0] + '.txt'

        with open(output_file, 'w') as f:
            for line in output:
                f.write(line + '\n')  

        print(f"Text for {filename} written to {output_file}")


Text for a01-000.png written to outputs/easy-ocr-cleaned/a01-000.txt
Text for a01-003.png written to outputs/easy-ocr-cleaned/a01-003.txt
Text for a01-007.png written to outputs/easy-ocr-cleaned/a01-007.txt
Text for a01-011.png written to outputs/easy-ocr-cleaned/a01-011.txt
Text for a01-014.png written to outputs/easy-ocr-cleaned/a01-014.txt
Text for a01-020.png written to outputs/easy-ocr-cleaned/a01-020.txt
Text for a01-026.png written to outputs/easy-ocr-cleaned/a01-026.txt
Text for a01-030.png written to outputs/easy-ocr-cleaned/a01-030.txt
Text for a01-043.png written to outputs/easy-ocr-cleaned/a01-043.txt
Text for a01-049.png written to outputs/easy-ocr-cleaned/a01-049.txt
Text for a01-049x.png written to outputs/easy-ocr-cleaned/a01-049x.txt
Text for a01-053.png written to outputs/easy-ocr-cleaned/a01-053.txt
Text for a01-058.png written to outputs/easy-ocr-cleaned/a01-058.txt
Text for a01-063.png written to outputs/easy-ocr-cleaned/a01-063.txt
Text for a01-068.png written to 