# PyTesseract

In [16]:
import pytesseract
from PIL import Image
import os

#Tesseract executable
pytesseract.pytesseract.tesseract_cmd = "C:/Program Files/Tesseract-OCR/tesseract.exe"

image_directory = 'data/000'

for filename in os.listdir(image_directory):

    if filename.endswith('.png'):
        image_path = os.path.join(image_directory, filename)

        # Open using PIL
        img = Image.open(image_path)

        # Output file format
        output_file ="outputs/pytesseract/" + os.path.splitext(filename)[0] + '.txt'

        # Run Tesseract and writes into output file
        with open(output_file, 'w') as f:
            text = pytesseract.image_to_string(img)
            f.write(text)

        print(f"Text extracted from {filename} and saved to {output_file}")


Text extracted from a01-000u.png and saved to outputs/pytesseract/a01-000u.txt
Text extracted from a01-003u.png and saved to outputs/pytesseract/a01-003u.txt
Text extracted from a01-007u.png and saved to outputs/pytesseract/a01-007u.txt
Text extracted from a01-011u.png and saved to outputs/pytesseract/a01-011u.txt
Text extracted from a01-014u.png and saved to outputs/pytesseract/a01-014u.txt
Text extracted from a01-020u.png and saved to outputs/pytesseract/a01-020u.txt
Text extracted from a01-026u.png and saved to outputs/pytesseract/a01-026u.txt
Text extracted from a01-030u.png and saved to outputs/pytesseract/a01-030u.txt
Text extracted from a01-043u.png and saved to outputs/pytesseract/a01-043u.txt
Text extracted from a01-049u.png and saved to outputs/pytesseract/a01-049u.txt
Text extracted from a01-049x.png and saved to outputs/pytesseract/a01-049x.txt
Text extracted from a01-053u.png and saved to outputs/pytesseract/a01-053u.txt
Text extracted from a01-058u.png and saved to output

# Keras-OCR

In [2]:
import matplotlib.pyplot as plt

import keras_ocr

# keras-ocr will automatically download pretrained
# weights for the detector and recognizer.
pipeline = keras_ocr.pipeline.Pipeline()

# Get a set of three example images
images = [
    keras_ocr.tools.read(url) for url in [
        'https://upload.wikimedia.org/wikipedia/commons/b/bd/Army_Reserves_Recruitment_Banner_MOD_45156284.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/e/e8/FseeG2QeLXo.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/b/b4/EUBanana-500x112.jpg'
    ]
]

# Each list of predictions in prediction_groups is a list of
# (word, box) tuples.
prediction_groups = pipeline.recognize(images)

# Plot the predictions
fig, axs = plt.subplots(nrows=len(images), figsize=(20, 20))
for ax, image, predictions in zip(axs, images, prediction_groups):
    keras_ocr.tools.drawAnnotations(image=image, predictions=predictions, ax=ax)

ModuleNotFoundError: No module named 'tensorflow'

# EasyOCR

In [14]:
import easyocr
import os

reader = easyocr.Reader(['en'])

folder_path = 'data/000'

for filename in os.listdir(folder_path):

    if filename.endswith('.png'):
        image_path = os.path.join(folder_path, filename)

        output = reader.readtext(image_path, detail=0)

        output_file ="outputs/easy-ocr/" + os.path.splitext(filename)[0] + '.txt'

        with open(output_file, 'w') as f:
            for line in output:
                f.write(line + '\n')  

        print(f"Text for {filename} written to {output_file}")


Text for a01-000u.png written to outputs/easy-ocr/a01-000u.txt
Text for a01-003u.png written to outputs/easy-ocr/a01-003u.txt
Text for a01-007u.png written to outputs/easy-ocr/a01-007u.txt
Text for a01-011u.png written to outputs/easy-ocr/a01-011u.txt
Text for a01-014u.png written to outputs/easy-ocr/a01-014u.txt
Text for a01-020u.png written to outputs/easy-ocr/a01-020u.txt
Text for a01-026u.png written to outputs/easy-ocr/a01-026u.txt
Text for a01-030u.png written to outputs/easy-ocr/a01-030u.txt
Text for a01-043u.png written to outputs/easy-ocr/a01-043u.txt
Text for a01-049u.png written to outputs/easy-ocr/a01-049u.txt
Text for a01-049x.png written to outputs/easy-ocr/a01-049x.txt
Text for a01-053u.png written to outputs/easy-ocr/a01-053u.txt
Text for a01-058u.png written to outputs/easy-ocr/a01-058u.txt
Text for a01-063u.png written to outputs/easy-ocr/a01-063u.txt
Text for a01-068u.png written to outputs/easy-ocr/a01-068u.txt
Text for a01-072u.png written to outputs/easy-ocr/a01-0