In [1]:
import numpy as np #2.2.1
import cv2 #4.10.0.84
import os
from natsort import natsorted
from PIL import Image, ImageDraw, ImageFont
import pandas as pd
#OCR Framework
import pytesseract
from paddleocr import PaddleOCR
import paddle
import easyocr

#OCR Metrics
from jiwer import wer, cer
import time

In [6]:
print("PaddlePaddle device:", paddle.device.get_device())

PaddlePaddle device: cpu


Pytesseract, PaddleOCR, EasyOCR
Metrics used: CER(Character Error Rate), WER(Word Error Rate), EMA(Exact Match Accuracy)

In [7]:
def parse_label_files(label_folder, num_files):
    annotations = {}
    label_files = natsorted(os.listdir(label_folder))[:num_files]  # Take only the specified number of files
    for label_file in label_files:
        file_path = os.path.join(label_folder, label_file)
        with open(file_path, 'r') as file:
            annotation = []
            for line in file:
                data = line.strip().split(',')
                coords = list(map(int, data[:8]))  # First 8 values are bounding box coordinates
                text = data[8]  # The 9th value is the text
                annotation.append((coords, text))
        annotations[label_file] = annotation
    return annotations

def draw_boxes(image_folder, label_folder, num_files, font_path="/usr/share/fonts/truetype/noto/NotoSans-Regular.ttf", font_size = 16):
    annotations = parse_label_files(label_folder, num_files)
    label_files = sorted(annotations.keys())  # Ensure labels are sorted in the right order

    for i, label_file in enumerate(label_files):
        # Generate the corresponding image name starting from "im0001.jpg"
        image_file = f"im{i + 1:04d}.jpg"  # Format index with leading zeros (e.g., im0001.jpg)
        image_path = os.path.join(image_folder, image_file)

        if not os.path.exists(image_path):
            print(f"Image file not found for {label_file}")
            continue

        # Read and draw bounding boxes on the image
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        # Convert to a Pillow Image
        pil_image = Image.fromarray(image)
        draw = ImageDraw.Draw(pil_image)

        # Load the Noto Sans font
        font = ImageFont.truetype(font_path, font_size)

        # Draw bounding boxes and text
        for coords, text in annotations[label_file]:
            pts = [(coords[i], coords[i + 1]) for i in range(0, len(coords), 2)]
            draw.polygon(pts, outline="green", width=2)
            draw.text((pts[0][0], pts[0][1] - font_size), text, fill="green", font=font)

        # Convert back to OpenCV format
        image = np.array(pil_image)
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

        # Display the image
        cv2.imshow(f"Image: {image_file}", image)
        cv2.waitKey(0)
        cv2.destroyAllWindows()

In [8]:
def crop_text_regions(image, coords):
    """Crop text regions using bounding box coordinates."""
    pts = np.array(coords, dtype=np.int32).reshape((4, 2))
    rect = cv2.boundingRect(pts)
    x, y, w, h = rect
    cropped = image[y:y+h, x:x+w]
    return cropped

In [9]:
def apply_easyocr(image_folder, label_folder, num_files):
    reader = easyocr.Reader(['en', 'vi'])  # Adjust languages as needed
    annotations = parse_label_files(label_folder, num_files)
    label_files = natsorted(annotations.keys())  # Ensure label files are sorted correctly

    for idx, (label_file, annotation) in enumerate(annotations.items()):
        # Generate the correct image name (im0001.jpg, im0002.jpg, ...)
        image_file = f"im{idx + 1:04d}.jpg"
        image_path = os.path.join(image_folder, image_file)

        if not os.path.exists(image_path):
            print(f"Image file not found for {label_file}")
            continue

        image = cv2.imread(image_path)
        for coords, true_text in annotation:
            cropped = crop_text_regions(image, coords)
            result = reader.readtext(cropped)
            predicted_text = result[0][-2] if result else ""
            print(f"True: {true_text}, Predicted (EasyOCR): {predicted_text}")

In [10]:
def apply_paddleocr(image_folder, label_folder, num_files):
    ocr = PaddleOCR(use_angle_cls=True, lang='en')  # Adjust language if needed
    annotations = parse_label_files(label_folder, num_files)
    label_files = natsorted(annotations.keys())  # Ensure label files are sorted correctly

    for idx, (label_file, annotation) in enumerate(annotations.items()):
        # Generate the correct image name (im0001.jpg, im0002.jpg, ...)
        image_file = f"im{idx + 1:04d}.jpg"
        image_path = os.path.join(image_folder, image_file)

        if not os.path.exists(image_path):
            print(f"Image file not found for {label_file}")
            continue

        image = cv2.imread(image_path)
        for coords, true_text in annotation:
            cropped = crop_text_regions(image, coords)
            
            # Process the cropped image directly without saving
            result = ocr.ocr(cropped, det=False)
            if result and result[0]:  # Check for valid result
                predicted_text = result[0][1][0]  # Extract the text
            else:
                predicted_text = ""
            
            print(f"True: {true_text}, Predicted (PaddleOCR): {predicted_text}")


In [11]:
def apply_tesseractocr(image_folder, label_folder, num_files):
    annotations = parse_label_files(label_folder, num_files)
    label_files = natsorted(annotations.keys())  # Ensure label files are sorted correctly

    for idx, (label_file, annotation) in enumerate(annotations.items()):
        # Generate the correct image name (im0001.jpg, im0002.jpg, ...)
        image_file = f"im{idx + 1:04d}.jpg"
        image_path = os.path.join(image_folder, image_file)

        if not os.path.exists(image_path):
            print(f"Image file not found for {label_file}")
            continue

        image = cv2.imread(image_path)
        for coords, true_text in annotation:
            cropped = crop_text_regions(image, coords)
            predicted_text = pytesseract.image_to_string(cropped, lang='eng')
            print(f"True: {true_text}, Predicted (TesseractOCR): {predicted_text.strip()}")

In [None]:
# Calculate runtime and metrics
easyocr_reader = easyocr.Reader(["en", "vi"])  # EasyOCR
paddle_ocr = PaddleOCR(use_angle_cls=True, lang="en")  # PaddleOCR

def calculate_metrics(ocr_framework, image_folder, label_folder, num_files):
    annotations = parse_label_files(label_folder, num_files)
    runtimes = []
    cer_list = []
    wer_list = []
    exact_matches = 0

    for idx, (label_file, annotation) in enumerate(annotations.items()):
        image_file = f"im{idx + 1:04d}.jpg"
        image_path = os.path.join(image_folder, image_file)

        if not os.path.exists(image_path):
            print(f"Image file not found: {image_file}")
            continue

        image = cv2.imread(image_path)
        for coords, true_text in annotation:
            cropped = crop_text_regions(image, coords)

            # Measure runtime
            start_time = time.time()
            if ocr_framework == "easyocr":
                result = easyocr_reader.readtext(cropped)
                predicted_text = result[0][-2] if result else ""
            elif ocr_framework == "paddleocr":
                # Process the cropped image directly without saving
                result = paddle_ocr.ocr(cropped, det=False)
                print(result)  # Debugging: Print the result structure
                try:
                    if result and isinstance(result[0], list) and result[0]:
                        predicted_text = result[0][0][1][0]  # Extract text
                    else:
                        predicted_text = ""
                except (IndexError, TypeError):
                    predicted_text = ""
            elif ocr_framework == "tesseract":
                predicted_text = pytesseract.image_to_string(cropped, lang="eng").strip()
            else:
                raise ValueError("Invalid OCR framework")
            end_time = time.time()

            # Update metrics
            runtimes.append(end_time - start_time)
            cer_list.append(cer(true_text, predicted_text))
            wer_list.append(wer(true_text, predicted_text))
            if true_text == predicted_text:
                exact_matches += 1

    # Aggregate results
    avg_runtime = np.mean(runtimes)
    avg_cer = np.mean(cer_list) * 100  # Convert to percentage
    avg_wer = np.mean(wer_list) * 100  # Convert to percentage
    exact_match_accuracy = (exact_matches / sum(len(ann) for ann in annotations.values())) * 100

    return {
        "Framework": ocr_framework,
        "Avg Runtime (s)": avg_runtime,
        "Avg CER (%)": avg_cer,
        "Avg WER (%)": avg_wer,
        "Exact Match Accuracy (%)": exact_match_accuracy,
    }


[2024/12/24 10:29:34] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/home/nhduong141103/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/home/nhduong141103/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=

In [19]:
image_folder = "/home/nhduong141103/VegaCop/OCR/OCR_project/ocr-dataset-20241224/images/"
label_folder = "/home/nhduong141103/VegaCop/OCR/OCR_project/ocr-dataset-20241224/labels/"
font_path = "/usr/share/fonts/truetype/noto/NotoSans-Regular.ttf"

#draw_boxes(image_folder, label_folder, 5, font_path = font_path, font_size = 20)

# Number of files to process
num_files = 5

# Apply EasyOCR
apply_easyocr(image_folder, label_folder, num_files) #Working

# Apply PaddleOCR
apply_paddleocr(image_folder, label_folder, num_files) #Setup Error

# Apply TesseractOCR
apply_tesseractocr(image_folder, label_folder, num_files) #Working

# Calculate metrics
results_easyocr = calculate_metrics("easyocr", image_folder, label_folder, num_files)
results_paddleocr = calculate_metrics("paddleocr", image_folder, label_folder, num_files)
results_tesseract = calculate_metrics("tesseract", image_folder, label_folder, num_files)

results_df = pd.DataFrame([results_easyocr, results_paddleocr, results_tesseract])
print(results_df)

True: ###, Predicted (EasyOCR): 
True: CHẤT, Predicted (EasyOCR): OH
True: LƯỢNG, Predicted (EasyOCR): WUI
True: TỐT, Predicted (EasyOCR): 
True: ĐỂ, Predicted (EasyOCR): 
True: CÓ, Predicted (EasyOCR): 
True: VIỆC, Predicted (EasyOCR): VEC
True: LÀM, Predicted (EasyOCR): 
True: NĂNG, Predicted (EasyOCR): HiN
True: SUẤT, Predicted (EasyOCR): SV]
True: CAO, Predicted (EasyOCR): 6
True: ĐỂ, Predicted (EasyOCR): 
True: TĂNG, Predicted (EasyOCR): TINA
True: THU, Predicted (EasyOCR): 
True: NHẬP, Predicted (EasyOCR): HKAP
True: ###, Predicted (EasyOCR): 
True: ###, Predicted (EasyOCR): 
True: ###, Predicted (EasyOCR): 
True: ###, Predicted (EasyOCR): 
True: ###, Predicted (EasyOCR): 
True: ###, Predicted (EasyOCR): 
True: ###, Predicted (EasyOCR): 
True: ###, Predicted (EasyOCR): 
True: ###, Predicted (EasyOCR): 
True: ###, Predicted (EasyOCR): 
True: ###, Predicted (EasyOCR): 
True: ###, Predicted (EasyOCR): 
True: ###, Predicted (EasyOCR): 
True: ###, Predicted (EasyOCR): 
True: ###, Pred