In [2]:
import numpy as np
import tensorflow as tf
from paddleocr import PaddleOCR
import cv2

2024-04-30 14:44:49.055659: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
ocr = PaddleOCR(use_angle_cls=True, lang="ch")

[2024/04/30 14:44:56] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/home/abdo/.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/home/abdo/.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, rec_char_d

: 

In [None]:
def extract_text(img):
    result = ocr.ocr(img, cls=False)
    return result[0]


def get_raw_text_from_pages(pages):
    raw_text = {}
    results = []
    for page_num, page in enumerate(pages, start=1):
        result = extract_text(page)
        results.append((page, result))
        raw_text[page_num] = "\n".join([line[1][0] for line in result])
    return results, raw_text

def intersection(box_1, box_2):
    return [box_2[0], box_1[1], box_2[2], box_1[3]]


def iou(box_1, box_2):
    x_1 = max(box_1[0], box_2[0])
    y_1 = max(box_1[1], box_2[1])
    x_2 = min(box_1[2], box_2[2])
    y_2 = min(box_1[3], box_2[3])

    inter = abs(max((x_2 - x_1, 0)) * max((y_2 - y_1), 0))
    if inter == 0:
        return 0

    box_1_area = abs((box_1[2] - box_1[0]) * (box_1[3] - box_1[1]))
    box_2_area = abs((box_2[2] - box_2[0]) * (box_2[3] - box_2[1]))

    return inter / float(box_1_area + box_2_area - inter)


def get_data(img, output):
    image_height = img.shape[0]
    image_width = img.shape[1]

    boxes = [line[0] for line in output]
    texts = [line[1][0] for line in output]
    probabilities = [line[1][1] for line in output]

    horiz_boxes = []
    vert_boxes = []

    for box in boxes:
        x_h, x_v = 0, int(box[0][0])
        y_h, y_v = int(box[0][1]), 0
        width_h, width_v = image_width, int(box[2][0] - box[0][0])
        height_h, height_v = int(box[2][1] - box[0][1]), image_height

        horiz_boxes.append([x_h, y_h, x_h + width_h, y_h + height_h])
        vert_boxes.append([x_v, y_v, x_v + width_v, y_v + height_v])

    horiz_out = tf.image.non_max_suppression(
        horiz_boxes,
        probabilities,
        max_output_size=1000,
        iou_threshold=0.1,
        score_threshold=float("-inf"),
        name=None,
    )

    horiz_lines = np.sort(np.array(horiz_out))

    vert_out = tf.image.non_max_suppression(
        vert_boxes,
        probabilities,
        max_output_size=1000,
        iou_threshold=0.1,
        score_threshold=float("-inf"),
        name=None,
    )
    vert_lines = np.sort(np.array(vert_out))

    out_array = ["" for _ in range(len(horiz_lines))]

    unordered_boxes = []

    for i in vert_lines:
        unordered_boxes.append(vert_boxes[i][0])

    ordered_boxes = np.argsort(unordered_boxes)

    for i in range(len(horiz_lines)):
        for j in range(len(vert_lines)):
            resultant = intersection(
                horiz_boxes[horiz_lines[i]], vert_boxes[vert_lines[ordered_boxes[j]]]
            )

            for b in range(len(boxes)):
                the_box = [
                    boxes[b][0][0],
                    boxes[b][0][1],
                    boxes[b][2][0],
                    boxes[b][2][1],
                ]
                if iou(resultant, the_box) > 0.1:
                    out_array[i] += f" {texts[b]}"

    out_array = np.array(out_array)
    return out_array


def get_processed_text_from_pages(results):
    processed_text = {}
    for page_num, (img, result) in enumerate(results, start=1):
        output = get_data(img, result)
        processed_text[page_num] = "\n".join(text for text in output)

    return processed_text

In [None]:
filename = "27786.jpg"
img = cv2.imread(filename)
pages = [img]

In [None]:
# pdf_pages = convert_from_path(filename, 500)
# pages = [np.asarray(page) for page in pdf_pages]


In [None]:
results, raw_text = get_raw_text_from_pages(pages)

In [None]:
processed_text = get_processed_text_from_pages(results)