Сравнение готовых решений из интернета

# Загрузка библиотек

In [1]:
import os
import pandas as pd
from pathlib import Path
import pytesseract
import cv2
from PIL import Image
import easyocr
from tqdm import tqdm
import numpy as np
from ultralytics import YOLO
import torch
import torch.nn.functional as F
import onnx
import onnxruntime

In [2]:
path_images = Path('images')
images = [str(path_images / x) for x in os.listdir(path_images)]

# EasyOCR

## Baseline: default EasyOCR

In [3]:
reader = easyocr.Reader(['en'])

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


In [4]:
%%time
for filename in tqdm(images):
    index = filename.split('\\')[1].split('.')[0]
    # image = cv2.imread(filename) # read image
    # if image.shape[0]<image.shape[1]:
    #     image = cv2.rotate(image, cv2.ROTATE_90_COUNTERCLOCKWISE) # read image
    image = Image.open(filename)
    if image.size[0]>image.size[1]: image = image.rotate(270)
    image = np.array(image)
    results = reader.readtext(image, allowlist ='0123456789')
    # # iterate on all results
    # for res in results:
    #     top_left = (int(res[0][0][0]), int(res[0][0][1])) # convert float to int
    #     bottom_right = (int(res[0][2][0]), int(res[0][2][1])) # convert float to int
    #     cv2.rectangle(image, top_left, bottom_right, (0, 255, 0), 3)
    #     cv2.putText(image, res[1], (top_left[0], top_left[1]-10), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 255), 2)
    # cv2.imwrite(f'images/{index}_1.jpg', image)

100%|██████████| 10/10 [02:00<00:00, 12.02s/it]

CPU times: total: 7min 56s
Wall time: 2min





## Terminal OD + EasyOCR

In [5]:
model_terminal = YOLO("../models/terminal_od.pt")

In [6]:
%%time
dict_easyocr = {}
for filename in tqdm(images):
    image = Image.open(filename)
    if image.size[0]>image.size[1]: image = image.rotate(270)
    box_terminal = model_terminal(image, verbose=False)
    x1, y1, x2, y2 = box_terminal[0].boxes.xyxy[0].numpy()
    image = image.crop((x1, y1, x2, y2))
    image = np.array(image)
    results = reader.readtext(image, allowlist='0123456789')
    dict_easyocr[filename] = results

100%|██████████| 10/10 [00:43<00:00,  4.39s/it]

CPU times: total: 1min 59s
Wall time: 43.9 s





In [7]:
(60+13) / 10

7.3

## Terminal OD + EasyOCR ONNX

https://github.com/JaidedAI/EasyOCR/issues/746

https://colab.research.google.com/drive/1pcoueUxhWFX5Ac6AA4paYDLgZMf819GT?usp=sharing#scrollTo=CvV_DDmohmBy

https://github.com/JaidedAI/EasyOCR/blob/ca9f9b0ac081f2874a603a5614ddaf9de40ac339/easyocr/config.py

In [8]:
path_onnx_files = Path('onnx_files')
if not os.path.exists(path_onnx_files): os.makedirs(path_onnx_files)

In [9]:
from easyocr_onnx import detection
import torch

model = detection.get_detector(trained_model='onnx_files/craft_mlt_25k.pth', device='cpu', quantize=False)

# input_shape = (1, 3, 480, 640)
# inputs = torch.ones(*input_shape)
# input_names=['input']
# output_names=['output']

# dynamic_axes= {'input':{0:'batch_size', 2:'height', 3:'width'}, 'output':{0:'batch_size', 2:'height', 3:'width'}} #adding names for better debugging
# torch.onnx.export(model, inputs, "craft.onnx", dynamic_axes=dynamic_axes, input_names=input_names, output_names=output_names)

In [10]:
import onnxruntime as rt
import cv2
import numpy as np
from easyocr_onnx.craft_utils import getDetBoxes, adjustResultCoordinates
from easyocr_onnx.imgproc import resize_aspect_ratio, normalizeMeanVariance
from easyocr_onnx.utils import reformat_input, get_image_list

# Сокращенный код

In [11]:
def read_image(image):
    if type(image) == str:
        img = cv2.imread(image)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    elif type(image) == bytes:
        nparr = np.frombuffer(image, np.uint8)
        img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    elif type(image) == np.ndarray:
        if len(image.shape) == 2:  # grayscale
            img = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
        elif len(image.shape) == 3 and image.shape[2] == 3:
            img = image
        elif len(image.shape) == 3 and image.shape[2] == 4:  # RGBAscale
            img = image[:, :, :3]
    return img


def rectify_poly(img, poly):
    # Use Affine transform
    n = int(len(poly) / 2) - 1
    width = 0
    height = 0
    for k in range(n):
        box = np.float32([poly[k], poly[k + 1], poly[-k - 2], poly[-k - 1]])
        width += int(
            (np.linalg.norm(box[0] - box[1]) + np.linalg.norm(box[2] - box[3])) / 2
        )
        height += np.linalg.norm(box[1] - box[2])
    width = int(width)
    height = int(height / n)

    output_img = np.zeros((height, width, 3), dtype=np.uint8)
    width_step = 0
    for k in range(n):
        box = np.float32([poly[k], poly[k + 1], poly[-k - 2], poly[-k - 1]])
        w = int((np.linalg.norm(box[0] - box[1]) + np.linalg.norm(box[2] - box[3])) / 2)

        # Top triangle
        pts1 = box[:3]
        pts2 = np.float32(
            [[width_step, 0], [width_step + w - 1, 0], [width_step + w - 1, height - 1]]
        )
        M = cv2.getAffineTransform(pts1, pts2)
        warped_img = cv2.warpAffine(
            img, M, (width, height), borderMode=cv2.BORDER_REPLICATE)
        warped_mask = np.zeros((height, width, 3), dtype=np.uint8)
        warped_mask = cv2.fillConvexPoly(warped_mask, np.int32(pts2), (1, 1, 1))
        output_img[warped_mask == 1] = warped_img[warped_mask == 1]

        # Bottom triangle
        pts1 = np.vstack((box[0], box[2:]))
        pts2 = np.float32(
            [
                [width_step, 0],
                [width_step + w - 1, height - 1],
                [width_step, height - 1],
            ]
        )
        M = cv2.getAffineTransform(pts1, pts2)
        warped_img = cv2.warpAffine(
            img, M, (width, height), borderMode=cv2.BORDER_REPLICATE
        )
        warped_mask = np.zeros((height, width, 3), dtype=np.uint8)
        warped_mask = cv2.fillConvexPoly(warped_mask, np.int32(pts2), (1, 1, 1))
        cv2.line(
            warped_mask, (width_step, 0), (width_step + w - 1, height - 1), (0, 0, 0), 1
        )
        output_img[warped_mask == 1] = warped_img[warped_mask == 1]

        width_step += w
    return output_img


def crop_poly(image, poly):
    # points should have 1*x*2  shape
    if len(poly.shape) == 2:
        poly = np.array([np.array(poly).astype(np.int32)])

    # create mask with shape of image
    mask = np.zeros(image.shape[0:2], dtype=np.uint8)

    # method 1 smooth region
    cv2.drawContours(mask, [poly], -1, (255, 255, 255), -1, cv2.LINE_AA)
    # method 2 not so smooth region
    # cv2.fillPoly(mask, points, (255))

    # crop around poly
    res = cv2.bitwise_and(image, image, mask=mask)
    rect = cv2.boundingRect(poly)  # returns (x,y,w,h) of the rect
    cropped = res[rect[1] : rect[1] + rect[3], rect[0] : rect[0] + rect[2]]
    return cropped


def export_detected_region(image, poly, rectify=True):
    """
    Arguments:
        image: full image
        points: bbox or poly points        
        rectify: rectify detected polygon by affine transform
    """
    if rectify:
        # rectify poly region
        result_rgb = rectify_poly(image, poly)
    else:
        result_rgb = crop_poly(image, poly)
    # export corpped region
    result_bgr = cv2.cvtColor(result_rgb, cv2.COLOR_RGB2BGR)
    return result_bgr


import copy

def export_detected_regions(
    image,
    regions,    
    rectify: bool = False,
):
    """
    Arguments:
        image: path to the image to be processed or numpy array or PIL image
        regions: list of bboxes or polys        
        rectify: rectify detected polygon by affine transform
    """
    # read/convert image
    image = read_image(image)
    # deepcopy image so that original is not altered
    image = copy.deepcopy(image)
    # init exported file paths
    exported_images = []
    # export regions
    for ind, region in enumerate(regions):
        # get export path
        #file_path = os.path.join(crops_dir, "crop_" + str(ind) + ".png")
        # export region
        crop = export_detected_region(image, poly=region, rectify=rectify)
        # note exported file path
        exported_images.append(crop)
    return exported_images


import torch.nn.functional as F
from easyocr_onnx import recognition

def recognizer_predict(converter, test_loader, batch_max_length,\
                       ignore_idx, char_group_idx, decoder = 'greedy', beamWidth= 5, device = 'cpu'):    
    result = []
    with torch.no_grad():
        for image_tensors in test_loader:
            batch_size = image_tensors.size(0)
            image = image_tensors.to(device)
            # For max length prediction
            length_for_pred = torch.IntTensor([batch_max_length] * batch_size).to(device)
            text_for_pred = torch.LongTensor(batch_size, batch_max_length + 1).fill_(0).to(device)

            #preds = model(image, text_for_pred)

            providers = ['CPUExecutionProvider']
            session = rt.InferenceSession("onnx_files/recog.onnx", providers=providers)
            inputs = session.get_inputs()

            inp = {inputs[0].name: image.numpy()}
            preds = session.run(None, inp)

            preds = torch.from_numpy(preds[0])

            # Select max probabilty (greedy decoding) then decode index to character
            preds_size = torch.IntTensor([preds.size(1)] * batch_size)

            ######## filter ignore_char, rebalance
            preds_prob = F.softmax(preds, dim=2)
            preds_prob = preds_prob.cpu().detach().numpy()
            preds_prob[:,:,ignore_idx] = 0.
            pred_norm = preds_prob.sum(axis=2)
            preds_prob = preds_prob/np.expand_dims(pred_norm, axis=-1)
            preds_prob = torch.from_numpy(preds_prob).float().to(device)

            if decoder=='greedy':
                # Select max probabilty (greedy decoding) then decode index to character
                _, preds_index = preds_prob.max(2)
                preds_index = preds_index.view(-1)
                preds_str = converter.decode_greedy(preds_index.data.cpu().detach().numpy(), preds_size.data)
            elif decoder == 'beamsearch':
                k = preds_prob.cpu().detach().numpy()
                preds_str = converter.decode_beamsearch(k, beamWidth=beamWidth)
            elif decoder == 'wordbeamsearch':
                k = preds_prob.cpu().detach().numpy()
                preds_str = converter.decode_wordbeamsearch(k, beamWidth=beamWidth)

            preds_prob = preds_prob.cpu().detach().numpy()
            values = preds_prob.max(axis=2)
            indices = preds_prob.argmax(axis=2)
            preds_max_prob = []
            for v, i in zip(values, indices):
                max_probs = v[i!=0]
                if len(max_probs)>0:
                    preds_max_prob.append(max_probs)
                else:
                    preds_max_prob.append(np.array([0]))

            for pred, pred_max_prob in zip(preds_str, preds_max_prob):
                confidence_score = recognition.custom_mean(pred_max_prob)
                result.append([pred, confidence_score])
    return result

def get_text(character, imgH, imgW, converter, image_list,\
             ignore_char = '',decoder = 'greedy', beamWidth =5, batch_size=1, contrast_ths=0.1,\
             adjust_contrast=0.5, filter_ths = 0.003, workers = 1, device = 'cpu'):
    batch_max_length = int(imgW/10)

    char_group_idx = {}
    ignore_idx = []
    for char in ignore_char:
        try: ignore_idx.append(character.index(char)+1)
        except: pass

    coord = [item[0] for item in image_list]
    img_list = [item[1] for item in image_list]
    AlignCollate_normal = recognition.AlignCollate(imgH=imgH, imgW=imgW, keep_ratio_with_pad=True)
    test_data = recognition.ListDataset(img_list)
    test_loader = torch.utils.data.DataLoader(
        test_data, batch_size=batch_size, shuffle=False,
        num_workers=int(workers), collate_fn=AlignCollate_normal, pin_memory=True)

    # predict first round
    result1 = recognizer_predict(converter, test_loader,batch_max_length,\
                                 ignore_idx, char_group_idx, decoder, beamWidth, device = device)

    # predict second round
    low_confident_idx = [i for i,item in enumerate(result1) if (item[1] < contrast_ths)]
    if len(low_confident_idx) > 0:
        img_list2 = [img_list[i] for i in low_confident_idx]
        AlignCollate_contrast = recognition.AlignCollate(imgH=imgH, imgW=imgW, keep_ratio_with_pad=True, adjust_contrast=adjust_contrast)
        test_data = recognition.ListDataset(img_list2)
        test_loader = torch.utils.data.DataLoader(
                        test_data, batch_size=batch_size, shuffle=False,
                        num_workers=int(workers), collate_fn=AlignCollate_contrast, pin_memory=True)
        result2 = recognizer_predict(converter, test_loader, batch_max_length,\
                                     ignore_idx, char_group_idx, decoder, beamWidth, device = device)

    result = []
    for i, zipped in enumerate(zip(coord, result1)):
        box, pred1 = zipped
        if i in low_confident_idx:
            pred2 = result2[low_confident_idx.index(i)]
            if pred1[1]>pred2[1]:
                result.append( (box, pred1[0], pred1[1]) )
            else:
                result.append( (box, pred2[0], pred2[1]) )
        else:
            result.append( (box, pred1[0], pred1[1]) )
    return result

In [12]:
# character = "0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ €ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
# symbol = "0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ €"
# recog_network = 'generation2'
# model_path = "onnx_files/english_g2.pth"
# separator_list = {}
# cyrillic_lang_list = ['en']
# package_dir = os.path.dirname(recognition.__file__)
# network_params = {
#     'input_channel': 1,
#     'output_channel': 256,
#     'hidden_size': 256
#     }
# dict_list = {}
# for lang in cyrillic_lang_list:
#     dict_list[lang] = os.path.join(package_dir, 'dict', lang + ".txt")

# model, converter = recognition.get_recognizer(recog_network=recog_network, network_params=network_params, character=character, separator_list=separator_list, dict_list=dict_list, model_path=model_path, device='cpu', quantize=False)

In [13]:
# batch_size = 1
# num_channels = 1
# image_height = imgH = 64
# image_width = 128
# image_input_shape = (batch_size, 1, image_height, image_width)
# image_input = torch.ones(*image_input_shape)
# max_text_length = 10
# text_input_shape = (batch_size, max_text_length)
# text_input = torch.ones(*text_input_shape)
# input_names=['image_input', 'text_input']
# output_names=['output']
# dynamic_axes = {"image_input": {0: "batch_size", 3: "width"}, "text_input": {0: "batch_size"}}
# opset_version = 12

# torch.onnx.export(model, (image_input, text_input), "onnx_files/recog.onnx", 
#                   input_names=input_names, output_names=output_names, 
#                   dynamic_axes=dynamic_axes, opset_version=opset_version)

In [14]:
character = '0123456789!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ €₽ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдеёжзийклмнопрстуфхцчшщъыьэюяЂђЃѓЄєІіЇїЈјЉљЊњЋћЌќЎўЏџҐґҒғҚқҮүҲҳҶҷӀӏӢӣӨөӮӯ'
symbol = '0123456789!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ €₽'
model_path = "cyrillic_g2.pth"
separator_list = {}
cyrillic_lang_list = ['ru','rs_cyrillic','be','bg','uk','mn','abq','ady','kbd',\
                      'ava','dar','inh','che','lbe','lez','tab','tjk', 'en']
package_dir = os.path.dirname(recognition.__file__)

dict_list = {}
for lang in cyrillic_lang_list:
    dict_list[lang] = os.path.join(package_dir, 'dict', lang + ".txt")

In [15]:
converter = recognition.CTCLabelConverter(character, separator_list, dict_list)

In [16]:
# Create ONNX Runtime session and load model
providers = ['CPUExecutionProvider']
session = rt.InferenceSession("onnx_files/craft.onnx", providers=providers)
input_name = session.get_inputs()[0].name

batch_size = 1
num_channels = 1
image_height = imgH = 64
device = 'cpu'

In [17]:
def get_easyocr_results(filename):
    image = Image.open(filename)
    if image.size[0]>image.size[1]: image = image.rotate(270)
    box_terminal = model_terminal(image, verbose=False)
    x1, y1, x2, y2 = box_terminal[0].boxes.xyxy[0].numpy()
    image = image.crop((x1, y1, x2, y2))
    img, _ = reformat_input(np.array(image))

    # Resize and normalize input image
    img_resized, target_ratio, size_heatmap = resize_aspect_ratio(img, 512, interpolation=cv2.INTER_LINEAR, mag_ratio=1.)
    ratio_h = ratio_w = 1 / target_ratio
    x = normalizeMeanVariance(img_resized)
    x = torch.from_numpy(x).permute(2, 0, 1).unsqueeze(0)
    # Prepare input tensor for inference
    inp = {input_name: x.numpy()}
    # Run inference and get output
    y, _ = session.run(None, inp)
    # Extract score and link maps
    score_text = y[0, :, :, 0]
    score_link = y[0, :, :, 1]
    # Post-processing to obtain bounding boxes and polygons
    boxes, _, _ = getDetBoxes(score_text, score_link, 0.5, 0.4, 0.4)
    boxes = adjustResultCoordinates(boxes, ratio_w, ratio_h)

    result = []
    lang_char = symbol[:10]
    ignore_char = ''.join(set(character)-set(lang_char))  

    crops = export_detected_regions(image=img, regions=boxes, rectify=True)
    for crop in crops:
        img, img_cv_grey = reformat_input(crop)
        y_max, x_max = img_cv_grey.shape
        horizontal_list = [[0, x_max, 0, y_max]]
        for bbox in horizontal_list:
            h_list = [bbox]
            f_list = []
            image_list, max_width = get_image_list(h_list, f_list, img_cv_grey, model_height=64) # 64 is default value
            # result0 = get_text(character, imgH, int(max_width), converter, image_list,\
            #                             ignore_char, 'greedy', beamWidth=5, batch_size=batch_size, contrast_ths=0.1, adjust_contrast=0.5, filter_ths=0.003,\
            #                             workers=0, device=device)
            result0 = get_text(character, imgH, int(max_width), converter, image_list,\
                                        ignore_char, 'greedy', batch_size=batch_size, \
                                        workers=0, device=device)
            result += result0
    return result

In [18]:
dict_results_onnx = {} 
for filename in tqdm(images):
    dict_results_onnx[filename] = get_easyocr_results(filename)

100%|██████████| 10/10 [00:18<00:00,  1.89s/it]


In [19]:
lists_onnx = []
for row in list(dict_results_onnx.values()):
    lists_onnx.append([x[1] for x in row])

In [20]:
lists_easyocr = []
for row in list(dict_easyocr.values()):
    lists_easyocr.append([x[1] for x in row])

In [21]:
import easyocr_onnx
easyocr_onnx.__file__

'C:\\Users\\dimaz\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\easyocr_onnx\\__init__.py'

In [22]:
df_onnx = pd.DataFrame({
    'filename': list(dict_results_onnx.keys()),
    'ocr_rows_onnx': lists_onnx})
df_easyocr = pd.DataFrame({
    'filename': list(dict_easyocr.keys()),
    'ocr_rows_easyocr': lists_easyocr
})

In [23]:
df_result = df_onnx.merge(df_easyocr, left_on='filename', right_on='filename')

In [24]:
correct_tids = ['30398560', '30395549',
 '30374544', '30370954',
 '30321657', '30372393',
 '30398967', '30370910',
 '30398363', '30409949'
 ]

In [25]:
onnx_flags = []
for i, tid in enumerate(correct_tids):
    onnx_flags.append(tid in df_result['ocr_rows_onnx'][i])

easyocr_flags = []
for i, tid in enumerate(correct_tids):
    easyocr_flags.append(tid in df_result['ocr_rows_easyocr'][i])

In [26]:
onnx_flags

[True, True, True, True, True, True, False, True, False, False]

In [27]:
easyocr_flags

[True, True, True, True, True, True, True, True, False, True]

In [28]:
sum(onnx_flags)

7

In [29]:
sum(easyocr_flags)

9

In [30]:
# filename = 'images/8.jpg'
# image = Image.open(filename)
# if image.size[0]>image.size[1]: image = image.rotate(270)
# box_terminal = model_terminal(image, verbose=False)
# x1, y1, x2, y2 = box_terminal[0].boxes.xyxy[0].numpy()
# image = image.crop((x1, y1, x2, y2))
# img, _ = reformat_input(np.array(image))

# # Resize and normalize input image
# img_resized, target_ratio, size_heatmap = resize_aspect_ratio(img, 512, interpolation=cv2.INTER_LINEAR, mag_ratio=1.)
# ratio_h = ratio_w = 1 / target_ratio
# x = normalizeMeanVariance(img_resized)
# x = torch.from_numpy(x).permute(2, 0, 1).unsqueeze(0)
# # Prepare input tensor for inference
# inp = {input_name: x.numpy()}
# # Run inference and get output
# y, _ = session.run(None, inp)
# # Extract score and link maps
# score_text = y[0, :, :, 0]
# score_link = y[0, :, :, 1]
# # Post-processing to obtain bounding boxes and polygons
# boxes, polys, mapper = getDetBoxes(score_text, score_link, 0.5, 0.4, 0.4)
# boxes = adjustResultCoordinates(boxes, ratio_w, ratio_h)

# result = []
# lang_char = symbol[:10]
# ignore_char = ''.join(set(character)-set(lang_char))  

# crops = export_detected_regions(image=img, regions=boxes, rectify=True)
# for crop in crops:
#     img, img_cv_grey = reformat_input(crop)
#     y_max, x_max = img_cv_grey.shape
#     horizontal_list = [[0, x_max, 0, y_max]]
#     for bbox in horizontal_list:
#         h_list = [bbox]
#         f_list = []
#         image_list, max_width = get_image_list(h_list, f_list, img_cv_grey, model_height=64) # 64 is default value
#         result0 = get_text(character, imgH, int(max_width), converter, image_list,\
#                                     ignore_char, 'greedy', beamWidth=5, batch_size=batch_size, contrast_ths=0.1, adjust_contrast=0.5, filter_ths=0.003,\
#                                     workers=0, device=device)
#         result += result0

In [31]:
# char = []
# for i in range(len(result)):
#     char.append(result[i][1])
# char

# Рабочий ONNX скрипт

In [32]:
# # im = Image.open('images/10.jpg') #.rotate(270)
# filename = 'images/1.jpg'
# image = Image.open(filename)
# if image.size[0]>image.size[1]: image = image.rotate(270)
# box_terminal = model_terminal(image, verbose=False)
# x1, y1, x2, y2 = box_terminal[0].boxes.xyxy[0].numpy()
# image = image.crop((x1, y1, x2, y2))
# img, _ = reformat_input(np.array(image))

In [33]:
# # Resize and normalize input image
# img_resized, target_ratio, size_heatmap = resize_aspect_ratio(img, 512, interpolation=cv2.INTER_LINEAR, mag_ratio=1.)
# ratio_h = ratio_w = 1 / target_ratio
# x = normalizeMeanVariance(img_resized)
# x = torch.from_numpy(x).permute(2, 0, 1).unsqueeze(0)

# # Create ONNX Runtime session and load model
# providers = ['CPUExecutionProvider']
# session = rt.InferenceSession("onnx_files/craft.onnx", providers=providers)
# input_name = session.get_inputs()[0].name

# # Prepare input tensor for inference
# inp = {input_name: x.numpy()}

# # Run inference and get output
# y, _ = session.run(None, inp)

# # Extract score and link maps
# score_text = y[0, :, :, 0]
# score_link = y[0, :, :, 1]

# # Post-processing to obtain bounding boxes and polygons
# boxes, polys, mapper = getDetBoxes(score_text, score_link, 0.5, 0.4, 0.4)
# boxes = adjustResultCoordinates(boxes, ratio_w, ratio_h)
# # polys = adjustResultCoordinates(polys, ratio_w, ratio_h)

In [34]:
# def read_image(image):
#     if type(image) == str:
#         img = cv2.imread(image)
#         img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

#     elif type(image) == bytes:
#         nparr = np.frombuffer(image, np.uint8)
#         img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
#         img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

#     elif type(image) == np.ndarray:
#         if len(image.shape) == 2:  # grayscale
#             img = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
#         elif len(image.shape) == 3 and image.shape[2] == 3:
#             img = image
#         elif len(image.shape) == 3 and image.shape[2] == 4:  # RGBAscale
#             img = image[:, :, :3]
#     return img


# def rectify_poly(img, poly):
#     # Use Affine transform
#     n = int(len(poly) / 2) - 1
#     width = 0
#     height = 0
#     for k in range(n):
#         box = np.float32([poly[k], poly[k + 1], poly[-k - 2], poly[-k - 1]])
#         width += int(
#             (np.linalg.norm(box[0] - box[1]) + np.linalg.norm(box[2] - box[3])) / 2
#         )
#         height += np.linalg.norm(box[1] - box[2])
#     width = int(width)
#     height = int(height / n)

#     output_img = np.zeros((height, width, 3), dtype=np.uint8)
#     width_step = 0
#     for k in range(n):
#         box = np.float32([poly[k], poly[k + 1], poly[-k - 2], poly[-k - 1]])
#         w = int((np.linalg.norm(box[0] - box[1]) + np.linalg.norm(box[2] - box[3])) / 2)

#         # Top triangle
#         pts1 = box[:3]
#         pts2 = np.float32(
#             [[width_step, 0], [width_step + w - 1, 0], [width_step + w - 1, height - 1]]
#         )
#         M = cv2.getAffineTransform(pts1, pts2)
#         warped_img = cv2.warpAffine(
#             img, M, (width, height), borderMode=cv2.BORDER_REPLICATE
#         )
#         warped_mask = np.zeros((height, width, 3), dtype=np.uint8)
#         warped_mask = cv2.fillConvexPoly(warped_mask, np.int32(pts2), (1, 1, 1))
#         output_img[warped_mask == 1] = warped_img[warped_mask == 1]

#         # Bottom triangle
#         pts1 = np.vstack((box[0], box[2:]))
#         pts2 = np.float32(
#             [
#                 [width_step, 0],
#                 [width_step + w - 1, height - 1],
#                 [width_step, height - 1],
#             ]
#         )
#         M = cv2.getAffineTransform(pts1, pts2)
#         warped_img = cv2.warpAffine(
#             img, M, (width, height), borderMode=cv2.BORDER_REPLICATE
#         )
#         warped_mask = np.zeros((height, width, 3), dtype=np.uint8)
#         warped_mask = cv2.fillConvexPoly(warped_mask, np.int32(pts2), (1, 1, 1))
#         cv2.line(
#             warped_mask, (width_step, 0), (width_step + w - 1, height - 1), (0, 0, 0), 1
#         )
#         output_img[warped_mask == 1] = warped_img[warped_mask == 1]

#         width_step += w
#     return output_img


# def crop_poly(image, poly):
#     # points should have 1*x*2  shape
#     if len(poly.shape) == 2:
#         poly = np.array([np.array(poly).astype(np.int32)])

#     # create mask with shape of image
#     mask = np.zeros(image.shape[0:2], dtype=np.uint8)

#     # method 1 smooth region
#     cv2.drawContours(mask, [poly], -1, (255, 255, 255), -1, cv2.LINE_AA)
#     # method 2 not so smooth region
#     # cv2.fillPoly(mask, points, (255))

#     # crop around poly
#     res = cv2.bitwise_and(image, image, mask=mask)
#     rect = cv2.boundingRect(poly)  # returns (x,y,w,h) of the rect
#     cropped = res[rect[1] : rect[1] + rect[3], rect[0] : rect[0] + rect[2]]
#     return cropped


# def export_detected_region(image, poly, rectify=True):
#     """
#     Arguments:
#         image: full image
#         points: bbox or poly points        
#         rectify: rectify detected polygon by affine transform
#     """
#     if rectify:
#         # rectify poly region
#         result_rgb = rectify_poly(image, poly)
#     else:
#         result_rgb = crop_poly(image, poly)
#     # export corpped region
#     result_bgr = cv2.cvtColor(result_rgb, cv2.COLOR_RGB2BGR)
#     return result_bgr


# import copy

# def export_detected_regions(
#     image,
#     regions,    
#     rectify: bool = False,
# ):
#     """
#     Arguments:
#         image: path to the image to be processed or numpy array or PIL image
#         regions: list of bboxes or polys        
#         rectify: rectify detected polygon by affine transform
#     """
#     # read/convert image
#     image = read_image(image)
#     # deepcopy image so that original is not altered
#     image = copy.deepcopy(image)
#     # init exported file paths
#     exported_images = []
#     # export regions
#     for ind, region in enumerate(regions):
#         # get export path
#         #file_path = os.path.join(crops_dir, "crop_" + str(ind) + ".png")
#         # export region
#         crop = export_detected_region(image, poly=region, rectify=rectify)
#         # note exported file path
#         exported_images.append(crop)
#     return exported_images

In [35]:
# crops = export_detected_regions(
#     image=img,
#     regions=boxes,
#     rectify=True)

In [36]:
# import torch.nn.functional as F
# from easyocr_onnx import recognition

# def recognizer_predict(converter, test_loader, batch_max_length,\
#                        ignore_idx, char_group_idx, decoder = 'greedy', beamWidth= 5, device = 'cpu'):    
#     result = []
#     with torch.no_grad():
#         for image_tensors in test_loader:
#             batch_size = image_tensors.size(0)
#             image = image_tensors.to(device)
#             # For max length prediction
#             length_for_pred = torch.IntTensor([batch_max_length] * batch_size).to(device)
#             text_for_pred = torch.LongTensor(batch_size, batch_max_length + 1).fill_(0).to(device)

#             #preds = model(image, text_for_pred)

#             providers = ['CPUExecutionProvider']
#             session = rt.InferenceSession("onnx_files/recog.onnx", providers=providers)
#             inputs = session.get_inputs()

#             inp = {inputs[0].name: image.numpy()}
#             preds = session.run(None, inp)

#             preds = torch.from_numpy(preds[0])

#             # Select max probabilty (greedy decoding) then decode index to character
#             preds_size = torch.IntTensor([preds.size(1)] * batch_size)

#             ######## filter ignore_char, rebalance
#             preds_prob = F.softmax(preds, dim=2)
#             preds_prob = preds_prob.cpu().detach().numpy()
#             preds_prob[:,:,ignore_idx] = 0.
#             pred_norm = preds_prob.sum(axis=2)
#             preds_prob = preds_prob/np.expand_dims(pred_norm, axis=-1)
#             preds_prob = torch.from_numpy(preds_prob).float().to(device)

#             if decoder=='greedy':
#                 # Select max probabilty (greedy decoding) then decode index to character
#                 _, preds_index = preds_prob.max(2)
#                 preds_index = preds_index.view(-1)
#                 preds_str = converter.decode_greedy(preds_index.data.cpu().detach().numpy(), preds_size.data)
#             elif decoder == 'beamsearch':
#                 k = preds_prob.cpu().detach().numpy()
#                 preds_str = converter.decode_beamsearch(k, beamWidth=beamWidth)
#             elif decoder == 'wordbeamsearch':
#                 k = preds_prob.cpu().detach().numpy()
#                 preds_str = converter.decode_wordbeamsearch(k, beamWidth=beamWidth)

#             preds_prob = preds_prob.cpu().detach().numpy()
#             values = preds_prob.max(axis=2)
#             indices = preds_prob.argmax(axis=2)
#             preds_max_prob = []
#             for v, i in zip(values, indices):
#                 max_probs = v[i!=0]
#                 if len(max_probs)>0:
#                     preds_max_prob.append(max_probs)
#                 else:
#                     preds_max_prob.append(np.array([0]))

#             for pred, pred_max_prob in zip(preds_str, preds_max_prob):
#                 confidence_score = recognition.custom_mean(pred_max_prob)
#                 result.append([pred, confidence_score])
#     return result

# def get_text(character, imgH, imgW, converter, image_list,\
#              ignore_char = '',decoder = 'greedy', beamWidth =5, batch_size=1, contrast_ths=0.1,\
#              adjust_contrast=0.5, filter_ths = 0.003, workers = 1, device = 'cpu'):
#     batch_max_length = int(imgW/10)

#     char_group_idx = {}
#     ignore_idx = []
#     for char in ignore_char:
#         try: ignore_idx.append(character.index(char)+1)
#         except: pass

#     coord = [item[0] for item in image_list]
#     img_list = [item[1] for item in image_list]
#     AlignCollate_normal = recognition.AlignCollate(imgH=imgH, imgW=imgW, keep_ratio_with_pad=True)
#     test_data = recognition.ListDataset(img_list)
#     test_loader = torch.utils.data.DataLoader(
#         test_data, batch_size=batch_size, shuffle=False,
#         num_workers=int(workers), collate_fn=AlignCollate_normal, pin_memory=True)

#     # predict first round
#     result1 = recognizer_predict(converter, test_loader,batch_max_length,\
#                                  ignore_idx, char_group_idx, decoder, beamWidth, device = device)

#     # predict second round
#     low_confident_idx = [i for i,item in enumerate(result1) if (item[1] < contrast_ths)]
#     if len(low_confident_idx) > 0:
#         img_list2 = [img_list[i] for i in low_confident_idx]
#         AlignCollate_contrast = recognition.AlignCollate(imgH=imgH, imgW=imgW, keep_ratio_with_pad=True, adjust_contrast=adjust_contrast)
#         test_data = recognition.ListDataset(img_list2)
#         test_loader = torch.utils.data.DataLoader(
#                         test_data, batch_size=batch_size, shuffle=False,
#                         num_workers=int(workers), collate_fn=AlignCollate_contrast, pin_memory=True)
#         result2 = recognizer_predict(converter, test_loader, batch_max_length,\
#                                      ignore_idx, char_group_idx, decoder, beamWidth, device = device)

#     result = []
#     for i, zipped in enumerate(zip(coord, result1)):
#         box, pred1 = zipped
#         if i in low_confident_idx:
#             pred2 = result2[low_confident_idx.index(i)]
#             if pred1[1]>pred2[1]:
#                 result.append( (box, pred1[0], pred1[1]) )
#             else:
#                 result.append( (box, pred2[0], pred2[1]) )
#         else:
#             result.append( (box, pred1[0], pred1[1]) )
#     return result

In [37]:
# character = '0123456789!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ €₽ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдеёжзийклмнопрстуфхцчшщъыьэюяЂђЃѓЄєІіЇїЈјЉљЊњЋћЌќЎўЏџҐґҒғҚқҮүҲҳҶҷӀӏӢӣӨөӮӯ'
# symbol = '0123456789!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ €₽'
# model_path = "cyrillic_g2.pth"
# separator_list = {}
# cyrillic_lang_list = ['ru','rs_cyrillic','be','bg','uk','mn','abq','ady','kbd',\
#                       'ava','dar','inh','che','lbe','lez','tab','tjk', 'en']
# package_dir = os.path.dirname(recognition.__file__)

# dict_list = {}
# for lang in cyrillic_lang_list:
#     dict_list[lang] = os.path.join(package_dir, 'dict', lang + ".txt")

In [38]:
# from easyocr_onnx.utils import reformat_input, get_image_list


# batch_size = 1
# num_channels = 1
# image_height = imgH = 64

# result = []
# # read image
# converter = recognition.CTCLabelConverter(character, separator_list, dict_list)
# device = 'cpu'

# lang_char = []
# for lang in cyrillic_lang_list:
#   char_file = os.path.join(package_dir, 'character', lang + "_char.txt")
#   with open(char_file, "r", encoding = "utf-8-sig") as input_file:
#     char_list =  input_file.read().splitlines()
#   lang_char += char_list
# lang_char = set(lang_char).union(set(symbol))
# # lang_char = set(lang_char).union(set(symbol[:10]))
# lang_char = symbol[:10]
# ignore_char = ''.join(set(character)-set(lang_char))  

# for crop in crops:
#   img, img_cv_grey = reformat_input(crop)
#   y_max, x_max = img_cv_grey.shape
#   horizontal_list = [[0, x_max, 0, y_max]]
#   for bbox in horizontal_list:
#       h_list = [bbox]
#       f_list = []
#       image_list, max_width = get_image_list(h_list, f_list, img_cv_grey, model_height=64) # 64 is default value
#       result0 = get_text(character, imgH, int(max_width), converter, image_list,\
#                                 ignore_char, 'greedy', beamWidth = 5, batch_size=batch_size, contrast_ths = 0.1, adjust_contrast = 0.5, filter_ths = 0.003,\
#                                 workers = 0, device = device)
#       result += result0

In [39]:
# char = []
# for i in range(len(result)):
#     char.append(result[i][1])
# char

# Terminal OD + ONNX en_model

In [55]:
model_terminal = YOLO("../models/terminal_od.pt")

In [63]:
import cv2
import numpy as np
import onnxruntime as rt
from easyocr_onnx.craft_utils import getDetBoxes, adjustResultCoordinates
from easyocr_onnx.imgproc import resize_aspect_ratio, normalizeMeanVariance
from easyocr_onnx.utils import reformat_input, get_image_list
from easyocr_onnx import recognition

def read_image(image):
    if type(image) == str:
        img = cv2.imread(image)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    elif type(image) == bytes:
        nparr = np.frombuffer(image, np.uint8)
        img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    elif type(image) == np.ndarray:
        if len(image.shape) == 2:  # grayscale
            img = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
        elif len(image.shape) == 3 and image.shape[2] == 3:
            img = image
        elif len(image.shape) == 3 and image.shape[2] == 4:  # RGBAscale
            img = image[:, :, :3]

    return img

def rectify_poly(img, poly):
    # Use Affine transform
    n = int(len(poly) / 2) - 1
    width = 0
    height = 0
    for k in range(n):
        box = np.float32([poly[k], poly[k + 1], poly[-k - 2], poly[-k - 1]])
        width += int(
            (np.linalg.norm(box[0] - box[1]) + np.linalg.norm(box[2] - box[3])) / 2
        )
        height += np.linalg.norm(box[1] - box[2])
    width = int(width)
    height = int(height / n)

    output_img = np.zeros((height, width, 3), dtype=np.uint8)
    width_step = 0
    for k in range(n):
        box = np.float32([poly[k], poly[k + 1], poly[-k - 2], poly[-k - 1]])
        w = int((np.linalg.norm(box[0] - box[1]) + np.linalg.norm(box[2] - box[3])) / 2)

        # Top triangle
        pts1 = box[:3]
        pts2 = np.float32(
            [[width_step, 0], [width_step + w - 1, 0], [width_step + w - 1, height - 1]]
        )
        M = cv2.getAffineTransform(pts1, pts2)
        warped_img = cv2.warpAffine(
            img, M, (width, height), borderMode=cv2.BORDER_REPLICATE
        )
        warped_mask = np.zeros((height, width, 3), dtype=np.uint8)
        warped_mask = cv2.fillConvexPoly(warped_mask, np.int32(pts2), (1, 1, 1))
        output_img[warped_mask == 1] = warped_img[warped_mask == 1]

        # Bottom triangle
        pts1 = np.vstack((box[0], box[2:]))
        pts2 = np.float32(
            [
                [width_step, 0],
                [width_step + w - 1, height - 1],
                [width_step, height - 1],
            ]
        )
        M = cv2.getAffineTransform(pts1, pts2)
        warped_img = cv2.warpAffine(
            img, M, (width, height), borderMode=cv2.BORDER_REPLICATE
        )
        warped_mask = np.zeros((height, width, 3), dtype=np.uint8)
        warped_mask = cv2.fillConvexPoly(warped_mask, np.int32(pts2), (1, 1, 1))
        cv2.line(
            warped_mask, (width_step, 0), (width_step + w - 1, height - 1), (0, 0, 0), 1)
        output_img[warped_mask == 1] = warped_img[warped_mask == 1]

        width_step += w
    return output_img

def crop_poly(image, poly):
    # points should have 1*x*2  shape
    if len(poly.shape) == 2:
        poly = np.array([np.array(poly).astype(np.int32)])

    # create mask with shape of image
    mask = np.zeros(image.shape[0:2], dtype=np.uint8)

    # method 1 smooth region
    cv2.drawContours(mask, [poly], -1, (255, 255, 255), -1, cv2.LINE_AA)
    # method 2 not so smooth region
    # cv2.fillPoly(mask, points, (255))

    # crop around poly
    res = cv2.bitwise_and(image, image, mask=mask)
    rect = cv2.boundingRect(poly)  # returns (x,y,w,h) of the rect
    cropped = res[rect[1] : rect[1] + rect[3], rect[0] : rect[0] + rect[2]]
    return cropped

def export_detected_region(image, poly, rectify=True):
    """
    Arguments:
        image: full image
        points: bbox or poly points
        rectify: rectify detected polygon by affine transform
    """
    if rectify:
        # rectify poly region
        result_rgb = rectify_poly(image, poly)
    else:
        result_rgb = crop_poly(image, poly)

    # export corpped region
    result_bgr = cv2.cvtColor(result_rgb, cv2.COLOR_RGB2BGR)
    return result_bgr

import copy

def export_detected_regions(
    image,
    regions,
    rectify: bool = False,
):
    """
    Arguments:
        image: path to the image to be processed or numpy array or PIL image
        regions: list of bboxes or polys
        rectify: rectify detected polygon by affine transform
    """

    # read/convert image
    image = read_image(image)

    # deepcopy image so that original is not altered
    image = copy.deepcopy(image)

    # init exported file paths
    exported_images = []

    # export regions
    for ind, region in enumerate(regions):
        # get export path
        #file_path = os.path.join(crops_dir, "crop_" + str(ind) + ".png")
        # export region
        crop = export_detected_region(image, poly=region, rectify=rectify)
        # note exported file path
        exported_images.append(crop)

    return exported_images

In [64]:
import torch.nn.functional as F

def recognizer_predict(converter, test_loader, batch_max_length,\
                       ignore_idx, char_group_idx, decoder='greedy', beamWidth=5, device='cpu'):
    result = []
    with torch.no_grad():
        for image_tensors in test_loader:
            batch_size = image_tensors.size(0)
            image = image_tensors.to(device)
            # print(image.shape)
            # For max length prediction
            length_for_pred = torch.IntTensor([batch_max_length] * batch_size).to(device)
            text_for_pred = torch.LongTensor(batch_size, batch_max_length + 1).fill_(0).to(device)

            #preds = model(image, text_for_pred)

            providers = ['CPUExecutionProvider']
            session = rt.InferenceSession("onnx_files/recog_en.onnx", providers=providers)
            inputs = session.get_inputs()

            # print(len(inputs))

            inp = {inputs[0].name: image.numpy()}
            preds = session.run(None, inp)

            preds = torch.from_numpy(preds[0])

            # Select max probabilty (greedy decoding) then decode index to character
            preds_size = torch.IntTensor([preds.size(1)] * batch_size)

            ######## filter ignore_char, rebalance
            preds_prob = F.softmax(preds, dim=2)
            preds_prob = preds_prob.cpu().detach().numpy()
            preds_prob[:,:,ignore_idx] = 0.
            pred_norm = preds_prob.sum(axis=2)
            preds_prob = preds_prob/np.expand_dims(pred_norm, axis=-1)
            preds_prob = torch.from_numpy(preds_prob).float().to(device)

            if decoder == 'greedy':
                # Select max probabilty (greedy decoding) then decode index to character
                _, preds_index = preds_prob.max(2)
                preds_index = preds_index.view(-1)
                preds_str = converter.decode_greedy(preds_index.data.cpu().detach().numpy(), preds_size.data)
            elif decoder == 'beamsearch':
                k = preds_prob.cpu().detach().numpy()
                preds_str = converter.decode_beamsearch(k, beamWidth=beamWidth)
            elif decoder == 'wordbeamsearch':
                k = preds_prob.cpu().detach().numpy()
                preds_str = converter.decode_wordbeamsearch(k, beamWidth=beamWidth)

            preds_prob = preds_prob.cpu().detach().numpy()
            values = preds_prob.max(axis=2)
            indices = preds_prob.argmax(axis=2)
            preds_max_prob = []
            for v,i in zip(values, indices):
                max_probs = v[i!=0]
                if len(max_probs)>0:
                    preds_max_prob.append(max_probs)
                else:
                    preds_max_prob.append(np.array([0]))

            for pred, pred_max_prob in zip(preds_str, preds_max_prob):
                confidence_score = recognition.custom_mean(pred_max_prob)
                result.append([pred, confidence_score])

    return result

def get_text(character, imgH, imgW, converter, image_list,\
             ignore_char='', decoder='greedy', beamWidth=5, batch_size=1, contrast_ths=0.1,\
             adjust_contrast=0.5, filter_ths = 0.003, workers = 1, device = 'cpu'):
    batch_max_length = int(imgW/10)

    char_group_idx = {}
    ignore_idx = []
    for char in ignore_char:
        try: ignore_idx.append(character.index(char)+1)
        except: pass

    coord = [item[0] for item in image_list]
    img_list = [item[1] for item in image_list]
    AlignCollate_normal = recognition.AlignCollate(imgH=imgH, imgW=imgW, keep_ratio_with_pad=True)
    test_data = recognition.ListDataset(img_list)
    test_loader = torch.utils.data.DataLoader(
        test_data, batch_size=batch_size, shuffle=False,
        num_workers=int(workers), collate_fn=AlignCollate_normal, pin_memory=True)

    # predict first round
    result1 = recognizer_predict(converter, test_loader,batch_max_length,\
                                 ignore_idx, char_group_idx, decoder, beamWidth, device = device)

    # predict second round
    low_confident_idx = [i for i,item in enumerate(result1) if (item[1] < contrast_ths)]
    if len(low_confident_idx) > 0:
        img_list2 = [img_list[i] for i in low_confident_idx]
        AlignCollate_contrast = recognition.AlignCollate(imgH=imgH, imgW=imgW, keep_ratio_with_pad=True, adjust_contrast=adjust_contrast)
        test_data = recognition.ListDataset(img_list2)
        test_loader = torch.utils.data.DataLoader(
                        test_data, batch_size=batch_size, shuffle=False,
                        num_workers=int(workers), collate_fn=AlignCollate_contrast, pin_memory=True)
        result2 = recognizer_predict(converter, test_loader, batch_max_length,\
                                     ignore_idx, char_group_idx, decoder, beamWidth, device = device)
    result = []
    for i, zipped in enumerate(zip(coord, result1)):
        box, pred1 = zipped
        if i in low_confident_idx:
            pred2 = result2[low_confident_idx.index(i)]
            if pred1[1]>pred2[1]:
                result.append( (box, pred1[0], pred1[1]) )
            else:
                result.append( (box, pred2[0], pred2[1]) )
        else:
            result.append( (box, pred1[0], pred1[1]) )
    return result

In [65]:
recog_network = 'generation2'
network_params = {
    'input_channel': 1,
    'output_channel': 256,
    'hidden_size': 256}
character = "0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ €ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
symbol = "0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ €"
model_path = "english_g2.pth"
separator_list = {}
cyrillic_lang_list = ['en']
package_dir = os.path.dirname(recognition.__file__)

dict_list = {}
for lang in cyrillic_lang_list:
    dict_list[lang] = os.path.join(package_dir, 'dict', lang + ".txt")

batch_size = 1
num_channels = 1
image_height = imgH = 64
image_width = 128
lang_char = []
for lang in cyrillic_lang_list:
    char_file = os.path.join(package_dir, 'character', lang + "_char.txt")
    with open(char_file, "r", encoding = "utf-8-sig") as input_file:
        char_list =  input_file.read().splitlines()
    lang_char += char_list

lang_char = set(lang_char).union(set(symbol))
ignore_char = ''.join(set(character)-set(lang_char))

# read image
converter = recognition.CTCLabelConverter(character, separator_list, dict_list)
device = 'cpu'

In [75]:
dict_list

{'en': 'C:\\Users\\dimaz\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\easyocr_onnx\\dict\\en.txt'}

In [78]:
easyocr_onnx.__file__

'C:\\Users\\dimaz\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\easyocr_onnx\\__init__.py'

In [66]:
filename = 'images/6.jpg'
def easyocr_onnx_inference(filename):
    image = Image.open(filename)
    if image.size[0]>image.size[1]: image = image.rotate(270)
    box_terminal = model_terminal(image, verbose=False)
    x1, y1, x2, y2 = box_terminal[0].boxes.xyxy[0].numpy()
    image = image.crop((x1, y1, x2, y2))
    img, _ = reformat_input(np.array(image))
    # Resize and normalize input image
    img_resized, target_ratio, size_heatmap = resize_aspect_ratio(img, 512, interpolation=cv2.INTER_LINEAR, mag_ratio=1.)
    ratio_h = ratio_w = 1 / target_ratio
    x = normalizeMeanVariance(img_resized)
    x = torch.from_numpy(x).permute(2, 0, 1).unsqueeze(0)
    # Create ONNX Runtime session and load model
    providers = ['CPUExecutionProvider']
    session = rt.InferenceSession("onnx_files/craft_en.onnx", providers=providers)
    input_name = session.get_inputs()[0].name
    # Prepare input tensor for inference
    inp = {input_name: x.numpy()}
    # Run inference and get output
    y, _ = session.run(None, inp)
    # Extract score and link maps
    score_text = y[0, :, :, 0]
    score_link = y[0, :, :, 1]
    # Post-processing to obtain bounding boxes and polygons
    boxes, polys, mapper = getDetBoxes(score_text, score_link, 0.5, 0.4, 0.4)
    boxes = adjustResultCoordinates(boxes, ratio_w, ratio_h)
    crops = export_detected_regions(
        image=img,
        regions=boxes,
        rectify=True)
    result = []
    for crop in crops:
        img, img_cv_grey = reformat_input(crop)
        y_max, x_max = img_cv_grey.shape
        horizontal_list = [[0, x_max, 0, y_max]]
        for bbox in horizontal_list:
            h_list = [bbox]
            f_list = []
            image_list, max_width = get_image_list(h_list, f_list, img_cv_grey, model_height=64) # 64 is default value
            result0 = get_text(character, imgH, int(max_width), converter, image_list,\
                                    ignore_char, 'greedy', beamWidth=5, batch_size=batch_size, 
                                    contrast_ths=0.1, adjust_contrast=0.5, filter_ths=0.003,\
                                    workers=0, device=device)
            result += result0
    char = []
    for i in range(len(result)):
        char.append(result[i][1])
    return char

In [67]:
%%time
dict_results_onnx_en = {} 
for filename in tqdm(images):
    dict_results_onnx_en[filename] = easyocr_onnx_inference(filename)
    # chars.append(easyocr_onnx_inference(filename))
# chars = [easyocr_onnx_inference(filename) for filename in tqdm(images)]

100%|██████████| 10/10 [00:19<00:00,  1.92s/it]

CPU times: total: 54.4 s
Wall time: 19.2 s





In [68]:
lists_onnx_en = []
for row in list(dict_results_onnx_en.values()):
    lists_onnx_en.append(row)

In [69]:
df_onnx_en = pd.DataFrame({
    'filename': list(dict_results_onnx_en.keys()),
    'ocr_rows_onnx_en': lists_onnx_en})

In [72]:
df_result = df_result.merge(df_onnx_en, left_on='filename', right_on='filename')
onnx_flags_en = []
for i, tid in enumerate(correct_tids):
    onnx_flags_en.append(tid in df_result['ocr_rows_onnx_en'][i])

In [73]:
sum(onnx_flags_en)

8

# Terminal OD + models quant ONNX

In [96]:
# from onnxruntime.quantization import quantize_dynamic, QuantType

In [117]:
# craft_fp32 = 'onnx_files/craft_en.onnx'
# craft_quant = 'onnx_files/craft_en_quant.onnx' 
# model_craft_quant = quantize_dynamic(craft_fp32, craft_quant,
#                                      nodes_to_exclude=['Conv_quant'])
# # weight_type=QuantType.QInt8, 



In [127]:
import onnx
from onnxsim import simplify

# load your predefined ONNX model
model = onnx.load('onnx_files/craft_en.onnx')

# convert model
model_simp, check = simplify(model)

assert check, "Simplified ONNX model could not be validated"

In [130]:
onnx.save_model(model_simp, 'onnx_files/craft_en_onnxsim.onnx' )

In [119]:
recog_fp32 = 'onnx_files/recog_en.onnx'
recog_quant = 'onnx_files/recog_en_quant.onnx' 
# model_recog_quant = quantize_dynamic(recog_fp32, recog_quant)



In [131]:
model = onnx.load('onnx_files/recog_en.onnx')
# convert model
model_simp, check = simplify(model)

assert check, "Simplified ONNX model could not be validated"
onnx.save_model(model_simp, 'onnx_files/recog_en_onnxsim.onnx' )

In [132]:
import cv2
import numpy as np
import onnxruntime as rt
from easyocr_onnx.craft_utils import getDetBoxes, adjustResultCoordinates
from easyocr_onnx.imgproc import resize_aspect_ratio, normalizeMeanVariance
from easyocr_onnx.utils import reformat_input, get_image_list
from easyocr_onnx import recognition

def read_image(image):
    if type(image) == str:
        img = cv2.imread(image)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    elif type(image) == bytes:
        nparr = np.frombuffer(image, np.uint8)
        img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    elif type(image) == np.ndarray:
        if len(image.shape) == 2:  # grayscale
            img = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
        elif len(image.shape) == 3 and image.shape[2] == 3:
            img = image
        elif len(image.shape) == 3 and image.shape[2] == 4:  # RGBAscale
            img = image[:, :, :3]

    return img

def rectify_poly(img, poly):
    # Use Affine transform
    n = int(len(poly) / 2) - 1
    width = 0
    height = 0
    for k in range(n):
        box = np.float32([poly[k], poly[k + 1], poly[-k - 2], poly[-k - 1]])
        width += int(
            (np.linalg.norm(box[0] - box[1]) + np.linalg.norm(box[2] - box[3])) / 2
        )
        height += np.linalg.norm(box[1] - box[2])
    width = int(width)
    height = int(height / n)

    output_img = np.zeros((height, width, 3), dtype=np.uint8)
    width_step = 0
    for k in range(n):
        box = np.float32([poly[k], poly[k + 1], poly[-k - 2], poly[-k - 1]])
        w = int((np.linalg.norm(box[0] - box[1]) + np.linalg.norm(box[2] - box[3])) / 2)

        # Top triangle
        pts1 = box[:3]
        pts2 = np.float32(
            [[width_step, 0], [width_step + w - 1, 0], [width_step + w - 1, height - 1]]
        )
        M = cv2.getAffineTransform(pts1, pts2)
        warped_img = cv2.warpAffine(
            img, M, (width, height), borderMode=cv2.BORDER_REPLICATE
        )
        warped_mask = np.zeros((height, width, 3), dtype=np.uint8)
        warped_mask = cv2.fillConvexPoly(warped_mask, np.int32(pts2), (1, 1, 1))
        output_img[warped_mask == 1] = warped_img[warped_mask == 1]

        # Bottom triangle
        pts1 = np.vstack((box[0], box[2:]))
        pts2 = np.float32(
            [
                [width_step, 0],
                [width_step + w - 1, height - 1],
                [width_step, height - 1],
            ]
        )
        M = cv2.getAffineTransform(pts1, pts2)
        warped_img = cv2.warpAffine(
            img, M, (width, height), borderMode=cv2.BORDER_REPLICATE
        )
        warped_mask = np.zeros((height, width, 3), dtype=np.uint8)
        warped_mask = cv2.fillConvexPoly(warped_mask, np.int32(pts2), (1, 1, 1))
        cv2.line(
            warped_mask, (width_step, 0), (width_step + w - 1, height - 1), (0, 0, 0), 1)
        output_img[warped_mask == 1] = warped_img[warped_mask == 1]

        width_step += w
    return output_img

def crop_poly(image, poly):
    # points should have 1*x*2  shape
    if len(poly.shape) == 2:
        poly = np.array([np.array(poly).astype(np.int32)])

    # create mask with shape of image
    mask = np.zeros(image.shape[0:2], dtype=np.uint8)

    # method 1 smooth region
    cv2.drawContours(mask, [poly], -1, (255, 255, 255), -1, cv2.LINE_AA)
    # method 2 not so smooth region
    # cv2.fillPoly(mask, points, (255))

    # crop around poly
    res = cv2.bitwise_and(image, image, mask=mask)
    rect = cv2.boundingRect(poly)  # returns (x,y,w,h) of the rect
    cropped = res[rect[1] : rect[1] + rect[3], rect[0] : rect[0] + rect[2]]
    return cropped

def export_detected_region(image, poly, rectify=True):
    """
    Arguments:
        image: full image
        points: bbox or poly points
        rectify: rectify detected polygon by affine transform
    """
    if rectify:
        # rectify poly region
        result_rgb = rectify_poly(image, poly)
    else:
        result_rgb = crop_poly(image, poly)

    # export corpped region
    result_bgr = cv2.cvtColor(result_rgb, cv2.COLOR_RGB2BGR)
    return result_bgr

import copy

def export_detected_regions(
    image,
    regions,
    rectify: bool = False,
):
    """
    Arguments:
        image: path to the image to be processed or numpy array or PIL image
        regions: list of bboxes or polys
        rectify: rectify detected polygon by affine transform
    """

    # read/convert image
    image = read_image(image)

    # deepcopy image so that original is not altered
    image = copy.deepcopy(image)

    # init exported file paths
    exported_images = []

    # export regions
    for ind, region in enumerate(regions):
        # get export path
        #file_path = os.path.join(crops_dir, "crop_" + str(ind) + ".png")
        # export region
        crop = export_detected_region(image, poly=region, rectify=rectify)
        # note exported file path
        exported_images.append(crop)

    return exported_images

In [133]:
import torch.nn.functional as F

def recognizer_predict(converter, test_loader, batch_max_length,\
                       ignore_idx, char_group_idx, decoder='greedy', beamWidth=5, device='cpu'):
    result = []
    with torch.no_grad():
        for image_tensors in test_loader:
            batch_size = image_tensors.size(0)
            image = image_tensors.to(device)
            # print(image.shape)
            # For max length prediction
            length_for_pred = torch.IntTensor([batch_max_length] * batch_size).to(device)
            text_for_pred = torch.LongTensor(batch_size, batch_max_length + 1).fill_(0).to(device)

            #preds = model(image, text_for_pred)

            providers = ['CPUExecutionProvider']
            session = rt.InferenceSession("onnx_files/recog_en_onnxsim.onnx", providers=providers)
            inputs = session.get_inputs()

            # print(len(inputs))

            inp = {inputs[0].name: image.numpy()}
            preds = session.run(None, inp)

            preds = torch.from_numpy(preds[0])

            # Select max probabilty (greedy decoding) then decode index to character
            preds_size = torch.IntTensor([preds.size(1)] * batch_size)

            ######## filter ignore_char, rebalance
            preds_prob = F.softmax(preds, dim=2)
            preds_prob = preds_prob.cpu().detach().numpy()
            preds_prob[:,:,ignore_idx] = 0.
            pred_norm = preds_prob.sum(axis=2)
            preds_prob = preds_prob/np.expand_dims(pred_norm, axis=-1)
            preds_prob = torch.from_numpy(preds_prob).float().to(device)

            if decoder == 'greedy':
                # Select max probabilty (greedy decoding) then decode index to character
                _, preds_index = preds_prob.max(2)
                preds_index = preds_index.view(-1)
                preds_str = converter.decode_greedy(preds_index.data.cpu().detach().numpy(), preds_size.data)
            elif decoder == 'beamsearch':
                k = preds_prob.cpu().detach().numpy()
                preds_str = converter.decode_beamsearch(k, beamWidth=beamWidth)
            elif decoder == 'wordbeamsearch':
                k = preds_prob.cpu().detach().numpy()
                preds_str = converter.decode_wordbeamsearch(k, beamWidth=beamWidth)

            preds_prob = preds_prob.cpu().detach().numpy()
            values = preds_prob.max(axis=2)
            indices = preds_prob.argmax(axis=2)
            preds_max_prob = []
            for v,i in zip(values, indices):
                max_probs = v[i!=0]
                if len(max_probs)>0:
                    preds_max_prob.append(max_probs)
                else:
                    preds_max_prob.append(np.array([0]))

            for pred, pred_max_prob in zip(preds_str, preds_max_prob):
                confidence_score = recognition.custom_mean(pred_max_prob)
                result.append([pred, confidence_score])

    return result

def get_text(character, imgH, imgW, converter, image_list,\
             ignore_char='', decoder='greedy', beamWidth=5, batch_size=1, contrast_ths=0.1,\
             adjust_contrast=0.5, filter_ths = 0.003, workers = 1, device = 'cpu'):
    batch_max_length = int(imgW/10)

    char_group_idx = {}
    ignore_idx = []
    for char in ignore_char:
        try: ignore_idx.append(character.index(char)+1)
        except: pass

    coord = [item[0] for item in image_list]
    img_list = [item[1] for item in image_list]
    AlignCollate_normal = recognition.AlignCollate(imgH=imgH, imgW=imgW, keep_ratio_with_pad=True)
    test_data = recognition.ListDataset(img_list)
    test_loader = torch.utils.data.DataLoader(
        test_data, batch_size=batch_size, shuffle=False,
        num_workers=int(workers), collate_fn=AlignCollate_normal, pin_memory=True)

    # predict first round
    result1 = recognizer_predict(converter, test_loader,batch_max_length,\
                                 ignore_idx, char_group_idx, decoder, beamWidth, device = device)

    # predict second round
    low_confident_idx = [i for i,item in enumerate(result1) if (item[1] < contrast_ths)]
    if len(low_confident_idx) > 0:
        img_list2 = [img_list[i] for i in low_confident_idx]
        AlignCollate_contrast = recognition.AlignCollate(imgH=imgH, imgW=imgW, keep_ratio_with_pad=True, adjust_contrast=adjust_contrast)
        test_data = recognition.ListDataset(img_list2)
        test_loader = torch.utils.data.DataLoader(
                        test_data, batch_size=batch_size, shuffle=False,
                        num_workers=int(workers), collate_fn=AlignCollate_contrast, pin_memory=True)
        result2 = recognizer_predict(converter, test_loader, batch_max_length,\
                                     ignore_idx, char_group_idx, decoder, beamWidth, device = device)
    result = []
    for i, zipped in enumerate(zip(coord, result1)):
        box, pred1 = zipped
        if i in low_confident_idx:
            pred2 = result2[low_confident_idx.index(i)]
            if pred1[1]>pred2[1]:
                result.append( (box, pred1[0], pred1[1]) )
            else:
                result.append( (box, pred2[0], pred2[1]) )
        else:
            result.append( (box, pred1[0], pred1[1]) )
    return result

In [134]:
recog_network = 'generation2'
network_params = {
    'input_channel': 1,
    'output_channel': 256,
    'hidden_size': 256}
character = "0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ €ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
symbol = "0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ €"
model_path = "english_g2.pth"
separator_list = {}
cyrillic_lang_list = ['en']
package_dir = os.path.dirname(recognition.__file__)

dict_list = {}
for lang in cyrillic_lang_list:
    dict_list[lang] = os.path.join(package_dir, 'dict', lang + ".txt")

batch_size = 1
num_channels = 1
image_height = imgH = 64
image_width = 128
lang_char = []
for lang in cyrillic_lang_list:
    char_file = os.path.join(package_dir, 'character', lang + "_char.txt")
    with open(char_file, "r", encoding = "utf-8-sig") as input_file:
        char_list =  input_file.read().splitlines()
    lang_char += char_list

lang_char = set(lang_char).union(set(symbol))
ignore_char = ''.join(set(character)-set(lang_char))

# read image
converter = recognition.CTCLabelConverter(character, separator_list, dict_list)
device = 'cpu'

In [135]:
dict_list

{'en': 'C:\\Users\\dimaz\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\easyocr_onnx\\dict\\en.txt'}

In [136]:
easyocr_onnx.__file__

'C:\\Users\\dimaz\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\easyocr_onnx\\__init__.py'

In [137]:
filename = 'images/6.jpg'
def easyocr_onnx_inference(filename):
    image = Image.open(filename)
    if image.size[0]>image.size[1]: image = image.rotate(270)
    box_terminal = model_terminal(image, verbose=False)
    x1, y1, x2, y2 = box_terminal[0].boxes.xyxy[0].numpy()
    image = image.crop((x1, y1, x2, y2))
    img, _ = reformat_input(np.array(image))
    # Resize and normalize input image
    img_resized, target_ratio, size_heatmap = resize_aspect_ratio(img, 512, interpolation=cv2.INTER_LINEAR, mag_ratio=1.)
    ratio_h = ratio_w = 1 / target_ratio
    x = normalizeMeanVariance(img_resized)
    x = torch.from_numpy(x).permute(2, 0, 1).unsqueeze(0)
    # Create ONNX Runtime session and load model
    providers = ['CPUExecutionProvider']
    session = rt.InferenceSession("onnx_files/craft_en_onnxsim.onnx", providers=providers)
    input_name = session.get_inputs()[0].name
    # Prepare input tensor for inference
    inp = {input_name: x.numpy()}
    # Run inference and get output
    y, _ = session.run(None, inp)
    # Extract score and link maps
    score_text = y[0, :, :, 0]
    score_link = y[0, :, :, 1]
    # Post-processing to obtain bounding boxes and polygons
    boxes, polys, mapper = getDetBoxes(score_text, score_link, 0.5, 0.4, 0.4)
    boxes = adjustResultCoordinates(boxes, ratio_w, ratio_h)
    crops = export_detected_regions(
        image=img,
        regions=boxes,
        rectify=True)
    result = []
    for crop in crops:
        img, img_cv_grey = reformat_input(crop)
        y_max, x_max = img_cv_grey.shape
        horizontal_list = [[0, x_max, 0, y_max]]
        for bbox in horizontal_list:
            h_list = [bbox]
            f_list = []
            image_list, max_width = get_image_list(h_list, f_list, img_cv_grey, model_height=64) # 64 is default value
            result0 = get_text(character, imgH, int(max_width), converter, image_list,\
                                    ignore_char, 'greedy', beamWidth=5, batch_size=batch_size, 
                                    contrast_ths=0.1, adjust_contrast=0.5, filter_ths=0.003,\
                                    workers=0, device=device)
            result += result0
    char = []
    for i in range(len(result)):
        char.append(result[i][1])
    return char

In [138]:
%%time
dict_results_onnx_en_sim = {} 
for filename in tqdm(images):
    dict_results_onnx_en_sim[filename] = easyocr_onnx_inference(filename)
    # chars.append(easyocr_onnx_inference(filename))
# chars = [easyocr_onnx_inference(filename) for filename in tqdm(images)]

100%|██████████| 10/10 [00:19<00:00,  1.98s/it]

CPU times: total: 1min 2s
Wall time: 19.9 s





In [139]:
lists_onnx_en_quant = []
for row in list(dict_results_onnx_en_sim.values()):
    lists_onnx_en_quant.append(row)

In [140]:
df_onnx_en_sim = pd.DataFrame({
    'filename': list(dict_results_onnx_en_sim.keys()),
    'ocr_rows_onnx_en_sim': lists_onnx_en_quant})

In [141]:
df_result = df_result.merge(df_onnx_en_sim, left_on='filename', right_on='filename')
onnx_flags_en_sim = []
for i, tid in enumerate(correct_tids):
    onnx_flags_en_sim.append(tid in df_result['ocr_rows_onnx_en_sim'][i])

In [142]:
sum(onnx_flags_en_sim)

8