In [1]:
import pytesseract
import cv2
import base64
from img_doc.editors.binarizer import ValleyEmphasisBinarizer
from img_doc.extractors.word_extractors import BaseWordExtractor
from img_doc.extractors.word_extractors.word_bold_extractor import PsBoldExtractor, WidthBoldExtractor, ISPBoldExtractor
from img_doc.extractors.block_extractors.block_extractor_from_word import KMeanBlockExtractor
from img_doc.extractors.block_extractors.block_label_extractor import MLPExtractor, MLPAngLenExtractor, AngleLengthExtractor
from img_doc.data_structures import Word, Block
from img_doc.data_structures import Image, ImageSegment
import numpy as np
from typing import List
from io import StringIO
import json
import os

class TesseractWordExtractor(BaseWordExtractor):
    def extract_from_img(self, img: np) -> List[Word]:
        tesseract_bboxes = pytesseract.image_to_data(
            config="-l eng+rus",
            image=img,
            output_type=pytesseract.Output.DICT)
        word_list = []
        for index_bbox, level in enumerate(tesseract_bboxes["level"]):
            if level == 5:
                word = Word(text = tesseract_bboxes["text"][index_bbox])
                word.set_point_and_size({
                    "x_top_left": tesseract_bboxes["left"][index_bbox],
                    "y_top_left": tesseract_bboxes["top"][index_bbox],
                    "width": tesseract_bboxes["width"][index_bbox],
                    "height": tesseract_bboxes["height"][index_bbox],
                })
                word_list.append(word)
        return word_list


class ImgDocManager:
    def __init__(self):
        self.word_ext = TesseractWordExtractor()
        self.kmeanext = KMeanBlockExtractor()
        self.LABEL_BLOCK_EXTRACTOR = {
            "mlp_len": MLPExtractor("../models/model-2.sav", {"len_vec": 5}),
            "mlp_len_ang": MLPAngLenExtractor("../models/model-3.sav", {"len_vec": 5})
        }
        self.binarizer = ValleyEmphasisBinarizer()
        


    def get_file_dataset(self, dataset, parametr, fun_get_image):
        list_vec = []
        list_y = []
        vec_len = parametr["vec_len"]
        model_type = parametr["model_type"]
        is_into_segment = lambda point, json_seg: (json_seg["x_top_left"] < point[0] and json_seg["x_bottom_right"] > point[0] and
                                                   json_seg["y_top_left"] < point[1] and json_seg["y_bottom_right"] > point[1])
        for doc in dataset["documents"]:
            image = fun_get_image(doc["image64"])
            words = self.word_ext.extract_from_img(image.img)
            
            list_seg = [seg for seg in dataset["segments"] if seg["document_id"] == doc["id"]]
            for seg in list_seg:
                try:
                    seg_words = [word for word in words if is_into_segment(word.segment.get_center(), json.loads(seg["json_data"]))]
                    list_vec.append(self.LABEL_BLOCK_EXTRACTOR[model_type].get_vec_from_words(seg_words, vec_len).tolist())
                    list_y.append(seg["marking_id"])
                except:
                    print(seg)
        
        return {"x": list_vec, "y": list_y}

    def get_dataset_from_dir(self, path_dir, balans = 1000):
        train_images = os.path.join(path_dir, "train")
        with open(os.path.join(path_dir, "train.json"), "r") as f:
            train_json = json.load(f)
        dataset = dict()
        dataset["documents"] = [{"image64": img["file_name"], "id": img["id"]} for img in train_json["images"]]
        dataset["segments"] = []
        list_count_category = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0}
        for seg in train_json["annotations"]:
            if list_count_category[seg["category_id"]] < balans:
                dataset["segments"].append({"json_data": "{"+f'"x_top_left":{int(seg["bbox"][0])}, "y_top_left":{int(seg["bbox"][1])}, "x_bottom_right": {int(seg["bbox"][0]+seg["bbox"][2])}, "y_bottom_right": {int(seg["bbox"][1]+seg["bbox"][3])}'+"}",
                                "marking_id": seg["category_id"],
                                "document_id": seg["image_id"]
                               })
                list_count_category[seg["category_id"]] += 1
 
        return dataset
        

In [2]:
path_mini_publaynet = "/home/daniil/program/dataset/mini_publaynet"
path_mini_publaynet_train = "/home/daniil/program/dataset/mini_publaynet/train"

In [3]:
doc_manage = ImgDocManager()

In [4]:
dataset = doc_manage.get_dataset_from_dir(path_mini_publaynet)

In [5]:
def read_image(name_file):
    image = Image()
    image.set_img_from_path(os.path.join(path_mini_publaynet_train, name_file))
    return image

In [7]:
param = [1, 2, 3, 4]
param[0] = {
    "model_type": "mlp_len",
    "vec_len": 5
}

param[1] = {
    "model_type": "mlp_len",
    "vec_len": 50
}

param[2] = {
    "model_type": "mlp_len_ang",
    "vec_len": 5
}

param[3] = {
    "model_type": "mlp_len_ang",
    "vec_len": 50
}

In [8]:
print(len(dataset["segments"]))

5000


In [13]:
times_list = []
for p in param[1:]:
    start_time = time.time()
    with open(f"mini_publaynat_{p['model_type']}_veclen{p['vec_len']}.json", "w") as f:
        rez =doc_manage.get_file_dataset(dataset, p, read_image)
        json.dump(rez, f)
        times_list.append(time.time()-start_time)

print(times_list)

{'json_data': '{"x_top_left":40, "y_top_left":591, "x_bottom_right": 561, "y_bottom_right": 753}', 'marking_id': 4, 'document_id': 0}
{'json_data': '{"x_top_left":38, "y_top_left":77, "x_bottom_right": 546, "y_bottom_right": 229}', 'marking_id': 4, 'document_id': 45}
{'json_data': '{"x_top_left":53, "y_top_left":78, "x_bottom_right": 389, "y_bottom_right": 355}', 'marking_id': 4, 'document_id': 170}
{'json_data': '{"x_top_left":51, "y_top_left":57, "x_bottom_right": 289, "y_bottom_right": 351}', 'marking_id': 5, 'document_id': 456}
{'json_data': '{"x_top_left":51, "y_top_left":57, "x_bottom_right": 544, "y_bottom_right": 250}', 'marking_id': 5, 'document_id': 459}
{'json_data': '{"x_top_left":306, "y_top_left":57, "x_bottom_right": 544, "y_bottom_right": 203}', 'marking_id': 5, 'document_id': 465}
{'json_data': '{"x_top_left":51, "y_top_left":57, "x_bottom_right": 544, "y_bottom_right": 264}', 'marking_id': 5, 'document_id': 475}
{'json_data': '{"x_top_left":35, "y_top_left":45, "x_bot

In [9]:
import time

In [10]:
a1 = time.time()

In [12]:
time.time()-a1

20.863629817962646

4 часа 22 мин (21, 18 мин соответственно)