[31mERROR: Invalid requirement: '....'[0m[31m
[0m

In [23]:
import pytesseract
import cv2
import base64
import time
from img_doc.editors.binarizer import ValleyEmphasisBinarizer
from img_doc.extractors.word_extractors import BaseWordExtractor
from img_doc.extractors.word_extractors.word_bold_extractor import PsBoldExtractor, WidthBoldExtractor, ISPBoldExtractor
from img_doc.extractors.block_extractors.block_extractor_from_word import KMeanBlockExtractor
from img_doc.extractors.block_extractors.block_label_extractor import *
from img_doc.data_structures import Word, Block
from img_doc.data_structures import Image, ImageSegment
import numpy as np
from typing import List
from io import StringIO
import json
import os

class TesseractWordExtractor(BaseWordExtractor):
    def extract_from_img(self, img: np) -> List[Word]:
        tesseract_bboxes = pytesseract.image_to_data(
            config="-l eng+rus",
            image=img,
            output_type=pytesseract.Output.DICT)
        word_list = []
        for index_bbox, level in enumerate(tesseract_bboxes["level"]):
            if level == 5:
                word = Word(text = tesseract_bboxes["text"][index_bbox])
                word.set_point_and_size({
                    "x_top_left": tesseract_bboxes["left"][index_bbox],
                    "y_top_left": tesseract_bboxes["top"][index_bbox],
                    "width": tesseract_bboxes["width"][index_bbox],
                    "height": tesseract_bboxes["height"][index_bbox],
                })
                word_list.append(word)
        return word_list


class ImgDocManager:
    def __init__(self):
        self.word_ext = TesseractWordExtractor()
        self.kmeanext = KMeanBlockExtractor()
        self.LABEL_BLOCK_EXTRACTOR = {
            "mlp_len": MLPExtractor("../models/model-1.sav", {"len_vec": 5}),
            "mlp_len_ang": MLPAngLenExtractor("../models/model-1.sav", {"len_vec": 5}),
            "rnd_walk_dist": MLPRandomWalkExtractor("../models/model-1.sav", {"len_vec": 50})
        }
        self.binarizer = ValleyEmphasisBinarizer()
        


    def get_file_dataset(self, dataset, parametr, fun_get_image):
        list_vec = []
        list_y = []
        vec_len = parametr["vec_len"]
        model_type = parametr["model_type"]
        is_into_segment = lambda point, json_seg: (json_seg["x_top_left"] < point[0] and json_seg["x_bottom_right"] > point[0] and
                                                   json_seg["y_top_left"] < point[1] and json_seg["y_bottom_right"] > point[1])
        i = 0
        for doc in dataset["documents"]:
            print(f"{i/5000*100:.2f}%")
            image = fun_get_image(doc["image64"])
            words = self.word_ext.extract_from_img(image.img)
            
            list_seg = [seg for seg in dataset["segments"] if seg["document_id"] == doc["id"]]
            for seg in list_seg:
                try:
                    seg_words = [word for word in words if is_into_segment(word.segment.get_center(), json.loads(seg["json_data"]))]
                    list_vec.append(self.LABEL_BLOCK_EXTRACTOR[model_type].get_vec_from_words(seg_words, vec_len).tolist())
                    list_y.append(seg["marking_id"])
                except:
                    print(seg)
            i+=1
        return {"x": list_vec, "y": list_y}

    def get_dataset_from_dir(self, path_dir, balans = 1000):
        train_images = os.path.join(path_dir, "train")
        with open(os.path.join(path_dir, "train.json"), "r") as f:
            train_json = json.load(f)
        dataset = dict()
        dataset["documents"] = [{"image64": img["file_name"], "id": img["id"]} for img in train_json["images"]]
        dataset["segments"] = []
        list_count_category = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0}
        
        for seg in train_json["annotations"]:
            
            if list_count_category[seg["category_id"]] < balans:
                dataset["segments"].append({"json_data": "{"+f'"x_top_left":{int(seg["bbox"][0])}, "y_top_left":{int(seg["bbox"][1])}, "x_bottom_right": {int(seg["bbox"][0]+seg["bbox"][2])}, "y_bottom_right": {int(seg["bbox"][1]+seg["bbox"][3])}'+"}",
                                "marking_id": seg["category_id"],
                                "document_id": seg["image_id"]
                               })
                list_count_category[seg["category_id"]] += 1
 
        return dataset
        

In [24]:
path_mini_publaynet = "/home/daniil/program/dataset/mini_publaynet"
path_mini_publaynet_train = "/home/daniil/program/dataset/mini_publaynet/train"

In [25]:
doc_manage = ImgDocManager()

In [26]:
dataset = doc_manage.get_dataset_from_dir(path_mini_publaynet)

In [27]:
def read_image(name_file):
    image = Image()
    image.set_img_from_path(os.path.join(path_mini_publaynet_train, name_file))
    return image

In [28]:
param = [1, 2, 3, 4, 5]
# param[0] = {
#     "model_type": "mlp_len",
#     "vec_len": 5
# }

# param[1] = {
#     "model_type": "mlp_len",
#     "vec_len": 50
# }

# param[2] = {
#     "model_type": "mlp_len_ang",
#     "vec_len": 5
# }

# param[3] = {
#     "model_type": "mlp_len_ang",
#     "vec_len": 50
# }
param[4] = {
    "model_type": "rnd_walk_dist",
    "vec_len": 50 
}

In [29]:
print(len(dataset["segments"]))

5000


In [30]:
times_list = []
for p in param[4:]:
    start_time = time.time()
    with open(f"mini_publaynat_{p['model_type']}_veclen{p['vec_len']}.json", "w") as f:
        rez =doc_manage.get_file_dataset(dataset, p, read_image)
        json.dump(rez, f)
        times_list.append(time.time()-start_time)

print(times_list)

0.00%
{'json_data': '{"x_top_left":40, "y_top_left":591, "x_bottom_right": 561, "y_bottom_right": 753}', 'marking_id': 4, 'document_id': 0}
0.02%
0.04%
0.06%
0.08%
0.10%
0.12%
0.14%
0.16%
0.18%
0.20%
0.22%
0.24%
0.26%
0.28%
0.30%
0.32%
0.34%
0.36%
0.38%
0.40%
0.42%
0.44%
0.46%
0.48%
0.50%
0.52%
0.54%
0.56%
0.58%
0.60%
0.62%
0.64%
0.66%
0.68%
0.70%
0.72%
0.74%
0.76%
0.78%
0.80%
0.82%
0.84%
0.86%
0.88%
0.90%
{'json_data': '{"x_top_left":38, "y_top_left":77, "x_bottom_right": 546, "y_bottom_right": 229}', 'marking_id': 4, 'document_id': 45}
0.92%
0.94%
0.96%
0.98%
1.00%
1.02%
1.04%
1.06%
1.08%
1.10%
1.12%
1.14%
1.16%
1.18%
1.20%
1.22%
1.24%
1.26%
1.28%
1.30%
1.32%
1.34%
1.36%
1.38%
1.40%
1.42%
1.44%
1.46%
1.48%
1.50%
1.52%
1.54%
1.56%
1.58%
1.60%
1.62%
1.64%
1.66%
1.68%
1.70%
1.72%
1.74%
1.76%
1.78%
1.80%
1.82%
1.84%
1.86%
1.88%
1.90%
1.92%
1.94%
1.96%
1.98%
2.00%
2.02%
2.04%
2.06%
2.08%
2.10%
2.12%
2.14%
2.16%
2.18%
2.20%
2.22%
2.24%
2.26%
2.28%
2.30%
2.32%
2.34%
2.36%
2.38%
2.40%
2.42%


In [10]:
a1 = time.time()

In [12]:
time.time()-a1

20.863629817962646

4 часа 22 мин (21, 18 мин соответственно)