## Обучение Tesseract

In [None]:
from PIL import Image
import pytesseract
import subprocess
import os, re


class Tesseract:
    def __init__(self):
        """
        Инициализирует объект класса.
        """

        self.directory = os.getcwd()
        self.language_name = "eng"
        self.training_list = None
        self.font_name = None
        self.box_list = None

    def run_all(self):
        """
        Запускает полный процесс обучения для списка шрифтов.
        """

        self.clean_images()
        font_names = ["Arial", "Campanella", "Minecraft", "Montesuma", "Ryuk", "Times"]
        for font_name in font_names:
            self.box_list = None
            self.training_list = None
            self.font_name = font_name
            self.create_font_file()
            self.rename_files()
            self.extract_unicode()
            self.run_shape_clustering()
            self.run_mf_training()
            self.run_cn_training()
            self.create_tess_data()

    def clean_images(self):
        """
        Очищает множество изображений.
        """

        print("CLEANING IMAGES...")
        for folder_path, _, file_names in os.walk(self.directory + "/../../lections/4. ЛР Tesseract/captchaz/"):
            for file_name in file_names:
                file_path = os.path.join(folder_path, file_name)
                print(file_path)
                if file_name.endswith((".jpg", ".jpeg", ".png", ".gif")):
                    image = self.clears_the_image(file_path)
                    rotated_image = self.rotate_image(image, file_path)
                    if self.selection(file_path, rotated_image):
                       self.save_image_and_box_file(file_path, rotated_image)

    def save_image_and_box_file(self, file_path, image):
        """
        Сохраняет очищенные изображения и их коробки.
        """

        file_name = os.path.basename(file_path)
        without_ext, _ = os.path.splitext(file_name)
        new_file_name_tiff = without_ext + ".tiff"
        new_file_name_box = without_ext + ".box"
        self.extract_font_name(file_path)

        if not os.path.exists(self.directory + "/train/" + self.font_name + "/"):
            os.makedirs(self.directory + "/train/" + self.font_name + "/")

        image.save(self.directory + "/train/" + self.font_name + "/" + new_file_name_tiff)

        with open(self.directory + "/train/" + self.font_name + "/" + new_file_name_box, "w") as f:
            f.write(pytesseract.image_to_boxes(self.directory + "/train/" + self.font_name + "/" + new_file_name_tiff))

    def rename_files(self):
        """
        Ищет файлы box, использует имя box-файла для поиска соответствующего tiff-файла.
        Переименовывает все файлы с соответствующим именем файла "<language>.<fontName>.exp<i>".
        """

        box_string = ""
        i = 0
        for folder_path, _, file_names in os.walk(self.directory + "/train/" + self.font_name + "/"):
            for file_name in file_names:
                file_path = os.path.join(folder_path, file_name)

                if file_path.endswith(".box"):
                    self.extract_font_name(file_name)
                    without_ext, _ = os.path.splitext(file_name)
                    tiff_file = self.language_name + "." + self.font_name + ".exp" + str(i) + ".tiff"
                    box_file = self.language_name + "." + self.font_name + ".exp" + str(i) + ".box"

                    os.rename(self.directory + "/train/" + self.font_name + "/" + without_ext + ".tiff", self.directory + "/train/" + self.font_name + "/" + tiff_file)
                    os.rename(self.directory + "/train/" + self.font_name + "/" + without_ext + ".box", self.directory + "/train/" + self.font_name + "/" + box_file)

                    box_string += " " + box_file
                    self.create_training_file(self.language_name + "." + self.font_name + ".exp" + str(i))
                    i += 1

        return box_string

    def create_training_file(self, prefix):
        """
        Создает обучающий файл для одной пары tiff/box. Вызывается при переименовании файлов.
        """

        print("CREATING TRAINING DATA...")
        os.chdir(self.directory + "/train/" + self.font_name + "/")
        p = subprocess.Popen(["tesseract", prefix + ".tiff", prefix, "nobatch", "box.train"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        return_value = stdout_value = p.communicate()[1]
        return_value = return_value.decode("utf-8")
        if "Empty page!!" in return_value:
            os.chdir(self.directory + "/train/" + self.font_name + "/")
            subprocess.call(["tesseract", "-psm", "7", prefix + ".tiff", prefix, "nobatch", "box.train"])

    def extract_unicode(self):
        """
        Функция extractUnicode извлекает Unicode-информацию из файлов шрифта,
        используя инструмент unicharset_extractor, и сохраняет результаты в каталоге train/<fontName>/.
        """

        print("EXTRACTING UNICODE...")
        box_list = self.get_box_file_list()
        box_arr = box_list.split(" ")
        box_arr.insert(0, "unicharset_extractor")
        box_arr = [i for i in box_arr if i != ""]
        os.chdir(self.directory + "/train/" + self.font_name + "/")
        p = subprocess.Popen(box_arr)
        p.wait()

    def create_font_file(self):
        """
        Создаёт файл свойств шрифта.
        """

        fname = self.directory + "/train/" + self.font_name + "/font_properties"
        with open(fname, "w") as fout:
            fout.write(self.font_name + " 0 0 0 0 0")

    def run_shape_clustering(self):
        """
        Запускает кластеризацию форм.
        """

        print("RUNNING SHAPE CLUSTERING...")
        self.get_training_file_list()
        shape_command = self.training_list.split(" ")
        shape_command.insert(0, "shapeclustering")
        shape_command.insert(1, "-F")
        shape_command.insert(2, "font_properties")
        shape_command.insert(3, "-U")
        shape_command.insert(4, "unicharset")
        shape_command = [i for i in shape_command if i != ""]
        os.chdir(self.directory + "/train/" + self.font_name + "/")
        p = subprocess.Popen(shape_command)
        p.wait()

    def run_mf_training(self):
        """
        Тренирует матричное распознавание.
        """

        print("RUNNING MF CLUSTERING...")
        self.get_training_file_list()
        mf_command = self.training_list.split(" ")
        mf_command.insert(0, "mftraining")
        mf_command.insert(1, "-F")
        mf_command.insert(2, "font_properties")
        mf_command.insert(3, "-U")
        mf_command.insert(4, "unicharset")
        mf_command = [i for i in mf_command if i != ""]
        os.chdir(self.directory + "/train/" + self.font_name + "/")
        p = subprocess.Popen(mf_command)
        p.wait()

    def run_cn_training(self):
        """
        Тренирует контурное распознавание.
        """

        print("RUNNING MF CLUSTERING...")
        self.get_training_file_list()
        cn_command = self.training_list.split(" ")
        cn_command.insert(0, "cntraining")
        cn_command.insert(1, "-F")
        cn_command.insert(2, "font_properties")
        cn_command.insert(3, "-U")
        cn_command.insert(4, "unicharset")
        cn_command = [i for i in cn_command if i != ""]
        os.chdir(self.directory + "/train/" + self.font_name + "/")
        p = subprocess.Popen(cn_command)
        p.wait()

    def create_tess_data(self):
        """
        Переименовывает все файлы и запускает combine_tessdata <language>.
        """

        print("CREATING TESS DATA...")
        os.chdir(self.directory + "/train/" + self.font_name + "/")
        os.rename("unicharset", self.language_name + ".unicharset")
        os.rename("shapetable", self.language_name + ".shapetable")
        os.rename("inttemp", self.language_name + ".inttemp")
        os.rename("normproto", self.language_name + ".normproto")
        os.rename("pffmtable", self.language_name + ".pffmtable")
        p = subprocess.Popen(["combine_tessdata", self.language_name + "."])
        p.wait()

    def get_box_file_list(self):
        """
        Возвращает список box-файлов .
        """

        if self.box_list is not None:
            return self.box_list
        self.box_list = ""
        files = os.listdir(self.directory + "/train/" + self.font_name + "/")
        command_string = "unicharset_extractor"
        files_found = False

        for file_name in files:
            if file_name.endswith(".box"):
                files_found = True
                self.box_list += " " + file_name

        if not files_found:
            self.box_list = None
        return self.box_list

    def get_training_file_list(self):
        """
        Получает список созданных обучающих файлов, список кэшей.
        """
    
        if self.training_list is not None:
            return self.training_list

        self.training_list = ""
        files = os.listdir(self.directory + "/train/" + self.font_name + "/")
        command_string = "unicharset_extractor"
        files_found = False

        for file_name in files:
            if file_name.endswith(".tr"):
                files_found = True
                self.training_list += " " + file_name

        if not files_found:
            self.training_list = None
        return self.training_list

    def extract_font_name(self, file_name):
        """
        Извлекаем шрифт из имени файла.
        """

        file_name = os.path.basename(file_name)
        font_name = file_name.split(".")[0]
        self.font_name = font_name

        return self.font_name

    def remove_image_noise(self, image, image_new, params):
        """
        Удаляет шумы с изображения.
        """

        flag = params['flag']
        first_color = params['first_color']
        background = params['background']
        width = params['width']
        height = params['height']

        if flag:
            for y in range(1, height - 1):
                for x in range(1, width - 1):
                    if (image.getpixel((x - 1, y)) != background or image.getpixel((x, y)) != background):
                        if (image.getpixel((x + 1, y)) != background or image.getpixel((x, y)) != background):
                            if (image.getpixel((x, y + 1)) != background and image.getpixel((x, y)) != background):
                                image_new.putpixel((x, y), first_color)

            for y in range(1, height - 1):
                for x in range(1, width - 1):
                    if (image_new.getpixel((x, y + 1)) == background or image_new.getpixel((x + 1, y)) == background):
                        if (image_new.getpixel((x, y - 1)) == background and image_new.getpixel((x - 1, y)) == background):
                            image_new.putpixel((x, y), background)
        else:
            for y in range(1, height - 1):
                for x in range(1, width - 1):
                    if image.getpixel((x, y)) != background:
                        if image.getpixel((x, y)) == first_color:
                            image_new.putpixel((x, y), first_color)

        return image_new

    def clears_the_image(self, file_path):
        """
        Очищает изображение, поворачивает его и сохраняет.
        """

        _, file_ext = os.path.splitext(file_path)
        if file_ext.lower() in [".gif"]:
            image = Image.open(file_path).convert("P")
            histogram = image.histogram()
            palette = {i: histogram[i] for i in range(256)}
            sorted_palette = sorted(palette.items(), key=lambda x: x[1], reverse=True)
            print(sorted_palette)
            background = sorted_palette[0][0]
            first_color = sorted_palette[1][0]
            width, height = image.size
            image_new = Image.new("P", image.size, (255, 255, 255))
            image_new.putpalette(image.getpalette())

            if first_color not in [113, 254]:
                flag = True
            else:
                flag = False

            params = {
                'flag': flag,
                'first_color': first_color,
                'background': background,
                'width': width,
                'height': height
            }
            image_without_noise = self.remove_image_noise(image, image_new, params)

        return image_without_noise

    def rotate_image(self, image, file_path):
        """
        Поворачивает изображение по заданному угол.
        """

        angle = int(re.search(r".+_(.+?)\.", file_path).group(1)) * -1
        return image.rotate(angle, fillcolor=(255, 255, 255))

    def selection(self, file_path, image):
        """
        Осуществляет выборку изображений.
        """

        file_name, file_ext = os.path.splitext(file_path)
        if file_ext.lower() in [".jpg", ".jpeg", ".png", ".gif", ".tiff"]:
            file_name = os.path.basename(file_path)
            parts = file_name.split("_")
            captcha = parts[1]

            image_to_string = pytesseract.image_to_string(image).strip()
            correct_symbols = sum(1 for symbol_1, symbol_2 in zip(image_to_string, captcha) if symbol_1 == symbol_2)

            if image_to_string == captcha:
                return True

            if self.font_name == "Minecraft":
                if len(captcha) == 5 and correct_symbols == 4:
                    return True
                elif len(captcha) == 6 and correct_symbols == 5:
                    return True
                elif len(captcha) == 4 and correct_symbols == 3:
                    return True

            if self.font_name == "Campanella" and correct_symbols > 2:
                return True

            return False


trainer = Tesseract_trainer()
trainer.run_all()

In [8]:
from PIL import Image
from pprint import pprint
import pytesseract
import shutil
import os
import re


# Чтобы запустить необходимо создать структурированную директорию шрифтов ./statistic/ c очищенными капчами
# statistic:
#     --Arial
#     --Campanella
#     --Minecraft
#     --Montesuma
#     --Ryuk
#     --Times


def recognize_info(folder, font_name, statistic, flag):
    current_dir = os.getcwd()
    if flag:
        shutil.copy2(
            os.path.join(current_dir, f"train/{font_name}/eng.traineddata"),
            f"/opt/homebrew/share/tessdata/eng.traineddata",
        )
    else:
        shutil.copy2(
            os.path.join(current_dir, f"reserve/eng.traineddata"),
            f"/opt/homebrew/share/tessdata/eng.traineddata",
        )

    for folder_path, _, file_names in os.walk(folder):
        for fileName in file_names:
            if fileName.endswith("tiff"):
                font_name_match = re.search(r"([A-z]+)\.", fileName)
                if font_name_match:
                    if font_name_match.group(1) != font_name:
                        continue

                image_path = os.path.join(folder_path, fileName)
                text = pytesseract.image_to_string(Image.open(image_path)).strip()

                normal_text_match = re.search(r"_(.+?)_", fileName)
                if normal_text_match:
                    normal_text = normal_text_match.group(1)
                else:
                    normal_text = ""

                if font_name not in statistic:
                    statistic[font_name] = {"all_count": 0, "normal_count": 0}

                statistic[font_name]["all_count"] += 1
                if text == normal_text:
                    statistic[font_name]["normal_count"] += 1

    return statistic


folder = os.path.join(os.getcwd() + "/statistic/")
untrained = {}
for font_name in ["Arial", "Campanella", "Minecraft", "Montesuma", "Ryuk", "Times"]:
    untrained = recognize_info(folder, font_name, untrained, False)
pprint(untrained)
trained = {}
for font_name in ["Arial", "Campanella", "Minecraft", "Montesuma", "Ryuk", "Times",]:
    trained = recognize_info(folder, font_name, trained, True)
pprint(trained)

{'Arial': {'all_count': 6977, 'normal_count': 4978},
 'Campanella': {'all_count': 6973, 'normal_count': 2},
 'Minecraft': {'all_count': 6981, 'normal_count': 209},
 'Montesuma': {'all_count': 6979, 'normal_count': 822},
 'Ryuk': {'all_count': 6983, 'normal_count': 650},
 'Times': {'all_count': 6985, 'normal_count': 3761}}
{'Arial': {'all_count': 6977, 'normal_count': 4727},
 'Campanella': {'all_count': 6973, 'normal_count': 7},
 'Minecraft': {'all_count': 6981, 'normal_count': 327},
 'Montesuma': {'all_count': 6979, 'normal_count': 3602},
 'Ryuk': {'all_count': 6983, 'normal_count': 4393},
 'Times': {'all_count': 6985, 'normal_count': 2758}}
