In [None]:
import os
import shutil
import uuid
import xml.etree.ElementTree as ET

import pandas as pd
from PIL import Image, ImageOps
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from ultralytics.utils.ops import segment2box


In [None]:
# Пути для сохранения разделенных выборок
images_text_detector_train_dir = "../../data/processed/3 Production/text_detector/train/images"
labels_text_detector_train_dir = "../../data/processed/3 Production/text_detector/train/labels"

images_text_detector_valid_dir = "../../data/processed/3 Production/text_detector/valid/images"
labels_text_detector_valid_dir = "../../data/processed/3 Production/text_detector/valid/labels"

images_text_detector_test_dir = "../../data/processed/3 Production/text_detector/test/images"
labels_text_detector_test_dir = "../../data/processed/3 Production/text_detector/test/labels"

images_text_recognizer_train_dir = "../../data/processed/3 Production/text_recognizer/train"

images_text_recognizer_valid_dir = "../../data/processed/3 Production/text_recognizer/valid"

images_text_recognizer_test_dir = "../../data/processed/3 Production/text_recognizer/test"


# Создание каталогов для train, valid, test и images
os.makedirs(images_text_detector_train_dir, exist_ok=True)
os.makedirs(labels_text_detector_train_dir, exist_ok=True)

os.makedirs(images_text_detector_valid_dir, exist_ok=True)
os.makedirs(labels_text_detector_valid_dir, exist_ok=True)

os.makedirs(images_text_detector_test_dir, exist_ok=True)
os.makedirs(labels_text_detector_test_dir, exist_ok=True)

os.makedirs(images_text_recognizer_train_dir, exist_ok=True)
os.makedirs(images_text_recognizer_valid_dir, exist_ok=True)
os.makedirs(images_text_recognizer_test_dir , exist_ok=True)


# Список для хранения путей к изображениям и разметкам
images = []
annotations = []
# 0 - Губернаторские отчёты
# 1 - Уставные грамоты – Афанасенков
# 2 - Уставные грамоты в jpg (Просветов)
# 3 - Победоносцев (разметка сегментами)
labels = []

###############################################################################
############# Формируем датасет из каталога Губернаторские отчеты #############
###############################################################################

# Путь к каталогу с данными
data_dir = "../../data/raw/Распознавание текстов/Губернаторские отчеты"

# Перебор всех каталогов и файлов внутри data_dir
for root, dirs, files in os.walk(data_dir):
    for file in files:
        if file.endswith(".JPG"):
            image_path = os.path.join(root, file)
            annotation_path = os.path.join(root, file.replace(".JPG", "_pvoc_imglab.xml"))
            if os.path.exists(annotation_path):
                images.append(image_path)
                annotations.append(annotation_path)
                labels.append(0)
                
train_img_g, test_img_g, train_annot_g, test_annot_g, train_labels_g, test_labels_g = train_test_split(images, annotations, labels, test_size=0.04, random_state=42)
train_img_g, valid_images_g, train_annot_g, valid_annot_g, train_labels_g, valid_labels_g = train_test_split(train_img_g, train_annot_g, train_labels_g, test_size=0.04, random_state=42)

########################################################################################
############# Формируем датасет из каталога Уставные грамоты – Афанасенков #############
########################################################################################
images = []
annotations = []
labels = []

# Путь к каталогу с данными
data_dir = "../../data/raw/Распознавание текстов/Уставные грамоты – Афанасенков"
# Перебор всех каталогов и файлов внутри data_dir
for root, dirs, files in os.walk(data_dir):
    for file in files:
        if file.endswith(".jpg"):
            image_path = os.path.join(root, file)
            annotation_path = os.path.join(root, file.replace(".jpg", "_pvoc_imglab.xml"))
            if os.path.exists(annotation_path):
                images.append(image_path)
                annotations.append(annotation_path)
                labels.append(1)

############################################################################################
############# Формируем датасет из каталога Уставные грамоты в jpg (Просветов) #############
############################################################################################

image_dir = "../../data/raw/Распознавание текстов/Уставные грамоты в jpg (Просветов)"
annotation_dir = "../../data/raw/Распознавание текстов/Уставные грамоты в jpg (Просветов)/Обработка/Просветов (13.12)"
# /media/admin01/storage1/vadim/Historical-docs-OCR/data/raw/Распознавание текстов/Уставные грамоты в jpg (Просветов)

# Перебор всех файлов изображений в image_dir
for root, dirs, files in os.walk(image_dir):
    for file in files:
        if file.endswith(".jpg"):
            image_path = os.path.join(root, file)
            annotation_file = file.replace(".jpg", "_pvoc_imglab.xml")
            annotation_path = os.path.join(annotation_dir, annotation_file)
            if os.path.exists(annotation_path):
                images.append(image_path)
                annotations.append(annotation_path)
                labels.append(2)


# Разделение выборки на train, valid и test
train_images, test_images, train_annotations, test_annotations, train_labels, test_labels = train_test_split(images, annotations, labels, test_size=0.12, random_state=42)
train_images, valid_images, train_annotations, valid_annotations, train_labels, valid_labels = train_test_split(train_images, train_annotations, train_labels, test_size=0.12, random_state=42)


# Объединим выборки по губернаторским отчетам и уставным грамотам
train_images = train_images + train_img_g
train_annotations = train_annotations + train_annot_g
train_labels = train_labels + train_labels_g

valid_images = valid_images + valid_images_g
valid_annotations = valid_annotations + valid_annot_g
valid_labels = valid_labels + valid_labels_g

test_images = test_images + test_img_g
test_annotations = test_annotations + test_annot_g
test_labels = test_labels + test_labels_g


# Функция для преобразования координат в относительные координаты
def convert_coordinates(size, box):
    dw = 1. / size[0]
    dh = 1. / size[1]
    x = (box[0] + box[2]) / 2.0
    y = (box[1] + box[3]) / 2.0
    w = box[2] - box[0]
    h = box[3] - box[1]
    x_rel = x * dw
    w_rel = w * dw
    y_rel = y * dh
    h_rel = h * dh
    return x_rel, y_rel, w_rel, h_rel

# Функция для записи разметки в формате YOLO
def write_yolo_annotation(size, objects, output_file):
    with open(output_file, "w") as f:
        for obj in objects:
            x_rel, y_rel, w_rel, h_rel = convert_coordinates(size, obj["bbox"])
            # первый 0 - единственный класс разметки 'text'
            f.write(f"0 {x_rel} {y_rel} {w_rel} {h_rel}\n")

# Преобразование разметки для train выборки
def process_data(
        images, annotations, labels,
        text_detector_images_dir, text_detector_labels_dir,
        text_recognizer_images_dir,
        data_type
):
    
    # Датафрейм для датасета
    data = pd.DataFrame()
    
    err_count = 0

    ocr_objects = []
    
    for image_path, annotation_path, dir_label in tqdm(zip(images, annotations, labels), total=len(images), desc=f"Подготавливаем {data_type} выборку"):
        # Копирование изображений в директорию images
        image_name = os.path.basename(image_path)
        shutil.copy(image_path, text_detector_images_dir + "/" + image_name)
        
        objects = []
        
        ###### Преобразование разметки в формат YOLO
        tree = ET.parse(annotation_path)
        root = tree.getroot()

        txt_filename = image_name.replace(".JPG", ".txt").replace(".jpg", ".txt")
        txt_path = text_detector_labels_dir + "/" + txt_filename

        size = (int(root.find("size/width").text), int(root.find("size/height").text))
        # objects = []
        for obj in root.findall("object"):
            name = obj.find("name").text
            bbox = [
                float(obj.find("bndbox/xmin").text),
                float(obj.find("bndbox/ymin").text),
                float(obj.find("bndbox/xmax").text),
                float(obj.find("bndbox/ymax").text),
            ]
            objects.append({"name": name, "bbox": bbox})

        write_yolo_annotation(size, objects, txt_path)
        
        ###### Преобразования разметки в формат TrOCR
        
        # objects = []
        
        for row_num, obj in enumerate(root.findall("object")):
            segment_name = image_name.split(".")[0] + "___" + str(row_num) + "." + image_name.split(".")[-1]
            name = obj.find("name").text
            bbox = [
                float(obj.find("bndbox/xmin").text),
                float(obj.find("bndbox/ymin").text),
                float(obj.find("bndbox/xmax").text),
                float(obj.find("bndbox/ymax").text),
            ]
            
            img = Image.open(image_path)
            img = ImageOps.exif_transpose(img)
            
            try:
                cropped_segment = img.crop(bbox)
                cropped_segment.save(os.path.join(text_recognizer_images_dir, segment_name))
                
                ocr_objects.append({"file_name": segment_name, "text": name, "label": dir_label})
                
            # могут быть ошибки разметки и DecompressionBombError
            except Exception:
                err_count += 1
                # print(f"Ошибка при преобразовании разметки {segment_name} в формат TrOCR")
                continue
    
    data = pd.DataFrame(data=ocr_objects)
    
    print(f"Ошибок: {err_count}")
    
    return data

In [None]:
from joblib import Parallel, delayed
import numpy as np

def chunkify(lst, n):
    """Split lst into n chunks."""
    return np.array_split(lst, n)

def run_in_parallel(
        images, annotations, labels,
        text_detector_images_dir, text_detector_labels_dir,
        text_recognizer_images_dir,
        data_type, n_jobs):
    
    image_chunks = chunkify(images, n_jobs)
    annotation_chunks = chunkify(annotations, n_jobs)
    labels_chunks = chunkify(labels, n_jobs)
    
    tasks = zip(image_chunks, 
                annotation_chunks, 
                labels_chunks, 
                [text_detector_images_dir] * n_jobs, 
                [text_detector_labels_dir] * n_jobs, 
                [text_recognizer_images_dir] * n_jobs, 
                [data_type] * n_jobs)
    
    # Use Joblib's Parallel and delayed to run process_data in parallel across chunks
    results = Parallel(n_jobs=n_jobs)(delayed(process_data)(*args) for args in tqdm(tasks))
    
    # Concatenate the results from each chunk
    final_dataframe = pd.concat(results, ignore_index=True)
    
    return final_dataframe

In [None]:
n_jobs = 20

train_dataframe = run_in_parallel(
    train_images, train_annotations, train_labels,
    images_text_detector_train_dir, labels_text_detector_train_dir,
    images_text_recognizer_train_dir,
    "обучающую", n_jobs
)

valid_dataframe = run_in_parallel(
    valid_images, valid_annotations, valid_labels,
    images_text_detector_valid_dir, labels_text_detector_valid_dir, 
    images_text_recognizer_valid_dir,
    "валидационную", n_jobs
)

test_dataframe = run_in_parallel(
    test_images, test_annotations, test_labels,
    images_text_detector_test_dir, labels_text_detector_test_dir,
    images_text_recognizer_test_dir,
    "тестовую", n_jobs
)

In [None]:
############################################################################################
############# Формируем датасет из каталога Победоносцев (разметка сегментами) #############
############################################################################################


image_dir = "../../data/raw/Распознавание текстов/Победоносцев/images"
annotation_file = "../../data/raw/Распознавание текстов/Победоносцев/project-14-at-2024-03-18-16-02-b43f1e84.json"
segment_annotations = pd.read_json(annotation_file)

segment_images = []

# Перебор всех файлов изображений в image_dir
for root, dirs, files in os.walk(image_dir):
    for file in files:
        image_path = os.path.join(root, file)
        segment_images.append(image_path)
        
train_images, test_images = train_test_split(segment_images, test_size=0.2, random_state=42)
train_images, valid_images = train_test_split(train_images, test_size=0.2, random_state=42)

In [None]:
# Apply moving average smoothing only to the y-coordinate
def moving_average_smoothing_y(points, window_size=3):
    """Apply moving average smoothing to the y-coordinate of a set of points."""
    if window_size % 2 == 0:
        raise ValueError("Window size must be odd.")
    
    extended_y = np.hstack([points[-window_size//2:, 1], points[:, 1], points[:window_size//2, 1]])
    smoothed_y = np.copy(points[:, 1])
    
    for i in range(len(points)):
        start = i
        end = i + window_size
        smoothed_y[i] = np.mean(extended_y[start:end])
    
    return np.column_stack((points[:, 0], smoothed_y))

In [None]:
datas = []

for image_list, detector_split, label_detector, recognizer_split in tqdm(zip(
        [train_images, valid_images, test_images], 
        [images_text_detector_train_dir, images_text_detector_valid_dir, images_text_detector_test_dir],
        [labels_text_detector_train_dir, labels_text_detector_valid_dir, labels_text_detector_test_dir],
        [images_text_recognizer_train_dir, images_text_recognizer_valid_dir, images_text_recognizer_test_dir]
    ),
    total=3
):
    
    objects = []
    ocr_objects = []
    
    for image in tqdm(image_list, total=len(image_list)):
        image_name = os.path.basename(image)
        shutil.copy(image, detector_split + "/" + image_name)
        
        annotations = segment_annotations[segment_annotations["file_upload"] == image_name]["annotations"].values
        txt_file_name = image_name.replace(".jpeg", ".txt").replace(".png", ".txt")
        txt_path = label_detector + "/" + txt_file_name
        
        for row_num, annotation in enumerate(annotations[0][0]["result"]):
            if annotation["type"] == "textarea":
                # Получаем текст аннотации
                row_text_label = annotation["value"]["text"][0]
                
                points = moving_average_smoothing_y(
                    np.array(annotation["value"]["points"]),
                    window_size=5, 
                    )
                
                # Конвертируем маску в ббокс            
                text_box = segment2box(
                    points, 
                    annotation["original_width"], 
                    annotation["original_height"]
                )
                # Получаем размер изображения
                img_size = (annotation["original_width"], annotation["original_height"])
                # Масштабируем ббокс по размеру изображения
                x1, y1, x2, y2 = ((text_box[0] * annotation["original_width"]) / 100, 
                                  (text_box[1] * annotation["original_height"]) / 100, 
                                  (text_box[2] * annotation["original_width"]) / 100, 
                                  (text_box[3] * annotation["original_height"]) / 100)
                bbox = [x1, y1, x2, y2]
                
                objects.append({"name": row_text_label, "bbox": bbox})
                
                img = Image.open(image)                
                cropped_segment = img.crop(bbox)
                
                segment_name = image_name.split(".")[0] + "___" + str(row_num) + "." + image_name.split(".")[-1]
                cropped_segment.save(os.path.join(recognizer_split, segment_name))
                ocr_objects.append({"file_name": segment_name, "text": row_text_label, "label": 3})
        
        write_yolo_annotation(img_size, objects, txt_path)
        
    t = pd.DataFrame(ocr_objects)
    datas.append(t)
    
data_train = datas[0]
data_valid = datas[1]
data_test = datas[2]

In [None]:
train_dataframe = pd.concat([train_dataframe, data_train])
valid_dataframe = pd.concat([valid_dataframe, data_valid])
test_dataframe = pd.concat([test_dataframe, data_test])

In [None]:
train_dataframe.to_csv("../../data/processed/3 Production/train.csv")
valid_dataframe.to_csv("../../data/processed/3 Production/valid.csv")
test_dataframe.to_csv("../../data/processed/3 Production/test.csv")

In [None]:
train_dataframe

In [None]:
train_dataframe['label'].value_counts()

In [None]:
valid_dataframe['label'].value_counts()

In [None]:
test_dataframe['label'].value_counts()