In [None]:
from pathlib import Path


tmp = Path("/kaggle/tmp")
tmp.mkdir(exist_ok=True, parents=True)

In [None]:
!pip install -q ultralytics transformers

# Download test image

In [None]:
!gdown --fuzzy https://drive.google.com/file/d/1l9p59MrCcTrmeiSWzptVrAb-vE4hlLo5/view?usp=sharing
!mv /kaggle/working/"Copy of 2.jpg" /kaggle/tmp

In [None]:
import cv2


imgs = []
paths = ["/kaggle/tmp/Copy of 2.jpg"]

for path in paths:
    image = cv2.imread(path)
    image = cv2.resize(image, (1280, 900))
    imgs.append(image)

# Model loading and configuration

## YOLO

In [None]:
from ultralytics import YOLO


text_detection = YOLO("/kaggle/input/yolo11x-dialectic/pytorch/default/1/best.pt")

In [None]:
def yolo_result_to_boxes(res):
    return res.boxes.xyxy

## TrOCR

In [None]:
from transformers import GenerationConfig


def beam_search(model, processor):
    model.config.decoder_start_token_id = processor.tokenizer.cls_token_id    
    model.config.pad_token_id = processor.tokenizer.pad_token_id
    model.config.vocab_size = model.config.decoder.vocab_size
    
    
    # set beam search parameters
    generation_config = GenerationConfig(        
        max_length=64,
        early_stopping=True,
        no_repeat_ngram_size=3,
        length_penalty=2.0,
        num_beams=4
    )
    model.generation_config = generation_config
    
    model.generation_config.eos_token_id = processor.tokenizer.sep_token_id
    model.generation_config.decoder_start_token_id = processor.tokenizer.cls_token_id
    model.generation_config.pad_token_id = processor.tokenizer.pad_token_id

In [None]:
import torch
from transformers import TrOCRProcessor, VisionEncoderDecoderModel


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "/kaggle/input/trocr-base-ru-dialectic/pytorch/default/1"

text_processor = TrOCRProcessor.from_pretrained(model_name)
text_recognition = VisionEncoderDecoderModel.from_pretrained(model_name)

beam_search(text_recognition, text_processor)
text_recognition.to(device)

In [None]:
def recognize(image):
    pixel_values = text_processor(images=image, return_tensors="pt").pixel_values.to(device)

    generated_ids = text_recognition.generate(pixel_values)
    generated_text = text_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return generated_text

# Text Detection

## Grouping detected regions by text lines

In [None]:
def clamp(min_val, max_val, value):
    return max(min(max_val, value), min_val)

def boxes_to_groups(boxes):
    groups = []

    for bi, box in enumerate(boxes):
        added_to_group = False
        
        for gi, group in enumerate(groups):
            for group_box in group:
                y1 = clamp(box[1], box[3], group_box[1])
                y2 = clamp(box[1], box[3], group_box[3])
                y_overlap = (y2 - y1) / (box[3] - box[1])
                
                if y_overlap >= 0.6:
                    group.append(box)
                    added_to_group = True
                    break
            
            if added_to_group:
                break

        if not added_to_group:
            groups.append([box])

    for group in groups:
        group.sort(key=lambda box: box[0])
    groups.sort(key=lambda group: group[0][1])

    return groups

In [None]:
import torch

def combining_boxes(groups):

    for gi, group in enumerate(groups):
        i = 0

        while i < len(group) - 1:
            box_cur, box_next = group[i], group[i+1]

            x1 = clamp(box_next[0], box_next[2], box_cur[0])
            x2 = clamp(box_next[0], box_next[2], box_cur[2])
            x_overlap = (x2 - x1) / (box_next[2] - box_next[0])

            if x_overlap > 0:
                new_box = torch.stack([
                    torch.min(box_cur[0], box_next[0]), #xmin
                    torch.min(box_cur[1], box_next[1]), #ymin
                    torch.max(box_cur[2], box_next[2]), #xmax
                    torch.max(box_cur[3], box_next[3]), #ymax
                ])

                group[i] = new_box

                del group[i + 1]
                i = max(i - 1, 0)
            else:
                i += 1

    return groups

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline


def show_groups(image, groups):
    temp = image.copy()
    groups = combining_boxes(groups)

    for group in groups:
        color = np.random.randint(256, size=3).tolist()

        for box in group:
            x1, y1, x2, y2 = map(int, box)
            temp = cv2.rectangle(temp, (x1, y1), (x2, y2), color=color, thickness=3)

    plt.figure(figsize=(15, 10))
    plt.imshow(temp)
    plt.axis('off')
    plt.show()

# Text recognition

In [None]:
def recognize_text(image, groups):
    result = ''

    for group in groups:
        for box in group:
            x1, y1, x2, y2 = map(int, box)
            crop = image[y1:y2, x1:x2]
            word = recognize(crop)
            result += word + ' '
            
        result = result[:-1]
        result += '\n'

    result = result[:-1]
    return result

# Full inference pipeline

In [None]:
for image in imgs:
    yolo_result = text_detection(image, conf=0.3)[0]
    yolo_boxes = yolo_result_to_boxes(yolo_result)
    yolo_groups = boxes_to_groups(yolo_boxes)
    combined_groups = combining_boxes(yolo_groups)

    show_groups(image, combined_groups)
    print(recognize_text(image, combined_groups))