# Train

In [2]:
from app.core.training.data import CreateLoader

In [3]:
root = "app/core/datasets/Persian OCR Dataset - kaggle farboodi"

In [7]:
q, w = CreateLoader(root, processor, "cuda")(batch_size=32, shuffle=True)

In [12]:
import os

In [17]:
images = [os.path.join(q.images_path, i) for i in q.images]
labels = [os.path.join(q.labels_path, i) for i in q.labels]

In [40]:
labels

['app/core/datasets/Persian OCR Dataset - kaggle farboodi/labels/text_0.txt',
 'app/core/datasets/Persian OCR Dataset - kaggle farboodi/labels/text_1.txt',
 'app/core/datasets/Persian OCR Dataset - kaggle farboodi/labels/text_10.txt',
 'app/core/datasets/Persian OCR Dataset - kaggle farboodi/labels/text_100.txt',
 'app/core/datasets/Persian OCR Dataset - kaggle farboodi/labels/text_1000.txt',
 'app/core/datasets/Persian OCR Dataset - kaggle farboodi/labels/text_10000.txt',
 'app/core/datasets/Persian OCR Dataset - kaggle farboodi/labels/text_100000.txt',
 'app/core/datasets/Persian OCR Dataset - kaggle farboodi/labels/text_100001.txt',
 'app/core/datasets/Persian OCR Dataset - kaggle farboodi/labels/text_100002.txt',
 'app/core/datasets/Persian OCR Dataset - kaggle farboodi/labels/text_100003.txt',
 'app/core/datasets/Persian OCR Dataset - kaggle farboodi/labels/text_100004.txt',
 'app/core/datasets/Persian OCR Dataset - kaggle farboodi/labels/text_100005.txt',
 'app/core/datasets/Pers

In [44]:
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
import shutil

In [49]:
def save_split(x_files, y_files, split_name, base_dir):
    """
    Save split images and labels into a new directory structure with progress bars.

    Args:
        x_files (list): List of image file paths.
        y_files (list): List of label file paths.
        split_name (str): One of 'train', 'valid', 'test'.
        base_dir (str): Base directory where split dirs will be created.
    """
    img_dir = os.path.join(base_dir, split_name, "images")
    lbl_dir = os.path.join(base_dir, split_name, "labels")
    os.makedirs(img_dir, exist_ok=True)
    os.makedirs(lbl_dir, exist_ok=True)

    for img, lbl in tqdm(zip(x_files, y_files),
                         total=len(x_files),
                         desc=f"Copying {split_name}"):
        shutil.copy(img, os.path.join(img_dir, os.path.basename(img)))
        shutil.copy(lbl, os.path.join(lbl_dir, os.path.basename(lbl)))

In [50]:
x_train, x_temp, y_train, y_temp = train_test_split(images, labels, test_size=0.3, shuffle=True, random_state=42) 

In [51]:
x_valid, x_test, y_valid, y_test = train_test_split(x_temp, y_temp, test_size=0.5, shuffle=True, random_state=42) 

In [52]:
base_out = "app/core/datasets/Persian-OCR-Dataset-kaggle-farboodi"

save_split(x_train, y_train, "train", base_out)
save_split(x_valid, y_valid, "valid", base_out)
save_split(x_test,  y_test,  "test",  base_out)

Copying train:   0%|          | 0/200164 [00:00<?, ?it/s]

Copying valid:   0%|          | 0/42892 [00:00<?, ?it/s]

Copying test:   0%|          | 0/42893 [00:00<?, ?it/s]

# Dev

In [5]:
from transformers import (
    VisionEncoderDecoderModel,
    ViTModel,
    BertLMHeadModel,
    AutoTokenizer,
    TrOCRProcessor,
    ViTImageProcessor,
    EncoderDecoderModel,
    BertConfig
)
import torch
from PIL import Image
from torchinfo import summary
import numpy as np

In [6]:
processor = TrOCRProcessor.from_pretrained(
    pretrained_model_name_or_path="app/core/models/trocr-base-printed",
    tokenizer = AutoTokenizer.from_pretrained("app/core/models/bert-fa-base-uncased"),
    use_fast=True,
    device="cuda",
)
# tokenizer = AutoTokenizer.from_pretrained("app/core/models/bert-fa-base-uncased")
encoder = ViTModel.from_pretrained("app/core/models/trocr-base-printed")
decoder = BertLMHeadModel.from_pretrained(
    "app/core/models/bert-fa-base-uncased",
    is_decoder=True,
    add_cross_attention = True,
)
model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder).to("cuda")
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.generation_config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.bos_token_id = processor.tokenizer.cls_token_id
model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id

Some weights of ViTModel were not initialized from the model checkpoint at app/core/models/trocr-base-printed and are newly initialized: ['embeddings.cls_token', 'embeddings.patch_embeddings.projection.bias', 'embeddings.patch_embeddings.projection.weight', 'embeddings.position_embeddings', 'encoder.layer.0.attention.attention.key.weight', 'encoder.layer.0.attention.attention.query.weight', 'encoder.layer.0.attention.attention.value.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.layernorm_after.bias', 'encoder.layer.0.layernorm_after.weight', 'encoder.layer.0.layernorm_before.bias', 'encoder.layer.0.layernorm_before.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.attention.attention.key.weight', 'encoder.layer.1.attention.attention.query.weight', 'encoder.layer.1.attention

In [23]:
width, height = 384, 384
dummy_image = Image.fromarray(np.uint8(np.random.rand(height, width, 3) * 255))

In [24]:
pixel_values = processor(images=[dummy_image, dummy_image], return_tensors="pt", device="cuda").pixel_values
pixel_values.shape

torch.Size([2, 3, 384, 384])

In [26]:
generated_ids = model.generate(pixel_values.to("cuda"))
processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

'و و و و و و و و و و و و و و و و و و و و'

In [27]:
text = ["احمد محسن", "سلام علیکم و رحمه الله"]
labels = processor.tokenizer(text, return_tensors="pt", padding=True).input_ids.to("cuda")

In [241]:
processor.batch_decode(labels, skip_special_tokens=False)

['[CLS] احمد محسن [SEP] [PAD] [PAD] [PAD]',
 '[CLS] سلام علیکم و رحمه الله [SEP]']

In [249]:
labels.shape

torch.Size([2, 7])

In [243]:
a = model(pixel_values=pixel_values.to("cuda"), labels=labels)

In [245]:
a.loss

tensor(14.3094, device='cuda:0', grad_fn=<NllLossBackward0>)

In [250]:
generated_ids

tensor([[   2, 1379, 1379, 1379, 1379, 1379, 1379, 1379, 1379, 1379, 1379, 1379,
         1379, 1379, 1379, 1379, 1379, 1379, 1379, 1379, 1379],
        [   2, 1379, 1379, 1379, 1379, 1379, 1379, 1379, 1379, 1379, 1379, 1379,
         1379, 1379, 1379, 1379, 1379, 1379, 1379, 1379, 1379]],
       device='cuda:0')

In [251]:
processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

'و و و و و و و و و و و و و و و و و و و و'

In [14]:
processor.tokenizer

RobertaTokenizerFast(name_or_path='app/core/models/trocr-base-printed', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True, special=True),
}
)