### Imports

In [None]:
import torch
import pandas as pd
import os
from PIL import Image
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, default_data_collator
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_metric
from transformers import EarlyStoppingCallback

### Config

In [None]:
open_source_model_name = "microsoft/trocr-base-handwritten"
save_model_name = f"models/{open_source_model_name.split('/')[-1]}-finetuned"

In [None]:
dataset_path = 'dataset/dataset_training/'
train_dataset_path = os.path.join(dataset_path, 'train')
val_dataset_path = os.path.join(dataset_path, 'val')

In [None]:
print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)

### Load Dataset

In [None]:
train_df_list = os.listdir(train_dataset_path)
val_df_list = os.listdir(val_dataset_path)

train_df_jpg_list = [train_df_list[i] for i in range(len(train_df_list)) if train_df_list[i].endswith('.jpg')]
val_df_jpg_list = [val_df_list[i] for i in range(len(val_df_list)) if val_df_list[i].endswith('.jpg')]

In [None]:
train_df = pd.DataFrame(columns=['file_name', 'text'])
val_df = pd.DataFrame(columns=['file_name', 'text'])

for i in range(len(train_df_jpg_list)):
    text_file = f"{train_df_jpg_list[i].split('.')[0]}.txt"
    with open(os.path.join(dataset_path, 'train', text_file), 'r') as f:
        text = f.read()
    train_df.loc[i] = {'file_name': train_df_jpg_list[i], 'text': text}

for i in range(len(val_df_jpg_list)):
    text_file = f"{val_df_jpg_list[i].split('.')[0]}.txt"
    with open(os.path.join(dataset_path, 'val', text_file), 'r') as f:
        text = f.read()
    val_df.loc[i] = {'file_name': val_df_jpg_list[i], 'text': text}

In [None]:
train_df.head()

In [None]:
val_df.head()

### Dataset Class

In [None]:
class Dataset:
    def __init__(self, root_dir, df, processor, max_target_length=128):
        self.root_dir = root_dir
        self.df = df
        self.processor = processor
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # get file name + text 
        file_name = self.df['file_name'][idx]
        text = str(self.df['text'][idx])  # Convert text to string explicitly
        
        # Prepare image (i.e. resize + normalize)
        image_path = os.path.join(self.root_dir, file_name)  # Create full file path
        image = Image.open(image_path).convert("RGB")
        pixel_values = self.processor(image, return_tensors="pt").pixel_values
        
        # Add labels (input_ids) by encoding the text
        labels = self.processor.tokenizer(text, 
                                           padding="max_length", 
                                           max_length=self.max_target_length).input_ids
        
        # Important: make sure that PAD tokens are ignored by the loss function
        labels = [label if label != self.processor.tokenizer.pad_token_id else -100 for label in labels]

        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(labels)}
        return encoding

### Dataset

In [None]:
processor = TrOCRProcessor.from_pretrained(open_source_model_name)
train_dataset = Dataset(root_dir=train_dataset_path,
                           df=train_df,
                           processor=processor)
eval_dataset = Dataset(root_dir=val_dataset_path,
                           df=val_df,
                           processor=processor)

In [None]:
print("Number of training examples:", len(train_dataset))
print("Number of validation examples:", len(eval_dataset))

### Examples

In [None]:
encoding = train_dataset[0]
for k,v in encoding.items():
  print(k, v.shape)

In [None]:
image = Image.open(os.path.join(train_dataset.root_dir , train_df['file_name'][0])).convert("RGB")
image

In [None]:
labels = encoding['labels']
labels[labels == -100] = processor.tokenizer.pad_token_id
label_str = processor.decode(labels, skip_special_tokens=True)
print(label_str)

### Training

In [None]:
model = VisionEncoderDecoderModel.from_pretrained(open_source_model_name)

In [None]:
# set special tokens used for creating the decoder_input_ids from the labels
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
# make sure vocab size is set correctly
model.config.vocab_size = model.config.decoder.vocab_size

# set beam search parameters
model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = 64
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

In [None]:
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    num_train_epochs=2,
    evaluation_strategy="steps",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    fp16=False, 
    output_dir="./",
    logging_steps=2,
    save_steps=200,
    eval_steps=200,
    load_best_model_at_end=True,
)

In [None]:
cer_metric = load_metric("cer", trust_remote_code=True)

In [None]:
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(labels_ids, skip_special_tokens=True)

    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    return {"cer": cer}

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=processor.image_processor,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=default_data_collator,
    callbacks=[EarlyStoppingCallback(400)]
)
trainer.train()

In [None]:
results = trainer.evaluate(eval_dataset=eval_dataset)
print(results)

### Save Model

In [None]:
trainer.save_model(save_model_name)

### Try Model

In [None]:
processor = TrOCRProcessor.from_pretrained(open_source_model_name)
model = VisionEncoderDecoderModel.from_pretrained(save_model_name)

In [None]:
print("Predicted; True")
for i, eval in enumerate(eval_dataset):
    pixel_values = eval['pixel_values'].unsqueeze(0)
    generated_ids = model.generate(pixel_values)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    real_text = val_df['text'][i]
    print(generated_text, real_text)