### Imports

In [None]:
import torch
import pandas as pd
import os
from torchvision import transforms
from PIL import Image
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, default_data_collator
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

### Config

In [None]:
BATCH_SIZE = 24
EPOCHS = 25
MODEL_NAME = "small-stage1"
AUGMENTATION = False

In [None]:
open_source_model_name = "microsoft/trocr-small-stage1"
base_path = "models/trocr/"
save_model_name = os.path.join(base_path, MODEL_NAME)

In [None]:
dataset_path = 'dataset/transfer_dataset/'
train_dataset_path = os.path.join(dataset_path, 'train')
val_dataset_path = os.path.join(dataset_path, 'val')

In [None]:
print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)

### Load Dataset

In [None]:
train_df_list = os.listdir(train_dataset_path)
val_df_list = os.listdir(val_dataset_path)

train_df_jpg_list = [train_df_list[i] for i in range(len(train_df_list)) if train_df_list[i].endswith('.jpg')]
val_df_jpg_list = [val_df_list[i] for i in range(len(val_df_list)) if val_df_list[i].endswith('.jpg')]

In [None]:
train_df = pd.DataFrame(columns=['file_name', 'text'])
val_df = pd.DataFrame(columns=['file_name', 'text'])

for i in range(len(train_df_jpg_list)):
    text_file = f"{train_df_jpg_list[i].split('.')[0]}.txt"
    with open(os.path.join(dataset_path, 'train', text_file), 'r') as f:
        text = f.read()
    train_df.loc[i] = {'file_name': train_df_jpg_list[i], 'text': text.replace('|', ' ')}

for i in range(len(val_df_jpg_list)):
    text_file = f"{val_df_jpg_list[i].split('.')[0]}.txt"
    with open(os.path.join(dataset_path, 'val', text_file), 'r') as f:
        text = f.read()
    val_df.loc[i] = {'file_name': val_df_jpg_list[i], 'text': text.replace('|', ' ')}

In [None]:
train_df.head()

In [None]:
val_df.head()

### Dataset Class

In [None]:
from torchvision import transforms
from PIL import Image
import os

class Dataset:
    def __init__(self, root_dir, df, processor, max_target_length=128, augment=False, target_size=(1024, 128)):
        self.root_dir = root_dir
        self.df = df
        self.processor = processor
        self.max_target_length = max_target_length
        self.augment = augment
        self.target_size = target_size


        if augment:
            self.augment_transforms = transforms.Compose([
                transforms.RandomRotation(2),  
                transforms.ColorJitter(brightness=0.25, contrast=0.25),  # Randomly change brightness, contrast, etc.
            ])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        file_name = self.df['file_name'][idx]
        text = str(self.df['text'][idx])
        
        image_path = os.path.join(self.root_dir, file_name)
        image = Image.open(image_path).convert("RGB")
        
        if self.augment:
            image = self.augment_transforms(image)
        
        
        pixel_values = self.processor(image, return_tensors="pt").pixel_values

        labels = self.processor.tokenizer(text, padding="max_length", max_length=self.max_target_length, truncation=True).input_ids
        labels = [label if label != self.processor.tokenizer.pad_token_id else -100 for label in labels]

        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(labels)}
        return encoding


### Dataset

In [None]:
processor = TrOCRProcessor.from_pretrained(open_source_model_name, )
train_dataset = Dataset(root_dir=train_dataset_path,
                           df=train_df,
                           processor=processor, augment=AUGMENTATION)
eval_dataset = Dataset(root_dir=val_dataset_path,
                           df=val_df,
                           processor=processor)

In [None]:
print("Number of training examples:", len(train_dataset))
print("Number of validation examples:", len(eval_dataset))

### Examples

In [None]:
import matplotlib.pyplot as plt
# Plot Example Image
plt.imshow(train_dataset[0]["pixel_values"].permute(1, 2, 0))
print("Image shape:", train_dataset[0]["pixel_values"].shape)
plt.axis("off")


### Training

In [None]:
model = VisionEncoderDecoderModel.from_pretrained(open_source_model_name)

In [None]:
# set special tokens used for creating the decoder_input_ids from the labels
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
# make sure vocab size is set correctly
model.config.vocab_size = model.config.decoder.vocab_size

# set beam search parameters
model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = 64
model.config.early_stopping = True
model.config.no_repeat_gram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

In [None]:
training_args = Seq2SeqTrainingArguments(  
    predict_with_generate=True,
    num_train_epochs=EPOCHS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    fp16=False, 
    output_dir=save_model_name,
    logging_steps=2,
    load_best_model_at_end=True,
    metric_for_best_model="levenshtein",  
    greater_is_better=False  
)

In [None]:
from Levenshtein import distance

In [None]:
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(labels_ids, skip_special_tokens=True)
    sum_leven = 0
    for label, pred in zip(label_str, pred_str):
        sum_leven += distance(label, pred)
    levenshtein = sum_leven / len(label_str)
    
    return {"levenshtein": levenshtein}

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=processor.image_processor,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=default_data_collator,
)
trainer.train()

### Plot 

In [None]:
import matplotlib.pyplot as plt

# Assuming you have completed training and your trainer object is named 'trainer'
# Extracting metrics from the log history
log_history = trainer.state.log_history
epochs = []
train_epochs = []
levenshtein_distances = []
train_losses = []
eval_losses = []
for entry in log_history:
    if 'eval_loss' in entry: 
        epochs.append(entry['epoch'])
        levenshtein_distances.append(entry['eval_levenshtein'])
       
        eval_losses.append(entry["eval_loss"])
    if 'loss' in entry:
        train_losses.append(entry["loss"])
        train_epochs.append(entry['epoch'])

# Plotting
plt.figure(figsize=(10, 5))
plt.plot(epochs, levenshtein_distances, marker='.', linestyle='-', color='g')
plt.plot(train_epochs, train_losses, marker='.', linestyle='-', color='r')
plt.plot(epochs, eval_losses, marker='.', linestyle='-', color='b')
plt.legend(["Levenshtein Distance", "Training Loss", "Eval Loss"])
plt.title(f'Model: {os.path.basename(save_model_name)} Levenshtein: {round(min(levenshtein_distances),2)} Loss: {round(min(train_losses),2)} Eval Loss: {round(min(eval_losses),2)} ')
plt.xlabel('Training Epochs')
plt.ylabel(["Levenshtein Distance", "Training Loss", "Eval Loss"])
plt.grid(True)
plt.savefig(f"{save_model_name}/results.png")
plt.show()

### Save Model

In [None]:
trainer.save_model(save_model_name)

### Try Model

In [None]:
processor = TrOCRProcessor.from_pretrained(open_source_model_name)
model = VisionEncoderDecoderModel.from_pretrained(save_model_name)

In [None]:
print("Predicted; True")
for i, eval in enumerate(eval_dataset):
    pixel_values = eval['pixel_values'].unsqueeze(0)
    generated_ids = model.generate(pixel_values)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    real_text = val_df['text'][i]
    print(generated_text, real_text)