In [None]:
# Imports PIL module
import urllib.request
from PIL import Image
import os
import matplotlib.pyplot as plt

from google.colab import drive
drive.mount('/content/drive', force_remount=True)
os.chdir('/content/drive/My Drive/Colab Notebooks/Logbook_Dataset')

In [None]:
# install dataset library, sentencepiece and tensor board packages
!pip install -q datasets sentencepiece tensorboard

In [None]:
import os
import json
from pathlib import Path
import shutil
from datasets import load_dataset

# define paths
# Check the current working directory
curr_path = os.getcwd()
print(curr_path)


# Load dataset
dataset = load_dataset("imagefolder", data_dir=curr_path, split="train")

print(f"Dataset has {len(dataset)} images")
print(f"Dataset features are: {dataset.features.keys()}")

In [None]:
#install Transformers package
!pip install transformers==4.45.2 sentence-transformers==3.1.1

In [None]:
import random

random_sample = random.randint(0, len(dataset))

print(f"Random sample is {random_sample}")
print(f"OCR text is {dataset[random_sample]['text']}")
dataset[random_sample]['image'].resize((400,600))

In [None]:
new_special_tokens = [] # new tokens which will be added to the tokenizer
task_start_token = "<s>"  # start of task token
eos_token = "</s>" # eos token of tokenizer
#This function will convert the json annotations into tokens
def json2token(obj, update_special_tokens_for_json_key: bool = True, sort_json_key: bool = True):
    """
    Convert an ordered JSON object into a token sequence
    """
    if type(obj) == dict:
        if len(obj) == 1 and "text_sequence" in obj:
            return obj["text_sequence"]
        else:
            output = ""
            if sort_json_key:
                keys = sorted(obj.keys(), reverse=True)
            else:
                keys = obj.keys()
            for k in keys:
                if update_special_tokens_for_json_key:
                    new_special_tokens.append(fr"<s_{k}>") if fr"<s_{k}>" not in new_special_tokens else None
                    new_special_tokens.append(fr"</s_{k}>") if fr"</s_{k}>" not in new_special_tokens else None
                output += (
                    fr"<s_{k}>"
                    + json2token(obj[k], update_special_tokens_for_json_key, sort_json_key)
                    + fr"</s_{k}>"
                )
            return output
    elif type(obj) == list:
        return r"<sep/>".join(
            [json2token(item, update_special_tokens_for_json_key, sort_json_key) for item in obj]
        )
    else:
        # excluded special tokens for now
        obj = str(obj)
        if f"<{obj}/>" in new_special_tokens:
            obj = f"<{obj}/>"  # for categorical special tokens
        return obj


def preprocess_documents_for_donut(sample):
    # create Donut-style input
    text = json.loads(sample["text"])
    d_doc = task_start_token + json2token(text) + eos_token
    # convert all images to RGB
    image = sample["image"].convert('RGB')
    return {"image": image, "text": d_doc}

proc_dataset = dataset.map(preprocess_documents_for_donut)

print(f"Sample: {proc_dataset[5]['text']}")
print(f"New special tokens: {new_special_tokens + [task_start_token] + [eos_token]}")

In [None]:
print(f"Sample: {proc_dataset[5]['text']}")
print(f"New special tokens: {new_special_tokens + [task_start_token] + [eos_token]}")

In [None]:
from transformers import DonutProcessor

# Load Donut processor of the pre-trained model
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")

# add new special tokens to tokenizer
processor.tokenizer.add_special_tokens({"additional_special_tokens": new_special_tokens + [task_start_token] + [eos_token]})

# we update some settings which differ from pretraining; namely the size of the images + no rotation required
processor.feature_extractor.size = [720,960] # should be (width, height)
processor.feature_extractor.do_align_long_axis = False

In [None]:
#this function will convert the images into tensors
def transform_and_tokenize(sample, processor=processor, split="train", max_length=512, ignore_id=-100):
    # create tensor from image
    try:
        pixel_values = processor(
            sample["image"], random_padding=split == "train", return_tensors="pt"
        ).pixel_values.squeeze()
    except Exception as e:
        print(sample)
        print(f"Error: {e}")
        return {}

    # tokenize document
    input_ids = processor.tokenizer(
        sample["text"],
        add_special_tokens=False,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )["input_ids"].squeeze(0)

    labels = input_ids.clone()
    labels[labels == processor.tokenizer.pad_token_id] = ignore_id  # model doesn't need to predict pad token
    return {"pixel_values": pixel_values, "labels": labels, "target_sequence": sample["text"]}

# need at least 32-64GB of RAM to run this
processed_dataset = proc_dataset.map(transform_and_tokenize,remove_columns=["image","text"])

In [None]:
#print few sample tokens
print(f"Sample1: {processed_dataset[8]}")
print(f"New special tokens1: {new_special_tokens + [task_start_token] + [eos_token]}")
print(f"Sample2: {processed_dataset[5]}")
print(f"New special tokens2: {new_special_tokens + [task_start_token] + [eos_token]}")

In [None]:
## COMMENT IN in case you want to save the processed dataset to disk in case of error later
#This will save the processor into your Google drive
processed_dataset.save_to_disk("processed_dataset")
processor.save_pretrained("processor")

In [None]:
#split the train and test data into 85 and 15 %
processed_dataset = processed_dataset.train_test_split(test_size=0.15)
print(processed_dataset)

In [None]:
#!pip install -q git+https://github.com/huggingface/transformers.git

import torch
from transformers import VisionEncoderDecoderModel, VisionEncoderDecoderConfig

# Load pre-traned model from huggingface.co
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")

# Resize embedding layer to match vocabulary size
new_emb = model.decoder.resize_token_embeddings(len(processor.tokenizer))
print(f"New embedding size: {new_emb}")
# Adjust our image size and output sequence lengths
#model.config.encoder.image_size = processor.feature_extractor.size[::-1] # (height, width)
model.config.decoder.max_length = len(max(processed_dataset["train"]["labels"], key=len))

# Add task token for decoder to start
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.decoder_start_token_id = processor.tokenizer.convert_tokens_to_ids(['<s>'])[0]

# is done by Trainer
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

In [None]:
from huggingface_hub import notebook_login
#Hugging face login details
notebook_login()

In [None]:
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

# hyperparameters used for multiple args
hf_repository_id = "donut-base-engine_logbook"

# Arguments for training
training_args = Seq2SeqTrainingArguments(
    output_dir=hf_repository_id,
    num_train_epochs=20,
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    weight_decay=0.01,
    fp16=True,
    logging_steps=50,
    save_total_limit=2,
    evaluation_strategy="no",
    save_strategy="epoch",
    predict_with_generate=True,
    # push to hub parameters
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=hf_repository_id,
    hub_token=HfFolder.get_token(),
)

# Create Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset["train"],
)

# Start training the model
trainer.train()

In [None]:
# Save processor to Hugging face repo and create model card 
processor.save_pretrained(hf_repository_id)
trainer.create_model_card()
trainer.push_to_hub()