In [6]:
import re
from transformers import DonutProcessor, VisionEncoderDecoderModel, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset, DatasetDict
import torch
from PIL import Image
from torch.utils.data import Dataset
import os

In [None]:
# Load pretrained processor and model
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

In [None]:
x_dict = {
    "219gbqQt+ML.jpg": "height",
    "218vf17tHkL.jpg": "weight",
    "21-VzxP3BDL.jpg": "item_volume",
    "217V+UhIrHL.jpg": "length",
    "11j0F4QOiFL.jpg": "height",
    "211sXYcOHcL.jpg": "height",
    "218zo3iJ2IL.jpg": "length",
    "213VIsNlvzL.jpg": "height",
    "21+quvMwZSL.jpg": "weight",
    "217+y-mckBL.jpg": "weight",
    "211EIgVhPEL.jpg": "voltage",
    "218tBdpDGPS.jpg": "length",
    "21-V2Kx5BVL.jpg": "length"
}

y_dict = {
    "219gbqQt+ML.jpg": "12 cm",
    "218vf17tHkL.jpg": "250 mg",
    "21-VzxP3BDL.jpg": "200 ml",
    "217V+UhIrHL.jpg": "5 cm",
    "11j0F4QOiFL.jpg": "2.75 inches",
    "211sXYcOHcL.jpg": "8 cm",
    "218zo3iJ2IL.jpg": "44.2 cm",
    "213VIsNlvzL.jpg": "11 cm",
    "21+quvMwZSL.jpg": "1.6 lbs",
    "217+y-mckBL.jpg": "400 mg",
    "211EIgVhPEL.jpg": "3.7 V",
    "218tBdpDGPS.jpg": "104.5 inches",
    "21-V2Kx5BVL.jpg": "80 inches"
}

In [3]:
def process_example(example):
    image = Image.open(example['image_path']).convert("RGB")
    question = example['question']
    answer = example['answer']

    # Prepare input for the model
    task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
    prompt = task_prompt.replace("{user_input}", question)
    
    # Tokenize input question
    input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids.squeeze()

    # Process image
    pixel_values = processor(image, return_tensors="pt").pixel_values.squeeze()

    # Tokenize answer as label
    labels = processor.tokenizer(answer, add_special_tokens=False, return_tensors="pt").input_ids.squeeze()

    return {
        "pixel_values": pixel_values,
        "input_ids": input_ids,
        "labels": labels
    }

In [8]:
class ImageDataset(Dataset):
    def __init__(self, image_dir, questions, transform=None):
        self.image_dir = image_dir
        self.transform = transform
        self.image_files = [f for f in os.listdir(image_dir) if os.path.isfile(os.path.join(image_dir, f))]
        self.question = questions

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_name = os.path.join(self.image_dir, self.image_files[idx])
        image = Image.open(img_name)
        if self.transform:
            image = self.transform(image)
        return image

In [3]:
import os

def count_files_in_folder(folder_path):
    files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
    return len(files)

folder_path = '/home/arjun/Desktop/Github/AmazonML-Hackathon/images/test'
print(f"Number of files in the folder: {count_files_in_folder(folder_path)}")

Number of files in the folder: 0


In [None]:
dataset = DatasetDict({
    "train": load_dataset("path/to/train_dataset"),  # Update this with your dataset
    "validation": load_dataset("path/to/val_dataset")  # Update this with your validation dataset
})

# Preprocess dataset
train_dataset = dataset["train"].map(process_example)
val_dataset = dataset["validation"].map(process_example)

In [None]:

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=2,  # Adjust batch size based on available memory
    per_device_eval_batch_size=2,
    logging_steps=100,
    save_steps=500,
    eval_steps=500,
    save_total_limit=3,
    num_train_epochs=3,
    output_dir="./donut-finetuned-docvqa"
)

# Define trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=processor.tokenizer,
    data_collator=processor.data_collator
)


In [None]:
trainer.train()


In [None]:
model.save_pretrained("./donut-finetuned-docvqa")
processor.save_pretrained("./donut-finetuned-docvqa")