In [6]:
import re
from transformers import DonutProcessor, VisionEncoderDecoderModel, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset, DatasetDict
import torch
from PIL import Image
from torch.utils.data import Dataset
import os

In [3]:
# Load pretrained processor and model
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
device = "cuda" if torch.cuda.is_available() else "cpu"
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa").to(device)

In [4]:
x_dict = {
    "219gbqQt+ML.jpg": "height",
    "218vf17tHkL.jpg": "weight",
    "21-VzxP3BDL.jpg": "item_volume",
    "217V+UhIrHL.jpg": "length",
    "11j0F4QOiFL.jpg": "height",
    "211sXYcOHcL.jpg": "height",
    "218zo3iJ2IL.jpg": "length",
    "213VIsNlvzL.jpg": "height",
    "21+quvMwZSL.jpg": "weight",
    "217+y-mckBL.jpg": "weight",
    "211EIgVhPEL.jpg": "voltage",
    "218tBdpDGPS.jpg": "length",
    "21-V2Kx5BVL.jpg": "length"
}

y_dict = {
    "219gbqQt+ML.jpg": "12 cm",
    "218vf17tHkL.jpg": "250 mg",
    "21-VzxP3BDL.jpg": "200 ml",
    "217V+UhIrHL.jpg": "5 cm",
    "11j0F4QOiFL.jpg": "2.75 inches",
    "211sXYcOHcL.jpg": "8 cm",
    "218zo3iJ2IL.jpg": "44.2 cm",
    "213VIsNlvzL.jpg": "11 cm",
    "21+quvMwZSL.jpg": "1.6 lbs",
    "217+y-mckBL.jpg": "400 mg",
    "211EIgVhPEL.jpg": "3.7 V",
    "218tBdpDGPS.jpg": "104.5 inches",
    "21-V2Kx5BVL.jpg": "80 inches"
}

In [5]:
def process_example(example):
    image = Image.open(example['image_path']).convert("RGB")
    question = example['question']
    answer = example['answer']

    # Prepare input for the model
    task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
    prompt = task_prompt.replace("{user_input}", question)
    
    # Tokenize input question
    input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids.squeeze()

    # Process image
    pixel_values = processor(image, return_tensors="pt").pixel_values.squeeze()

    # Tokenize answer as label
    labels = processor.tokenizer(answer, add_special_tokens=False, return_tensors="pt").input_ids.squeeze()

    return {
        "pixel_values": pixel_values,
        "input_ids": input_ids,
        "labels": labels
    }

In [5]:
class ImageDataset(Dataset):
    def __init__(self, image_dir, x_dict, y_dict, transform=None):
        self.image_dir = image_dir
        self.transform = transform
        self.images = list(x_dict.keys())
        self.question = list(x_dict.values())
        self.answer = list(y_dict.values())
        self.pre_finetune_text = '''Given the image, what is the''' # to stay consistent for finetuning
        assert type(self.answer) == list, "Answer should be a list of strings"
        

    def __len__(self):
        return len(self.question)

    def __getitem__(self, idx):
        img_name = os.path.join(self.image_dir, self.image_files[idx])
        image = Image.open(img_name)
        if self.transform:
            image = self.transform(image)
        return image

In [7]:
dataset = ImageDataset("images/test", x_dict, y_dict)

In [None]:

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=2,  # Adjust batch size based on available memory
    per_device_eval_batch_size=2,
    logging_steps=100,
    save_steps=500,
    eval_steps=500,
    save_total_limit=3,
    num_train_epochs=3,
    output_dir="./donut-finetuned-docvqa"
)

# Define trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    # eval_dataset=val_dataset,
    tokenizer=processor.tokenizer,
    # data_collator=processor.data_collator
)


In [None]:
trainer.train()


In [None]:
model.save_pretrained("./donut-finetuned-docvqa")
processor.save_pretrained("./donut-finetuned-docvqa")

In [None]:
import os
import torch
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from transformers import DonutProcessor, VisionEncoderDecoderModel, Seq2SeqTrainer, Seq2SeqTrainingArguments

# Load pretrained processor and model
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
device = "cuda" if torch.cuda.is_available() else "cpu"
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa").to(device)

x_dict = {
    "219gbqQt+ML.jpg": "height",
    "218vf17tHkL.jpg": "weight",
    "21-VzxP3BDL.jpg": "item_volume",
    "217V+UhIrHL.jpg": "length",
    "11j0F4QOiFL.jpg": "height",
    "211sXYcOHcL.jpg": "height",
    "218zo3iJ2IL.jpg": "length",
    "213VIsNlvzL.jpg": "height",
    "21+quvMwZSL.jpg": "weight",
    "217+y-mckBL.jpg": "weight",
    "211EIgVhPEL.jpg": "voltage",
    "218tBdpDGPS.jpg": "length",
    "21-V2Kx5BVL.jpg": "length"
}

y_dict = {
    "219gbqQt+ML.jpg": "12 cm",
    "218vf17tHkL.jpg": "250 mg",
    "21-VzxP3BDL.jpg": "200 ml",
    "217V+UhIrHL.jpg": "5 cm",
    "11j0F4QOiFL.jpg": "2.75 inches",
    "211sXYcOHcL.jpg": "8 cm",
    "218zo3iJ2IL.jpg": "44.2 cm",
    "213VIsNlvzL.jpg": "11 cm",
    "21+quvMwZSL.jpg": "1.6 lbs",
    "217+y-mckBL.jpg": "400 mg",
    "211EIgVhPEL.jpg": "3.7 V",
    "218tBdpDGPS.jpg": "104.5 inches",
    "21-V2Kx5BVL.jpg": "80 inches"
}


In [None]:

class ImageDataset(Dataset):
    def __init__(self, image_dir, x_dict, y_dict, transform=None):
        self.image_dir = image_dir
        self.transform = transform
        self.images = list(x_dict.keys())
        self.question = list(x_dict.values())
        self.answer = list(y_dict.values())
        self.pre_finetune_text = '''Given the image, what is the''' # to stay consistent for finetuning
        assert type(self.answer) == list, "Answer should be a list of strings"
        

    def __len__(self):
        return len(self.question)

    def __getitem__(self, idx):
        img_name = os.path.join(self.image_dir, self.image_files[idx])
        image = Image.open(img_name)
        if self.transform:
            image = self.transform(image)
        return image


# Create dataset
dataset = ImageDataset("images/test", x_dict, y_dict, processor)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    logging_steps=100,
    save_steps=500,
    eval_steps=500,
    save_total_limit=3,
    num_train_epochs=3,
    output_dir="./donut-finetuned-docvqa",
    fp16=True,  # Enable mixed precision training
)

# Define data collator
def collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    labels = torch.stack([item['labels'] for item in batch])
    pixel_values = torch.stack([item['pixel_values'] for item in batch])
    
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels,
        'pixel_values': pixel_values,}


In [None]:


# Define trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=processor.tokenizer,
    data_collator=collate_fn,
)


In [None]:
trainer.train()


In [8]:
import re
from transformers import DonutProcessor, VisionEncoderDecoderModel, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset, DatasetDict
import torch
from PIL import Image
from torch.utils.data import Dataset
import os

In [9]:
# Load pretrained processor and model
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
device = "cuda" if torch.cuda.is_available() else "cpu"
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa").to(device)



In [10]:
x_dict = {
    "219gbqQt+ML.jpg": "height",
    "218vf17tHkL.jpg": "weight",
    "21-VzxP3BDL.jpg": "item_volume",
    "217V+UhIrHL.jpg": "length",
    "11j0F4QOiFL.jpg": "height",
    "211sXYcOHcL.jpg": "height",
    "218zo3iJ2IL.jpg": "length",
    "213VIsNlvzL.jpg": "height",
    "21+quvMwZSL.jpg": "weight",
    "217+y-mckBL.jpg": "weight",
    "211EIgVhPEL.jpg": "voltage",
    "218tBdpDGPS.jpg": "length",
    "21-V2Kx5BVL.jpg": "length"
}

y_dict = {
    "219gbqQt+ML.jpg": "12 cm",
    "218vf17tHkL.jpg": "250 mg",
    "21-VzxP3BDL.jpg": "200 ml",
    "217V+UhIrHL.jpg": "5 cm",
    "11j0F4QOiFL.jpg": "2.75 inches",
    "211sXYcOHcL.jpg": "8 cm",
    "218zo3iJ2IL.jpg": "44.2 cm",
    "213VIsNlvzL.jpg": "11 cm",
    "21+quvMwZSL.jpg": "1.6 lbs",
    "217+y-mckBL.jpg": "400 mg",
    "211EIgVhPEL.jpg": "3.7 V",
    "218tBdpDGPS.jpg": "104.5 inches",
    "21-V2Kx5BVL.jpg": "80 inches"
}

In [11]:
def process_example(example):
    image = Image.open(example['image_path']).convert("RGB")
    question = example['question']
    answer = example['answer']

    # Prepare input for the model
    task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
    prompt = task_prompt.replace("{user_input}", question)
    
    # Tokenize input question
    input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids.squeeze()

    # Process image
    pixel_values = processor(image, return_tensors="pt").pixel_values.squeeze()

    # Tokenize answer as label
    labels = processor.tokenizer(answer, add_special_tokens=False, return_tensors="pt").input_ids.squeeze()

    return {
        "pixel_values": pixel_values,
        "input_ids": input_ids,
        "labels": labels
    }

In [12]:
class ImageDataset(Dataset):
    def __init__(self, image_dir, x_dict, y_dict, transform=None):
        self.image_dir = image_dir
        self.transform = transform
        self.images = list(x_dict.keys())
        self.question = list(x_dict.values())
        self.answer = list(y_dict.values())
        self.pre_finetune_text = '''Given the image, what is the''' # to stay consistent for finetuning
        assert type(self.answer) == list, "Answer should be a list of strings"
        

    def __len__(self):
        return len(self.question)

    def __getitem__(self, idx):
        img_name = os.path.join(self.image_dir, self.image_files[idx])
        image = Image.open(img_name)
        if self.transform:
            image = self.transform(image)
        return image

In [13]:
dataset = ImageDataset("images/test", x_dict, y_dict)

In [14]:

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=2,  # Adjust batch size based on available memory
    per_device_eval_batch_size=2,
    logging_steps=100,
    save_steps=500,
    eval_steps=500,
    save_total_limit=3,
    num_train_epochs=3,
    output_dir="./donut-finetuned-docvqa"
)

# Define trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    # eval_dataset=val_dataset,
    tokenizer=processor.tokenizer,
    # data_collator=processor.data_collator
)


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [15]:
trainer.train()


[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/arjun/.netrc


  0%|          | 0/21 [00:00<?, ?it/s]

AttributeError: 'ImageDataset' object has no attribute 'image_files'

In [None]:
model.save_pretrained("./donut-finetuned-docvqa")
processor.save_pretrained("./donut-finetuned-docvqa")

In [None]:
banana

In [5]:
import os
import torch
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from transformers import DonutProcessor, VisionEncoderDecoderModel, Seq2SeqTrainer, Seq2SeqTrainingArguments

# Load pretrained processor and model
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
device = "cuda" if torch.cuda.is_available() else "cpu"
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa").to(device)

x_dict = {
    "219gbqQt+ML.jpg": "height",
    "218vf17tHkL.jpg": "weight",
    "21-VzxP3BDL.jpg": "item_volume",
    "217V+UhIrHL.jpg": "length",
    "11j0F4QOiFL.jpg": "height",
    "211sXYcOHcL.jpg": "height",
    "218zo3iJ2IL.jpg": "length",
    "213VIsNlvzL.jpg": "height",
    "21+quvMwZSL.jpg": "weight",
    "217+y-mckBL.jpg": "weight",
    "211EIgVhPEL.jpg": "voltage",
    "218tBdpDGPS.jpg": "length",
    "21-V2Kx5BVL.jpg": "length"
}

y_dict = {
    "219gbqQt+ML.jpg": "12 cm",
    "218vf17tHkL.jpg": "250 mg",
    "21-VzxP3BDL.jpg": "200 ml",
    "217V+UhIrHL.jpg": "5 cm",
    "11j0F4QOiFL.jpg": "2.75 inches",
    "211sXYcOHcL.jpg": "8 cm",
    "218zo3iJ2IL.jpg": "44.2 cm",
    "213VIsNlvzL.jpg": "11 cm",
    "21+quvMwZSL.jpg": "1.6 lbs",
    "217+y-mckBL.jpg": "400 mg",
    "211EIgVhPEL.jpg": "3.7 V",
    "218tBdpDGPS.jpg": "104.5 inches",
    "21-V2Kx5BVL.jpg": "80 inches"
}




In [6]:


# Define trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=processor.tokenizer,
    data_collator=collate_fn,
)


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [7]:
model.save_pretrained("./donut-finetuned-docvqa")
processor.save_pretrained("./donut-finetuned-docvqa")

KeyboardInterrupt: 