In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os

In [None]:
!pip install datasets


In [17]:
from datasets import load_dataset

dataset = load_dataset("flaviagiammarino/vqa-rad")


In [18]:
import torch
import io
import pickle
import pandas as pd
from transformers import BlipProcessor, BlipForQuestionAnswering
from torch.utils.data import DataLoader
from tqdm import tqdm
from PIL import Image
from transformers import AutoModelForCausalLM, BitsAndBytesConfig


In [None]:
!pip install peft

In [None]:
!pip install -U bitsandbytes


In [19]:
import torch
from transformers import BitsAndBytesConfig
from transformers import AutoProcessor, AutoModelForVision2Seq,AutoModel
from peft import LoraConfig
from transformers import AutoModelForCausalLM


lora_config = LoraConfig(
r=8,
lora_alpha=8,
lora_dropout=0.1,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # Adjust based on inspected names
bias="none",
init_lora_weights="gaussian"
)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)


from transformers import AutoProcessor, AutoModelForPreTraining

processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
model = AutoModelForPreTraining.from_pretrained("llava-hf/llava-1.5-7b-hf",
    torch_dtype=torch.float16,
    quantization_config=bnb_config,
)

model.add_adapter(lora_config)
model.enable_adapters()


# Move model to the correct device


`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [20]:
from datasets import load_dataset

dataset = load_dataset("flaviagiammarino/vqa-rad")

# Split the dataset: 80% for training, 20% for validation
train_test_split = dataset['train'].train_test_split(test_size=0.2, seed=42)

# Rename the splits
train_dataset = train_test_split['train']
valid_dataset = train_test_split['test']

datasets = {
    'train': train_dataset,
    'valid': valid_dataset
}

# Print sizes to confirm
print(f"Train size: {len(train_dataset)}")
print(f"Validation size: {len(valid_dataset)}")


Train size: 1434
Validation size: 359


In [None]:
dataset['train']

Dataset({
    features: ['image', 'question', 'answer'],
    num_rows: 1793
})

In [None]:
datasets['train'] = datasets['train'].filter(lambda example: example['question'] is not None and example['answer'] is not None and example['image'] is not None)
dataset['test'] = dataset['test'].filter(lambda example: example['question'] is not None and example['answer'] is not None and example['image'] is not None)
datasets['valid'] = datasets['valid'].filter(lambda example: example['question'] is not None and example['answer'] is not None and example['image'] is not None)


Filter:   0%|          | 0/1434 [00:00<?, ? examples/s]

Filter:   0%|          | 0/451 [00:00<?, ? examples/s]

Filter:   0%|          | 0/359 [00:00<?, ? examples/s]

In [21]:
from datasets import load_dataset
from torch.utils.data import Dataset
from PIL import Image
import random

# Define the VQADataset class
class VQADataset(Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        question = self.dataset[idx]['question']
        answer = self.dataset[idx]['answer']
        image = self.dataset[idx]['image']  # Assuming it's a PIL image

        return {
            "image": image,
            "query": {"en": question},
            "answers": [answer]
        }


# Define the MyDataCollator class
class MyDataCollator:
    def __init__(self, processor):
        self.processor = processor
        self.image_token = '<image>'
        self.end_of_utterance_token = '<end_of_utterance>'
        self.pad_token_id = getattr(processor, 'pad_token_id', 0)  # Default to 0 if not found

    def __call__(self, examples):
        texts = []
        for example in examples:
            question = example["query"]['en']
            answer = random.choice(example["answers"])

            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "Answer briefly."},
                        {"type": "text", "text": self.image_token},
                        {"type": "text", "text": question}
                    ]
                },
                {
                    "role": "assistant",
                    "content": [
                        {"type": "text", "text": answer},
                        {"type": "text", "text": self.end_of_utterance_token}
                    ]
                }
            ]

            text = self.processor.apply_chat_template(messages, add_generation_prompt=False)

            if isinstance(text, list):
                text = " ".join(str(item) for item in text)
            else:
                text = str(text)

            texts.append(text.strip())

        text_batch = self.processor(text=texts, return_tensors="pt", padding=True)

        # Prepare the batch dictionary
        batch = {
            "input_ids": text_batch["input_ids"],
            "attention_mask": text_batch["attention_mask"]
        }

        # Create labels
        labels = batch["input_ids"].clone()
        labels[labels == self.pad_token_id] = -100  # Ignore padding in loss calculation
        batch["labels"] = labels

        return batch




dataset = load_dataset("flaviagiammarino/vqa-rad")

train_dataset = VQADataset(dataset=datasets['train'], processor=processor)
test_dataset = VQADataset(dataset=dataset['test'], processor=processor)
valid_dataset = VQADataset(dataset=datasets['valid'], processor=processor)

# Data collator for batching
data_collator = MyDataCollator(processor=processor)


In [30]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir = "IDEFICS_DocVQA2_8",
    learning_rate = 4e-5,
    fp16 = True,
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    gradient_accumulation_steps = 12,
    dataloader_pin_memory = False,
    save_total_limit = 3,
    evaluation_strategy ="steps",
    save_strategy = "steps",
    eval_steps = 100,
    save_steps = 25,
    max_steps = 500,
    logging_steps = 5,
    remove_unused_columns = False,
    push_to_hub=False,
    label_names = ["labels"],
    load_best_model_at_end = False,
    report_to = "none",
    optim = "paged_adamw_8bit",
)



In [31]:
from transformers import  Trainer

trainer = Trainer(
    model = model,
    args = training_args,
    data_collator=data_collator,
    train_dataset = train_dataset,
    eval_dataset = valid_dataset
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


In [32]:
trainer.train()


Step,Training Loss,Validation Loss
100,1.154,1.216516
200,0.7802,0.885901
300,0.6953,0.873809
400,0.6556,0.886627
500,0.6177,0.894474




TrainOutput(global_step=500, training_loss=1.0637857599258422, metrics={'train_runtime': 4325.1172, 'train_samples_per_second': 5.549, 'train_steps_per_second': 0.116, 'total_flos': 4.048365391331328e+16, 'train_loss': 1.0637857599258422, 'epoch': 16.71309192200557})

In [33]:
test_results = trainer.evaluate(test_dataset)

# Print the evaluation results
print(test_results)

{'eval_loss': 0.8413650393486023, 'eval_runtime': 41.4427, 'eval_samples_per_second': 10.882, 'eval_steps_per_second': 2.727, 'epoch': 16.71309192200557}
