In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os

In [None]:
!pip install datasets
!pip install peft
!pip install -U bitsandbytes


In [None]:
from datasets import load_dataset

dataset = load_dataset("flaviagiammarino/vqa-rad")

train_test_split = dataset['train'].train_test_split(test_size=0.2, seed=42)

train_dataset = train_test_split['train']
valid_dataset = train_test_split['test']

datasets = {
    'train': train_dataset,
    'valid': valid_dataset
}

# Print sizes to confirm
print(f"Train size: {len(train_dataset)}")
print(f"Validation size: {len(valid_dataset)}")
print(f"Test size: {len(dataset['test'])}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/3.91k [00:00<?, ?B/s]

(…)-00000-of-00001-eb8844602202be60.parquet:   0%|          | 0.00/24.2M [00:00<?, ?B/s]

(…)-00000-of-00001-e5bc3d208bb4deeb.parquet:   0%|          | 0.00/10.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1793 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/451 [00:00<?, ? examples/s]

Train size: 1434
Validation size: 359
Test size: 451


In [None]:
import torch
import io
import pickle
import pandas as pd
from transformers import BlipProcessor, BlipForQuestionAnswering
from torch.utils.data import DataLoader
from tqdm import tqdm
from PIL import Image
from transformers import AutoModelForCausalLM, BitsAndBytesConfig


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [None]:
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration, TrainingArguments, Trainer
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import random

DEVICE = "cuda:0"

# Load the BLIP-2 processor and model
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-capfilt-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-vqa-capfilt-large").to(DEVICE)

class VQADataset(Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        question = self.dataset[idx]['question']
        answer = self.dataset[idx]['answer']
        image = self.dataset[idx]['image']

        return {
            "image": image,
            "query": {"en": question},
            "answers": [answer]
        }

class MyDataCollator:
    def __init__(self, processor, max_length=10):  # max_length set to a fixed value
        self.processor = processor
        self.max_length = max_length  # Define max length for padding

    def __call__(self, examples):
        images = []
        questions = []
        answers = []

        for example in examples:
            images.append(example["image"])
            questions.append(example["query"]["en"])  # This should be a string
            answers.append(example["answers"][0])  # This should also be a string

        image_inputs = self.processor(images=images, return_tensors="pt", padding=True)
        pixel_values = image_inputs['pixel_values']

        if not all(isinstance(q, str) for q in questions):
            raise ValueError("All questions must be strings.")
        if not all(isinstance(a, str) for a in answers):
            raise ValueError("All answers must be strings.")

        # Prepare input_ids for questions with consistent padding and truncation
        question_inputs = self.processor(
            text=questions, return_tensors="pt", padding="max_length", max_length=self.max_length, truncation=True
        )
        input_ids = question_inputs['input_ids']
        attention_mask = question_inputs['attention_mask']

        # Prepare labels for answers with consistent padding and truncation
        label_inputs = self.processor(
            text=answers, return_tensors="pt", padding="max_length", max_length=self.max_length, truncation=True
        )
        labels = label_inputs['input_ids']

        # Replace padding token in labels with -100 for ignoring in loss calculation
        labels[labels == self.processor.tokenizer.pad_token_id] = -100

        return {
            "pixel_values": pixel_values,
            "input_ids": input_ids,
            "labels": labels,
            "attention_mask": attention_mask,
        }




dataset = load_dataset("flaviagiammarino/vqa-rad")

# Create dataset objects for each split
train_dataset = VQADataset(dataset=datasets['train'], processor=processor)
test_dataset = VQADataset(dataset=dataset['test'], processor=processor)
valid_dataset = VQADataset(dataset=datasets['valid'], processor=processor)




preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/524 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/4.59k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

In [None]:
# Training setup
training_args = TrainingArguments(
    output_dir="Salesforce",
    learning_rate=2e-4,
    fp16=True,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    dataloader_pin_memory=False,
    save_total_limit=3,
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=100,
    save_steps=25,
    # max_steps=100,
    num_train_epochs=10,
    logging_steps=5,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to = "none",
    optim="paged_adamw_8bit",
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=MyDataCollator(processor),
    train_dataset=train_dataset,
    eval_dataset=valid_dataset
)

# Train the model
trainer.train()


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss
100,3.1178,3.823364
200,2.9325,3.745755
300,3.2429,3.691884
400,3.0708,3.71363
500,2.7338,3.6409
600,2.6875,3.675929
700,2.5372,3.673109
800,2.6156,3.642132


TrainOutput(global_step=890, training_loss=2.84189529847563, metrics={'train_runtime': 2221.4266, 'train_samples_per_second': 6.455, 'train_steps_per_second': 0.401, 'total_flos': 8.450352807684342e+18, 'train_loss': 2.84189529847563, 'epoch': 9.9302649930265})

In [None]:
test_dataset=trainer.evaluate(test_dataset)
test_dataset

{'eval_loss': 3.2363553047180176,
 'eval_runtime': 22.9748,
 'eval_samples_per_second': 19.63,
 'eval_steps_per_second': 9.837,
 'epoch': 9.9302649930265}