In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [9]:
!wget https://amazon-berkeley-objects.s3.amazonaws.com/archives/abo-images-small.tar
!tar -xf abo-images-small.tar
!rm abo-images-small.tar

--2025-05-11 07:03:19--  https://amazon-berkeley-objects.s3.amazonaws.com/archives/abo-images-small.tar
Resolving amazon-berkeley-objects.s3.amazonaws.com (amazon-berkeley-objects.s3.amazonaws.com)... 16.182.108.241, 3.5.10.150, 3.5.2.206, ...
Connecting to amazon-berkeley-objects.s3.amazonaws.com (amazon-berkeley-objects.s3.amazonaws.com)|16.182.108.241|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3253381120 (3.0G) [application/x-tar]
Saving to: ‘abo-images-small.tar’


2025-05-11 07:04:26 (46.9 MB/s) - ‘abo-images-small.tar’ saved [3253381120/3253381120]



In [2]:
import os
import json
import random
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import BlipProcessor, BlipForQuestionAnswering
from peft import get_peft_model, LoraConfig, TaskType
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import pickle
import shutil


In [3]:

with open("/kaggle/input/amazon-berkley-vqa/train.json", "r") as f:
    questions_data = json.load(f)

In [4]:
torch.manual_seed(42)
random.seed(42)

In [5]:
all_entries = []
for img_path, qas in list(questions_data.items()):
    for qa in qas:
        all_entries.append({
            "image_path": img_path,
            "question": qa["question"],
            "answer": qa["answer"]
        })


In [10]:
train_data, val_data = train_test_split(all_entries, test_size=0.2, random_state=42)

In [11]:
class CustomVQADataset(Dataset):
    def __init__(self, data, processor):
        self.data = data
        self.processor = processor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        image = Image.open(item["image_path"]).convert("RGB")
        question = item["question"]
        answer = item["answer"].lower()

        # Process the image and question
        encoding = self.processor(image, question, padding="max_length", truncation=True, return_tensors="pt")
        labels = self.processor.tokenizer.encode(answer, max_length=8, padding="max_length", truncation=True, return_tensors='pt')

        encoding["labels"] = labels
        for k, v in encoding.items():
            encoding[k] = v.squeeze(0)

        return encoding

In [12]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
base_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")

# Apply LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none",
)
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

trainable params: 1,179,648 || all params: 385,852,220 || trainable%: 0.3057


In [13]:
train_dataset = CustomVQADataset(train_data, processor)
val_dataset = CustomVQADataset(val_data, processor)

train_loader = DataLoader(train_dataset, batch_size=12, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=12, shuffle=False, pin_memory=True)

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = torch.nn.DataParallel(model)
model.to(device)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
scaler = torch.cuda.amp.GradScaler()

  scaler = torch.cuda.amp.GradScaler()


In [23]:
num_epochs = 30
patience = 3
min_eval_loss = float("inf")
early_stopping = 0
tracking = []

for epoch in range(1, num_epochs):
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} Training"):
        input_ids = batch['input_ids'].to(device)
        pixel_values = batch['pixel_values'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        with torch.autocast(device_type='cuda', dtype=torch.float16):
            outputs = model(input_ids=input_ids,
                            pixel_values=pixel_values,
                            attention_mask=attention_mask,
                            labels=labels)
        loss = outputs.loss
        if loss.ndim > 0:
            loss = loss.mean() 
        train_loss += loss.item()

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            input_ids = batch['input_ids'].to(device)
            pixel_values = batch['pixel_values'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            with torch.autocast(device_type='cuda', dtype=torch.float16):
                outputs = model(input_ids=input_ids,
                                pixel_values=pixel_values,
                                attention_mask=attention_mask,
                                labels=labels)
            loss = outputs.loss
            if loss.ndim > 0:
                loss = loss.mean() 

            val_loss += loss.item()

    train_loss /= len(train_loader)
    val_loss /= len(val_loader)
    tracking.append((train_loss, val_loss))

    print(f"Epoch {epoch+1} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

    # Early stopping
    if val_loss < min_eval_loss:
        min_eval_loss = val_loss
        early_stopping = 0
        model.module.save_pretrained("blip-lora-vqa")
        shutil.make_archive("blip-lora-vqa", 'zip', "blip-lora-vqa")
        print("Saved best model.")
    else:
        early_stopping += 1
        if early_stopping >= patience:
            print("Early stopping triggered.")
            break

    scheduler.step()

with open("training_tracking.pkl", "wb") as f:
    pickle.dump(tracking, f)

Epoch 2 Training: 100%|██████████| 2035/2035 [33:20<00:00,  1.02it/s]
Validating: 100%|██████████| 509/509 [04:53<00:00,  1.74it/s]


Epoch 2 | Train Loss: 6.2358 | Val Loss: 6.2116
Saved best model.


Epoch 3 Training: 100%|██████████| 2035/2035 [33:21<00:00,  1.02it/s]
Validating: 100%|██████████| 509/509 [04:54<00:00,  1.73it/s]


Epoch 3 | Train Loss: 6.1939 | Val Loss: 6.1959
Saved best model.


Epoch 4 Training: 100%|██████████| 2035/2035 [33:17<00:00,  1.02it/s]
Validating: 100%|██████████| 509/509 [04:52<00:00,  1.74it/s]


Epoch 4 | Train Loss: 6.1755 | Val Loss: 6.1905
Saved best model.


Epoch 5 Training: 100%|██████████| 2035/2035 [33:20<00:00,  1.02it/s]
Validating: 100%|██████████| 509/509 [04:53<00:00,  1.73it/s]


Epoch 5 | Train Loss: 6.1629 | Val Loss: 6.1859
Saved best model.


Epoch 6 Training: 100%|██████████| 2035/2035 [33:16<00:00,  1.02it/s]
Validating: 100%|██████████| 509/509 [04:53<00:00,  1.73it/s]


Epoch 6 | Train Loss: 6.1531 | Val Loss: 6.1846
Saved best model.


Epoch 7 Training: 100%|██████████| 2035/2035 [33:19<00:00,  1.02it/s]
Validating: 100%|██████████| 509/509 [04:53<00:00,  1.74it/s]


Epoch 7 | Train Loss: 6.1447 | Val Loss: 6.1826
Saved best model.


Epoch 8 Training: 100%|██████████| 2035/2035 [33:17<00:00,  1.02it/s]
Validating: 100%|██████████| 509/509 [04:54<00:00,  1.73it/s]


Epoch 8 | Train Loss: 6.1375 | Val Loss: 6.1820
Saved best model.


Epoch 9 Training: 100%|██████████| 2035/2035 [33:13<00:00,  1.02it/s]
Validating: 100%|██████████| 509/509 [04:53<00:00,  1.74it/s]


Epoch 9 | Train Loss: 6.1312 | Val Loss: 6.1834


Epoch 10 Training: 100%|██████████| 2035/2035 [33:13<00:00,  1.02it/s]
Validating: 100%|██████████| 509/509 [04:52<00:00,  1.74it/s]


Epoch 10 | Train Loss: 6.1261 | Val Loss: 6.1821


Epoch 11 Training:   4%|▍         | 77/2035 [01:15<32:01,  1.02it/s]


KeyboardInterrupt: 

In [19]:
model.module.save_pretrained("blip-lora-vqa")
print("Saved best model.")

Saved best model.


In [24]:
from transformers import BlipForQuestionAnswering, BlipProcessor
from peft import PeftModel, PeftConfig
import torch
from tqdm import tqdm
import torch.nn.functional as F

# Load processor and base model
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
base_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")

# Load LoRA config and wrap model
peft_model = PeftModel.from_pretrained(base_model, "blip-lora-vqa")
peft_model.eval()
peft_model.to(device)

# Exact match evaluation
total = 0
correct = 0

with torch.no_grad():
    for batch in tqdm(val_loader, desc="Evaluating"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        pixel_values = batch['pixel_values'].to(device)

        outputs = peft_model.generate(
            input_ids=input_ids,
            pixel_values=pixel_values,
            attention_mask=attention_mask,
            max_new_tokens=10
        )

        predictions = processor.tokenizer.batch_decode(outputs, skip_special_tokens=True)
        gold_answers = processor.tokenizer.batch_decode(batch['labels'], skip_special_tokens=True)

        for pred, gold in zip(predictions, gold_answers):
            total += 1
            if pred.strip().lower() == gold.strip().lower():
                correct += 1

accuracy = correct / total
print(f"Exact Match Accuracy: {accuracy:.4f}")


Evaluating: 100%|██████████| 509/509 [13:22<00:00,  1.58s/it]

Exact Match Accuracy: 0.2950





In [25]:
for i in range(len(predictions)):
    print(predictions[i], gold_answers[i])

yes yes
yes yes
blue blue
beige yellow
yes yes
ten amazon
beige green
fabric fabric
handmade chair


In [20]:
!zip -r blip-lora-vqa.zip blip-lora-vqa/


  adding: blip-lora-vqa/ (stored 0%)
  adding: blip-lora-vqa/README.md (deflated 66%)
  adding: blip-lora-vqa/adapter_config.json (deflated 54%)
  adding: blip-lora-vqa/adapter_model.safetensors

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


 (deflated 7%)


In [44]:
!ls /kaggle/input/blip-epoch-10/pytorch/test/1/blip-lora-vqa

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


adapter_config.json  adapter_model.safetensors	README.md
