# Lora finetuning for blip-vqa-base

In [1]:
!pip install kaggle



In [2]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 67 bytes


Downloading the dataset from Kaggle

In [3]:
!kaggle datasets download "nitinrajesh/berkley-for-vqa"

Dataset URL: https://www.kaggle.com/datasets/nitinrajesh/berkley-for-vqa
License(s): unknown
Downloading berkley-for-vqa.zip to /content
 64% 127M/198M [00:00<00:00, 1.33GB/s]
100% 198M/198M [00:00<00:00, 1.34GB/s]


In [4]:
!unzip berkley-for-vqa.zip -d berkley-for-vqa

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: berkley-for-vqa/berkley-for-vqa/images/d3/d3999b77.jpg  
  inflating: berkley-for-vqa/berkley-for-vqa/images/d3/d39a3d15.jpg  
  inflating: berkley-for-vqa/berkley-for-vqa/images/d3/d39c0811.jpg  
  inflating: berkley-for-vqa/berkley-for-vqa/images/d3/d39cb97f.jpg  
  inflating: berkley-for-vqa/berkley-for-vqa/images/d3/d3a1659f.jpg  
  inflating: berkley-for-vqa/berkley-for-vqa/images/d3/d3a1cc64.jpg  
  inflating: berkley-for-vqa/berkley-for-vqa/images/d3/d3a55252.jpg  
  inflating: berkley-for-vqa/berkley-for-vqa/images/d3/d3a57347.jpg  
  inflating: berkley-for-vqa/berkley-for-vqa/images/d3/d3a72098.jpg  
  inflating: berkley-for-vqa/berkley-for-vqa/images/d3/d3ab60d0.jpg  
  inflating: berkley-for-vqa/berkley-for-vqa/images/d3/d3ad60be.jpg  
  inflating: berkley-for-vqa/berkley-for-vqa/images/d3/d3b2fc19.jpg  
  inflating: berkley-for-vqa/berkley-for-vqa/images/d3/d3b4742b.jpg  
  inflating: berkley-for-

In [26]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
import csv
import torch
from PIL import Image
from torch.optim import AdamW
import json
from tqdm import tqdm
from transformers import Trainer, TrainingArguments
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from peft import LoraConfig, get_peft_model, PeftModel, PeftConfig, TaskType
from sklearn.model_selection import train_test_split

In [7]:
import transformers
from transformers import BlipProcessor, BlipForQuestionAnswering
print(transformers.__version__)

4.51.3


*Load blip-vqa-base model*

Custom blip class to absorb 'input_embeds'

In [8]:
class CustomBlipForVQA(BlipForQuestionAnswering):
    def forward(
        self,
        input_ids=None,
        pixel_values=None,
        attention_mask=None,
        labels=None,
        **kwargs  # This is important to absorb unexpected args like `inputs_embeds`
    ):
        # You can print or log here to debug
        return super().forward(
            input_ids=input_ids,
            pixel_values=pixel_values,
            attention_mask=attention_mask,
            labels=labels
        )

In [9]:
# Load model directly
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = CustomBlipForVQA.from_pretrained("Salesforce/blip-vqa-base")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def pipe(image_path, question):
  image = Image.open(image_path).resize((196,196))
  inputs = processor(image, question, return_tensors="pt")
  inputs.to(device)

  generated_ids = model.generate(pixel_values=inputs["pixel_values"], input_ids=inputs["input_ids"])

  # Decode the generated IDs to text
  predicted_answer = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

  return predicted_answer


_Prepare dataset_

In [None]:
!ls /content/

berkley-for-vqa  berkley-for-vqa.zip  drive  sample_data


In [11]:
datapath = '/content/berkley-for-vqa/berkley-for-vqa/'
csv_file_path = datapath + 'vqa_qa_cleaned.csv'

dataset = []

count = 0

with open(csv_file_path, newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in tqdm(reader, desc="Dataset loaded "):
      count += 1
      # if count < 20000:
      #   continue
      if count > 14000:
        break
      image_path = datapath+"images/"+row["path"]
      image = Image.open(image_path).convert("RGB").resize((196,196))
      for elem in {row['q1']:row['a1'],
                row['q2']:row['a2'],
                row['q3']:row['a3'],
                'What type of product is in the image?':row['product_type'].lower().replace('_',' ')}.items():
                dataset.append({'img':image.copy(), 'question':elem[0], 'answer':elem[1].lower()})


print(len(dataset))
print(dataset[7])

Dataset loaded : 14000it [00:27, 512.32it/s]

55364
{'img': <PIL.Image.Image image mode=RGB size=196x196 at 0x7F9938926310>, 'question': 'What type of product is in the image?', 'answer': 'home bed and bath'}





In [12]:
# Define Lora config
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.QUESTION_ANS,
)

# Apply LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1,179,648 || all params: 385,852,220 || trainable%: 0.3057


*Setting up Lora*

In [13]:
for name, module in model.named_modules():
    print(name)


base_model
base_model.model
base_model.model.vision_model
base_model.model.vision_model.embeddings
base_model.model.vision_model.embeddings.patch_embedding
base_model.model.vision_model.encoder
base_model.model.vision_model.encoder.layers
base_model.model.vision_model.encoder.layers.0
base_model.model.vision_model.encoder.layers.0.self_attn
base_model.model.vision_model.encoder.layers.0.self_attn.dropout
base_model.model.vision_model.encoder.layers.0.self_attn.qkv
base_model.model.vision_model.encoder.layers.0.self_attn.projection
base_model.model.vision_model.encoder.layers.0.layer_norm1
base_model.model.vision_model.encoder.layers.0.mlp
base_model.model.vision_model.encoder.layers.0.mlp.activation_fn
base_model.model.vision_model.encoder.layers.0.mlp.fc1
base_model.model.vision_model.encoder.layers.0.mlp.fc2
base_model.model.vision_model.encoder.layers.0.layer_norm2
base_model.model.vision_model.encoder.layers.1
base_model.model.vision_model.encoder.layers.1.self_attn
base_model.mod

*Dataset handler class*

In [21]:
class VQADataset(Dataset):
    def __init__(self, data, processor):
        self.data = data
        self.processor = processor

    def __getitem__(self, idx):
        item = self.data[idx]

        image = item["img"]
        question = item["question"]
        answer = item["answer"]

        encoding = processor(
            image,
            question,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=32,
        )

        labels = processor.tokenizer(
            answer,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=10,
        ).input_ids

        # Flatten and return
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "pixel_values": encoding["pixel_values"].squeeze(0),
            "labels": labels.squeeze(0)
        }

        return inputs

    def __len__(self):
        return len(self.data)




# Create dataset and dataloader
train_data, eval_data = train_test_split(dataset, test_size=0.2, random_state=42)
train_dataset = VQADataset(train_data, processor)
eval_dataset = VQADataset(eval_data, processor)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=4, shuffle=True)

In [14]:
print(len(train_dataset))
print(train_dataset[5])

49827
{'input_ids': tensor([ 101, 2054, 2828, 1997, 3430, 2515, 1996, 2553, 3711, 2000, 2022, 2081,
        2013, 1029,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0]), 'pixel_values': tensor([[[1.9011, 1.9011, 1.9011,  ..., 1.9157, 1.9157, 1.9157],
         [1.9011, 1.9011, 1.9011,  ..., 1.9157, 1.9157, 1.9157],
         [1.9011, 1.9011, 1.9011,  ..., 1.9157, 1.9157, 1.9157],
         ...,
         [1.9303, 1.9303, 1.9303,  ..., 1.9157, 1.9157, 1.9157],
         [1.9303, 1.9303, 1.9303,  ..., 1.9157, 1.9157, 1.9157],
         [1.9303, 1.9303, 1.9303,  ..., 1.9157, 1.9157, 1.9157]],

        [[2.0599, 2.0599, 2.0599,  ..., 2.0599, 2.0599, 2.0599],
         [2.0599, 2.0599, 2.0599,  ..., 2.0599, 2.0599, 2.0599],
         [2.0599, 2.0599, 2.0599,  ..., 2.0599, 2.0599, 2.0599],
         ...,

In [15]:
# Custom collate fn

def collate_fn(batch):
    pixel_values = torch.stack([item["pixel_values"] for item in batch])
    input_ids = torch.stack([item["input_ids"] for item in batch])
    attention_mask = torch.stack([item["attention_mask"] for item in batch])

    labels = [item["labels"] for item in batch]  # list of 1D tensors
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=processor.tokenizer.pad_token_id)

    # Replace pad token IDs with -100 for loss masking
    labels_padded[labels_padded == processor.tokenizer.pad_token_id] = -100

    return {
        "pixel_values": pixel_values,
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels_padded,
    }


### Training Lora

_Custom trainer_ (to fix `num_items_in_batch` issue)

In [17]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        if 'num_items_in_batch' in kwargs:
            kwargs.pop('num_items_in_batch')  # Remove unsupported kwarg

        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = self.loss_fn(logits, labels)

        return (loss, outputs) if return_outputs else loss

    def loss_fn(self, logits, labels):
        import torch.nn.functional as F
        return F.cross_entropy(logits, labels)


*Custom trainer to fix 'input_embeds' issue*

In [18]:
class CustomBlipTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        if 'num_items_in_batch' in kwargs:
            kwargs.pop('num_items_in_batch')  # Remove unsupported kwarg

        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = self.loss_fn(logits, labels)
        # Remove unwanted keys
        inputs = {k: v for k, v in inputs.items() if k in ["input_ids", "attention_mask", "pixel_values", "labels"]}

        # Forward pass
        outputs = model(**inputs)
        loss = outputs.loss

        return (loss, outputs) if return_outputs else loss


In [22]:
drive_path = '/content/drive/My Drive/Datasets/'

training_args = TrainingArguments(
    output_dir=drive_path+"/blip-lora-vqa",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    learning_rate=1e-4,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    eval_strategy="epoch",
    remove_unused_columns=False,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor,
)


  trainer = Trainer(
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [21]:
print("PyTorch version:", torch.__version__)
print("Transformers version:", transformers.__version__)

PyTorch version: 2.6.0+cu124
Transformers version: 4.51.3


In [23]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,6.6981,No log
2,6.6202,No log
3,6.4988,No log
4,6.5978,No log
5,6.5723,No log


TrainOutput(global_step=13845, training_loss=6.6221280501266815, metrics={'train_runtime': 6835.8349, 'train_samples_per_second': 32.396, 'train_steps_per_second': 2.025, 'total_flos': 1.437923645268864e+16, 'train_loss': 6.6221280501266815, 'epoch': 5.0})

In [None]:
!cp -r /content/vilt-lora-vqa '/content/drive/My Drive/Datasets'

In [39]:
# Load PEFT config from checkpoint
peft_model_id = "/content/drive/My Drive/Datasets/blip-lora-vqa/checkpoint-13845"
config = PeftConfig.from_pretrained(peft_model_id)

# Load base model
base_model = BlipForQuestionAnswering.from_pretrained(config.base_model_name_or_path)

# Attach LoRA adapters
model = PeftModel.from_pretrained(base_model, peft_model_id)

# Load processor
processor = BlipProcessor.from_pretrained(config.base_model_name_or_path)

model.eval()


PeftModelForQuestionAnswering(
  (base_model): LoraModel(
    (model): BlipForQuestionAnswering(
      (vision_model): BlipVisionModel(
        (embeddings): BlipVisionEmbeddings(
          (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
        )
        (encoder): BlipEncoder(
          (layers): ModuleList(
            (0-11): 12 x BlipEncoderLayer(
              (self_attn): BlipAttention(
                (dropout): Dropout(p=0.0, inplace=False)
                (qkv): Linear(in_features=768, out_features=2304, bias=True)
                (projection): Linear(in_features=768, out_features=768, bias=True)
              )
              (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (mlp): BlipMLP(
                (activation_fn): GELUActivation()
                (fc1): Linear(in_features=768, out_features=3072, bias=True)
                (fc2): Linear(in_features=3072, out_features=768, bias=True)
              )
            

In [40]:
import torch
import random

def pipe(image, question):

  # Preprocess inputs
  inputs = processor(images=image, text=question, return_tensors="pt")

  with torch.no_grad():
    output = model.generate(**inputs)

  # Decode the generated token IDs
  answer = processor.tokenizer.decode(output[0], skip_special_tokens=True)

  return answer

In [52]:
idx = random.randint(0,len(dataset))
print(dataset[idx])
pred = pipe(dataset[idx]['img'], dataset[idx]['question'])
ans = dataset[idx]['answer']
print("Predicted answer:", pred, "\nActual answer: ", ans)




{'img': <PIL.Image.Image image mode=RGB size=196x196 at 0x7F99353B4C90>, 'question': 'What is the primary color of the package?', 'answer': 'green'}
Predicted answer: green 
Actual answer:  green


In [None]:
# Assuming `model` is your PEFT-wrapped model
drivepath = '/content/drive/My Drive/Datasets/'
model.save_pretrained(drivepath+"vilt-lora-saved-model")
processor.save_pretrained(drivepath+"vilt-lora-saved-model")


[]

In [None]:
datapath

'/content/berkley-for-vqa/berkley-for-vqa/'