In [1]:
import numpy as np
import pandas as pd

In [2]:
final_df = pd.read_csv('/kaggle/input/finale-dataset/train_dataset.csv')
test_df = pd.read_csv('/kaggle/input/finale-dataset/test_dataset.csv')

In [3]:
# final_df= final_df.head(38000)

In [4]:
final_df.head()

Unnamed: 0,image_id,question,answer,path
0,71BN3iMoGkL,What are the figurines sitting on?,Bench,9d/9dd2d3f1.jpg
1,71AH7yyOgCL,What brand is the mobile phone case?,Solimo,f3/f3ab1fbf.jpg
2,71ag2dwtVfL,What brand is this?,AmazonBasics,62/62075c07.jpg
3,61yqMXIGHVL,What lining do these envelopes have?,Bubble,2d/2d483c93.jpg
4,71d57C76BpL,What is the shade shape?,Drum,b8/b8b510a2.jpg


In [5]:
len(final_df)

63639

In [6]:
len(test_df)

15910

## LORA MODEL

In [7]:
!pip install -q peft accelerate


In [8]:
import os
import torch
import pandas as pd
from PIL import Image
from torch.utils.data import DataLoader, Dataset
from transformers import BlipProcessor, BlipForQuestionAnswering, get_scheduler
from peft import get_peft_model, LoraConfig, TaskType, PeftModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")

model = BlipForQuestionAnswering.from_pretrained(
    "Salesforce/blip-vqa-base"
).to(device)

_orig_forward = model.forward

def _forward_no_inputs_embeds(*args, **kwargs):
    kwargs.pop("inputs_embeds", None)
    kwargs.pop("decoder_inputs_embeds", None)  # Use `None` as default
    return _orig_forward(*args, **kwargs)

model.forward = _forward_no_inputs_embeds

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# df = pd.read_csv("/kaggle/input/finale-dataset/train_dataset.csv")  # assumes columns: 'path', 'question', 'answer'

class BlipVQADataset(Dataset):
    def __init__(self, df, image_dir, processor):
        self.df = df
        self.image_dir = image_dir
        self.processor = processor

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = os.path.join(self.image_dir, row['path'])
        image = Image.open(image_path).convert("RGB")
        question = str(row["question"])
        answer = str(row["answer"])

        inputs = self.processor(images=image, text=question, return_tensors="pt", padding='max_length', max_length=128, truncation=True)
        inputs = {k: v.squeeze(0) for k, v in inputs.items()}
        inputs["labels"] = processor.tokenizer(answer, return_tensors="pt", padding='max_length', max_length=32, truncation=True).input_ids.squeeze(0)
        return inputs

def collate_fn(batch):
    pixel_values = torch.stack([item["pixel_values"] for item in batch])
    input_ids = torch.stack([item["input_ids"] for item in batch])
    attention_mask = torch.stack([item["attention_mask"] for item in batch])
    labels = torch.nn.utils.rnn.pad_sequence(
        [item["labels"] for item in batch], batch_first=True, padding_value=-100
    )
    return {
        "pixel_values": pixel_values,
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

image_dir = "/kaggle/input/vr-dataset/small/small/"
dataset = BlipVQADataset(final_df, image_dir, processor)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True, collate_fn=collate_fn, num_workers=2)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
lr_scheduler = get_scheduler("cosine", optimizer=optimizer, num_warmup_steps=100, num_training_steps=len(dataloader)*3)

scaler = torch.cuda.amp.GradScaler()

model.train()
for epoch in range(2):
    total_loss = 0
    for step, batch in enumerate(dataloader):
        pixel_values = batch["pixel_values"].to(device, dtype=torch.float16)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            # Correct forward pass without 'inputs_embeds'
            outputs = model(
                pixel_values=pixel_values,  # Image features
                input_ids=input_ids,        # Text input (question)
                attention_mask=attention_mask,  # Attention mask
                labels=labels               # Labels for the answer
            )
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        lr_scheduler.step()

        total_loss += loss.item()
        if step % 10 == 0:
            print(f"Epoch {epoch+1}, Step {step}, Loss: {loss.item():.4f}")

    print(f"Epoch {epoch+1} complete. Avg Loss: {total_loss / len(dataloader):.4f}")

save_path = "/kaggle/working/blip_lora_adapter2"
model.save_pretrained(save_path)
print(f"LoRA adapter saved at: {save_path}")


2025-05-18 05:27:25.130534: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747546045.158007     100 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747546045.165568     100 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Device: cuda
trainable params: 2,359,296 || all params: 387,031,868 || trainable%: 0.6096


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch 1, Step 0, Loss: 10.4506
Epoch 1, Step 10, Loss: 10.5323
Epoch 1, Step 20, Loss: 10.5198
Epoch 1, Step 30, Loss: 10.3073
Epoch 1, Step 40, Loss: 10.3607
Epoch 1, Step 50, Loss: 10.1615
Epoch 1, Step 60, Loss: 9.9935
Epoch 1, Step 70, Loss: 9.8585
Epoch 1, Step 80, Loss: 9.6610
Epoch 1, Step 90, Loss: 9.4874
Epoch 1, Step 100, Loss: 9.5259
Epoch 1, Step 110, Loss: 9.3614
Epoch 1, Step 120, Loss: 9.2823
Epoch 1, Step 130, Loss: 9.1261
Epoch 1, Step 140, Loss: 9.0051
Epoch 1, Step 150, Loss: 9.0074
Epoch 1, Step 160, Loss: 8.9601
Epoch 1, Step 170, Loss: 8.8785
Epoch 1, Step 180, Loss: 8.8061
Epoch 1, Step 190, Loss: 8.7187
Epoch 1, Step 200, Loss: 8.5628
Epoch 1, Step 210, Loss: 8.7300
Epoch 1, Step 220, Loss: 8.6064
Epoch 1, Step 230, Loss: 8.6122
Epoch 1, Step 240, Loss: 8.4885
Epoch 1, Step 250, Loss: 8.4364
Epoch 1, Step 260, Loss: 8.5511
Epoch 1, Step 270, Loss: 8.4695
Epoch 1, Step 280, Loss: 8.4925
Epoch 1, Step 290, Loss: 8.4464
Epoch 1, Step 300, Loss: 8.5051
Epoch 1, Step

In [None]:
!zip -r /kaggle/working/blip_lora_adapter.zip /kaggle/working/blip_lora_adapter2


### MODEL DEPLOYED ON HUGGING FACE

# INFERENCE - LORA MODEL

In [9]:
import os
import torch
from transformers import BlipProcessor, BlipForQuestionAnswering
from peft import PeftModel
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import pandas as pd

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")

model = BlipForQuestionAnswering.from_pretrained(
    "Salesforce/blip-vqa-base",
    device_map='auto'
)
model = PeftModel.from_pretrained(model, "bk45/blip-vqa-finetuned").to(device)
model.eval()

adapter_config.json:   0%|          | 0.00/721 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/18.9M [00:00<?, ?B/s]

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): BlipForQuestionAnswering(
      (vision_model): BlipVisionModel(
        (embeddings): BlipVisionEmbeddings(
          (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
        )
        (encoder): BlipEncoder(
          (layers): ModuleList(
            (0-11): 12 x BlipEncoderLayer(
              (self_attn): BlipAttention(
                (dropout): Dropout(p=0.0, inplace=False)
                (qkv): Linear(in_features=768, out_features=2304, bias=True)
                (projection): Linear(in_features=768, out_features=768, bias=True)
              )
              (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (mlp): BlipMLP(
                (activation_fn): GELUActivation()
                (fc1): Linear(in_features=768, out_features=3072, bias=True)
                (fc2): Linear(in_features=3072, out_features=768, bias=True)
              )
              (layer

In [10]:
class BlipVQATestDataset(Dataset):
    def __init__(self, df, image_dir, processor):
        self.df = df
        self.image_dir = image_dir
        self.processor = processor

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = os.path.join(self.image_dir, row['path'])
        image = Image.open(image_path).convert("RGB")
        question = str(row["question"])

        inputs = self.processor(
            images=image,
            text=question,
            return_tensors="pt",
            padding='max_length',
            max_length=128,
            truncation=True
        )
        return {k: v.squeeze(0) for k, v in inputs.items()}

In [11]:
image_dir = "/kaggle/input/vr-dataset/small/small/"

# Prepare DataLoader
test_dataset = BlipVQATestDataset(test_df, image_dir, processor)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=2)

In [12]:
predictions = []
with torch.no_grad():
    for batch in test_dataloader:
        pixel_values = batch["pixel_values"].to(device)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model.generate(
            pixel_values=pixel_values,
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=10,
            use_cache=True
        )

        answers = processor.tokenizer.batch_decode(outputs, skip_special_tokens=True)
        predictions.extend(answers)

# Save or print predictions
test_df["predicted_answer"] = predictions
test_df.head(10)

Unnamed: 0,image_id,question,answer,path,predicted_answer
0,71OQ4BiD3uL,What is the product type?,Bucket,f0/f0d50e0a.jpg,hand
1,81CtH3N+0NL,What kind of closure does it have?,Lace-up,52/521668c5.jpg,lace
2,8176aHTRP3L,What is the primary color?,Multi,43/438d87ff.jpg,pink
3,71yC+5KLN-L,What material is the cover made of?,Hard,30/304914ac.jpg,hard
4,71pP-5SEtyL,What material is the case made of?,Silicone,40/40d177f8.jpg,canvas
5,71A8O6XIQGL,What is the shoe's style?,Ballet,10/104a3774.jpg,heels
6,61NxifiMbuL,Is warranty included?,No,42/4213f453.jpg,no
7,71sqlEJ6F1L,What is the case's material?,Silicon,3c/3ce01595.jpg,canvas
8,81UrYa7KFEL,What is the heat level?,Medium,b6/b613a781.jpg,medium
9,71gGW9pDoHL,What is the shoe's style?,Sneakers,e2/e2f5d987.jpg,sneakers


In [13]:
test_df.to_csv("vqa_predictions.csv", index=False)


## METRICS

In [16]:
from sklearn.metrics import accuracy_score
test_df["Answer_clean"] = test_df["answer"].astype(str).str.lower().str.strip()
test_df["Pred_clean"] = test_df["predicted_answer"].astype(str).str.lower().str.strip()

acc = accuracy_score(test_df["Answer_clean"], test_df["Pred_clean"])
print(f"Exact Match Accuracy: {acc:.4f}")


Exact Match Accuracy: 0.2864


In [15]:
!pip install bert-score

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13


In [18]:
from bert_score import score

In [19]:
filtered_df = test_df[(test_df["Answer_clean"] != "") & (test_df["Pred_clean"] != "")]

P, R, F1 = score(
    filtered_df["Pred_clean"].tolist(),
    filtered_df["Answer_clean"].tolist(),
    lang="en",
    verbose=True
)

print(f"BERTScore - Precision: {P.mean():.4f}, Recall: {R.mean():.4f}, F1: {F1.mean():.4f}")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/39 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/249 [00:00<?, ?it/s]

done in 5.29 seconds, 3006.91 sentences/sec
BERTScore - Precision: 0.9552, Recall: 0.9344, F1: 0.9438


In [20]:
! pip install nltk



In [21]:
import nltk
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


True

In [22]:
from nltk.corpus import wordnet as wn

def wups_score(preds, refs, threshold=0.9):
    def compute_wups(pred, ref):
        pred_synsets = wn.synsets(pred)
        ref_synsets = wn.synsets(ref)

        if not pred_synsets or not ref_synsets:
            return 0.0  # no synsets found → assume unrelated

        max_score = max(wn.wup_similarity(p, r) or 0.0 for p in pred_synsets for r in ref_synsets)
        return max_score

    scores = []
    for pred, ref in zip(preds, refs):
        pred = pred.lower().strip()
        ref = ref.lower().strip()

        score = compute_wups(pred, ref)
        score = score if score >= threshold else 0.0  # apply threshold
        scores.append(score)

    return sum(scores) / len(scores) if scores else 0.0


In [23]:
refs = test_df["Answer_clean"].tolist()
preds = test_df["Pred_clean"].tolist()

# Calculate WUPS@0.0 (lenient) and WUPS@0.9 (strict)
wups_00 = wups_score(preds, refs, threshold=0.0)
wups_09 = wups_score(preds, refs, threshold=0.9)

final_score = 0.5 * wups_00 + 0.5 * wups_09

# Print results
print(f"WUPS @0.0 (lenient): {wups_00:.4f}")
print(f"WUPS @0.9 (strict): {wups_09:.4f}")
print(f"Final Weighted WUPS Score: {final_score:.4f}")


WUPS @0.0 (lenient): 0.5673
WUPS @0.9 (strict): 0.3064
Final Weighted WUPS Score: 0.4369


In [24]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight & fast

preds = test_df["predicted_answer"].astype(str).tolist()
refs = test_df["answer"].astype(str).tolist()

pred_embeds = model.encode(preds, convert_to_tensor=True)
ref_embeds = model.encode(refs, convert_to_tensor=True)

cos_sim = util.cos_sim(pred_embeds, ref_embeds).diagonal()

sbert_score = cos_sim.mean().item()
print(f"SBERT Metric: {sbert_score:.4f}")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/498 [00:00<?, ?it/s]

Batches:   0%|          | 0/498 [00:00<?, ?it/s]

SBERT Metric: 0.5630


In [25]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [26]:
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize

meteor_scores = [
    meteor_score([word_tokenize(ref)], word_tokenize(pred))
    for ref, pred in zip(test_df["answer"].astype(str), test_df["predicted_answer"].astype(str))
]

avg_meteor = sum(meteor_scores) / len(meteor_scores)
print(f"Average METEOR Score: {avg_meteor:.4f}")


Average METEOR Score: 0.1496
