In [None]:
import numpy as np
import pandas as pd

In [28]:
final_df = pd.read_csv('/kaggle/input/finale-dataset/train_dataset.csv')
test_df = pd.read_csv('/kaggle/input/finale-dataset/test_dataset.csv')

In [29]:
# final_df= final_df.head(38000)

In [30]:
final_df.head()

Unnamed: 0,image_id,question,answer,path
0,71BN3iMoGkL,What are the figurines sitting on?,Bench,9d/9dd2d3f1.jpg
1,71AH7yyOgCL,What brand is the mobile phone case?,Solimo,f3/f3ab1fbf.jpg
2,71ag2dwtVfL,What brand is this?,AmazonBasics,62/62075c07.jpg
3,61yqMXIGHVL,What lining do these envelopes have?,Bubble,2d/2d483c93.jpg
4,71d57C76BpL,What is the shade shape?,Drum,b8/b8b510a2.jpg


In [31]:
len(final_df)

63639

In [32]:
len(test_df)

15910

## BASE MODEL

In [33]:
!pip install -q peft accelerate


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [34]:
!pip install transformers torch torchvision pillow


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [35]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BlipProcessor, BlipForQuestionAnswering
from PIL import Image
import pandas as pd
from sklearn.metrics import accuracy_score
from bert_score import score

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model & processor
model_name = "Salesforce/blip-vqa-base"
processor = BlipProcessor.from_pretrained(model_name)
model = BlipForQuestionAnswering.from_pretrained(model_name).to(device)
model.eval()

BlipForQuestionAnswering(
  (vision_model): BlipVisionModel(
    (embeddings): BlipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (encoder): BlipEncoder(
      (layers): ModuleList(
        (0-11): 12 x BlipEncoderLayer(
          (self_attn): BlipAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=768, out_features=2304, bias=True)
            (projection): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): BlipMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((768,), eps=1e-05, e

In [36]:
class VQADataset(Dataset):
    def __init__(self, df, image_dir, processor):
        self.df = df.reset_index(drop=True)
        self.image_dir = image_dir
        self.processor = processor

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = os.path.join(self.image_dir, row["path"])
        image = Image.open(img_path).convert("RGB")
        question = row["question"]

        inputs = self.processor(
            image,
            question,
            return_tensors="pt",
            padding="max_length",
            max_length=128,
            truncation=True
        )
        return {
            "pixel_values": inputs["pixel_values"].squeeze(0),
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "answer": row["answer"]
        }

def collate_fn(batch):
    pixel_values = torch.stack([item["pixel_values"] for item in batch])
    input_ids = torch.stack([item["input_ids"] for item in batch])
    attention_mask = torch.stack([item["attention_mask"] for item in batch])
    answers = [item["answer"] for item in batch]
    return {
        "pixel_values": pixel_values,
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "answer": answers
    }



image_dir = "/kaggle/input/vr-dataset/small/small" 

test_dataset = VQADataset(test_df, image_dir, processor)
test_loader = DataLoader(test_dataset, batch_size=8, collate_fn=collate_fn)


In [37]:
predictions = []
true_answers = []

with torch.no_grad():
    for batch in test_loader:
        pixel_values = batch["pixel_values"].to(device)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model.generate(
            pixel_values=pixel_values,
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=10,
            use_cache=True
        )

        preds = processor.tokenizer.batch_decode(outputs, skip_special_tokens=True)
        predictions.extend(preds)
        true_answers.extend(batch["answer"])

In [38]:
test_df["predicted_answer"] = predictions
test_df["Answer_clean"] = test_df["answer"].astype(str).str.lower().str.strip()
test_df["Pred_clean"] = test_df["predicted_answer"].astype(str).str.lower().str.strip()



In [39]:
test_df.head()

Unnamed: 0,image_id,question,answer,path,predicted_answer,Answer_clean,Pred_clean
0,71OQ4BiD3uL,What is the product type?,Bucket,f0/f0d50e0a.jpg,no,bucket,no
1,81CtH3N+0NL,What kind of closure does it have?,Lace-up,52/521668c5.jpg,sliding,lace-up,sliding
2,8176aHTRP3L,What is the primary color?,Multi,43/438d87ff.jpg,blue,multi,blue
3,71yC+5KLN-L,What material is the cover made of?,Hard,30/304914ac.jpg,metal,hard,metal
4,71pP-5SEtyL,What material is the case made of?,Silicone,40/40d177f8.jpg,metal,silicone,metal


## METRICS

In [40]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(test_df["Answer_clean"], test_df["Pred_clean"])
print(f"Exact Match Accuracy: {acc:.4f}")


Exact Match Accuracy: 0.1725


In [15]:
!pip install bert-score

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13


In [41]:
from bert_score import score

In [42]:
filtered_df = test_df[(test_df["Answer_clean"] != "") & (test_df["Pred_clean"] != "")]

P, R, F1 = score(
    filtered_df["Pred_clean"].tolist(),
    filtered_df["Answer_clean"].tolist(),
    lang="en",
    verbose=True
)

print(f"BERTScore - Precision: {P.mean():.4f}, Recall: {R.mean():.4f}, F1: {F1.mean():.4f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/40 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/249 [00:00<?, ?it/s]

done in 4.92 seconds, 3236.22 sentences/sec
BERTScore - Precision: 0.9517, Recall: 0.9251, F1: 0.9371


In [43]:
! pip install nltk

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [44]:
import nltk
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [45]:
from nltk.corpus import wordnet as wn

def wups_score(preds, refs, threshold=0.9):
    def compute_wups(pred, ref):
        pred_synsets = wn.synsets(pred)
        ref_synsets = wn.synsets(ref)

        if not pred_synsets or not ref_synsets:
            return 0.0  # no synsets found → assume unrelated

        max_score = max(wn.wup_similarity(p, r) or 0.0 for p in pred_synsets for r in ref_synsets)
        return max_score

    scores = []
    for pred, ref in zip(preds, refs):
        pred = pred.lower().strip()
        ref = ref.lower().strip()

        score = compute_wups(pred, ref)
        score = score if score >= threshold else 0.0  # apply threshold
        scores.append(score)

    return sum(scores) / len(scores) if scores else 0.0


In [46]:
refs = test_df["Answer_clean"].tolist()
preds = test_df["Pred_clean"].tolist()

# Calculate WUPS@0.0 (lenient) and WUPS@0.9 (strict)
wups_00 = wups_score(preds, refs, threshold=0.0)
wups_09 = wups_score(preds, refs, threshold=0.9)

final_score = 0.5 * wups_00 + 0.5 * wups_09

# Print results
print(f"WUPS @0.0 (lenient): {wups_00:.4f}")
print(f"WUPS @0.9 (strict): {wups_09:.4f}")
print(f"Final Weighted WUPS Score: {final_score:.4f}")


WUPS @0.0 (lenient): 0.4538
WUPS @0.9 (strict): 0.2116
Final Weighted WUPS Score: 0.3327


In [47]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight & fast

preds = test_df["predicted_answer"].astype(str).tolist()
refs = test_df["answer"].astype(str).tolist()

pred_embeds = model.encode(preds, convert_to_tensor=True)
ref_embeds = model.encode(refs, convert_to_tensor=True)

cos_sim = util.cos_sim(pred_embeds, ref_embeds).diagonal()

sbert_score = cos_sim.mean().item()
print(f"SBERT Metric: {sbert_score:.4f}")


Batches:   0%|          | 0/498 [00:00<?, ?it/s]

Batches:   0%|          | 0/498 [00:00<?, ?it/s]

SBERT Metric: 0.4584


In [48]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [49]:
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize

meteor_scores = [
    meteor_score([word_tokenize(ref)], word_tokenize(pred))
    for ref, pred in zip(test_df["answer"].astype(str), test_df["predicted_answer"].astype(str))
]

avg_meteor = sum(meteor_scores) / len(meteor_scores)
print(f"Average METEOR Score: {avg_meteor:.4f}")


Average METEOR Score: 0.0994
