In [1]:
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.0.0->bert_score)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch>=1.0.0->bert_score)
  

In [2]:
import torch
import os
import random
import pandas as pd
import numpy as np
import torch.nn as nn
from PIL import Image
from tqdm import tqdm
from transformers import BlipProcessor, BlipForQuestionAnswering
from torch.utils.data import Dataset, DataLoader
import bert_score
import transformers

2025-05-14 06:01:55.247286: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747202515.435676      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747202515.488831      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
def process_dataset(input_csv_path, image_base_dir=None):

    output_csv = 'vqa_dataset_processed.csv'

    df = pd.read_csv(input_csv_path)
    print(f"Processing dataset with {df.shape[0]} entries")

    processed_data = []

    product_type_questions = [
        "What is the product type?",
        "Can you identify the type of product?",
        "What kind of product is this?",
        "Identify the product category.",
        "What sort of item is this?",
        "Could you tell me the product type?",
        "What's the category of this product?"
    ]

    for _, row in df.iterrows():

        image_path = row['path']
        if image_base_dir:
            full_image_path = os.path.join(image_base_dir, image_path)
        else:
            full_image_path = image_path

        description = row['vqa_description']

        qa_pairs = [(row[f'q{i}'], row[f'a{i}'].lower()) for i in range(1, 5)
                    if pd.notna(row[f'q{i}']) and pd.notna(row[f'a{i}'])]

        if pd.notna(row['product_type']):
            product_type_question = random.choice(product_type_questions)
            product_type_answer = row['product_type'].lower()
            qa_pairs.append((product_type_question, product_type_answer))

        for question, answer in qa_pairs:
            processed_data.append({
                'image_path': full_image_path,
                'description': description,
                'question': question,
                'answer': answer
            })

    processed_df = pd.DataFrame(processed_data)
    processed_df.to_csv(output_csv, index=False)
    print(f"Saved {len(processed_data)} QA pairs to {output_csv}")

In [4]:
class VQADataset(Dataset):
    def __init__(self, csv_path):
        self.df = pd.read_csv(csv_path)
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        try:
            image = Image.open(row['image_path']).convert('RGB')
        except:
            image = Image.new('RGB', (224, 224), color='white')
        return image, row['question'], row['answer'].lower().strip(), row['image_path']

In [5]:
def collate(batch):
    images, questions, answers, img_paths = zip(*batch)
    return list(images), list(questions), list(answers), list(img_paths)

In [6]:
def create_dataloader(dataset_path, batch_size=16, num_workers=4):
    
    dataset = VQADataset(dataset_path)
    
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,  
        shuffle=False,
        num_workers=num_workers, 
        pin_memory=True,
        collate_fn=collate
    )
    
    return dataloader

In [7]:
def load_model_and_processor(model_id, device):
    processor = BlipProcessor.from_pretrained(model_id, use_fast=True)
    model = BlipForQuestionAnswering.from_pretrained(model_id)
    model = model.to(device)
    return processor, model

In [8]:
def evaluate(model, processor, dataloader, device):
   
    img_paths, questions, predictions, references = [], [], [], []
    model.eval()
    with torch.inference_mode():
        for images, ques, answers, batch_img_paths in tqdm(dataloader):
            inputs = processor(images=images, text=ques, return_tensors="pt", padding=True)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            generated_ids = model.generate(**inputs, max_new_tokens=20, min_length=1)
            generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
            
            batch_preds = [t.split("Answer:")[-1].strip().lower() if "Answer:" in t else t.strip().lower() 
                          for t in generated_texts]

            img_paths.extend(batch_img_paths)
            questions.extend(ques)
            predictions.extend(batch_preds)
            references.extend(answers)
    
    return img_paths, questions, predictions, references

In [9]:
def print_parameter_count(model):
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Parameters : {total_params:,}")

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [11]:
input_csv_path = "/kaggle/input/abo-metadata/vqa_qa_cleaned.csv"
image_base_dir = "/kaggle/input/abo-dataset/small"
processed_dataset_path = "/kaggle/working/vqa_dataset_processed.csv"
model_id = "Salesforce/blip-vqa-base"

In [12]:
process_dataset(input_csv_path, image_base_dir)

Processing dataset with 28974 entries
Saved 129125 QA pairs to vqa_dataset_processed.csv


In [13]:
df = pd.read_csv(processed_dataset_path)

In [14]:
df.tail()

Unnamed: 0,image_path,description,question,answer
129120,/kaggle/input/abo-dataset/small/b3/b3f05fda.jpg,the image features a black storage box with wh...,Can you identify the type of product?,storage_box
129121,/kaggle/input/abo-dataset/small/4f/4f6ec573.jpg,the image shows a silver pressure cooker with ...,What is the color of the pressure cooker's han...,black
129122,/kaggle/input/abo-dataset/small/4f/4f6ec573.jpg,the image shows a silver pressure cooker with ...,What is the color of the background in the image?,white
129123,/kaggle/input/abo-dataset/small/4f/4f6ec573.jpg,the image shows a silver pressure cooker with ...,Is the lid open or closed in the image?,closed
129124,/kaggle/input/abo-dataset/small/4f/4f6ec573.jpg,the image shows a silver pressure cooker with ...,Could you tell me the product type?,pressure_cooker


In [15]:
dataloader = create_dataloader(processed_dataset_path)

In [16]:
processor, model = load_model_and_processor(model_id, device)

preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

In [17]:
print_parameter_count(model)

Parameters : 384,672,572


In [18]:
img_paths, questions, predictions, references = evaluate(model, processor, dataloader, device)

100%|██████████| 8071/8071 [1:02:41<00:00,  2.15it/s]


In [19]:
transformers.logging.set_verbosity_error()

In [20]:
# Accuracy calculation
exact_matches = [p == r for p, r in zip(predictions, references)]
accuracy = sum(exact_matches) / len(exact_matches)

In [21]:
# F1 Score calculation
pred_tokens = [set(p.split()) for p in predictions]
ref_tokens = [set(r.split()) for r in references]

f1s = []
for p, r in zip(pred_tokens, ref_tokens):
    precision = len(p.intersection(r))/len(p) if len(p) > 0 else 0.0
    recall = len(p.intersection(r))/len(r) if len(r) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall + 1e-8)
    f1s.append(f1)

avg_f1 = sum(f1s) / len(f1s)

In [22]:
# BERTScore calculation
bert_device = device.type
P, R, F1 = bert_score.score(predictions, references, lang="en", 
                           batch_size=16 if torch.cuda.is_available() else 8, 
                           device=bert_device)
bert_score_val = F1.mean().item()

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]



In [23]:
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {avg_f1:.4f}")
print(f"BERTScore: {bert_score_val:.4f}")

Accuracy: 0.2183
F1 Score: 0.2252
BERTScore: 0.9527


In [24]:
result = pd.DataFrame({
        'image_path': img_paths,
        'question': questions,
        'prediction': predictions,
        'reference': references
    })

In [25]:
result.to_csv('result.csv', index=False)