In [1]:
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.0.0->bert_score)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch>=1.0.0->bert_score)
  

In [2]:
import torch
import os
import pandas as pd
import numpy as np
import torch.nn as nn
from PIL import Image
from tqdm import tqdm
from transformers import ViltProcessor, ViltForQuestionAnswering
from torch.utils.data import Dataset, DataLoader
import bert_score
import transformers

2025-05-18 16:58:08.387217: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747587488.575607      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747587488.629986      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
def process_dataset(input_csv_path, image_base_dir=None):
    
    output_csv = 'vqa_dataset_processed.csv'
    
    df = pd.read_csv(input_csv_path)
    print(f"Processing dataset with {df.shape[0]} entries")
    
    processed_data = []
    
    for _, row in df.iterrows():
        
        image_path = row['path']
        if image_base_dir:
            full_image_path = os.path.join(image_base_dir, image_path)
        else:
            full_image_path = image_path
            

        qa_pairs = [(row[f'q{i}'], row[f'a{i}'].lower()) for i in range(1, 5) 
           if pd.notna(row[f'q{i}']) and pd.notna(row[f'a{i}'])]
        
        for question, answer in qa_pairs:
            processed_data.append({
                'image_path': full_image_path,
                'question': question,
                'answer': answer
            })
    
    processed_df = pd.DataFrame(processed_data)
    processed_df.to_csv(output_csv, index=False)
    print(f"Saved {len(processed_data)} QA pairs to {output_csv}")

In [4]:
input_csv_path = "/kaggle/input/abo-metadata/vqa_qa_cleaned.csv"

In [5]:
process_dataset(input_csv_path,"/kaggle/input/abo-dataset/small")

Processing dataset with 28974 entries
Saved 100151 QA pairs to vqa_dataset_processed.csv


In [6]:
df = pd.read_csv("/kaggle/working/vqa_dataset_processed.csv")

In [7]:
df.tail()

Unnamed: 0,image_path,question,answer
100146,/kaggle/input/abo-dataset/small/b3/b3f05fda.jpg,What is the background color of the image?,white
100147,/kaggle/input/abo-dataset/small/b3/b3f05fda.jpg,Are there any other objects in the image besid...,no
100148,/kaggle/input/abo-dataset/small/4f/4f6ec573.jpg,What is the color of the pressure cooker's han...,black
100149,/kaggle/input/abo-dataset/small/4f/4f6ec573.jpg,What is the color of the background in the image?,white
100150,/kaggle/input/abo-dataset/small/4f/4f6ec573.jpg,Is the lid open or closed in the image?,closed


In [8]:
class VQADataset(Dataset):
    def __init__(self, csv_path):
        self.df = pd.read_csv(csv_path)
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        try:
            image = Image.open(row['image_path']).convert('RGB')
        except:
            image = Image.new('RGB', (224, 224), color='white')
        return image, row['question'], row['answer'].lower().strip()

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [10]:
model_id = "dandelin/vilt-b32-finetuned-vqa"

In [11]:
processor = ViltProcessor.from_pretrained(model_id)
model = ViltForQuestionAnswering.from_pretrained(model_id)

preprocessor_config.json:   0%|          | 0.00/251 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/136k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/470M [00:00<?, ?B/s]

In [12]:
def print_parameter_count(model):
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Parameters : {total_params:,}")

In [13]:
print_parameter_count(model)

Parameters : 117,588,537


In [14]:
model = model.to(device)

model.safetensors:   0%|          | 0.00/470M [00:00<?, ?B/s]

In [15]:
dataset_path = "/kaggle/working/vqa_dataset_processed.csv"

In [16]:
dataset = VQADataset(dataset_path)

def collate_fn(batch):
    images, questions, answers = zip(*batch)
    return list(images), list(questions), list(answers)

dataloader = DataLoader(
    dataset,
    batch_size=16,  
    shuffle=False,
    num_workers=4, 
    pin_memory=True,
    collate_fn=collate_fn
)

In [17]:
def evaluate(model, processor, dataloader, device):
    ques, predictions, references = [], [], []
    
    model.eval()
    
    with torch.inference_mode(): 
        for images, questions, answers in tqdm(dataloader, position=0, leave=True):

            inputs = processor(images=images, text=questions, return_tensors="pt", 
                               padding=True, truncation=True, max_length=40)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            outputs = model(**inputs)
            logits = outputs.logits
            
            pred_indices = logits.argmax(dim=-1)
            
            batch_preds = [model.config.id2label[idx.item()] for idx in pred_indices]
            
            ques.extend(questions)
            predictions.extend(batch_preds)
            references.extend(answers)
    
    return ques, predictions, references

In [18]:
ques, predictions, references = evaluate(
    model=model, 
    processor=processor, 
    dataloader=dataloader, 
    device=device
)

100%|██████████| 6260/6260 [29:35<00:00,  3.52it/s]


In [19]:
transformers.logging.set_verbosity_error()

In [20]:
# Accuracy calculation
exact_matches = [p == r for p, r in zip(predictions, references)]
accuracy = sum(exact_matches) / len(exact_matches)

In [21]:
# F1 Score calculation
pred_tokens = [set(p.split()) for p in predictions]
ref_tokens = [set(r.split()) for r in references]

f1s = []
for p, r in zip(pred_tokens, ref_tokens):
    precision = len(p.intersection(r))/len(p) if len(p) > 0 else 0.0
    recall = len(p.intersection(r))/len(r) if len(r) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall + 1e-8)
    f1s.append(f1)

avg_f1 = sum(f1s) / len(f1s)

In [22]:
# BERTScore calculation
bert_device = device.type
P, R, F1 = bert_score.score(predictions, references, lang="en", 
                           batch_size=16 if torch.cuda.is_available() else 8, 
                           device=bert_device)
bert_score_val = F1.mean().item()

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

In [23]:
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {avg_f1:.4f}")
print(f"BERTScore: {bert_score_val:.4f}")

Accuracy: 0.2695
F1 Score: 0.2703
BERTScore: 0.9843


In [24]:
result = pd.DataFrame({
    'question': ques,
    'prediction': predictions,
    'reference': references
})

In [25]:
result.to_csv('result.csv', index=False)