In [2]:
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.0.0->bert_score)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch>=1.0.0->bert_score)
  

In [3]:
import torch
import gc
import os
import random
import pandas as pd
import numpy as np
import torch.nn as nn
from PIL import Image
from tqdm import tqdm
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
import bert_score
import transformers

2025-05-14 11:21:20.809805: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747221681.158124      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747221681.223025      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
def process_dataset(input_csv_path, image_base_dir=None):

    output_csv = 'vqa_dataset_processed.csv'

    df = pd.read_csv(input_csv_path)
    print(f"Processing dataset with {df.shape[0]} entries")

    processed_data = []

    product_type_questions = [
        "What is the object type in the image?",
        "Identify the type of object in the image?",
        "What kind of object is in the image?",
        "Identify the object category.",
        "What sort of item is in the image?",
        "Could you tell me the object type?",
        "What's the category of the object in the image?"
    ]

    for _, row in df.iterrows():

        image_path = row['path']
        if image_base_dir:
            full_image_path = os.path.join(image_base_dir, image_path)
        else:
            full_image_path = image_path

        description = row['vqa_description']

        qa_pairs = [(row[f'q{i}'], row[f'a{i}'].lower()) for i in range(1, 5)
                    if pd.notna(row[f'q{i}']) and pd.notna(row[f'a{i}'])]

        if pd.notna(row['product_type']):
            product_type_question = random.choice(product_type_questions)
            product_type_answer = row['product_type'].lower()
            qa_pairs.append((product_type_question, product_type_answer))

        for question, answer in qa_pairs:
            processed_data.append({
                'image_path': full_image_path,
                'description': description,
                'question': question,
                'answer': answer
            })

    processed_df = pd.DataFrame(processed_data)
    processed_df.to_csv(output_csv, index=False)
    print(f"Saved {len(processed_data)} QA pairs to {output_csv}")

In [5]:
class VQADataset(Dataset):
    def __init__(self, csv_path):
        self.df = pd.read_csv(csv_path)
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        try:
            image = Image.open(row['image_path']).convert('RGB')
        except:
            image = Image.new('RGB', (224, 224), color='white')
        return image, row['question'], row['answer'].lower().strip(), row['image_path']

In [6]:
def collate(batch):
    images, questions, answers, img_paths = zip(*batch)
    return list(images), list(questions), list(answers), list(img_paths)

In [7]:
def create_dataloader(dataset_path, batch_size=16, num_workers=4):
    
    dataset = VQADataset(dataset_path)
    
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,  
        shuffle=False,
        num_workers=num_workers, 
        pin_memory=True,
        collate_fn=collate
    )
    
    return dataloader

In [8]:
def load_model_and_processor(model_id, device):
    device_map = {
                    "vision_model": "cuda:0",
                    "qformer": "cuda:0",
                    "query_tokens": "cuda:0",
                    "language_model": "cuda:1",
                    "language_projection": "cuda:1"
                }
    
    processor = Blip2Processor.from_pretrained(model_id, use_fast=True)
    model = Blip2ForConditionalGeneration.from_pretrained(model_id, 
                                                     torch_dtype=torch.float16,
                                                     device_map=device_map)
    return processor, model

In [9]:
def evaluate(model, processor, dataloader, device):
    img_paths, questions, predictions, references = [], [], [], []
    model.eval()
    
    with torch.inference_mode():
        for batch_idx, (images, ques, answers, batch_img_paths) in enumerate(tqdm(dataloader)):
            ques = [f"Question: {q} Answer:" for q in ques]
            inputs = processor(images=images, text=ques, return_tensors="pt", padding=True)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            generated_ids = model.generate(
                **inputs,
                max_new_tokens=10,     
                min_length=1,
                num_beams=3,            
                do_sample=False,        
                repetition_penalty=1.5, 
                length_penalty=0.6,     
                early_stopping=True    
            )
            
            generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
            
            batch_preds = [t.split("Answer:")[-1].strip().lower() if "Answer:" in t else t.strip().lower() 
                          for t in generated_texts]
            
            img_paths.extend(batch_img_paths)
            questions.extend([q.replace("Question: ", "").replace(" Answer:", "") for q in ques])
            predictions.extend(batch_preds)
            references.extend(answers)
            
            if batch_idx >= 2000:
                del inputs, generated_ids, generated_texts, batch_preds, ques
                gc.collect()
                torch.cuda.empty_cache()
    
    return img_paths, questions, predictions, references

In [10]:
def print_parameter_count(model):
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Parameters : {total_params:,}")

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [12]:
input_csv_path = "/kaggle/input/abo-metadata/vqa_qa_cleaned.csv"
image_base_dir = "/kaggle/input/abo-dataset/small"
processed_dataset_path = "/kaggle/working/vqa_dataset_processed.csv"
model_id = "Salesforce/blip2-flan-t5-xl"

In [13]:
process_dataset(input_csv_path, image_base_dir)

Processing dataset with 28974 entries
Saved 129125 QA pairs to vqa_dataset_processed.csv


In [14]:
dataloader = create_dataloader(processed_dataset_path)

In [15]:
processor, model = load_model_and_processor(model_id, device)

preprocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.22k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/128k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/5.81G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [16]:
print_parameter_count(model)

Parameters : 3,942,446,592


In [17]:
img_paths, questions, predictions, references = evaluate(model, processor, dataloader, device)

100%|██████████| 8071/8071 [4:25:13<00:00,  1.97s/it]  


In [18]:
transformers.logging.set_verbosity_error()

In [19]:
# Accuracy calculation
exact_matches = [p == r for p, r in zip(predictions, references)]
accuracy = sum(exact_matches) / len(exact_matches)

In [20]:
# F1 Score calculation
pred_tokens = [set(p.split()) for p in predictions]
ref_tokens = [set(r.split()) for r in references]

f1s = []
for p, r in zip(pred_tokens, ref_tokens):
    precision = len(p.intersection(r))/len(p) if len(p) > 0 else 0.0
    recall = len(p.intersection(r))/len(r) if len(r) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall + 1e-8)
    f1s.append(f1)

avg_f1 = sum(f1s) / len(f1s)

In [21]:
# BERTScore calculation
bert_device = device.type
P, R, F1 = bert_score.score(predictions, references, lang="en", 
                           batch_size=16 if torch.cuda.is_available() else 8, 
                           device=bert_device)
bert_score_val = F1.mean().item()

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]



In [22]:
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {avg_f1:.4f}")
print(f"BERTScore: {bert_score_val:.4f}")

Accuracy: 0.1217
F1 Score: 0.1434
BERTScore: 0.8975


In [23]:
result = pd.DataFrame({
        'image_path': img_paths,
        'question': questions,
        'prediction': predictions,
        'reference': references
    })

In [24]:
result.to_csv('result_blip2_flan_t5_xl.csv', index=False)

In [25]:
# for name, _ in model.named_modules():
#     print(name)