In [1]:
!pip install datasets peft torch torchvision 
!pip install -q transformers evaluate scikit-learn pillow

Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch)
  Downloading nvidia_cusparse_cu12-12

In [2]:
import os
import sys
import subprocess

# Uninstall any existing bitsandbytes build
print("Uninstalling old bitsandbytes...")
subprocess.run([sys.executable, "-m", "pip", "uninstall", "-y", "bitsandbytes"], check=True)

# Install bitsandbytes >=0.43.2 which adds upstream support for CUDA 12.4+
print("Installing bitsandbytes>=0.43.2 for CUDA 12.4 support...")
subprocess.run([sys.executable, "-m", "pip", "install", "--upgrade", "bitsandbytes>=0.43.2"], check=True)

# Ensure the CUDA 12.4 runtime path is on LD_LIBRARY_PATH
cuda_lib="/usr/local/cuda-12.4/lib64"
prev_ld = os.environ.get("LD_LIBRARY_PATH", "")
os.environ["LD_LIBRARY_PATH"] = f"{cuda_lib}:{prev_ld}"
print(f"🎯 LD_LIBRARY_PATH set to include {cuda_lib}")

# Diagnostic: confirm bitsandbytes is seeing CUDA 12.4
print("🔍 Running bitsandbytes diagnostic...")
res = subprocess.run([sys.executable, "-m", "bitsandbytes"], capture_output=True, text=True)
print(res.stdout)
if "CUDA" not in res.stdout:
    raise RuntimeError("bitsandbytes did not detect CUDA—check your LD_LIBRARY_PATH and installation.")

Uninstalling old bitsandbytes...




Installing bitsandbytes>=0.43.2 for CUDA 12.4 support...
Collecting bitsandbytes>=0.43.2
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 76.1/76.1 MB 24.4 MB/s eta 0:00:00
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.5
🎯 LD_LIBRARY_PATH set to include /usr/local/cuda-12.4/lib64
🔍 Running bitsandbytes diagnostic...
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++ BUG REPORT INFORMATION ++++++++++++++++++
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++++++++++ OTHER +++++++++++++++++++++++++++
CUDA specs: CUDASpecs(highest_compute_capability=(6, 0), cuda_version_string='124', cuda_version_tuple=(12, 4))
PyTorch settings found: CUDA_VERSION=124, Highest Compute Capability: (6, 0).
To manually override the PyTorch CUDA version plea

In [3]:
import os
import ast
import pandas as pd
from PIL import Image
import torch
from sklearn.model_selection import train_test_split 
import evaluate                                        
from transformers import AutoProcessor, AutoModelForVision2Seq , BitsAndBytesConfig, get_linear_schedule_with_warmup, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel, TaskType
from datasets import Dataset

from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering, Trainer, TrainingArguments
from accelerate import Accelerator
from transformers.data.data_collator import default_data_collator
from transformers import Blip2ForConditionalGeneration, Blip2Processor, AutoModelForVision2Seq

CSV_PATH = "/kaggle/input/vr-csv-cleaned/kaggle/working/cleaned_csvs/vqa_train_cleaned.csv"
train_df = pd.read_csv(CSV_PATH)

train_df = train_df[["image_path", "question", "answer"]].reset_index(drop=True)
print(f"Total examples: {len(train_df)}")

CSV_PATH = "/kaggle/input/vr-csv-cleaned/kaggle/working/cleaned_csvs/vqa_val_cleaned.csv"
val_df = pd.read_csv(CSV_PATH)

val_df = val_df[["image_path", "question", "answer"]].reset_index(drop=True)
print(f"Total examples: {len(val_df)}")

2025-05-17 17:53:32.398379: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747504412.598571      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747504412.667062      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Total examples: 265183
Total examples: 36845


In [4]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_id ="Salesforce/blip2-opt-2.7b"
base_model = AutoModelForVision2Seq.from_pretrained(
    model_id,
    # load_in_4bit=True,
    quantization_config=quantization_config,
    device_map = "auto"
)

processor = AutoProcessor.from_pretrained(model_id)
if processor.tokenizer.pad_token is None:
    processor.tokenizer.pad_token = processor.tokenizer.eos_token

config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/122k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/882 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

In [5]:
from tqdm import tqdm
preds, refs = [], []
image_paths = []
q, a = [], []
count=0
for row in tqdm(val_df[:10000].itertuples(), total = 10000):
    img = Image.open(row.image_path).convert("RGB")
    prompt = f"Based on the image, answer the following question with a single word. Question: {row.question} Answer:"
    inputs = processor(images=img, text=prompt, return_tensors="pt").to(device)
    pixel_values   = inputs["pixel_values"]
    input_ids      = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    out = base_model.generate(
        pixel_values=pixel_values,
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=10
    )
    text = processor.batch_decode(out, skip_special_tokens=True)[0].strip()
    # print("text:",text)
    pred = text.split()[-1].rstrip(".,;:!?") if text else ""
    preds.append(pred.lower())
    refs.append(row.answer)
    image_paths.append(row.image_path)
    q.append(row.question)
    a.append(row.answer)

100%|██████████| 10000/10000 [49:31<00:00,  3.37it/s]


In [6]:
# ──────────────────────────────────────────────────────────────────────────────
# 6. Compute Metrics (and identify mismatches)
# ──────────────────────────────────────────────────────────────────────────────
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import evaluate
# from bert_score import BERTScorer # Keep if you add BERTScore later
# from rouge_score import rouge_scorer # Keep if you add ROUGE later
# from rapidfuzz.distance import Levenshtein # Keep if you add Levenshtein later
from sentence_transformers import SentenceTransformer, util # Keep if you add SBERT later
import numpy as np
import pandas as pd # Ensure pandas is imported if df is used



# normalize case
preds_l = [str(p).lower().strip() for p in preds] # Added str() and strip() for robustness
refs_l  = [str(r).lower().strip() for r in refs]   # Added str() and strip()

# # 1) TOKEN-LEVEL EXACT MATCH → binary labels
y_pred_bin = [int(p == r) for p, r in zip(preds_l, refs_l)]
y_true_bin = [1]*len(refs_l)            # reference is always “correct” class

acc = accuracy_score(y_true_bin, y_pred_bin)
prec, rec, f1, _ = precision_recall_fscore_support(
    y_true_bin, y_pred_bin, average="binary", zero_division=0
)
print(f"Exact-match Accuracy: {acc:.3f}")
print(f"Exact-match Precision: {prec:.3f}") # Will be 1.0 if acc > 0, else 0.0
print(f"Exact-match Recall:    {rec:.3f}")    # Will be same as acc
print(f"Exact-match F1:        {f1:.3f}\n")  

Exact-match Accuracy: 0.471
Exact-match Precision: 1.000
Exact-match Recall:    0.471
Exact-match F1:        0.641



In [7]:
df = pd.DataFrame({
    'image_path': image_paths,
    'question': q,
    'answer': a,
    'prediction': preds,
    'clean_pred': preds_l
})
df.to_csv("blip2_evaluation_results.csv")

In [8]:
df.head()

Unnamed: 0,image_path,question,answer,prediction,clean_pred
0,/kaggle/input/vr-img-meta/categorized_data/CEL...,Is the cover transparent?,no,yes,yes
1,/kaggle/input/vr-img-meta/categorized_data/SHO...,Are laces present?,no,yes,yes
2,/kaggle/input/vr-img-meta/categorized_data/WAL...,Are there lines?,yes,yes,yes
3,/kaggle/input/vr-img-meta/categorized_data/HOM...,Is there text on the laundry bags?,yes,no,no
4,/kaggle/input/vr-img-meta/categorized_data/FIN...,Are there two earrings?,yes,yes,yes


In [9]:
!pip install git+https://github.com/google-research/bleurt.git
!pip install scikit-learn evaluate bert-score rouge-score \
            sentence-transformers rapidfuzz


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting git+https://github.com/google-research/bleurt.git
  Cloning https://github.com/google-research/bleurt.git to /tmp/pip-req-build-mbf64hcr
  Running command git clone --filter=blob:none --quiet https://github.com/google-research/bleurt.git /tmp/pip-req-build-mbf64hcr
  Resolved https://github.com/google-research/bleurt.git to commit cebe7e6f996b40910cfaa520a63db47807e3bf5c
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: BLEURT
  Building wheel for BLEURT (setup.py) ... [?25l[?25hdone
  Created wheel for BLEURT: filename=BLEURT-0.0.2-py3-none-any.whl size=16456766 sha256=b17be0470659f253252286a09daa2bd92a1f7d131a5032ebb3c6ab738d3d6a18
  Stored in directory: /tmp/pip-ephem-wheel-cache-xv24ym37/wheels/30/af/34/e148007788b060e4c76e7ecf68e70c692dff0f2632e62ac454
Successfully built BLEURT
Installing collected packages: BLEURT
Successfully installed BLEURT-0.0.2


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting rapidfuzz
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m75.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=16bcc9d82a431c01ffa2493f5419955384ad3

In [10]:
from bert_score import BERTScorer # Keep if you add BERTScore later
from rouge_score import rouge_scorer # Keep if you add ROUGE later
from rapidfuzz.distance import Levenshtein # Keep if you add Levenshtein later
from sentence_transformers import SentenceTransformer, util # Keep if you add SBERT later


In [11]:
# --- ROUGE ---
print("\n--- ROUGE Scores ---")
rouge_eval_scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
rouge1_scores = []
rougeL_scores = []
for pred, ref in zip(preds_l, refs_l):
    if not pred or not ref: # Handle empty strings if any
        rouge1_scores.append(0.0)
        rougeL_scores.append(0.0)
        continue
    scores = rouge_eval_scorer.score(ref, pred) # Target, Prediction
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

if rouge1_scores:
    print(f"Average ROUGE-1 F1: {np.mean(rouge1_scores):.3f}")
    print(f"Average ROUGE-L F1: {np.mean(rougeL_scores):.3f}")
else:
    print("No ROUGE scores to compute (empty predictions or references).")


# --- BERTScore ---
print("\n--- BERTScore ---")
try:
    bert_eval_scorer = BERTScorer(lang="en", rescale_with_baseline=True, device=device)
    filtered_preds_l = [p for p, r in zip(preds_l, refs_l) if p and r]
    filtered_refs_l = [r for p, r in zip(preds_l, refs_l) if p and r]

    if filtered_preds_l and filtered_refs_l:
        P, R, F1_bert = bert_eval_scorer.score(filtered_preds_l, filtered_refs_l)
        print(f"Average BERTScore Precision: {P.mean():.3f}")
        print(f"Average BERTScore Recall:    {R.mean():.3f}")
        print(f"Average BERTScore F1:        {F1_bert.mean():.3f}")
    else:
        print("Not enough valid (non-empty) prediction/reference pairs for BERTScore.")
except Exception as e:
    print(f"Could not compute BERTScore: {e}")



--- ROUGE Scores ---
Average ROUGE-1 F1: 0.477
Average ROUGE-L F1: 0.477

--- BERTScore ---


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average BERTScore Precision: 0.876
Average BERTScore Recall:    0.864
Average BERTScore F1:        0.868


In [12]:
# --- Levenshtein Normalized Similarity ---
print("\n--- Levenshtein Normalized Similarity ---")
lev_similarities = []
for pred, ref in zip(preds_l, refs_l):
    if not pred and not ref: # both empty, perfect match
        similarity = 1.0
    elif not pred or not ref: # one empty, other not, zero similarity
        similarity = 0.0
    else:
        similarity = Levenshtein.normalized_similarity(pred, ref)
    lev_similarities.append(similarity)

if lev_similarities:
    print(f"Average Levenshtein Normalized Similarity: {np.mean(lev_similarities):.3f}")
else:
    print("No Levenshtein similarities to compute.")


# --- Sentence Transformer Cosine Similarity ---
print("\n--- Sentence-BERT Cosine Similarity ---")
try:
    sbert_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
    valid_pairs_indices = [i for i, (p, r) in enumerate(zip(preds_l, refs_l)) if p and r]
    sbert_preds = [preds_l[i] for i in valid_pairs_indices]
    sbert_refs = [refs_l[i] for i in valid_pairs_indices]

    if sbert_preds and sbert_refs:
        embeddings_preds = sbert_model.encode(sbert_preds, convert_to_tensor=True)
        embeddings_refs = sbert_model.encode(sbert_refs, convert_to_tensor=True)

        cosine_scores_sbert = util.cos_sim(embeddings_preds, embeddings_refs)
        pair_similarities = [cosine_scores_sbert[i, i].item() for i in range(len(sbert_preds))]
        print(f"Average Sentence-BERT Cosine Similarity: {np.mean(pair_similarities):.3f}")
    else:
        print("Not enough valid (non-empty) prediction/reference pairs for Sentence-BERT similarity.")
except Exception as e:
    print(f"Could not compute Sentence-BERT similarity: {e}")


--- Levenshtein Normalized Similarity ---
Average Levenshtein Normalized Similarity: 0.529

--- Sentence-BERT Cosine Similarity ---


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Average Sentence-BERT Cosine Similarity: 0.729
