# Blip Baseline evaluation (NO FINE TUNING)

In [None]:
# Dependencies
!pip install --upgrade --quiet transformers bert-score pandas tqdm datasets accelerate peft bitsandbytes pillow trl --no-deps

In [None]:
# importing modules

import pandas as pd
import numpy as np
import os
from PIL import Image
import torch
from transformers import BlipProcessor, BlipForQuestionAnswering
from bert_score import score

### Uploaded the required datasets for evaluating the baseline model:
#### - abo-small
#### - vqa-test-dataset
#### - vqa-val-dataset

  ### The sizes of both the curated datasets is - ~1900 products with 4-5 question-answer pairs for each image.

In [None]:

# Load your data
df  = pd.read_csv("/kaggle/input/vqa-test-dataset/blip_vqa_test.csv")
df1 = pd.read_csv("/kaggle/input/vqa-val-dataset/blip_vqa_val.csv")

# Concatenate df1 below df and reset the index
df = pd.concat([df, df1], ignore_index=True)

# (Optional) Quick sanity check
print(f"Combined dataframe shape: {df.shape}")
print(df['main_image_id'].size)

### The pre-trained BLIP model is loaded and stored from the tranformer library

In [None]:
# Initializing the BLIP baseline model
device    = "cuda" if torch.cuda.is_available() else "cpu"
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model     = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)

### Now we use the dataset rows and extract the image_path, concatenate it with image_root and pass it as imafe input to the model. Along with this, we pass the "question" for that row, generate the output and store it.

In [None]:
# Running the inference loop on the dataset using the pre-trained model
# stores the predictions(preds) and ground truths(refs)
preds = []
refs  = []
image_root = "/kaggle/input/abo-small/images/small" # will be concatenated with image_path

for _, row in df.iterrows():
    img_path = os.path.join(image_root, row["image_path"]).replace("\\","/")
    image    = Image.open(img_path).convert("RGB")
    inputs   = processor(image, row["question"], return_tensors="pt").to(device)
    out      = model.generate(**inputs)
    pred     = processor.decode(out[0], skip_special_tokens=True).strip()
    
    preds.append(pred)
    refs.append(str(row["answer"]).strip())

## EVALUATION

### We use 3 evaluation metrics for our entire project, find the section "Evaluation Metrics" in the report for the description and purpose of the same:
#### - Exact Match Accuracy
#### - BERTScore
#### - Semantic Cosine Similarity

In [None]:
# Computing exact match accuracy
exact_match = sum(p.lower() == r.lower() for p, r in zip(preds, refs)) / len(preds)
print(f"Exact-Match Accuracy : {exact_match * 100:.2f}%")

### This value comes out to be 41.22 which is quite obvious given the fact that we are evaluating for the "exact match" between the predictions and the ground-truths which does not take into account "case insensitivity".

In [None]:
# Computing BERTScore (F1)
P, R, F1 = score(preds, refs, lang="en", rescale_with_baseline=True)
print(f"BERTScore F1 :         {F1.mean().item() * 100:.2f}%")

### This value comes out to be 68.53%

In [None]:
# Computing Semantic cosine similarity (via SentenceTransformers)
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

st_model = SentenceTransformer("all-MiniLM-L6-v2")
emb_true = st_model.encode(refs, convert_to_numpy=True)
emb_pred = st_model.encode(preds, convert_to_numpy=True)

# cosine_similarity gives an NxN matrix; we take its diagonal
cos_mat     = cosine_similarity(emb_true, emb_pred)
cos_scores  = np.diag(cos_mat)
print(f"\nAvg. semantic cosine similarity: {cos_scores.mean():.4f}")

### Finally, the semantic cosine similarity evaluates the model correctness as 71.90%