# PaliGemma2 - 3Billion BASELINE EVALUATION (NO FINE-TUNING)

In [None]:
# Dependencies
!pip install --upgrade --quiet transformers sentence-transformers bert-score pandas tqdm datasets accelerate peft bitsandbytes pillow trl --no-deps

In [None]:
# Importing modules
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
from PIL import Image
import torch
import pandas as pd
import os
import numpy as np
from huggingface_hub import login


### To use the Paligemma model, we must first create a HuggingFace ID, accept their T&C to use this model and then create an acess token. Using that access token, we can load the model and use it for either inferencing or fine-tuning.

### The following function 'paligemma_load' logs into HuggingFace using the Access Token and then loads the model - 'google/paligemma-3b-pt-224' and its processor. It then returns these two. We have used lower precision, i.e. "float16" to accomodate memory constraints.

In [None]:
# Load function
def paligemma_load():
    login('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') # Login to huggingface
    
    model = PaliGemmaForConditionalGeneration.from_pretrained(
            "google/paligemma-3b-pt-224",
            torch_dtype=torch.float16,
            device_map='auto',
            revision="float16",
        ).eval()
    processor = AutoProcessor.from_pretrained("google/paligemma-3b-pt-224",use_fast=True)
    return model,processor

### The following function 'paligemma_inference' takes the output model and processor of the previous function and also takes the Image Path and Question from the row of our input dataframe. Using these, it passes them as inputs to the Paligemma model along with a prompt - "Answer the question in exactly one word". This function, then, returns the output given by the decoder of this model.

In [None]:
# Inference function
def paligemma_inference(img_path,question_text,model,processor):
    image = Image.open(img_path)
    text = f'<image> Answer the question in exactly one word:{question_text}'
    model_inputs = processor(text=text,
                             images=image,
                             return_tensors="pt").to(model.device)
    
    input_len = model_inputs["input_ids"].shape[-1]
    
    with torch.inference_mode():
        generation = model.generate(**model_inputs, max_new_tokens=100, do_sample=False)
        generation = generation[0][input_len:]
        decoded = processor.decode(generation, skip_special_tokens=True)

    return decoded

### Initially, we thought of splitting the data into train:test:val::80:10:10  and use the train and val for fine-tuning and only test for getting the baseline inference. Later, we changed this by using both test and val for evaluating both baseline and fine-tuned.
### Thus, below is code which concatenates the two dataframes as is.

In [None]:
# Loading datasets and preparing them
df  = pd.read_csv("/kaggle/input/vqa-test-dataset/blip_vqa_test.csv")
df1 = pd.read_csv("/kaggle/input/vqa-val-dataset/blip_vqa_val.csv")

# Concatenate df1 below df and reset the index
df = pd.concat([df, df1], ignore_index=True)

print(df['main_image_id'].size)

### Now, we load the model by invoking its respective function. Next, we run the inference loop on our dataframe wherein we call the previously defined inference function and store the prediction and ground-truths. At the end, we convert it to a Dataframe to be used for evaluation

In [None]:
model, processor = paligemma_load()

image_root = "/kaggle/input/abo-small/images/small"

results = []
for idx, row in df.iterrows():
    
    rel_path  = row['image_path']
    
    img_path  = os.path.join(image_root, rel_path).replace("\\","/")
    
    question  = row['question']
    gt_answer = row['answer']
    
    pred = paligemma_inference(
        img_path=img_path,
        question_text=question,
        model=model,
        processor=processor
    )
    
    results.append({
        "ground_truth": gt_answer,
        "prediction":   pred
    })

results_df = pd.DataFrame(results)
print(results_df.head(20))


### While running the evaluation metric at first, we found that some rows had no predictions at all, which led to a type-error between str and null type. Thus, we fill the null values with empty string to resolve this.

In [None]:
# before computing any metrics
y_true = results_df['ground_truth'].fillna('').astype(str)
y_pred = results_df['prediction'].fillna('').astype(str)


In [None]:
labels = sorted(set(y_true).union(set(y_pred)))  # all strings now

## EVALUATION

### We use 3 evaluation metrics for our entire project, find the section "Evaluation Metrics" in the report for the description and purpose of the same:¶
### - Exact Match Accuracy
### - BERTScore
### - Semantic Cosine Similarity

In [None]:
# Calculating exact match accuracy score
from sklearn.metrics import accuracy_score

acc = accuracy_score(y_true, y_pred)
print(f"Exact-match Accuracy: {acc*100:.4f}%")


### This value comes out to be 0.06% which is exceptionally poor

In [None]:
# Calculating BERTScore
from bert_score import score as bert_score

P, R, F1 = bert_score(
    cands=y_pred.tolist(),
    refs =y_true.tolist(),
    lang ="en",
    rescale_with_baseline=True
)
print(f"\nBERTScore →   F1: {F1.mean():.4f}")

### The BERTScore F1 values shows the correctness metric as - 70.88%

In [None]:
# Calculating Semantic Cosine Similarity score
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

st_model = SentenceTransformer("all-MiniLM-L6-v2")
emb_true = st_model.encode(y_true.tolist(), convert_to_numpy=True)
emb_pred = st_model.encode(y_pred.tolist(), convert_to_numpy=True)

# cosine_similarity gives an NxN matrix; we take its diagonal
cos_mat     = cosine_similarity(emb_true, emb_pred)
cos_scores  = np.diag(cos_mat)
print(f"\nAvg. semantic cosine similarity: {cos_scores.mean()*100:.4f}")


### The average cosine similarity evaluates the model preciseness to be - 69.47%