In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Cell 1 — Imports, paths, knobs
import os, json, math, re, random, time
from typing import List

import numpy as np
import pandas as pd
from PIL import Image
from tqdm.auto import tqdm

import torch
from transformers import AutoProcessor, AutoModelForVision2Seq

PROJ        = "/content/drive/MyDrive/VQAScore_Project"
DATA_CSV    = f"{PROJ}/data/genai_bench_subset.csv"
RAW_SCORES  = f"{PROJ}/results/raw_scores.csv"       # from improved Phase 3
TEMPLATES   = f"{PROJ}/data/templates.json"
TEMPLATE_OUT_CSV = f"{PROJ}/results/template_scores.csv"
NEGATION_OUT_CSV = f"{PROJ}/results/negation_results.csv"

device = "cuda" if torch.cuda.is_available() else "cpu"
SEED   = 42
random.seed(SEED)
np.random.seed(SEED)

print("Project:", PROJ)
print("Device:", device)
print("Data CSV:", DATA_CSV)
print("Raw scores CSV:", RAW_SCORES)


Project: /content/drive/MyDrive/VQAScore_Project
Device: cuda
Data CSV: /content/drive/MyDrive/VQAScore_Project/data/genai_bench_subset.csv
Raw scores CSV: /content/drive/MyDrive/VQAScore_Project/results/raw_scores.csv


In [3]:
# Cell 2 — Load base data + Phase 3 scores
df_data = pd.read_csv(DATA_CSV)
df_scores = pd.read_csv(RAW_SCORES)

print("Data rows:", len(df_data))
print("Raw scores rows:", len(df_scores))

# Join to make sure we have a unified view
base = df_data.merge(
    df_scores[["id","VQAScore","CLIPScore","template"]],
    on="id",
    how="inner",
    suffixes=("", "_base")
)
print("Joined rows:", len(base))
base.head(5)

Data rows: 1341
Raw scores rows: 1341
Joined rows: 1341


Unnamed: 0,id,prompt,category,image_path,VQAScore,CLIPScore,template
0,604,"The moon casts a soft glow on the garden, a ra...",compositional,/content/drive/MyDrive/VQAScore_Project/data/g...,0.553988,0.265516,"Does this figure show ""The moon casts a soft g..."
1,147,One content rabbit and six tired turtles.,compositional,/content/drive/MyDrive/VQAScore_Project/data/g...,0.331985,0.290892,"Does this figure show ""One content rabbit and ..."
2,824,Two computers equipped with unique circular di...,compositional,/content/drive/MyDrive/VQAScore_Project/data/g...,0.598782,0.311411,"Does this figure show ""Two computers equipped ..."
3,14,A book with glowing runes floating beside a my...,compositional,/content/drive/MyDrive/VQAScore_Project/data/g...,0.512693,0.284609,"Does this figure show ""A book with glowing run..."
4,1332,"A person with a bright scarf, and no hat in th...",negation,/content/drive/MyDrive/VQAScore_Project/data/g...,0.177241,0.334264,"Does this figure show ""A person with a bright ..."


In [4]:
# Cell 3 — Define / load templates for the study
# We'll load existing templates.json if present and extend/normalize them,
# but you can also just hardcode them here.

if os.path.exists(TEMPLATES):
    templates_cfg = json.load(open(TEMPLATES))
    templates_list = templates_cfg.get("templates", [])
else:
    templates_list = []

# Ensure we have at least a few distinct phrasings
default_templates = [
    'Does this figure show "{}"? Please answer yes or no.',
    'Is there "{}" in this image? Please answer yes or no.',
    'Does the photo depict "{}"? Please answer yes or no.',
    'Is this picture about "{}"? Please answer yes or no.'
]
# Merge and deduplicate
templates_list = list(dict.fromkeys(templates_list + default_templates))

print("Templates to evaluate:")
for i, t in enumerate(templates_list):
    print(f"{i+1}. {t}")

Templates to evaluate:
1. Does this figure show "{}"? Please answer yes or no.
2. Is there "{}" in this image? Please answer yes or no.
3. Does the photo depict "{}"? Please answer yes or no.
4. Is this picture about "{}"? Please answer yes or no.


In [5]:
# Cell 4 — Choose a manageable subset for template study

MAX_PER_CATEGORY = 250  # adjust if you have more time/GPU

if "category" not in base.columns:
    # If category didn’t get carried over, recompute a simple heuristic.
    def negation_flag(prompt: str) -> bool:
        p = prompt.lower()
        terms = [" no ", " not ", " without ", " except ", " minus "]
        return any(t in f" {p} " for t in terms) or p.startswith("no ")
    def guess_category(prompt: str) -> str:
        if negation_flag(prompt):
            return "negation"
        compositional_markers = [",", " and ", " with ", " beside ", " behind ", " wearing ", " holding "]
        if any(m in prompt.lower() for m in compositional_markers):
            return "compositional"
        return "simple"
    base["category"] = base["prompt"].apply(guess_category)

subsets = []
for cat in sorted(base["category"].unique()):
    df_cat = base[base["category"] == cat]
    n = min(MAX_PER_CATEGORY, len(df_cat))
    subsets.append(df_cat.sample(n=n, random_state=SEED))
study_df = pd.concat(subsets, ignore_index=True).reset_index(drop=True)

print("Template study subset size:", len(study_df))
print("By category:")
print(study_df["category"].value_counts())
study_df.head(5)

Template study subset size: 750
By category:
category
compositional    250
negation         250
simple           250
Name: count, dtype: int64


Unnamed: 0,id,prompt,category,image_path,VQAScore,CLIPScore,template
0,350,"A notebook lies open in the grass, with sketch...",compositional,/content/drive/MyDrive/VQAScore_Project/data/g...,0.630685,0.314134,"Does this figure show ""A notebook lies open in..."
1,1042,A woolen sweater drying on a wooden rack in th...,compositional,/content/drive/MyDrive/VQAScore_Project/data/g...,0.653569,0.33939,"Does this figure show ""A woolen sweater drying..."
2,1014,"Glistening, silver utensils neatly arranged be...",compositional,/content/drive/MyDrive/VQAScore_Project/data/g...,0.675765,0.308376,"Does this figure show ""Glistening, silver uten..."
3,608,"Amidst a winter wonderland, a rabbit scurries ...",compositional,/content/drive/MyDrive/VQAScore_Project/data/g...,0.586996,0.313988,"Does this figure show ""Amidst a winter wonderl..."
4,1191,A meadow aglow with fireflies under a starry sky.,compositional,/content/drive/MyDrive/VQAScore_Project/data/g...,0.688468,0.316688,"Does this figure show ""A meadow aglow with fir..."


In [None]:
# Cell 5 — Load the same VQA model

VQA_MODEL_ID = "Salesforce/instructblip-flan-t5-xl"
DTYPE = torch.float16 if device == "cuda" else torch.float32

vqa_processor = AutoProcessor.from_pretrained(VQA_MODEL_ID)
vqa_model = AutoModelForVision2Seq.from_pretrained(
    VQA_MODEL_ID, torch_dtype=DTYPE
).to(device).eval()

print("Loaded VQA model:", VQA_MODEL_ID, "on", device)

In [7]:
# Should be same as in phase 3 notebook
@torch.inference_mode()
def vqa_yes_probability(image: Image.Image, prompt: str, template: str) -> float:
    """
    Approximate P(Yes | image, question) by comparing NLL of 'Yes' and 'No'
    answers and normalizing:
        p_yes = exp(-L_yes) / (exp(-L_yes) + exp(-L_no))
    This keeps scores in [0, 1] and makes them easier to interpret/compare.
    """
    question = template.format(prompt)

    # Encode the (image, question) once
    inputs = vqa_processor(images=image, text=question, return_tensors="pt").to(device)

    # Prepare target token ids for "Yes" and "No"
    yes_ids = vqa_processor.tokenizer("Yes", return_tensors="pt").input_ids.to(device)
    no_ids  = vqa_processor.tokenizer("No",  return_tensors="pt").input_ids.to(device)

    # Compute loss for each possible answer
    out_yes = vqa_model(**inputs, labels=yes_ids)
    out_no  = vqa_model(**inputs, labels=no_ids)

    nll_yes = float(out_yes.loss.item())
    nll_no  = float(out_no.loss.item())

    # Convert negative NLLs into normalized probabilities
    p_yes = math.exp(-nll_yes)
    p_no  = math.exp(-nll_no)
    denom = p_yes + p_no if (p_yes + p_no) > 0 else 1e-8

    return p_yes / denom

In [None]:
# Cell 7 — Run template study: recompute VQAScore for each template on the subset
records = []
start = time.time()

for idx, row in tqdm(study_df.iterrows(), total=len(study_df), desc="Template study"):
    img_path = row["image_path"]
    prompt   = row["prompt"]
    cat      = row["category"]
    base_vqa = row.get("VQAScore", None)
    base_clip = row.get("CLIPScore", None)

    if not isinstance(img_path, str) or not os.path.exists(img_path):
        continue

    image = Image.open(img_path).convert("RGB")

    for tmpl in templates_list:
        tmpl = tmpl.format(prompt)
        score = vqa_yes_probability(image, prompt, tmpl)
        records.append({
            "id": row["id"],
            "prompt": prompt,
            "category": cat,
            "image_path": img_path,
            "template": tmpl,
            "VQAScore_template": score,
            "VQAScore_base": base_vqa,
            "CLIPScore": base_clip,
        })

    # optional: small cleanup to be nice to VRAM
    del image
    torch.cuda.empty_cache()

template_df = pd.DataFrame(records)
template_df.to_csv(TEMPLATE_OUT_CSV, index=False)

elapsed = time.time() - start
print(f"Saved template scores to: {TEMPLATE_OUT_CSV}")
print(f"Elapsed: {elapsed/60:.1f} minutes")
template_df.head(5)

In [9]:
# Cell 8 — Basic template variance analysis (per prompt)
# This is just computing how sensitive VQAScore is to phrasing.

if not template_df.empty:
    agg = (
        template_df
        .groupby(["id","prompt","category","image_path"])
        .agg(
            VQA_mean=("VQAScore_template","mean"),
            VQA_std=("VQAScore_template","std"),
            VQA_min=("VQAScore_template","min"),
            VQA_max=("VQAScore_template","max")
        )
        .reset_index()
    )
    print("Template variance summary (head):")
    display(agg.head(10))

    # Save for later plotting in Phase 5 if you like
    agg.to_csv(f"{PROJ}/results/template_variance_summary.csv", index=False)
else:
    print("template_df is empty; skipping variance summary.")

Template variance summary (head):


Unnamed: 0,id,prompt,category,image_path,VQA_mean,VQA_std,VQA_min,VQA_max
0,1,A photographer capturing a fleeting moment in ...,simple,/content/drive/MyDrive/VQAScore_Project/data/g...,0.622868,0.021136,0.591723,0.638835
1,3,A man shaping clay on a wheel in a cluttered w...,simple,/content/drive/MyDrive/VQAScore_Project/data/g...,0.712773,0.026898,0.681727,0.736021
2,6,"A fairy dancing lightly atop a blooming, moonl...",compositional,/content/drive/MyDrive/VQAScore_Project/data/g...,0.621747,0.030027,0.593609,0.661486
3,8,"A ghostly ship sailing on a fog-shrouded, moon...",compositional,/content/drive/MyDrive/VQAScore_Project/data/g...,0.696213,0.032402,0.670177,0.74055
4,9,A sorcerer's hat casting shadows over a clutte...,compositional,/content/drive/MyDrive/VQAScore_Project/data/g...,0.341342,0.024971,0.324235,0.378459
5,10,A pair of winged boots resting on a cloud in t...,simple,/content/drive/MyDrive/VQAScore_Project/data/g...,0.520314,0.028391,0.495606,0.558327
6,14,A book with glowing runes floating beside a my...,compositional,/content/drive/MyDrive/VQAScore_Project/data/g...,0.54868,0.046951,0.5083,0.605799
7,18,"A unicorn grazing peacefully in a radiant, rai...",compositional,/content/drive/MyDrive/VQAScore_Project/data/g...,0.644592,0.019978,0.627038,0.669313
8,20,A lantern casting dim light in a haunted forest.,simple,/content/drive/MyDrive/VQAScore_Project/data/g...,0.571669,0.043636,0.534127,0.627495
9,21,A cloak of invisibility draped over a chair in...,simple,/content/drive/MyDrive/VQAScore_Project/data/g...,0.410647,0.005507,0.403097,0.416322


In [10]:
# Cell 9 — Negation subset: use Phase 3 scores and flag negation prompts
def negation_flag(prompt: str) -> bool:
    p = str(prompt).lower()
    terms = [" no ", " not ", " without ", " except ", " minus "]
    return any(t in f" {p} " for t in terms) or p.startswith("no ")

df_scores_full = df_scores.copy()
df_scores_full["negation_flag"] = df_scores_full["prompt"].apply(negation_flag)

neg_df = df_scores_full[df_scores_full["negation_flag"] == True].reset_index(drop=True)
print("Negation subset size:", len(neg_df))

neg_df.to_csv(NEGATION_OUT_CSV, index=False)
neg_df.head(10)

Negation subset size: 293


Unnamed: 0,id,prompt,category,image_path,VQAScore,CLIPScore,model_vqa,model_clip,template,negation_flag
0,1332,"A person with a bright scarf, and no hat in th...",negation,/content/drive/MyDrive/VQAScore_Project/data/g...,0.177241,0.334264,instructblip-flan-t5-xl,ViT-B-32,"Does this figure show ""A person with a bright ...",True
1,205,"There are some apples on the table, no oranges.",negation,/content/drive/MyDrive/VQAScore_Project/data/g...,0.642881,0.28756,instructblip-flan-t5-xl,ViT-B-32,"Does this figure show ""There are some apples o...",True
2,442,"A glass with no water, only ice melting.",negation,/content/drive/MyDrive/VQAScore_Project/data/g...,0.257952,0.271265,instructblip-flan-t5-xl,ViT-B-32,"Does this figure show ""A glass with no water, ...",True
3,1298,a shoe rack without any red pairs of shoes on it.,negation,/content/drive/MyDrive/VQAScore_Project/data/g...,0.426322,0.300298,instructblip-flan-t5-xl,ViT-B-32,"Does this figure show ""a shoe rack without any...",True
4,1265,"In a room, there is only a table, but no chairs.",negation,/content/drive/MyDrive/VQAScore_Project/data/g...,0.610453,0.289228,instructblip-flan-t5-xl,ViT-B-32,"Does this figure show ""In a room, there is onl...",True
5,1541,It's raining outside the house and a mother is...,negation,/content/drive/MyDrive/VQAScore_Project/data/g...,0.478041,0.280618,instructblip-flan-t5-xl,ViT-B-32,"Does this figure show ""It's raining outside th...",True
6,445,"A ball with no bounce, lying still.",negation,/content/drive/MyDrive/VQAScore_Project/data/g...,0.319121,0.219709,instructblip-flan-t5-xl,ViT-B-32,"Does this figure show ""A ball with no bounce, ...",True
7,477,"A basket full of apples, but no oranges.",negation,/content/drive/MyDrive/VQAScore_Project/data/g...,0.530236,0.310106,instructblip-flan-t5-xl,ViT-B-32,"Does this figure show ""A basket full of apples...",True
8,945,"Two LED table lamps on a table, the illuminate...",negation,/content/drive/MyDrive/VQAScore_Project/data/g...,0.596903,0.318271,instructblip-flan-t5-xl,ViT-B-32,"Does this figure show ""Two LED table lamps on ...",True
9,449,A beach without any footprints in the sand.,negation,/content/drive/MyDrive/VQAScore_Project/data/g...,0.181263,0.285131,instructblip-flan-t5-xl,ViT-B-32,"Does this figure show ""A beach without any foo...",True


In [11]:
# Cell 10 — Quick sanity for negation (no plots yet; those go in Phase 5)
if len(neg_df) > 0:
    print("CLIPScore stats on negation prompts:")
    print(neg_df["CLIPScore"].describe())
    print("\nVQAScore stats on negation prompts:")
    print(neg_df["VQAScore"].describe())
else:
    print("No negation prompts detected by heuristic; you may need to adjust negation_flag.")

CLIPScore stats on negation prompts:
count    293.000000
mean       0.300645
std        0.026550
min        0.219709
25%        0.284606
50%        0.298834
75%        0.314837
max        0.405980
Name: CLIPScore, dtype: float64

VQAScore stats on negation prompts:
count    293.000000
mean       0.464283
std        0.150393
min        0.093017
25%        0.361165
50%        0.498047
75%        0.570331
max        0.795658
Name: VQAScore, dtype: float64


In [None]:
# Cell 11 — Phase 4 summary
summary = {
    "template_study_rows": int(len(template_df)),
    "template_out_csv": TEMPLATE_OUT_CSV,
    "negation_rows": int(len(neg_df)),
    "negation_out_csv": NEGATION_OUT_CSV,
    "vqa_model": VQA_MODEL_ID,
    "num_templates": len(templates_list),
}
import pprint; pprint.pprint(summary)

{'negation_out_csv': '/content/drive/MyDrive/VQAScore_Project/results/negation_results.csv',
 'negation_rows': 293,
 'num_templates': 4,
 'template_out_csv': '/content/drive/MyDrive/VQAScore_Project/results/template_scores.csv',
 'template_study_rows': 3000,
 'vqa_model': 'Salesforce/instructblip-flan-t5-xl'}

✅ Phase 4 complete. Next: Phase 5 — Analysis & plotting (correlations, histograms, etc.).
