### Installation

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

### Unsloth

In [None]:
!pip install transformers[qwen] # Install transformers with qwen support



In [None]:
from unsloth import FastVisionModel # FastLanguageModel for LLMs
import torch
from peft import PeftModel

# 2. LOAD MODEL & ADAPTERS
print("Loading Model...")
# Load Base Model
model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Qwen3-VL-8B-Instruct-unsloth-bnb-4bit",
    load_in_4bit=True,
    use_gradient_checkpointing="unsloth",
)

# Load Your Fine-Tuned Adapters
model = PeftModel.from_pretrained(
    model,
    "Adyakanta/qwen3_finetuned_adapters_1"
)
FastVisionModel.for_inference(model)

Loading Model...
==((====))==  Unsloth 2025.11.4: Fast Qwen3_Vl patching. Transformers: 4.57.2.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_config.json: 0.00B [00:00, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/205M [00:00<?, ?B/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen3VLForConditionalGeneration(
      (model): Qwen3VLModel(
        (visual): Qwen3VLVisionModel(
          (patch_embed): Qwen3VLVisionPatchEmbed(
            (proj): Conv3d(3, 1152, kernel_size=(2, 16, 16), stride=(2, 16, 16))
          )
          (pos_embed): Embedding(2304, 1152)
          (rotary_pos_emb): Qwen3VLVisionRotaryEmbedding()
          (blocks): ModuleList(
            (0-26): 27 x Qwen3VLVisionBlock(
              (norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
              (norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
              (attn): Qwen3VLVisionAttention(
                (qkv): lora.Linear(
                  (base_layer): Linear(in_features=1152, out_features=3456, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Identity()
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_

In [None]:
!pip install datasets
from datasets import load_dataset,Dataset



In [None]:
dataset2 = load_dataset("Adyakanta/test_flickr30k", split="test")

README.md:   0%|          | 0.00/469 [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/14.6M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
dataset2 = dataset2.remove_columns(['sentids', 'split', 'img_id', 'filename'])

In [None]:
#100 test inference
import pandas as pd
from tqdm import tqdm
import torch
from transformers import TextStreamer

# Shuffle and take first 100 samples (already applied as per your dataset)
# dataset = dataset.shuffle(seed=3407).take(100)

# Enable inference mode
FastVisionModel.for_inference(model)

# Initialize list to store results
results = []

# Set up text streamer (optional for console output)


# Iterate over dataset
for sample in tqdm(dataset2):
    image = sample["image"]
    caption = sample["caption"]  # Original ground truth

    instruction = "Write a descriptive caption for this image."
    messages = [
        {"role": "user", "content": [
            {"type": "image"},
            {"type": "text", "text": instruction}
        ]}
    ]

    input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)

    # Tokenize with image and text
    inputs = tokenizer(
        image,
        input_text,
        add_special_tokens=False,
        return_tensors="pt"
    ).to("cuda")

    # Generate prediction
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=128,
            use_cache=True,
            do_sample=True,
            temperature=0.8,
            top_p=0.9,
            top_k=50,
            min_p=0.0,
        )

    # Decode model output
    generated_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    text_streamer = TextStreamer(tokenizer, skip_prompt=True)
    # Store result
    results.append({
        "image": image,
        "original_response": caption,
        "generated_response": generated_response
    })

# Create DataFrame from results
df = pd.DataFrame(results)

# Save to disk (optionally use df.to_csv if you convert images to paths or skip them)
df.to_pickle("vlm_inference_results.pkl")

print("Inference complete. DataFrame with image, caption, and generated response created.")


100%|██████████| 100/100 [23:35<00:00, 14.16s/it]

Inference complete. DataFrame with image, caption, and generated response created.





# **UPLOAD TO HF**

In [None]:
#to upload to hf
from huggingface_hub import login
login()
df.to_csv("rando.csv", index="False")
dataset = Dataset.from_pandas(pd.read_csv("rando.csv"))
dataset.push_to_hub("Adyakanta/Qwen3_FT_test_zero_shot_sampled")

# **Conversion of A List of Strings to List of Lists**

In [1]:
from datasets import load_dataset,Dataset
dataset=load_dataset('Adyakanta/Qwen3_FT_test_zero_shot_sampled',split="train",download_mode="force_redownload")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/401 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/39.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100 [00:00<?, ? examples/s]

In [2]:
df = dataset.to_pandas()

In [3]:
import pandas as pd
import re
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize

# Function to clean, tokenize, and remove the prompt string
def clean_and_split_caption(caption):
    if pd.isna(caption):
        return []

    # Remove unwanted labels like "user", "assistant", etc.
    caption = re.sub(r'\buser\b|\bassistant\b', '', caption, flags=re.IGNORECASE).strip()

    # Normalize whitespaces and newlines
    caption = re.sub(r'\s+', ' ', caption).strip()

    # Tokenize into individual sentences
    sentences = sent_tokenize((caption))

    # Remove the specific instruction sentence if present
    sentences = [s.strip() for s in sentences if s.strip()!='system You are a helpful .' and s.strip() != 'Write a descriptive caption for this image.']

    return str(sentences)

# Apply to your DataFrame
df['second_assistant_response'] = df['generated_response'].apply(clean_and_split_caption)




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [4]:
df['second_assistant_response'][0]

'["[\'A little girl in an Elmo costume holding a guitar.", "\', \'A little girl in a Elmo costume playing a guitar\', \'A child dressed up as Elmo is playing a guitar.", "\', \'A toddler in an Elmo costume playing a guitar.", "\', \'A toddler in an Elmo costume playing a guitar.\']"]'

In [5]:
lm=pd.Series(dataset['original_response'])

In [6]:
lm1=pd.Series(df['second_assistant_response'])

In [7]:
# Convert each string in the column to a list of lists
df['lol_of_genrated_response'] = df['second_assistant_response'].apply(
    lambda x: [[item.strip().strip("'")] for item in x.strip("[]").split("', ")]
)

In [8]:
df['lol_of_original_response'] = lm.apply(
    lambda x: [[item.strip().strip("'")] for item in x.strip("[]").split("', ")]
)

In [9]:
df.drop(columns=['original_response'], inplace=True)
df.drop(columns=['generated_response'], inplace=True)
df.drop(columns=['second_assistant_response'], inplace=True)
df.drop(columns=['Unnamed: 0'], inplace=True)

In [10]:
df['lol_of_original_response'][0]


[['The baby his a Cookie Monster outfit on he is happy he his a Gutter its red'],
 ['"A child wearing a shirt decorated with Elmo\'s face holds a red ukulele.", \'A toddler wearing an Elmo Christmas outfit holds a red guitar.'],
 ['A child wearing a santa elmo suit holding a red guitar.'],
 ['A child in a Elmo suit is playing the guitar.']]

In [11]:
df['lol_of_genrated_response'][0]


[['"[\'A little girl in an Elmo costume holding a guitar.", "'],
 ['A little girl in a Elmo costume playing a guitar'],
 ['A child dressed up as Elmo is playing a guitar.", "'],
 ['A toddler in an Elmo costume playing a guitar.", "'],
 ['A toddler in an Elmo costume playing a guitar.\']"']]

In [12]:
df.to_csv('Qwen3_FT_zs_sampled.csv', index=False)
print("DataFrame with all metrics saved to qwen3_FT_zs.csv")

DataFrame with all metrics saved to qwen3_FT_zs.csv


**---------------------------------------------------------------------**

# **Type 1: BLEU-1 and BLEU-2**

In [None]:
# ==========================================
# ZERO-SHOT: CALCULATE MAX BLEU-1 & BLEU-2
# ==========================================
import pandas as pd
import numpy as np
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize

# 1. Ensure NLTK resources are available
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# Download necessary nltk data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

nltk.download('punkt_tab')

# 2. Define Smoothing (Standard for academic reporting)
chencherry = SmoothingFunction()

def calculate_max_bleu_scores(generated_raw, original_raw):
    """
    Calculates MAX BLEU-1 and BLEU-2 for a single image (Best-of-N).
    Parses stringified lists if necessary.
    """
    # --- A. Parse Data (Handle string vs list) ---
    if isinstance(generated_raw, str):
        generated_list = eval(generated_raw)
    else:
        generated_list = generated_raw

    if isinstance(original_raw, str):
        original_list = eval(original_raw)
    else:
        original_list = original_raw

    # --- B. Flatten Lists (Handle [['cap']] vs ['cap']) ---
    hyps = []
    for item in generated_list:
        # If item is a list like ['caption'], take the string inside
        if isinstance(item, list):
            hyps.append(item[0])
        else:
            hyps.append(item)

    refs = []
    for item in original_list:
        if isinstance(item, list):
            refs.append(item[0])
        else:
            refs.append(item)

    # --- C. Tokenize ---
    # References: List of lists of tokens [[w1, w2], [w3, w4]]
    ref_tokens = [word_tokenize(r.lower()) for r in refs]

    # Hypotheses: List of lists of tokens
    hyp_tokens_list = [word_tokenize(h.lower()) for h in hyps]

    b1_scores = []
    b2_scores = []

    # --- D. Calculate Best-of-N Score ---
    for hyp_tokens in hyp_tokens_list:
        # Calculate score for this specific candidate against ALL references
        b1 = sentence_bleu(ref_tokens, hyp_tokens, weights=(1, 0, 0, 0), smoothing_function=chencherry.method1)
        b2 = sentence_bleu(ref_tokens, hyp_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=chencherry.method1)

        b1_scores.append(b1)
        b2_scores.append(b2)

    # Return the MAX score (The single best caption's score)
    return np.max(b1_scores) if b1_scores else 0.0, np.max(b2_scores) if b2_scores else 0.0

print("Calculating MAX BLEU scores for Zero-Shot (Sampled)...")

max_b1_list = []
max_b2_list = []

# Iterate over the dataframe (assuming 'df' exists from your notebook)
for index, row in df.iterrows():
    try:
        # Using the specific column names from your file (note the 'genrated' typo)
        gen_data = row['lol_of_genrated_response']
        orig_data = row['lol_of_original_response']

        b1, b2 = calculate_max_bleu_scores(gen_data, orig_data)
        max_b1_list.append(b1)
        max_b2_list.append(b2)
    except Exception as e:
        print(f"Error at row {index}: {e}")
        max_b1_list.append(0.0)
        max_b2_list.append(0.0)

# Add results to dataframe
df['BLEU_1_Max'] = max_b1_list
df['BLEU_2_Max'] = max_b2_list

# --- REPORT ---
print("\n" + "="*40)
print("   ZERO-SHOT SAMPLED RESULTS (MAX METRIC)")
print("="*40)

print(f"Mean Max BLEU-1: {np.mean(max_b1_list):.4f}")
print(f"Mean Max BLEU-2: {np.mean(max_b2_list):.4f}")

print("\nDetailed Statistics:")
print(df[['BLEU_1_Max', 'BLEU_2_Max']].describe())

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Calculating MAX BLEU scores for Zero-Shot (Sampled)...

   ZERO-SHOT SAMPLED RESULTS (MAX METRIC)
Mean Max BLEU-1: 0.7241
Mean Max BLEU-2: 0.5561

Detailed Statistics:
       BLEU_1_Max  BLEU_2_Max
count  100.000000  100.000000
mean     0.724106    0.556070
std      0.091286    0.119122
min      0.500000    0.277350
25%      0.664802    0.489746
50%      0.727273    0.570295
75%      0.769231    0.627782
max      1.000000    0.904534


# **BLEU-3 AND BLEU-4 SCORES**

In [None]:
# ==========================================
import pandas as pd
import numpy as np
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize

# 1. Ensure NLTK resources
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
    nltk.download('punkt_tab')

# 2. Define Smoothing
chencherry = SmoothingFunction()

def calculate_type1_advanced_bleu(generated_raw, original_raw):
    """
    Calculates MAX BLEU-3 and BLEU-4 using TYPE 1 Logic (Multi-Reference).
    Compares each generated caption against the FULL LIST of references.
    """
    # --- A. Parse & Flatten Data ---
    # Handle stringified lists if necessary
    if isinstance(generated_raw, str):
        generated_list = eval(generated_raw)
    else:
        generated_list = generated_raw

    if isinstance(original_raw, str):
        original_list = eval(original_raw)
    else:
        original_list = original_raw

    # Flatten to simple lists of strings ['cap1', 'cap2']
    hyps = []
    for item in generated_list:
        hyps.append(item[0] if isinstance(item, list) else item)

    refs = []
    for item in original_list:
        refs.append(item[0] if isinstance(item, list) else item)

    # --- B. Tokenize ---
    # References: List of lists of tokens [[w1, w2], [w3, w4]] (Pool of references)
    ref_tokens = [word_tokenize(r.lower()) for r in refs]

    # Hypotheses: List of lists of tokens
    hyp_tokens_list = [word_tokenize(h.lower()) for h in hyps]

    b3_scores = []
    b4_scores = []

    # --- C. Calculate Score (Type 1 Logic) ---
    for hyp_tokens in hyp_tokens_list:
        # Compare ONE hypothesis against ALL references (Standard BLEU)

        # BLEU-3 (Weights: 1/3 each for 1,2,3-grams)
        b3 = sentence_bleu(
            ref_tokens, hyp_tokens,
            weights=(0.333, 0.333, 0.333, 0),
            smoothing_function=chencherry.method1
        )

        # BLEU-4 (Weights: 1/4 each for 1,2,3,4-grams)
        b4 = sentence_bleu(
            ref_tokens, hyp_tokens,
            weights=(0.25, 0.25, 0.25, 0.25),
            smoothing_function=chencherry.method1
        )

        b3_scores.append(b3)
        b4_scores.append(b4)

    # Return the MAX score (Best-of-N)
    return np.max(b3_scores) if b3_scores else 0.0, np.max(b4_scores) if b4_scores else 0.0

print("Calculating Zero-Shot BLEU-3 & BLEU-4 (Type 1)...")

max_b3_list = []
max_b4_list = []

for index, row in df.iterrows():
    try:
        # Adjust column names if they are different in your dataframe
        gen_data = row['lol_of_genrated_response']
        orig_data = row['lol_of_original_response']

        b3, b4 = calculate_type1_advanced_bleu(gen_data, orig_data)
        max_b3_list.append(b3)
        max_b4_list.append(b4)
    except Exception as e:
        print(f"Error at row {index}: {e}")
        max_b3_list.append(0.0)
        max_b4_list.append(0.0)

# Add to dataframe
df['BLEU_3_Max'] = max_b3_list
df['BLEU_4_Max'] = max_b4_list

# --- REPORT ---
print("\n" + "="*40)
print("   ZERO-SHOT RESULTS (BLEU-3 & BLEU-4)")
print("="*40)

print(f"Mean Max BLEU-3: {np.mean(max_b3_list):.4f}")
print(f"Mean Max BLEU-4: {np.mean(max_b4_list):.4f}")

print("\nDetailed Statistics:")
print(df[['BLEU_3_Max', 'BLEU_4_Max']].describe())

Calculating Zero-Shot BLEU-3 & BLEU-4 (Type 1)...

   ZERO-SHOT RESULTS (BLEU-3 & BLEU-4)
Mean Max BLEU-3: 0.4313
Mean Max BLEU-4: 0.3260

Detailed Statistics:
       BLEU_3_Max  BLEU_4_Max
count  100.000000  100.000000
mean     0.431297    0.325995
std      0.139329    0.156725
min      0.125152    0.056626
25%      0.329103    0.214770
50%      0.435534    0.324194
75%      0.519896    0.425593
max      0.868376    0.844693


In [None]:
# ==========================================
# 1. PREPROCESSING: STANDARDIZE COLUMNS
# ==========================================
import pandas as pd
import ast

def safe_eval(val):
    """Converts stringified lists "['a', 'b']" back to actual lists."""
    if isinstance(val, str):
        try:
            return eval(val)
        except:
            return [val]
    return val

def normalize_to_list(val):
    """Ensures data is a simple list of strings: ['cap1', 'cap2']"""
    val = safe_eval(val)

    clean_list = []
    if isinstance(val, list):
        for item in val:
            # Handle nested lists [['cap']] -> 'cap'
            if isinstance(item, list):
                clean_list.append(item[0])
            else:
                clean_list.append(item)
    else:
        clean_list.append(str(val))
    return clean_list

print("Standardizing Zero-Shot columns...")

# Adapt to your specific Zero-Shot column names
# Note: Handling the typo 'genrated' from your previous file
df['generated_response_formatted'] = df['lol_of_genrated_response'].apply(normalize_to_list)
df['original_response_formatted'] = df['lol_of_original_response'].apply(normalize_to_list)

print("Done! Columns 'generated_response_formatted' and 'original_response_formatted' created.")
print(f"Sample Gen: {df['generated_response_formatted'][0]}")

Standardizing Zero-Shot columns...
Done! Columns 'generated_response_formatted' and 'original_response_formatted' created.
Sample Gen: ['"[\'A little girl in an Elmo costume holding a guitar.", "', 'A little girl in a Elmo costume playing a guitar', 'A child dressed up as Elmo is playing a guitar.", "', 'A toddler in an Elmo costume playing a guitar.", "', 'A toddler in an Elmo costume playing a guitar.\']"']


# **ROGUE L/1/2 METRIC**


In [None]:
# ==========================================
# 2. CALCULATE MAX ROUGE SCORES
# ==========================================
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from collections import Counter
from tqdm import tqdm

def get_ngrams(text_tokens, n):
    return Counter(ngrams(text_tokens, n))

def calculate_rouge_n(candidate_tokens, reference_tokens, n):
    cand_ngrams = get_ngrams(candidate_tokens, n)
    ref_ngrams = get_ngrams(reference_tokens, n)
    candidate_count = sum(cand_ngrams.values())
    reference_count = sum(ref_ngrams.values())
    overlap = sum((cand_ngrams & ref_ngrams).values())

    if candidate_count == 0: precision = 0.0
    else: precision = overlap / candidate_count

    if reference_count == 0: recall = 0.0
    else: recall = overlap / reference_count

    if precision + recall == 0: return 0.0
    return 2 * precision * recall / (precision + recall)

def lcs_length(X, Y):
    m, n = len(X), len(Y)
    L = [[0] * (n + 1) for _ in range(m + 1)]
    for i in range(m + 1):
        for j in range(n + 1):
            if i == 0 or j == 0: L[i][j] = 0
            elif X[i-1] == Y[j-1]: L[i][j] = L[i-1][j-1] + 1
            else: L[i][j] = max(L[i-1][j], L[i][j-1])
    return L[m][n]

def calculate_rouge_l(candidate_tokens, reference_tokens):
    lcs = lcs_length(candidate_tokens, reference_tokens)
    if len(candidate_tokens) == 0: precision = 0.0
    else: precision = lcs / len(candidate_tokens)
    if len(reference_tokens) == 0: recall = 0.0
    else: recall = lcs / len(reference_tokens)
    if precision + recall == 0: return 0.0
    return 2 * precision * recall / (precision + recall)

print("Calculating Zero-Shot ROUGE scores...")

max_r1, max_r2, max_rl = [], [], []

for index, row in tqdm(df.iterrows(), total=len(df)):
    gens = row['generated_response_formatted']
    origs = row['original_response_formatted']

    ref_tokens_list = [word_tokenize(r.lower()) for r in origs]
    hyp_tokens_list = [word_tokenize(h.lower()) for h in gens]

    best_r1, best_r2, best_rl = 0.0, 0.0, 0.0

    for hyp_tokens in hyp_tokens_list:
        curr_r1, curr_r2, curr_rl = 0.0, 0.0, 0.0
        for ref_tokens in ref_tokens_list:
            curr_r1 = max(curr_r1, calculate_rouge_n(hyp_tokens, ref_tokens, 1))
            curr_r2 = max(curr_r2, calculate_rouge_n(hyp_tokens, ref_tokens, 2))
            curr_rl = max(curr_rl, calculate_rouge_l(hyp_tokens, ref_tokens))

        best_r1 = max(best_r1, curr_r1)
        best_r2 = max(best_r2, curr_r2)
        best_rl = max(best_rl, curr_rl)

    max_r1.append(best_r1)
    max_r2.append(best_r2)
    max_rl.append(best_rl)

df['ROUGE_1_Max'] = max_r1
df['ROUGE_2_Max'] = max_r2
df['ROUGE_L_Max'] = max_rl

print("\n" + "="*40)
print(f"Mean Max ROUGE-1: {np.mean(max_r1):.4f}")
print(f"Mean Max ROUGE-2: {np.mean(max_r2):.4f}")
print(f"Mean Max ROUGE-L: {np.mean(max_rl):.4f}")
print("="*40)

Calculating Zero-Shot ROUGE scores...


100%|██████████| 100/100 [00:00<00:00, 287.44it/s]


Mean Max ROUGE-1: 0.5955
Mean Max ROUGE-2: 0.3696
Mean Max ROUGE-L: 0.5659





# **CIDER,METEOR SCORES**

In [None]:
# ==========================================
# 3. CALCULATE MAX METEOR & CIDEr SCORES
# ==========================================
import numpy as np
import math
from collections import defaultdict
import nltk
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize

try:
    nltk.data.find('corpora/wordnet.zip')
except LookupError:
    nltk.download('wordnet')
    nltk.download('omw-1.4')

# --- CIDEr Scorer Class ---
class CiderScorer:
    def __init__(self, df, n=4, sigma=6.0):
        self.n = n
        self.sigma = sigma
        self.crefs = []
        self.document_frequency = defaultdict(float)
        self.ref_len = 0
        for refs in df['original_response_formatted']:
            self.crefs.append(refs)
            for ngram in self._counts2grams(refs):
                self.document_frequency[ngram] += 1
        self.ref_len = len(self.crefs)

    def _counts2grams(self, text_list):
        counts = defaultdict(int)
        for text in text_list:
            tokens = word_tokenize(text.lower())
            for i in range(1, self.n + 1):
                for ngram in nltk.ngrams(tokens, i):
                    counts[ngram] += 1
        return counts

    def _compute_vec(self, text_tokens, doc_freq, ref_len):
        vec = [defaultdict(float) for _ in range(self.n)]
        length = len(text_tokens)
        counts = defaultdict(int)
        for i in range(1, self.n + 1):
            for ngram in nltk.ngrams(text_tokens, i):
                counts[ngram] += 1
        for i in range(self.n):
            for ngram, count in counts.items():
                if len(ngram) == i + 1:
                    df = doc_freq.get(ngram, 0)
                    idf = np.log(max(1.0, ref_len) / (df + 1.0))
                    vec[i][ngram] = float(count) * idf
        return vec, length

    def _sim(self, vec_hyp, vec_ref, len_hyp, len_ref):
        delta = float(len_hyp - len_ref)
        val = np.array([0.0] * self.n)
        for i in range(self.n):
            denom_hyp = np.sqrt(sum([x**2 for x in vec_hyp[i].values()]))
            denom_ref = np.sqrt(sum([x**2 for x in vec_ref[i].values()]))
            if denom_hyp * denom_ref > 0:
                common = set(vec_hyp[i].keys()) & set(vec_ref[i].keys())
                numerator = sum([vec_hyp[i][k] * vec_ref[i][k] for k in common])
                val[i] = (numerator) / (denom_hyp * denom_ref)
            val[i] *= np.e**(-(delta**2) / (2 * self.sigma**2))
        return val

    def compute_score(self, hypothesis, references):
        hyp_tokens = word_tokenize(hypothesis.lower())
        ref_tokens_list = [word_tokenize(r.lower()) for r in references]
        vec_refs, len_refs = [], []
        for ref_tokens in ref_tokens_list:
            v, l = self._compute_vec(ref_tokens, self.document_frequency, self.ref_len)
            vec_refs.append(v)
            len_refs.append(l)
        vec_hyp, len_hyp = self._compute_vec(hyp_tokens, self.document_frequency, self.ref_len)
        scores = []
        for i in range(len(references)):
            scores.append(self._sim(vec_hyp, vec_refs[i], len_hyp, len_refs[i]))
        avg_score = np.mean(scores, axis=0)
        return np.sum(avg_score) / self.n * 10.0

# --- Execution ---
print("Calculating METEOR and CIDEr...")
cider_scorer = CiderScorer(df)
max_meteor, max_cider = [], []

for index, row in tqdm(df.iterrows(), total=len(df)):
    orig_texts = row['original_response_formatted']
    gen_texts = row['generated_response_formatted']
    orig_tokens = [word_tokenize(r) for r in orig_texts]

    m_scores = []
    c_scores = []

    for gen_text in gen_texts:
        gen_token = word_tokenize(gen_text)
        m_scores.append(meteor_score(orig_tokens, gen_token))
        c_scores.append(cider_scorer.compute_score(gen_text, orig_texts))

    max_meteor.append(max(m_scores) if m_scores else 0.0)
    max_cider.append(max(c_scores) if c_scores else 0.0)

df['METEOR_Max'] = max_meteor
df['CIDEr_Max'] = max_cider

print("\n" + "="*40)
print(f"Mean Max METEOR: {np.mean(max_meteor):.4f}")
print(f"Mean Max CIDEr:  {np.mean(max_cider):.4f}")
print("="*40)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Calculating METEOR and CIDEr...


100%|██████████| 100/100 [00:05<00:00, 19.34it/s]


Mean Max METEOR: 0.6073
Mean Max CIDEr:  0.9196





# **BERT, CLIP AND REF-CLIP SCORES**

In [None]:
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13


In [None]:
# ==========================================
# FIX & CALCULATE ZERO-SHOT SEMANTIC SCORES
# ==========================================
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from bert_score import BERTScorer
from transformers import CLIPProcessor, CLIPModel
from datasets import load_dataset
from PIL import Image
import io

# --- 1. FIX: RELOAD IMAGES FROM SOURCE ---
print("Reloading images from Hugging Face (to fix CSV missing data)...")
# Load the exact dataset used for testing
ds_source = load_dataset("Adyakanta/test_flickr30k", split="test")

# Ensure the dataframe matches the dataset length
if len(df) == len(ds_source):
    print("Success! Attaching real images to the dataframe...")
    df['image'] = ds_source['image']
else:
    print(f"Warning: DataFrame length ({len(df)}) != Dataset length ({len(ds_source)}).")
    print("Attempting to match by index (first 100)...")
    df['image'] = ds_source.select(range(len(df)))['image']

# --- 2. LOAD MODELS ---
print("Loading Metric Models...")
device = "cuda" if torch.cuda.is_available() else "cpu"
bert_scorer = BERTScorer(lang="en", rescale_with_baseline=True)
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# --- 3. SCORING FUNCTIONS ---
def get_best_bert_score(candidates, references):
    cands = [c for c in candidates if isinstance(c, str) and c.strip()]
    refs = [r for r in references if isinstance(r, str) and r.strip()]
    if not cands or not refs: return 0.0
    try:
        P, R, F1 = bert_scorer.score(cands, [refs] * len(cands))
        return F1.max().item()
    except: return 0.0

def get_clip_scores(image, candidates, references):
    cands = [c for c in candidates if isinstance(c, str) and c.strip()]
    refs = [r for r in references if isinstance(r, str) and r.strip()]
    if not cands: return 0.0, 0.0

    try:
        inputs_img = clip_processor(images=image, return_tensors="pt").to(device)
        inputs_cand = clip_processor(text=cands, return_tensors="pt", padding=True, truncation=True).to(device)

        with torch.no_grad():
            img_embeds = clip_model.get_image_features(**inputs_img)
            cand_embeds = clip_model.get_text_features(**inputs_cand)

            img_embeds /= img_embeds.norm(dim=-1, keepdim=True)
            cand_embeds /= cand_embeds.norm(dim=-1, keepdim=True)

            # CLIPScore
            clip_score = (torch.matmul(img_embeds, cand_embeds.t()).max().item()) * 100

            # RefCLIPScore
            refclip_score = 0.0
            if refs:
                inputs_ref = clip_processor(text=refs, return_tensors="pt", padding=True, truncation=True).to(device)
                ref_embeds = clip_model.get_text_features(**inputs_ref)
                ref_embeds /= ref_embeds.norm(dim=-1, keepdim=True)
                refclip_score = (torch.matmul(cand_embeds, ref_embeds.t()).max().item()) * 100

        return clip_score, refclip_score
    except Exception as e:
        print(f"CLIP Error: {e}")
        return 0.0, 0.0

# --- 4. RUN CALCULATION ---
print("Starting Score Calculation...")
bert_scores, clip_scores, refclip_scores = [], [], []

for index, row in tqdm(df.iterrows(), total=len(df)):
    # Parse text columns
    try:
        gen_raw = row['lol_of_genrated_response']
        orig_raw = row['lol_of_original_response']

        gens = eval(gen_raw) if isinstance(gen_raw, str) else gen_raw
        origs = eval(orig_raw) if isinstance(orig_raw, str) else orig_raw

        # Flatten
        gens = [x[0] if isinstance(x, list) else x for x in gens]
        origs = [x[0] if isinstance(x, list) else x for x in origs]

        # Calculate BERT (Does not need image)
        b_score = get_best_bert_score(gens, origs)

        # Calculate CLIP (Needs image)
        image = row['image']
        if isinstance(image, Image.Image):
            c_score, rc_score = get_clip_scores(image.convert("RGB"), gens, origs)
        else:
            c_score, rc_score = 0.0, 0.0

    except Exception as e:
        print(f"Row {index} error: {e}")
        b_score, c_score, rc_score = 0.0, 0.0, 0.0

    bert_scores.append(b_score)
    clip_scores.append(c_score)
    refclip_scores.append(rc_score)

df['BERTScore_Max'] = bert_scores
df['CLIPScore_Max'] = clip_scores
df['RefCLIPScore_Max'] = refclip_scores

# --- 5. RESULTS ---
print("\n" + "="*40)
print("   ZERO-SHOT SEMANTIC RESULTS")
print("="*40)
print(f"Mean Max BERTScore:    {np.mean(bert_scores):.4f}")
print(f"Mean Max CLIPScore:    {np.mean(clip_scores):.4f}")
print(f"Mean Max RefCLIPScore: {np.mean(refclip_scores):.4f}")

Reloading images from Hugging Face (to fix CSV missing data)...


README.md:   0%|          | 0.00/469 [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/14.6M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Success! Attaching real images to the dataframe...
Loading Metric Models...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

Starting Score Calculation...


100%|██████████| 100/100 [00:08<00:00, 12.48it/s]


   ZERO-SHOT SEMANTIC RESULTS
Mean Max BERTScore:    0.5784
Mean Max CLIPScore:    32.3236
Mean Max RefCLIPScore: 86.8757





# **DISTINCT-1 AND DISTINCT-2**

In [None]:
# ==========================================
# ZERO-SHOT: CALCULATE DISTINCT-1 & DISTINCT-2
# ==========================================
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.util import ngrams

# Ensure NLTK resources
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
    nltk.download('punkt_tab')

# --- Function to calculate Distinct-N ---
def calculate_group_distinct_n(captions_list, n):
    """
    Calculates Distinct-N for a group of captions.
    Formula: (Unique N-grams) / (Total N-grams)
    """
    all_ngrams = []
    for caption in captions_list:
        # Tokenize
        tokens = word_tokenize(caption.lower())
        if len(tokens) < n:
            continue
        # Generate n-grams
        row_ngrams = list(ngrams(tokens, n))
        all_ngrams.extend(row_ngrams)

    if len(all_ngrams) == 0:
        return 0.0

    return len(set(all_ngrams)) / len(all_ngrams)

print("Calculating Zero-Shot Diversity Metrics...")

dist1_scores = []
dist2_scores = []
all_generated_text_corpus = []

# Iterate over the Zero-Shot DataFrame (assuming 'df')
for index, row in df.iterrows():
    try:
        # Get generated captions
        # Ensure we are using the standardized list format from preprocessing
        # If 'generated_response_formatted' exists (from Cell 1), use it.
        # Otherwise, try parsing the original raw column.
        if 'generated_response_formatted' in df.columns:
            gens = row['generated_response_formatted']
        else:
            # Fallback parsing if preprocessing wasn't run
            raw = row['lol_of_genrated_response']
            gens = eval(raw) if isinstance(raw, str) else raw
            # Flatten if needed [['cap']] -> ['cap']
            gens = [x[0] if isinstance(x, list) else x for x in gens]

        # Add to global corpus
        all_generated_text_corpus.extend(gens)

        # Calculate Intra-Image Diversity
        d1 = calculate_group_distinct_n(gens, 1)
        d2 = calculate_group_distinct_n(gens, 2)

        dist1_scores.append(d1)
        dist2_scores.append(d2)

    except Exception as e:
        print(f"Error at row {index}: {e}")
        dist1_scores.append(0.0)
        dist2_scores.append(0.0)

# Add scores to DataFrame
df['Distinct_1_Image'] = dist1_scores
df['Distinct_2_Image'] = dist2_scores

# --- Calculate Corpus-Level Diversity ---
corpus_d1 = calculate_group_distinct_n(all_generated_text_corpus, 1)
corpus_d2 = calculate_group_distinct_n(all_generated_text_corpus, 2)

# --- REPORT ---
print("\n" + "="*40)
print("   ZERO-SHOT DIVERSITY ANALYSIS")
print("="*40)

print("1. PER-IMAGE DIVERSITY (Average variation within 5 captions):")
print(f"   Mean Distinct-1: {np.mean(dist1_scores):.4f}")
print(f"   Mean Distinct-2: {np.mean(dist2_scores):.4f}")

print("\n2. CORPUS-LEVEL DIVERSITY (Global Vocabulary Richness):")
print(f"   Corpus Distinct-1: {corpus_d1:.4f}")
print(f"   Corpus Distinct-2: {corpus_d2:.4f}")

print("\nDetailed Stats:")
print(df[['Distinct_1_Image', 'Distinct_2_Image']].describe())

Calculating Zero-Shot Diversity Metrics...

   ZERO-SHOT DIVERSITY ANALYSIS
1. PER-IMAGE DIVERSITY (Average variation within 5 captions):
   Mean Distinct-1: 0.3885
   Mean Distinct-2: 0.6228

2. CORPUS-LEVEL DIVERSITY (Global Vocabulary Richness):
   Corpus Distinct-1: 0.0897
   Corpus Distinct-2: 0.3113

Detailed Stats:
       Distinct_1_Image  Distinct_2_Image
count        100.000000        100.000000
mean           0.388470          0.622797
std            0.055113          0.085472
min            0.262500          0.400000
25%            0.352354          0.563913
50%            0.384552          0.629101
75%            0.430972          0.687500
max            0.532468          0.800000


# **Complete Scores for all the 100 Test Images**

In [None]:
!pip install bert_score



In [None]:
import pandas as pd
from tqdm import tqdm
import torch
from transformers import TextStreamer

In [None]:
!pip install bert_score torchmetrics rouge_score pycocoevalcap
!pip install git+https://github.com/openai/CLIP.git
!pip install rouge_score


Collecting torchmetrics
  Downloading torchmetrics-1.8.2-py3-none-any.whl.metadata (22 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycocoevalcap
  Downloading pycocoevalcap-1.2-py3-none-any.whl.metadata (3.2 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.15.2-py3-none-any.whl.metadata (5.7 kB)
Downloading torchmetrics-1.8.2-py3-none-any.whl (983 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.2/983.2 kB[0m [31m52.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pycocoevalcap-1.2-py3-none-any.whl (104.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.3/104.3 MB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.15.2-py3-none-any.whl (29 kB)
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created 

In [None]:
import pandas as pd
import numpy as np
import torch
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize
from rouge_score import rouge_scorer
from bert_score import BERTScorer
from transformers import CLIPProcessor, CLIPModel
import nltk

# --- 1. Setup ---
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

device = "cuda" if torch.cuda.is_available() else "cpu"

# --- 2. Load Models ---
print("Loading Evaluation Models...")
bert_scorer = BERTScorer(lang="en", rescale_with_baseline=True, device=device)

clip_model_name = "openai/clip-vit-base-patch32"
clip_model = CLIPModel.from_pretrained(clip_model_name).to(device)
clip_processor = CLIPProcessor.from_pretrained(clip_model_name)

scorer_rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
cc = SmoothingFunction()

# --- 3. Helper Functions ---
def get_clip_score_image_cosine(image, candidate, model, processor, device):
    """
    Returns Cosine Similarity (Image-Text). Range approx 0.2 - 0.4 for COCO-style.
    """
    try:
        inputs = processor(text=[candidate[:77]], images=image, return_tensors="pt", padding=True).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        # Extract features and normalize
        img_embeds = outputs.image_embeds / outputs.image_embeds.norm(dim=-1, keepdim=True)
        text_embeds = outputs.text_embeds / outputs.text_embeds.norm(dim=-1, keepdim=True)
        cosine_sim = (img_embeds @ text_embeds.T).item()
        return max(0.0, cosine_sim)
    except Exception as e:
        return 0.0

def get_refclip_score_text_cosine(refs, candidate, model, processor, device):
    """
    Returns Cosine Similarity (Text-Text) between Candidate and References. Range approx 0.7 - 0.9.
    """
    scores = []
    try:
        # Candidate Embedding
        cand_inputs = processor(text=[candidate[:77]], return_tensors="pt", padding=True).to(device)
        with torch.no_grad():
            cand_outputs = model.get_text_features(**cand_inputs)
        cand_embed = cand_outputs / cand_outputs.norm(dim=-1, keepdim=True)

        # Reference Embeddings
        for r in refs:
            ref_inputs = processor(text=[r[:77]], return_tensors="pt", padding=True).to(device)
            with torch.no_grad():
                ref_outputs = model.get_text_features(**ref_inputs)
            ref_embed = ref_outputs / ref_outputs.norm(dim=-1, keepdim=True)

            # Cosine
            sim = (cand_embed @ ref_embed.T).item()
            scores.append(max(0.0, sim))

        return max(scores) if scores else 0.0
    except Exception as e:
        return 0.0

def calculate_distinct_n(captions, n):
    total_ngrams = 0
    unique_ngrams = set()
    for cap in captions:
        words = word_tokenize(cap.lower())
        if len(words) < n: continue
        ngrams = list(zip(*[words[i:] for i in range(n)]))
        total_ngrams += len(ngrams)
        unique_ngrams.update(ngrams)
    return len(unique_ngrams) / total_ngrams if total_ngrams > 0 else 0

# --- 4. Main Evaluation Loop ---
results_data = []
# Assuming 'df' is already loaded with your 100 images
print(f"Starting Evaluation on {len(df)} images...")

for index, row in df.iterrows():
    candidates = row['generated_response_formatted']
    refs = row['original_response_formatted']
    if isinstance(refs, str): refs = [refs]
    image = row['image']

    # Tokenize Refs
    refs_tok = [word_tokenize(r.lower()) for r in refs]

    best_scores = {
        'BLEU-1': 0.0, 'BLEU-2': 0.0, 'BLEU-3': 0.0, 'BLEU-4': 0.0,
        'ROUGE-1': 0.0, 'ROUGE-2': 0.0, 'ROUGE-L': 0.0,
        'METEOR': 0.0, 'BERTScore': 0.0, 'CLIPScore': 0.0, 'RefCLIPScore': 0.0
    }

    for cand in candidates:
        cand_clean = cand.lower()
        cand_tok = word_tokenize(cand_clean)

        # --- Lexical ---
        b1 = sentence_bleu(refs_tok, cand_tok, weights=(1, 0, 0, 0), smoothing_function=cc.method1)
        b2 = sentence_bleu(refs_tok, cand_tok, weights=(0.5, 0.5, 0, 0), smoothing_function=cc.method1)
        b3 = sentence_bleu(refs_tok, cand_tok, weights=(0.33, 0.33, 0.33, 0), smoothing_function=cc.method1)
        b4 = sentence_bleu(refs_tok, cand_tok, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=cc.method1)
        met = meteor_score(refs_tok, cand_tok)

        # --- ROUGE ---
        r1, r2, rl = 0, 0, 0
        for r in refs:
            s = scorer_rouge.score(r.lower(), cand_clean)
            r1 = max(r1, s['rouge1'].fmeasure)
            r2 = max(r2, s['rouge2'].fmeasure)
            rl = max(rl, s['rougeL'].fmeasure)

        # --- Semantic ---
        # BERT (0-1)
        try:
            P, R, F1 = bert_scorer.score([cand], [refs])
            bert_val = F1.mean().item()
        except: bert_val = 0.0

        # CLIP (Image-Text Cosine)
        clip_val = get_clip_score_image_cosine(image, cand, clip_model, clip_processor, device)

        # RefCLIP (Text-Text Cosine)
        refclip_val = get_refclip_score_text_cosine(refs, cand, clip_model, clip_processor, device)

        # --- Update Max ---
        best_scores['BLEU-1'] = max(best_scores['BLEU-1'], b1)
        best_scores['BLEU-2'] = max(best_scores['BLEU-2'], b2)
        best_scores['BLEU-3'] = max(best_scores['BLEU-3'], b3)
        best_scores['BLEU-4'] = max(best_scores['BLEU-4'], b4)
        best_scores['ROUGE-1'] = max(best_scores['ROUGE-1'], r1)
        best_scores['ROUGE-2'] = max(best_scores['ROUGE-2'], r2)
        best_scores['ROUGE-L'] = max(best_scores['ROUGE-L'], rl)
        best_scores['METEOR'] = max(best_scores['METEOR'], met)
        best_scores['BERTScore'] = max(best_scores['BERTScore'], bert_val)

        # Store scaled (0-100)
        best_scores['CLIPScore'] = max(best_scores['CLIPScore'], clip_val * 100)
        best_scores['RefCLIPScore'] = max(best_scores['RefCLIPScore'], refclip_val * 100)

    # CIDEr & Diversity
    cid = row.get('CIDEr_Max', 0.0)
    dist1 = calculate_distinct_n(candidates, 1)
    dist2 = calculate_distinct_n(candidates, 2)

    # Save
    row_dict = {'image_id': index}
    row_dict.update(best_scores)
    row_dict['CIDEr'] = cid
    row_dict['Distinct-1'] = dist1
    row_dict['Distinct-2'] = dist2

    results_data.append(row_dict)

# --- 5. Output ---
df_metrics = pd.DataFrame(results_data)
output_file = 'qwen3_FT_zs_full_metrics.csv'
df_metrics.to_csv(output_file, index=False)
print(f"Saved: {output_file}")
print("Mean Scores:")
print(df_metrics.mean(numeric_only=True))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Loading Evaluation Models...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting Evaluation on 100 images...
Saved: qwen3_FT_zs_full_metrics.csv
Mean Scores:
image_id        49.500000
BLEU-1           0.724106
BLEU-2           0.556070
BLEU-3           0.434304
BLEU-4           0.325995
ROUGE-1          0.666457
ROUGE-2          0.448317
ROUGE-L          0.636952
METEOR           0.607328
BERTScore        0.578431
CLIPScore       32.122709
RefCLIPScore    87.030998
CIDEr            0.919568
Distinct-1       0.388470
Distinct-2       0.622797
dtype: float64
