In [1]:
!pip install transformers datasets peft torch accelerate pillow bitsandbytes 

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1

In [2]:
import os
import json
import torch
from PIL import Image
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import Blip2ForConditionalGeneration, Blip2Processor, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType

2025-05-12 07:15:09.944681: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747034110.174783      55 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747034110.241668      55 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
class VQADataset(Dataset):
    def __init__(self, json_dirs, base_img_path, processor):
        self.samples = []
        self.processor = processor
        self.base_img_path = base_img_path

        print(f"📁 Scanning JSON folders: {json_dirs}")

        for dir_path in ([json_dirs] if isinstance(json_dirs, str) else json_dirs):
            for root, _, files in os.walk(dir_path):
                for file in files:
                    if file.endswith(".json"):
                        json_path = os.path.join(root, file)
                        with open(json_path, "r") as f:
                            data = json.load(f)

                        image_rel_path = data["image_path"].replace("\\", "/")

                       
                        if "abo-images-small/" in image_rel_path:
                            image_rel_path = image_rel_path.split("abo-images-small/", 1)[1]

                        image_path = os.path.join(base_img_path, image_rel_path)

                        for qa in data["qa_pairs"]:
                            self.samples.append({
                                "image_path": image_path,
                                "question": qa["question"],
                                "answer": qa["answer"]
                            })

        print(f"✅ Loaded {len(self.samples)} samples.")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        image = Image.open(sample["image_path"]).convert("RGB")

        inputs = self.processor(
            images=image,
            text=sample["question"],
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=128
        )

        labels = self.processor.tokenizer(
            sample["answer"],
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=10
        ).input_ids

        inputs = {k: v.squeeze(0) for k, v in inputs.items()}
        inputs["labels"] = labels.squeeze(0)

        if "inputs_embeds" in inputs:
            del inputs["inputs_embeds"]

        return inputs

In [4]:
def compute_metrics(pred):
    preds = pred.predictions
    labels = pred.label_ids

    tokenizer = Blip2Processor.from_pretrained("Salesforce/blip-vqa-base").tokenizer
    pred_texts = tokenizer.batch_decode(preds, skip_special_tokens=True)
    label_texts = tokenizer.batch_decode(labels, skip_special_tokens=True)

    correct = sum(p.strip().lower() == l.strip().lower() for p, l in zip(pred_texts, label_texts))
    acc = correct / len(label_texts)
    print(f"✅ Accuracy: {acc:.4f}")
    return {"accuracy": acc}

In [5]:
from transformers import BlipProcessor, BlipForQuestionAnswering, TrainingArguments, Trainer
import torch
from torch.utils.data import random_split
from datasets import load_dataset

In [6]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=1b43c5660a13ec74a88c95cc114b441f9fab3345315a235c6f52d4a9f34695f0
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [7]:
!pip install rouge-score nltk sacrebleu --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [8]:
pip install bert-score nltk rouge-score fuzzywuzzy python-Levenshtein


Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (161 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m10.3 MB/s[0m eta [36m

In [9]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


True

In [10]:
!git clone https://github.com/neulab/BARTScore.git

Cloning into 'BARTScore'...
remote: Enumerating objects: 220, done.[K
remote: Counting objects: 100% (26/26), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 220 (delta 18), reused 14 (delta 14), pack-reused 194 (from 1)[K
Receiving objects: 100% (220/220), 101.98 MiB | 23.45 MiB/s, done.
Resolving deltas: 100% (47/47), done.
Updating files: 100% (192/192), done.


In [11]:
%cd BARTScore


/kaggle/working/BARTScore


In [12]:
!pip install transformers sentencepiece rouge-score




In [13]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'


In [14]:
import sys
sys.path.append('/kaggle/working/BARTScore')

In [15]:
!pip install bert-score
!pip install scikit-learn



V8 TEST

In [16]:
import os
import json
import time
import numpy as np
import torch
from torch.amp import autocast
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import BlipProcessor, BlipForQuestionAnswering
from accelerate import Accelerator
from sklearn.metrics import f1_score
from bert_score import score as bert_score_fn
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import single_meteor_score
from fuzzywuzzy import fuzz
from bart_score import BARTScorer
import nltk
from nltk.tokenize import word_tokenize
import string
from nltk.translate.bleu_score import SmoothingFunction


nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('punkt', quiet=True)


os.environ["TOKENIZERS_PARALLELISM"] = "false"


accelerator = Accelerator()
device = accelerator.device


load_path = "/kaggle/input/blip-finetunedmodel-versions/model_latest_v8"
try:
    model = BlipForQuestionAnswering.from_pretrained(load_path)
    processor = BlipProcessor.from_pretrained(load_path)
except Exception as e:
    print(f"Could not load model/processor from {load_path}. Falling back to default BLIP model for demonstration.")
    model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
    processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")

model = accelerator.prepare(model)

bart_scorer = None
try:
    bart_scorer_device = 'cuda' if device.type == 'cuda' else 'cpu'
    bart_scorer = BARTScorer(device=bart_scorer_device, checkpoint='facebook/bart-large-cnn')
    print(f"BARTScorer initialized on {bart_scorer_device}.")
except Exception as e:
    print(f"Could not initialize BARTScorer: {e}. BARTScore will not be calculated.")
    bart_scorer = None


test_json_dir = "/kaggle/input/master-test/test_dataset"
test_image_dir = "/kaggle/input/abo-dataset"
test_dataset = VQADataset(
    json_dirs=test_json_dir,
    base_img_path=test_image_dir,
    processor=processor
)


batch_size = 100
num_workers = 14
test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=num_workers,
    pin_memory=True,
    persistent_workers=(num_workers > 0),
)


resume_dir = "/kaggle/working/vqa_eval_output_2"
os.makedirs(resume_dir, exist_ok=True)
resume_file = os.path.join(resume_dir, "eval_resume.json")


print(resume_dir)
if os.path.exists(resume_file):
    try:
        with open(resume_file, "r") as f:
            resume_data = json.load(f)
        processed_indices = set(resume_data["indices"])
        predicted_all = resume_data["predicted"]
        true_all = resume_data["true"]
        correct = resume_data["correct"]
        total = resume_data["total"]
        print(f"Resume data loaded: {len(predicted_all)} predictions, {len(true_all)} true values. Correct batches: {len(processed_indices)}")
    except json.JSONDecodeError:
        print("Error decoding resume file. Starting evaluation from scratch.")
        processed_indices = set()
        predicted_all, true_all = [], []
        correct = total = 0
else:
    processed_indices = set()
    predicted_all, true_all = [], []
    correct = total = 0
    print("Default values initialized.")

initial_batches_processed = len(processed_indices)

model.eval()
batch_count = len(test_loader)
start_time = time.time()

with torch.no_grad():

    pbar = tqdm(test_loader, total=batch_count, initial=initial_batches_processed, desc="Evaluating")
    for batch_idx, batch in enumerate(pbar): 
       

        if batch_idx < initial_batches_processed: 
            pbar.update() 
            continue

        pixel_values = batch["pixel_values"].to(device)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        answers = batch["labels"]

        with autocast(device_type=device.type, dtype=torch.float16):
            generated_ids = model.generate(
                input_ids=input_ids,
                pixel_values=pixel_values,
                attention_mask=attention_mask,
                max_length=20,
                do_sample=False,
                num_beams=1
            )

        predicted_answers = processor.batch_decode(generated_ids, skip_special_tokens=True)
        decoded_answers = [
            processor.tokenizer.decode(ans, skip_special_tokens=True)
            if isinstance(ans, torch.Tensor) else ans
            for ans in answers
        ]

        for pred, true_ans_str in zip(predicted_answers, decoded_answers):
            pred_clean = pred.strip().lower()
            true_clean = true_ans_str.strip().lower()

            predicted_all.append(pred_clean)
            true_all.append(true_clean)

            if pred_clean == true_clean:
                correct += 1
            total += 1

       
        processed_indices.add(batch_idx) 
        
        if total % 1000 < batch_size and total >= 1000 or batch_idx == batch_count -1: 
            with open(resume_file, "w") as f:
                json.dump({
                    "indices": list(processed_indices), 
                    "predicted": predicted_all,
                    "true": true_all,
                    "correct": correct,
                    "total": total
                }, f)


with open(resume_file, "w") as f:
    json.dump({
        "indices": list(processed_indices),
        "predicted": predicted_all,
        "true": true_all,
        "correct": correct,
        "total": total
    }, f)
print("[Final Save] Evaluation state saved.")

end_time = time.time()
elapsed = end_time - start_time

if total == 0:
    print("\nNo samples were processed. Cannot calculate metrics.")
    exit()

exact_match = 100 * correct / total

bert_score_f1 = bert_score_precision = bert_score_recall = 0.0
avg_bleu = avg_rouge_l = avg_meteor = 0.0
avg_jaccard = avg_dice = avg_lcs = avg_token_overlap = avg_fuzzy = 0.0
avg_vqa_acc = 0.0
avg_vccs = 0.0
bart_score_value = 0.0

if len(predicted_all) > 0 and len(true_all) > 0:
    P, R, F1 = bert_score_fn(predicted_all, true_all, lang="en", verbose=False)
    bert_score_f1 = F1.mean().item()
    bert_score_precision = P.mean().item()
    bert_score_recall = R.mean().item()

    smoother = SmoothingFunction().method1  

    bleu_scores = [
        sentence_bleu([word_tokenize(ref)], word_tokenize(hyp), smoothing_function=smoother)
        for ref, hyp in zip(true_all, predicted_all)
    ]

    avg_bleu = np.mean(bleu_scores)

    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rouge_l_scores = [scorer.score(ref, hyp)["rougeL"].fmeasure for ref, hyp in zip(true_all, predicted_all)]
    avg_rouge_l = np.mean(rouge_l_scores)

    meteor_scores = [single_meteor_score(word_tokenize(ref), word_tokenize(hyp)) for ref, hyp in zip(true_all, predicted_all)]
    avg_meteor = np.mean(meteor_scores)

    def jaccard_sim(a, b):
        set_a, set_b = set(word_tokenize(a)), set(word_tokenize(b))
        return len(set_a & set_b) / len(set_a | set_b) if set_a | set_b else 0.0

    jaccard_scores = [jaccard_sim(a, b) for a, b in zip(true_all, predicted_all)]
    avg_jaccard = np.mean(jaccard_scores)

    def dice_sim(a, b):
        set_a, set_b = set(word_tokenize(a)), set(word_tokenize(b))
        return 2 * len(set_a & set_b) / (len(set_a) + len(set_b)) if set_a or set_b else 0.0

    dice_scores = [dice_sim(a, b) for a, b in zip(true_all, predicted_all)]
    avg_dice = np.mean(dice_scores)

    def lcs(a_tokens, b_tokens): 
        dp = np.zeros((len(a_tokens)+1, len(b_tokens)+1))
        for i in range(len(a_tokens)):
            for j in range(len(b_tokens)):
                if a_tokens[i] == b_tokens[j]:
                    dp[i+1][j+1] = dp[i][j] + 1
                else:
                    dp[i+1][j+1] = max(dp[i][j+1], dp[i+1][j])
        return dp[len(a_tokens)][len(b_tokens)]

    lcs_ratios = [lcs(word_tokenize(a), word_tokenize(b)) / max(len(word_tokenize(a)), len(word_tokenize(b))) if max(len(word_tokenize(a)), len(word_tokenize(b))) > 0 else 0
                  for a, b in zip(true_all, predicted_all)]
    avg_lcs = np.mean(lcs_ratios)

    def token_overlap(a, b):
        a_tokens, b_tokens = word_tokenize(a), word_tokenize(b)
        return len(set(a_tokens) & set(b_tokens)) / len(set(a_tokens) | set(b_tokens)) if a_tokens or b_tokens else 0.0

    token_overlaps = [token_overlap(a, b) for a, b in zip(true_all, predicted_all)]
    avg_token_overlap = np.mean(token_overlaps)

    fuzzy_scores = [fuzz.ratio(a, b)/100 for a, b in zip(true_all, predicted_all)]
    avg_fuzzy = np.mean(fuzzy_scores)

    vqa_accs = [1.0 if p == t else 0.0 for p, t in zip(predicted_all, true_all)]
    avg_vqa_acc = np.mean(vqa_accs)

    def visual_contextual_consistency_score(ground_truth, predicted_answer):
        gt_tokens = word_tokenize(ground_truth.lower())
        pred_tokens = word_tokenize(predicted_answer.lower())

        gt_tokens = [word for word in gt_tokens if word not in string.punctuation]
        pred_tokens = [word for word in pred_tokens if word not in string.punctuation]

        set_gt = set(gt_tokens)
        set_pred = set(pred_tokens)

        if not set_gt and not set_pred:
            return 1.0
        if not set_gt or not set_pred:
            return 0.0

        intersection = len(set_gt.intersection(set_pred))
        union = len(set_gt.union(set_pred))

        return intersection / union

    vccs_scores = []
    for gt, pred in zip(true_all, predicted_all):
        vccs_scores.append(visual_contextual_consistency_score(gt, pred))
    avg_vccs = np.mean(vccs_scores)

    if bart_scorer is not None:
        try:
            bart_scores_list = bart_scorer.score(predicted_all, true_all, batch_size=4)
            bart_score_value = np.mean(bart_scores_list)
        except Exception as e:
            print(f"Error calculating BARTScore: {e}")
            bart_score_value = 0.0
    else:
        print("BARTScorer not initialized, skipping BARTScore calculation.")


print(f"\n📊 Evaluation Metrics:")
print(f"✅ Exact Match (EM): {exact_match:.2f}%")
print(f"🤖 BERTScore - Precision: {bert_score_precision:.4f}")
print(f"🤖 BERTScore - Recall:    {bert_score_recall:.4f}")
print(f"🤖 BERTScore - F1:        {bert_score_f1:.4f}")
print(f"🔹 BLEU Score:            {avg_bleu:.4f}")
print(f"🔹 ROUGE-L:               {avg_rouge_l:.4f}")
print(f"🔹 METEOR:                {avg_meteor:.4f}")
print(f"🔹 Jaccard Similarity:    {avg_jaccard:.4f}")
print(f"🔹 Sørensen–Dice Coefficient: {avg_dice:.4f}")
print(f"🔹 LCS Ratio:             {avg_lcs:.4f}")
print(f"🔹 Token-Level Overlap:   {avg_token_overlap:.4f}")
print(f"🔹 Fuzzy Matching Score:  {avg_fuzzy:.4f}")
print(f"🔹 VQA Accuracy:          {avg_vqa_acc*100:.2f}%")
print(f"🔹 Visual-Contextual Consistency Score (VCCS): {avg_vccs:.4f}")
print(f"🔹 BARTScore:             {bart_score_value:.4f}")

print(f"\nTotal elapsed time: {elapsed:.2f} seconds")


metrics_file_path = os.path.join(resume_dir, "evaluation_metrics.txt")

with open(metrics_file_path, "w") as f:
    f.write("📊 Evaluation Metrics:\n")
    f.write(f"✅ Exact Match (EM): {exact_match:.2f}%\n")
    f.write(f"🤖 BERTScore - Precision: {bert_score_precision:.4f}\n")
    f.write(f"🤖 BERTScore - Recall:    {bert_score_recall:.4f}\n")
    f.write(f"🤖 BERTScore - F1:        {bert_score_f1:.4f}\n")
    f.write(f"🔹 BLEU Score:            {avg_bleu:.4f}\n")
    f.write(f"🔹 ROUGE-L:               {avg_rouge_l:.4f}\n")
    f.write(f"🔹 METEOR:                {avg_meteor:.4f}\n")
    f.write(f"🔹 Jaccard Similarity:    {avg_jaccard:.4f}\n")
    f.write(f"🔹 Sørensen–Dice Coefficient: {avg_dice:.4f}\n")
    f.write(f"🔹 LCS Ratio:             {avg_lcs:.4f}\n")
    f.write(f"🔹 Token-Level Overlap:   {avg_token_overlap:.4f}\n")
    f.write(f"🔹 Fuzzy Matching Score:  {avg_fuzzy:.4f}\n")
    f.write(f"🔹 VQA Accuracy:          {avg_vqa_acc*100:.2f}%\n")
    f.write(f"🔹 Visual-Contextual Consistency Score (VCCS): {avg_vccs:.4f}\n")
    f.write(f"🔹 BARTScore:             {bart_score_value:.4f}\n")
    f.write(f"\nTotal elapsed time: {elapsed:.2f} seconds\n")

print(f"\nAll metrics saved to: {metrics_file_path}")

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

BARTScorer initialized on cuda.
📁 Scanning JSON folders: /kaggle/input/master-test/test_dataset




✅ Loaded 482036 samples.
/kaggle/working/vqa_eval_output_2
Default values initialized.


Evaluating: 100%|██████████| 4821/4821 [4:21:05<00:00,  3.25s/it]  


[Final Save] Evaluation state saved.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



📊 Evaluation Metrics:
✅ Exact Match (EM): 19.97%
🤖 BERTScore - Precision: 0.9544
🤖 BERTScore - Recall:    0.9345
🤖 BERTScore - F1:        0.9435
🔹 BLEU Score:            0.0369
🔹 ROUGE-L:               0.2085
🔹 METEOR:                0.1083
🔹 Jaccard Similarity:    0.2044
🔹 Sørensen–Dice Coefficient: 0.2067
🔹 LCS Ratio:             0.2051
🔹 Token-Level Overlap:   0.2044
🔹 Fuzzy Matching Score:  0.3778
🔹 VQA Accuracy:          19.97%
🔹 Visual-Contextual Consistency Score (VCCS): 0.2043
🔹 BARTScore:             -5.8681

Total elapsed time: 15665.46 seconds

All metrics saved to: /kaggle/working/vqa_eval_output_2/evaluation_metrics.txt
