In [None]:
!pip install -q -U "transformers>=4.41.0" "datasets>=2.18.0" "accelerate>=0.28.0" "peft>=0.10.0" "bitsandbytes>=0.41.3"

In [None]:
import os
import torch
from datasets import load_from_disk, Dataset
from peft import PeftModel
from tqdm import tqdm
from collections import Counter
import re
import random
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from transformers import (
    PaliGemmaForConditionalGeneration,
    PaliGemmaProcessor,
    BitsAndBytesConfig
)

In [None]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_TOKEN")

login(token=hf_token)

In [None]:
def process_image(image):
    if isinstance(image, np.ndarray):
        if image.ndim == 2:
            image = Image.fromarray(image, mode='L').convert('RGB')
        elif image.ndim == 3 and image.shape[2] in [1, 3, 4]:
            if image.shape[2] == 1:
                image = Image.fromarray(image.squeeze(), mode='L').convert('RGB')
            elif image.shape[2] == 3:
                image = Image.fromarray(image, mode='RGB')
            else:
                image = Image.fromarray(image, mode='RGBA').convert('RGB')
    elif isinstance(image, Image.Image):
        if image.mode != 'RGB':
            image = image.convert('RGB')
    else:
        raise ValueError(f"Định dạng ảnh không được hỗ trợ: {type(image)}") 
    return image

def normalize_answer(s):
    s = s.lower()
    s = re.sub(r'\b(a|an|the)\b', ' ', s)
    s = re.sub(r'[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]', '', s)
    s = ' '.join(s.split())
    return s

In [None]:
def vqa_accuracy(predicted_answers, ground_truth_answers_list):

    detailed_scores = []
    for pred, gt_answers in zip(predicted_answers, ground_truth_answers_list):
        pred_normalized = normalize_answer(pred)
        answer_counts = Counter(normalize_answer(gt) for gt in gt_answers)
        
        score = 0.0
        if pred_normalized in answer_counts:
            score = min(answer_counts[pred_normalized] / 3.0, 1.0)
        detailed_scores.append(score)
        
    accuracy = sum(detailed_scores) / len(detailed_scores) if detailed_scores else 0.0
    return accuracy, detailed_scores


In [None]:
dataset_path = "/kaggle/input/vqa-v2/vqav2/dataset_arrow" 
model_name = "gintorikj/paligemma_vqav2_10pc"
base_model_id = "google/paligemma-3b-pt-224"


In [None]:
ds = load_from_disk(dataset_path)
print(f"Tổng số mẫu: {len(ds)}")

In [None]:
# Lấy một tập con để đánh giá
eval_subset_size = 5000
ds_eval = ds.select(range(min(eval_subset_size, len(ds))))
print(f"Sử dụng {len(ds_eval)} mẫu để đánh giá.")

def prepare_vqa_data(dataset):
    processed_data = []
    for item in dataset:
        if 'answers' in item:
            gt_answers = [ans['answer'] for ans in item['answers']]
            processed_data.append({

                'image': process_image(item['image']),
                'question': item['question'],
                'ground_truth_answers': gt_answers,
            })
    return processed_data

processed_data = prepare_vqa_data(ds_eval)
print(f"Đã xử lý {len(processed_data)} mẫu có câu trả lời")

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
base_model = PaliGemmaForConditionalGeneration.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map={"": 0},
    torch_dtype=torch.bfloat16
)
processor = PaliGemmaProcessor.from_pretrained(base_model_id)
model = PeftModel.from_pretrained(base_model, model_name)
model.eval()

device = "cuda:0"
eval_size = min(1000, len(processed_data))
eval_data = processed_data[:eval_size]
print(f"Bắt đầu đánh giá trên {len(eval_data)} mẫu")

In [None]:
predictions = []
ground_truth_answers_list = []

with torch.no_grad():
    for item in tqdm(eval_data, desc="Đang đánh giá"):
        prompt = "answer " + item["question"]
        inputs = processor(
            text=prompt,
            images=item["image"],
            return_tensors="pt"
        ).to(device)
        
        generated_ids = model.generate(**inputs, max_new_tokens=10, do_sample=False)
        generated_text = processor.decode(generated_ids[0, inputs["input_ids"].shape[1]:], skip_special_tokens=True).strip()
        predictions.append(generated_text)
        ground_truth_answers_list.append(item["ground_truth_answers"])


vqa_acc, individual_scores = vqa_accuracy(predictions, ground_truth_answers_list)

print(f"KẾT QUẢ ĐÁNH GIÁ VQA")
print(f"Model: {model_name}")
print(f"Dataset: VQAv2")
print(f"Số mẫu được đánh giá: {len(eval_data)}")
print(f"VQA Accuracy: {vqa_acc * 100:.2f}%")

In [None]:
num_examples = min(5, len(eval_data))
random_indices = random.sample(range(len(eval_data)), num_examples)

print(f"\nVí dụ dự đoán (hiển thị ảnh):")
for idx, i in enumerate(random_indices):

    item = eval_data[i]
    pred = predictions[i]
    score = individual_scores[i]
    gt_answers = ground_truth_answers_list[i]

    plt.figure(figsize=(6, 6)) 
    plt.imshow(item['image'])
    plt.title(f"Câu hỏi: {item['question']}\n", fontsize=12)
    plt.axis('off')  
    plt.show()

    print(f"Model dự đoán: {pred}")
    print(f"Đáp án gốc (Ground Truth): {gt_answers}")
    print(f"Điểm VQA cho câu này: {score:.2f}")