In [None]:
!pip install -q git+https://github.com/huggingface/peft.git transformers bitsandbytes datasets accelerate
!pip install -i https://pypi.org/simple/ bitsandbytes
!pip install evaluate
!pip install nltk
!pip install rouge
!pip install pycocoevalcap

In [None]:
import torch
from PIL import Image
from tqdm import tqdm
import pickle
from transformers import AutoProcessor, Blip2ForConditionalGeneration

processor = AutoProcessor.from_pretrained("sooh-j/VQA-for-VIP")
model_v = Blip2ForConditionalGeneration.from_pretrained("sooh-j/VQA-for-VIP", 
                                                 device_map="auto", )

In [None]:
import os
from shutil import copyfile

# Interface for accessing the VQA dataset.
lib_PATH = '/kaggle/input/vizwiz-dataset'

from os import listdir
from os.path import isfile, join
lib_files = [f for f in listdir(lib_PATH) if isfile(join(lib_PATH, f))]
# lib_file 

for lib_f in lib_files:
    copyfile(src = os.path.join(lib_PATH, lib_f), 
             dst = os.path.join("../working", lib_f))

# import all our functions
from preprocessing import *
from prepare_data_eval import *
from vqa import *

#-------------------------------download VIZWIZ dataset--------------------------#

vizwiz_data, VIZWIZ_TRAIN_PATH, VIZWIZ_VALIDATION_PATH = load_dataset_vizwiz("/kaggle/input/vizwiz")

# vizwiz_train_dataset = VQADataset(dataset=vizwiz_data['train'],
#                         processor=processor,
#                         img_path=VIZWIZ_TRAIN_PATH)
vizwiz_valid_dataset = VQADataset(dataset=vizwiz_data['test'][:100],
                        processor=processor,
                        img_path=VIZWIZ_VALIDATION_PATH)

#-------------------------------download KVQA dataset--------------------------#

kvqa_data, KVQA_TRAIN_PATH, KVQA_VALIDATION_PATH = load_dataset_kvqa("/kaggle/input/vqa-blind-ko")

# kvqa_train_dataset = VQADataset(dataset=kvqa_data['train'],
#                         processor=processor,
#                         img_path=KVQA_TRAIN_PATH)
kvqa_valid_dataset = VQADataset(dataset=kvqa_data['test'][:100],
                        processor=processor,
                        img_path=KVQA_VALIDATION_PATH)

In [None]:
import torch
from torch.utils.data import ConcatDataset
from nltk.translate.bleu_score import sentence_bleu
import nltk
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from tqdm import tqdm
model = model_v
nltk.download('wordnet')

# BLEU Score 
def calculate_bleu(reference, candidate):
    reference_tokens = [ref.split() for ref in reference]
    candidate_tokens = candidate.split()
    score = sentence_bleu(reference_tokens, candidate_tokens)
    return score

def predict(model, image, question):
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    prompt = f"Question: {question}, Answer:"
    processed = processor(images=image, text=prompt, return_tensors="pt").to(device)
    output = model.generate(**processed, 
                         max_new_tokens=20,
                         temperature = 0.5,
                         do_sample=True,
                         top_k=50,
                         top_p=0.9,
                         repetition_penalty=1.2
                         ).to(device)
    predicted_text = processor.decode(output[0], skip_special_tokens=True)
    return predicted_text

combined_dataset = ConcatDataset([vizwiz_valid_dataset, kvqa_valid_dataset])

references = []
candidates = []

for image, question, answer in tqdm(combined_dataset):
    predicted_answer = predict(model, image, question)

    references.append(answer)
    candidates.append(predicted_answer)

# BLEU Score 
bleu_scores = [calculate_bleu([ref], cand) for ref, cand in zip(references, candidates)]
avg_bleu_score = sum(bleu_scores) / len(bleu_scores)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
def calculate_metrics_with_similarity(predicted_list, ground_truth_list, threshold=0.3):
    vectorizer = TfidfVectorizer().fit(predicted_list + ground_truth_list)
    predicted_vectors = vectorizer.transform(predicted_list)
    ground_truth_vectors = vectorizer.transform(ground_truth_list)
    
    true_positive = 0
    
    for pred_vec in predicted_vectors:
        similarities = cosine_similarity(pred_vec, ground_truth_vectors).flatten()
        if np.any(similarities >= threshold):
            true_positive += 1
    
    precision = true_positive / len(predicted_list) if predicted_list else 0
    recall = true_positive / len(ground_truth_list) if ground_truth_list else 0
    f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) else 0
    accuracy = true_positive / len(predicted_list) if predicted_list else 0
    
    return {
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score,
        'accuracy': accuracy
    }

metrics = calculate_metrics_with_similarity( candidates, references)
print(metrics)
print(f"Average BLEU Score: {avg_bleu_score}")

----

In [None]:
from datasets import load_dataset
import torch
from PIL import Image
from torch.utils.data import DataLoader
from tqdm import tqdm
import pickle
from transformers import AutoProcessor, Blip2ForConditionalGeneration

processor = AutoProcessor.from_pretrained("sooh-j/VQA-for-VIP")
model_blip = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", 
                                                 device_map="auto", )


In [None]:
references_blip = []
candidates_blip = []

for image, question, answer in tqdm(combined_dataset):
    predicted_answer_blip = predict(model_blip, image, question)

    references_blip.append(answer)
    candidates_blip.append(predicted_answer_blip)

# BLEU Score 
bleu_scores_blip = [calculate_bleu([ref], cand) for ref, cand in zip(references_blip, candidates_blip)]
avg_bleu_score_blip = sum(bleu_scores_blip) / len(bleu_scores_blip)

metrics_blip = calculate_metrics_with_similarity( candidates_blip, references_blip)
print(metrics_blip)
print(f"Average BLEU Score: {avg_bleu_score_blip}")

In [None]:
import requests
from PIL import Image
import os
import skimage.io as io
import matplotlib.pyplot as plt
from io import BytesIO
import base64

device = "cuda:0" if torch.cuda.is_available() else "cpu"
input_images = [
    "/kaggle/input/dataset-fortest/VizWiz_train_00000000.jpg",
    "/kaggle/input/dataset-fortest/airpod.jpeg"
               ]
input_questions = [
    "What's the name of this product?",
    "what is this?"
                  ]

for img, question in zip(input_images, input_questions):
    if os.path.isfile(img):
        image = Image.open(img).convert('RGB')
        I = io.imread(img)
        plt.imshow(I)
        plt.axis('off')
    else:
        image = Image.open(requests.get(img, stream=True).raw).convert('RGB')
        plt.imshow(image)

    prompt = f"Question: {question}, Answer:"
    processed = processor(images=image, text=prompt, return_tensors="pt").to(device)
    out = model.generate(**processed, 
                     max_new_tokens=20,
                     temperature = 0.5,
                     do_sample=True,
                     top_k=50,
                     top_p=0.9,
                     repetition_penalty=1.2  
                     ).to(device)

    text_output = processor.decode(out[0], skip_special_tokens=True)
    print(f"Q : {question}, A : {text_output}")
    plt.figtext(1, 0.5, f"Q : {question}\nA : {text_output}", fontsize=14)
    plt.show()