In [1]:
!pip install -qU git+https://github.com/huggingface/transformers.git
!pip install -qU peft
from huggingface_hub import notebook_login
#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
notebook_login()

  from .autonotebook import tqdm as notebook_tqdm


ImportError: The `notebook_login` function can only be used in a notebook (Jupyter or Colab) and you need the `ipywidgets` module: `pip install ipywidgets`.

In [2]:
import torch
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
import requests
from PIL import Image

# Load a pre-trained VQA model and processor

processor = AutoProcessor.from_pretrained("google/paligemma-3b-pt-224")
model = PaliGemmaForConditionalGeneration.from_pretrained("MahmoudRox/Paligemma_VQAMED2019")

# Function for VQA with selective prediction for top 4 answers
def vqa_with_selective_prediction(image_path, question, threshold=0.5):
    # Load and process the image
    image = Image.open(requests.get(image_path, stream=True).raw)
    encoding = processor(text= question, images=image, return_tensors="pt")

    attention_mask = encoding['attention_mask']
    if torch.max(attention_mask) == 1:
      attention_mask = 1 - attention_mask
    # Forward pass
    outputs = model(**encoding)

    # Get the predicted answers and their probabilities
    logits = outputs.logits_per_image
    probs = torch.softmax(logits, dim=1)
    max_prob, predicted_class = torch.max(probs, dim=1)
    # Decode the top 2 answers
    #top_answers = [model.config.id2label[top_classes[0, i].item()] for i in range(20)]
    predicted_answer = model.config.id2label[predicted_class.item()]

    #top_confidences = [top_probs[0, i].item() for i in range(20)]

    # Selective prediction based on the threshold
    if max_prob.item() >= threshold:
        return predicted_answer, max_prob.item()
    else:
        return "Not confident enough to answer", max_prob.item()

# Example usage
image_path = "https://prod-images-static.radiopaedia.org/images/9289883/1c20962e46c92ee83a3f551adb24fa_big_gallery.jpg"
question = "Which part of the body is in the picture?"
answer, confidence = vqa_with_selective_prediction(image_path, question, threshold=0.7)
print(f"Answer: {answer}, Confidence: {confidence:.2f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


OSError: MahmoudRox/Paligemma_VQAMED2019 does not appear to have a file named config.json. Checkout 'https://huggingface.co/MahmoudRox/Paligemma_VQAMED2019/tree/main' for available files.

In [None]:
import torch
from transformers import ViltProcessor, ViltForQuestionAnswering
import requests
from PIL import Image

# Load a pre-trained VQA model and processor
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

# Dummy scaling vector and bias for demonstration (should be learned from validation)
scaling_vector = torch.tensor([1.0 for _ in range(model.config.num_labels)])
bias_vector = torch.tensor([0.0 for _ in range(model.config.num_labels)])

# Vector scaling calibration function
def apply_vector_scaling(logits, scaling_vector, bias_vector):
    scaled_logits = logits * scaling_vector + bias_vector
    return scaled_logits

# Function for VQA with selective prediction for top 2 answers using vector scaling
def vqa_with_selective_prediction(image_path, question, scaling_vector, bias_vector, threshold=0.5):
    # Load and process the image
    image = Image.open(requests.get(image_path, stream=True).raw)
    encoding = processor(image, question, return_tensors="pt")

    # Forward pass
    outputs = model(**encoding)

    # Apply vector scaling
    calibrated_logits = apply_vector_scaling(outputs.logits, scaling_vector, bias_vector)

    # Get the predicted answers and their probabilities
    probs = torch.softmax(calibrated_logits, dim=1)
    top_probs, top_classes = torch.max(probs, dim=1)

    # Decode the top 2 answers
    top_answers = [model.config.id2label[top_classes[0, i].item()] for i in range(4)]
    top_confidences = [top_probs[0, i].item() for i in range(4)]

    # Selective prediction based on the threshold
    results = []
    for answer, confidence in zip(top_answers, top_confidences):
        if confidence >= threshold:
            results.append((answer, confidence))
        else:
            results.append(("Not confident enough to answer", confidence))

    return results

# Example usage
image_path = "http://images.cocodataset.org/val2017/000000039769.jpg"
question = "What is in the background?"
threshold = 0.0  # Confidence threshold
answers = vqa_with_selective_prediction(image_path, question, scaling_vector, bias_vector, threshold)
for idx, (answer, confidence) in enumerate(answers, start=1):
    print(f"Answer {idx}: {answer}, Confidence: {confidence:.2f}")