[![Labellerr](https://storage.googleapis.com/labellerr-cdn/%200%20Labellerr%20template/notebook.webp)](https://www.labellerr.com)

# BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation

---

[![labellerr](https://img.shields.io/badge/Labellerr-BLOG-black.svg)](https://www.labellerr.com/blog/<BLOG_NAME>)
[![Youtube](https://img.shields.io/badge/Labellerr-YouTube-b31b1b.svg)](https://www.youtube.com/@Labellerr)
[![Github](https://img.shields.io/badge/Labellerr-GitHub-green.svg)](https://github.com/Labellerr/Hands-On-Learning-in-Computer-Vision)
[![Scientific Paper](https://img.shields.io/badge/Official-Paper-blue.svg)](<PAPER LINK>)


## Installing Required Libraries

In [None]:
%pip install torch transformers pillow
%pip install accelerate

## Importing Libraries

In [None]:
import torch
from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering
import requests
from PIL import Image
from io import BytesIO
from IPython.display import display
import os
from transformers.utils import logging

# Suppress unnecessary logs
logging.set_verbosity_error()


## Helper Function

In [None]:
def show_image(source):
    """
    Display an image from a URL or a local file path.

    Args:
        source (str): The URL or local file path of the image.
    """
    try:
        if source.startswith("http://") or source.startswith("https://"):
            # Load image from URL
            response = requests.get(source)
            response.raise_for_status()  # Raise exception for bad response
            img = Image.open(BytesIO(response.content))
        elif os.path.exists(source):
            # Load image from local file path
            img = Image.open(source)
        else:
            raise ValueError("Invalid source. Provide a valid URL or local file path.")
        
        display(img)
    
    except Exception as e:
        print(f"Error displaying image: {e}")

## Implementing BLIP

In [None]:
def blip(ques: str, img_url: str) -> str:
    """    Perform visual question answering using the BLIP model."""
    processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
    model = AutoModelForVisualQuestionAnswering.from_pretrained(
        "Salesforce/blip-vqa-base", 
        torch_dtype=torch.float16,
        device_map="auto"
    )
    image = Image.open(requests.get(img_url, stream=True).raw)

    question = ques
    inputs = processor(images=image, text=question, return_tensors="pt").to("cuda", torch.float16)

    output = model.generate(**inputs)
    answer = processor.batch_decode(output, skip_special_tokens=True)[0]
    return answer

In [None]:
# def blip(ques: str, img: str) -> str:
#     """
#     Perform visual question answering using the BLIP model.

#     Args:
#         ques (str): The question to ask about the image.
#         image (str): The URL or local file path of the image.

#     Returns:
#         str: The answer to the question.
#     """
#     blip_pipeline = pipeline(
#         task="visual-question-answering",
#         model="Salesforce/blip-vqa-base",
#         torch_dtype=torch.float16,
#         device=0
#     )
    
#     answer = blip_pipeline(question=ques, image=img)[0]['answer']
#     return answer

In [None]:
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
show_image(url)

In [None]:
blip("What is the weather in this image?", url)

In [None]:
url1 = "https://farm9.staticflickr.com/8198/8233776747_b27f40f3c2_z.jpg"
show_image(url1)

In [None]:
question = "how many animals in this image?"
blip(question, url1)

In [None]:
ques_list = [
    "What is the weather in this image?",
    "how many animals in this image?",
    "which animal is in the image?",
    "what type of terrain in the image?",
    "any flowers in the image?",
    "which time of day it is"]

for ques in ques_list:
    print(f"Question: {ques}")
    answer = blip(ques, url1)
    print(f"Answer: {answer}\n")

In [None]:
url2 = 'https://i.pinimg.com/1200x/c4/01/99/c40199e777e9467353f41432c351c90a.jpg'
show_image(url2)

In [None]:
ques_list = [
    "Numbers of posters in this image",
    "Name of the device in this image",
    "On right-side poster, what is written on it?",
    "Any plant in this image?"
    ]

for ques in ques_list:
    print(f"Question: {ques}")
    answer = blip(ques, url2)
    print(f"Answer: {answer}\n")

In [None]:
url3 = "https://i.pinimg.com/1200x/0b/41/71/0b417194ea4f479af82c1269b96a81d2.jpg"
show_image(url3)

In [None]:
ques_list = [
    "Numbers of coins in this image",
    "what is the color of coins in this image",
    "Value written on the coin",
    "which currency does the coins belong to?",
    "which currency is written on the coin?"
    ]

for ques in ques_list:
    print(f"Question: {ques}")
    answer = blip(ques, url3)
    print(f"Answer: {answer}\n")

In [None]:
url4 = "https://i.pinimg.com/736x/f9/0a/08/f90a0858d9271593f2be424cd62b38ba.jpg"
show_image(url4)

In [None]:
ques_list = [
    "which vehicle is in the image?",
    "what is the color of the vehicle?",
    "what is the brand of vehicle?",
    "Numbers of person in the image?",
    "where is the persons in the image?",
    "which place is in the image?",
    "what time of day is in the image",
    "what is the van plate vehicle ID?"
    ]

for ques in ques_list:
    print(f"Question: {ques}")
    answer = blip(ques, url4)
    print(f"Answer: {answer}\n")