In [None]:
"""

The one method using CLIP, a model developed by OpenAI, evaluate how well a generated caption (BLIP) matches an image.
Loads the CLIP model and its preprocessing function.
The image is preprocessed to be suitable for CLIP (using the CLIP preprocessing function).
Both the image and the generated caption are encoded into feature vectors using CLIP’s image and text encoders.
Computes the cosine similarity between the image features and the text features to determine how closely the caption matches the image.
Outputs the generated caption and its similarity score with the image, indicating the relevance of the caption to the image content.

"""

In [1]:
# Using BLIP to generate a caption of the image
# Local image

from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import requests

# Load the model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Load your image
image_path = "dataset/train/image29.png"
image = Image.open(image_path)

# Preprocess the image
inputs = processor(image, return_tensors="pt")

# Generate the caption
outputs = model.generate(**inputs)
caption = processor.decode(outputs[0], skip_special_tokens=True)

print(f"Generated Caption: {caption}")




Generated Caption: a black background with a white and blue pattern


In [2]:
# url image

from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import requests

# Load the model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Load an image from a URL
image_url = "https://gateway.fxhash.xyz/ipfs/Qmcg7DaHFPcgRyZ3pbwuTgNAS9CP1U7zp5HkjFfKJ9FYWZ"
image = Image.open(requests.get(image_url, stream=True).raw)

# Preprocess the image
inputs = processor(image, return_tensors="pt")

# Generate the caption
outputs = model.generate(**inputs)
caption = processor.decode(outputs[0], skip_special_tokens=True)

print(f"Generated Caption: {caption}")


Generated Caption: a colorful background with a grid pattern


In [3]:
# Using CLIP to check the BLIP generated caption

import clip
import torch

# Load CLIP model and preprocess
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, clip_preprocess = clip.load("ViT-B/32", device)

# Preprocess the image for CLIP
clip_image_input = clip_preprocess(image).unsqueeze(0).to(device)

# Encode image and generated description
with torch.no_grad():
    image_features = clip_model.encode_image(clip_image_input)
    text_features = clip_model.encode_text(clip.tokenize([caption]).to(device))

# Compute similarity
similarity = (image_features @ text_features.T).item()

print(f"Caption: {caption}")
print(f"Similarity with image: {similarity:.4f}")


Caption: a black background with a white and blue pattern
Similarity with image: 29.5569


In [3]:
# Trying to use BLIP on a larger sample

from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import requests

# Load the model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

caption_occurrences = {}

# Iterate over all images
for i in range(1, 51):
    image_path = f"dataset/ipfs/image{i}.png"
    image = Image.open(image_path)

    prompt = "Describe this in one word:"

    # Preprocess the image
    inputs = processor(image, return_tensors="pt")

    # Generate the caption
    outputs = model.generate(**inputs, max_length=4, num_beams=5)  # max_length=3 to encourage short output
    caption = processor.decode(outputs[0], skip_special_tokens=True)

    # Update the caption occurrences dictionary
    if caption in caption_occurrences:
        caption_occurrences[caption] += 1
    else:
        caption_occurrences[caption] = 1

# Print the final caption occurrences
print("Caption Occurrences:", caption_occurrences)


Caption Occurrences: {'red tulip': 1, 'a black and': 6, 'an illustration of': 1, 'an image of': 10, 'a colorful abstract': 1, 'a red and': 2, 'a black background': 1, 'green abstract background': 1, 'a cartoon character': 1, 'a brown and': 1, 'a drawing of': 4, 'a group of': 1, 'black and white': 1, 'three black and': 1, 'a bunch of': 1, 'a computer generated': 2, 'blue wood texture': 1, 'the cover of': 1, 'a blue background': 1, 'a circle with': 1, 'a green and': 2, 'a painting of': 1, 'a chart showing': 1, 'this is a': 1, 'a picture of': 1, 'a colorful pattern': 1, 'a set of': 1, 'a poster with': 1, 'the logo for': 1, 'an abstract pattern': 1}


In [6]:
sorted_captions = dict(sorted(caption_occurrences.items(), key=lambda item: item[1], reverse=True))

# Print the sorted dictionary
for caption, count in sorted_captions.items():
    print(f"{caption}: {count}")

an image of: 10
a black and: 6
a drawing of: 4
a red and: 2
a computer generated: 2
a green and: 2
red tulip: 1
an illustration of: 1
a colorful abstract: 1
a black background: 1
green abstract background: 1
a cartoon character: 1
a brown and: 1
a group of: 1
black and white: 1
three black and: 1
a bunch of: 1
blue wood texture: 1
the cover of: 1
a blue background: 1
a circle with: 1
a painting of: 1
a chart showing: 1
this is a: 1
a picture of: 1
a colorful pattern: 1
a set of: 1
a poster with: 1
the logo for: 1
an abstract pattern: 1
