In [1]:
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

# Step 1: BLIP model for description
def generate_caption(image):
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

    inputs = processor(images=image, return_tensors="pt")
    
    # Using beam search to get more detailed caption and adjusting max_length
    out = model.generate(**inputs, max_length=50, num_beams=5, early_stopping=True)
    
    caption = processor.decode(out[0], skip_special_tokens=True)
    return caption

# Step 2: Combine description
def describe_image(image_path):
    image = Image.open(image_path)

    # Generate caption
    caption = generate_caption(image)

    # Print the result
    print(f"Description: {caption}")

# Test the function with uploaded image
image_path = r"C:\Users\Lenovo\Downloads\9.jpg"  # Path to the uploaded image
describe_image(image_path)

Description: two boys playing a game of foo foo foo


In [2]:
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

# Step 1: BLIP model for description
def generate_caption(image):
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

    inputs = processor(images=image, return_tensors="pt")
    
    # Increase max_length and num_beams for more detailed caption
    out = model.generate(**inputs, max_length=100, num_beams=10, early_stopping=True)
    
    caption = processor.decode(out[0], skip_special_tokens=True)
    return caption

# Step 2: Combine description
def describe_image(image_path):
    image = Image.open(image_path)

    # Generate caption
    caption = generate_caption(image)

    # Print the result
    print(f"Description: {caption}")

# Test the function with the image you uploaded
image_path = r"C:\Users\Lenovo\Downloads\19.jpg"  # Path to your image
describe_image(image_path)

Description: a mouse eating peanuts on the ground


In [3]:
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

# Step 1: BLIP-large model for description
def generate_caption(image):
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

    inputs = processor(images=image, return_tensors="pt")
    
    # Using beam search and increasing max_length for more detail
    out = model.generate(**inputs, max_length=150, num_beams=10, early_stopping=True)
    
    caption = processor.decode(out[0], skip_special_tokens=True)
    return caption

# Step 2: Combine description
def describe_image(image_path):
    image = Image.open(image_path)

    # Generate caption
    caption = generate_caption(image)

    # Print the result
    print(f"Description: {caption}")

# Test the function
image_path = r"C:\Users\Lenovo\Downloads\9.jpg"  # Path to your image
describe_image(image_path)

Description: there are two boys playing a game of foo - pong in a large room


In [11]:
import numpy as np

# Set NumPy to print full arrays without truncation
np.set_printoptions(threshold=np.inf)

import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import numpy as np

# Set NumPy to print full arrays without truncation
np.set_printoptions(threshold=np.inf)

# Load a pre-trained ResNet model
model = models.resnet50(pretrained=True)
# Remove the final classification layer to get the embedding vector
model = torch.nn.Sequential(*list(model.children())[:-1])
model.eval()

# Function to preprocess image and get embedding vector
def get_embedding(image_path):
    preprocess = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    image = Image.open(image_path)
    image_tensor = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        embedding = model(image_tensor)
    return embedding.squeeze().cpu().numpy()

# Path to your uploaded image
image_path = r"C:\Users\Lenovo\Downloads\9.jpg" 

# Get the full embedding vector for the uploaded image
embedding_vector = get_embedding(image_path)
print("Full Embedding vector:", embedding_vector)

Full Embedding vector: [3.72799896e-02 4.67579037e-01 4.64814752e-01 1.35329032e+00
 2.30826259e+00 3.74844015e-01 5.88892460e-01 4.73825067e-01
 1.54108852e-01 5.14076054e-01 3.82819772e-01 1.49778083e-01
 6.02035761e-01 9.57301617e-01 1.74077094e+00 9.61226165e-01
 8.32294643e-01 2.11850658e-01 5.69806635e-01 8.86199176e-01
 3.72115076e-01 6.02786690e-02 6.06580436e-01 8.47634792e-01
 1.64498961e+00 5.22034645e-01 1.15653801e+00 4.18297291e-01
 3.59784365e-01 6.84525490e-01 1.09895360e+00 9.27941084e-01
 9.45944190e-02 3.55075449e-01 2.45852217e-01 3.22423488e-01
 4.38990951e-01 1.21769153e-01 6.44762635e-01 1.23460078e+00
 2.20255852e-01 3.29721689e-01 7.75120735e-01 1.93196848e-01
 6.37170076e-01 1.18521601e-01 1.33197367e-01 4.92387474e-01
 1.05393445e+00 3.72688562e-01 3.96920107e-02 5.13233900e-01
 3.60321015e-01 7.71278143e-01 1.87820685e+00 5.48000276e-01
 5.72869144e-02 3.61663669e-01 1.39267325e-01 4.48866308e-01
 5.49519420e-01 4.09216583e-01 4.90267903e-01 1.55291423e-01
 

In [15]:
import torch
import clip
from PIL import Image

# Load the CLIP model and tokenizer
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# Load and preprocess the image
image_path = r"C:\Users\Lenovo\Downloads\9.jpg"  # Replace with the path to your uploaded image
image = Image.open(image_path)
image_input = preprocess(image).unsqueeze(0).to(device)

# Define a set of potential text descriptions
text_descriptions = [
    "two boys are playing foosball",
    "a cat sitting on a chair",
    "a car on the road",
    "a beautiful landscape with mountains",
    "a person playing soccer",
    "a busy street in the city",
    "a painting of flowers",
    "a futuristic robot",
    "a person eating food",
]

# Convert the list of text descriptions into tensor format for the CLIP model
text_inputs = clip.tokenize(text_descriptions).to(device)

# Use the CLIP model to encode the image and text descriptions
with torch.no_grad():
    image_features = model.encode_image(image_input)
    text_features = model.encode_text(text_inputs)

# Normalize the image and text features to make them comparable
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)

# Calculate the similarity between the image features and each text description
similarity = (image_features @ text_features.T).softmax(dim=-1)

# Identify the description with the highest similarity
best_description_idx = similarity.argmax().item()
best_description = text_descriptions[best_description_idx]

# Output the best matching description
print(f"Best matching description: {best_description}")

Best matching description: two boys are playing foosball


In [32]:
import torch
import clip
from PIL import Image
import numpy as np

# Load the CLIP model and tokenizer
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device)

# List of text descriptions
descriptions = [
    "two boys are playing foosball",
    "a cat sitting on a chair",
    "a car on the road",
    "a beautiful landscape with mountains",
    "a person playing soccer",
    "a busy street in the city",
    "a painting of flowers",
    "a futuristic robot",
    "a person eating food"
]

# Tokenize and get embeddings for the text descriptions
text_tokens = clip.tokenize(descriptions).to(device)
with torch.no_grad():
    text_embeddings = model.encode_text(text_tokens).cpu().numpy()

# Preprocess and get the image embedding
image_path = r"C:\Users\Lenovo\Downloads\9.jpg"  # Replace with your image path
image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)

# Get image embedding using CLIP
with torch.no_grad():
    image_embedding = model.encode_image(image).cpu().numpy()

# Calculate cosine similarities between image embedding and text embeddings
cosine_similarities = np.dot(text_embeddings, image_embedding.T) / (
    np.linalg.norm(text_embeddings, axis=1) * np.linalg.norm(image_embedding)
)

# Get the description with the highest cosine similarity
best_match_index = np.argmax(cosine_similarities)
best_match_description = descriptions[best_match_index]

print(f"The description that best matches the embedding is: {best_match_description}")

The description that best matches the embedding is: two boys are playing foosball


In [42]:
import torch
import clip

# Load the CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device)

# The sentence you want to get an embedding for
sentence = ["two boys are playing foosball"]

# Tokenize the sentence and move to the appropriate device (CPU or GPU)
text_tokens = clip.tokenize(sentence).to(device)

# Generate the embedding vector for the sentence
with torch.no_grad():
    text_embedding = model.encode_text(text_tokens).cpu().numpy()

# Print the embedding vector
print(f"Embedding vector for the sentence: {sentence}")
print(text_embedding)

Embedding vector for the sentence: ['two boys are playing foosball']
[[-3.38123113e-01 -1.62967950e-01 -3.08804184e-01 -2.28086159e-01
   1.28168881e-01  4.30843309e-02  2.58815527e-01 -4.85642999e-01
  -7.34556735e-01 -4.54408556e-01  3.10918063e-01  1.95236057e-01
   5.33548333e-02 -1.42320618e-01 -1.02312148e-01  2.84891486e-01
  -4.26563501e-01  6.40503690e-02 -2.08458215e-01  5.03991485e-01
  -1.73545614e-01  2.72256613e-01  5.18820882e-01 -4.85504746e-01
  -1.73425719e-01 -2.99286753e-01 -3.19148540e-01  4.85366434e-01
  -1.86221376e-02 -5.62713831e-04  2.39443064e-01  2.17152178e-01
  -3.21758002e-01  1.15541980e-01 -9.28702712e-01 -3.26769382e-01
   4.30913270e-02  2.44705901e-01 -9.50826928e-02 -1.79305077e-01
  -6.20288542e-03  1.56562135e-01  4.25591432e-02 -1.37059063e-01
   2.87152290e-01  3.40816289e-01 -7.54240677e-02  2.83642888e-01
  -3.00065100e-01 -9.58792940e-02 -2.89157569e-01  3.81159604e-01
  -6.26199469e-02  3.57278019e-01 -1.36891559e-01 -1.34838289e-02
  -2.67