In [2]:
import torch
from transformers import Blip2Model, Blip2Processor
from tqdm import tqdm
import numpy as np
from torch.utils.data import DataLoader
from datasets import Flickr30k
from torch.utils.data import Subset


# Load BLIP-2 model and processor
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

def encode_images(dataloader):
    encodings = []
    for images, _ in tqdm(dataloader, desc="Encoding images"):
        inputs = processor(images=images, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model.get_image_features(**inputs)
            # Use the pooled output of the vision model
            image_features = outputs.pooler_output
        encodings.append(image_features.cpu().numpy())
    return np.vstack(encodings)

def encode_text(captions):
    encodings = []
    for caption_group in tqdm(captions, desc="Encoding text"):
        caption = caption_group[0]  # Take the first caption for each image
        inputs = processor(text=caption, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model.get_text_features(**inputs)
            # Use the last hidden state of the last token as the text feature
            text_features = outputs.logits[:, -1, :]
        encodings.append(text_features.cpu().numpy())
    return np.vstack(encodings)
    
def calculate_recall(similarities, k_values):
    recalls = {}
    for k in k_values:
        top_k = np.argsort(-similarities, axis=1)[:, :k]
        recall_at_k = np.mean([1 if i in row else 0 for i, row in enumerate(top_k)])
        recalls[f'R@{k}'] = recall_at_k
    return recalls

def perform_retrieval(dataset, batch_size=32):
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=lambda x: ([item[0] for item in x], [item[1] for item in x]))
    
    image_features = encode_images(dataloader)
    text_features = encode_text([dataset[i][1] for i in range(len(dataset))])

    # Normalize features
    image_features /= np.linalg.norm(image_features, axis=1, keepdims=True)
    text_features /= np.linalg.norm(text_features, axis=1, keepdims=True)

    # Calculate similarities
    similarities = np.dot(image_features, text_features.T)

    # Calculate metrics
    k_values = [1, 5, 10]
    i2t_recalls = calculate_recall(similarities, k_values)
    t2i_recalls = calculate_recall(similarities.T, k_values)

    return i2t_recalls, t2i_recalls




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:

# Load your Flickr30k dataset
flickr_dataset = Flickr30k(csv_file='./data/flickr30k/results.csv', 
                           img_dir='./data/flickr30k/images/')

# Create a subset of the first 1000 images
subset_indices = list(range(1000))
subset_dataset = Subset(flickr_dataset, subset_indices)

# Perform retrieval
i2t_recalls, t2i_recalls = perform_retrieval(subset_dataset)

print("Image-to-Text Retrieval:")
for k, v in i2t_recalls.items():
    print(f"{k}: {v:.4f}")

print("\nText-to-Image Retrieval:")
for k, v in t2i_recalls.items():
    print(f"{k}: {v:.4f}")

In [12]:
import torch
import torch.nn.functional as F
from transformers import Blip2Processor, Blip2Model
from datasets import Flickr30k
from torch.utils.data import Subset

# Load BLIP-2 model and processor
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b")
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Load a small subset of Flickr30k dataset
flickr_dataset = Flickr30k(csv_file='./data/flickr30k/results.csv', 
                           img_dir='./data/flickr30k/images/')
subset_size = 5  # Small subset for example
subset_dataset = Subset(flickr_dataset, range(subset_size))

# We'll define the projection layers inside the encode function after we know the correct dimensions

def encode_image_and_text(images, captions):
    # Process images
    image_inputs = processor(images=images, return_tensors="pt", padding=True).to(device)
    
    # Process text
    text_inputs = processor(text=captions, return_tensors="pt", padding=True).to(device)
    
    with torch.no_grad():
        # Get image features
        image_outputs = model.get_image_features(**image_inputs)
        image_embeds = image_outputs.last_hidden_state
        print(f"Image embeds shape: {image_embeds.shape}")
        
        # Get text features
        text_outputs = model.get_text_features(**text_inputs, output_hidden_states=True)
        text_embeds = text_outputs.hidden_states[-1]  # Use the last hidden state
        print(f"Text embeds shape: {text_embeds.shape}")
        
        # Define projection layers with correct dimensions
        vision_proj = torch.nn.Linear(image_embeds.shape[-1], 256).to(device)
        text_proj = torch.nn.Linear(text_embeds.shape[-1], 256).to(device)
        
        image_feats = F.normalize(vision_proj(image_embeds[:, 0, :]), dim=-1)
        text_feat = F.normalize(text_proj(text_embeds[:, 0, :]), dim=-1)
        
    return image_feats, text_feat

# Encode all images and texts in the subset
images, captions = zip(*[(item[0], item[1][0]) for item in subset_dataset])
image_feats, text_feats = encode_image_and_text(list(images), list(captions))

print(f"Image features shape: {image_feats.shape}")
print(f"Text features shape: {text_feats.shape}")

def compute_similarity(image_feats, text_feats):
    return torch.matmul(image_feats, text_feats.t())

# Compute similarity matrix
similarity_matrix = compute_similarity(image_feats, text_feats)

print(f"Similarity matrix shape: {similarity_matrix.shape}")

# Example: Retrieve text for the first image
image_idx = 0
scores = similarity_matrix[image_idx]
best_text_idx = scores.argmax().item()

print(f"For image {image_idx}, best matching text is {best_text_idx}")
print(f"Query image caption: {captions[image_idx]}")
print(f"Retrieved text: {captions[best_text_idx]}")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Image embeds shape: torch.Size([5, 257, 1408])
Text embeds shape: torch.Size([5, 21, 2560])
Image features shape: torch.Size([5, 256])
Text features shape: torch.Size([5, 256])
Similarity matrix shape: torch.Size([5, 5])
For image 0, best matching text is 0
Query image caption: Two young guys with shaggy hair look at their hands while hanging out in the yard .
Retrieved text: Two young guys with shaggy hair look at their hands while hanging out in the yard .


In [1]:
import torch
import torch.nn.functional as F
from transformers import Blip2Processor, Blip2Model
from datasets import Flickr30k
from torch.utils.data import Subset

# Load BLIP-2 model and processor
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b")
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Load a small subset of Flickr30k dataset
flickr_dataset = Flickr30k(csv_file='./data/flickr30k/results.csv', 
                           img_dir='./data/flickr30k/images/')
subset_size = 20  # Small subset for example
subset_dataset = Subset(flickr_dataset, range(subset_size))

# We'll define the projection layers inside the encode function after we know the correct dimensions

def encode_image_and_text(images, captions):
    # Process images
    image_inputs = processor(images=images, return_tensors="pt", padding=True).to(device)
    
    # Process text
    text_inputs = processor(text=captions, return_tensors="pt", padding=True).to(device)
    
    with torch.no_grad():
        # Get image features
        image_outputs = model.get_image_features(**image_inputs)
        image_embeds = image_outputs.last_hidden_state
        print(f"Image embeds shape: {image_embeds.shape}")
        
        # Get text features
        text_outputs = model.get_text_features(**text_inputs, output_hidden_states=True)
        text_embeds = text_outputs.hidden_states[-1]  # Use the last hidden state
        print(f"Text embeds shape: {text_embeds.shape}")
        
        # Define projection layers with correct dimensions
        vision_proj = torch.nn.Linear(image_embeds.shape[-1], 256).to(device)
        text_proj = torch.nn.Linear(text_embeds.shape[-1], 256).to(device)
        
        image_feats = F.normalize(vision_proj(image_embeds[:, 0, :]), dim=-1)
        text_feat = F.normalize(text_proj(text_embeds[:, 0, :]), dim=-1)
        
    return image_feats, text_feat

# Encode all images and texts in the subset
images, captions = zip(*[(item[0], item[1][0]) for item in subset_dataset])
image_feats, text_feats = encode_image_and_text(list(images), list(captions))

print(f"Image features shape: {image_feats.shape}")
print(f"Text features shape: {text_feats.shape}")

def compute_similarity(image_feats, text_feats):
    return torch.matmul(image_feats, text_feats.t())

# Compute similarity matrix
similarity_matrix = compute_similarity(image_feats, text_feats)

print(f"Similarity matrix shape: {similarity_matrix.shape}")

# Example: Retrieve text for the first image
image_idx = 0
scores = similarity_matrix[image_idx]
best_text_idx = scores.argmax().item()

print(f"For image {image_idx}, best matching text is {best_text_idx}")
print(f"Query image caption: {captions[image_idx]}")
print(f"Retrieved text: {captions[best_text_idx]}")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Image embeds shape: torch.Size([20, 257, 1408])
Text embeds shape: torch.Size([20, 25, 2560])
Image features shape: torch.Size([20, 256])
Text features shape: torch.Size([20, 256])
Similarity matrix shape: torch.Size([20, 20])
For image 0, best matching text is 0
Query image caption: Two young guys with shaggy hair look at their hands while hanging out in the yard .
Retrieved text: Two young guys with shaggy hair look at their hands while hanging out in the yard .


In [6]:
image_idx = 10
scores = similarity_matrix[image_idx]
best_text_idx = scores.argmax().item()

print(f"For image {image_idx}, best matching text is {best_text_idx}")
print(f"Query image caption: {captions[image_idx]}")
print(f"Retrieved text: {captions[best_text_idx]}")

For image 10, best matching text is 0
Query image caption: Five ballet dancers caught mid jump in a dancing studio with sunlight coming through a window .
Retrieved text: Two young guys with shaggy hair look at their hands while hanging out in the yard .


In [3]:
import torch
import torch.nn.functional as F
from transformers import Blip2Processor, Blip2Model
from datasets import Flickr30k
from torch.utils.data import Subset

# Load BLIP-2 model and processor
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b")
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Load Flickr30k dataset
flickr_dataset = Flickr30k(csv_file='./data/flickr30k/results.csv', 
                           img_dir='./data/flickr30k/images/')
subset_size = 5  # Small subset for example
subset_dataset = Subset(flickr_dataset, range(subset_size))

def encode_image_and_text(image, caption):
    # Process image
    image_inputs = processor(images=image, return_tensors="pt", padding=True).to(device)
    
    # Process text
    text_inputs = processor(text=caption, return_tensors="pt", padding=True).to(device)
    
    with torch.no_grad():
        # Get image features
        image_outputs = model.get_image_features(**image_inputs)
        image_embeds = image_outputs.last_hidden_state
        print(f"Image embeds shape: {image_embeds.shape}")
        
        # Get text features
        text_outputs = model.get_text_features(**text_inputs, output_hidden_states=True)
        text_embeds = text_outputs.hidden_states[-1]  # Use the last hidden state
        print(f"Text embeds shape: {text_embeds.shape}")
        
        # Define projection layers with correct dimensions
        vision_proj = torch.nn.Linear(image_embeds.shape[-1], 256).to(device)
        text_proj = torch.nn.Linear(text_embeds.shape[-1], 256).to(device)
        
        image_feats = F.normalize(vision_proj(image_embeds[:, 0, :]), dim=-1)
        text_feat = F.normalize(text_proj(text_embeds[:, 0, :]), dim=-1)
        
    return image_feats, text_feat

def compute_similarity(embedding1, embedding2):
    return F.cosine_similarity(embedding1, embedding2)

# Get embeddings for a single image-text pair
image, captions = subset_dataset[0]
text = captions[0]  # Use the first caption

image_embedding, text_embedding = encode_image_and_text(image, text)

similarity = compute_similarity(image_embedding, text_embedding)

print(f"Image embedding shape: {image_embedding.shape}")
print(f"Text embedding shape: {text_embedding.shape}")
print(f"Cosine similarity: {similarity.item():.4f}")

# Print the caption
print(f"\nCaption: {text}")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Image embeds shape: torch.Size([1, 257, 1408])
Text embeds shape: torch.Size([1, 20, 2560])
Image embedding shape: torch.Size([1, 256])
Text embedding shape: torch.Size([1, 256])
Cosine similarity: 0.0688

Caption: Two young guys with shaggy hair look at their hands while hanging out in the yard .
