In [None]:
# Import necessary librariesS
from transformers import CLIPTokenizer 
from transformers import CLIPProcessor 
from transformers import CLIPModel     

import torch                           
import torch.nn.functional as F        
import numpy as np                     
import matplotlib.pyplot as plt        
import seaborn as sns                  

from PIL import Image  
import requests                       
from io import BytesIO                 

In [None]:
model_name = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_name)

In [None]:
tokenizer = CLIPTokenizer.from_pretrained(model_name)

text = ["a donut", "a cookie", "an airplane", "a cat"]

inputs = tokenizer(text, padding=True, return_tensors="pt")

input_ids = inputs.input_ids  # Tokenized and encoded text input IDs

print("Input IDs (shape):", input_ids.shape)
print(input_ids)

In [None]:
# Compute text embeddings without tracking gradients (inference mode)
with torch.no_grad():
    text_embeddings = model.get_text_features(**inputs)

print(text_embeddings.shape)

In [None]:
cosine_similarity = F.cosine_similarity(
    text_embeddings[:, None, :],   
    text_embeddings[None, :, :],    
    dim=2                           
).cpu().numpy()                     

# Initialize a matplotlib figure with specified size (width=6, height=4)
plt.figure(figsize=(6, 4))

# Create a heatmap visualization using seaborn to display the cosine similarity matrix
sns.heatmap(
    cosine_similarity,             
    annot=True,                    
    fmt=".2f",                      
    cmap="coolwarm",                
    xticklabels=text,               
    yticklabels=text               
)

# Set the plot title with font size 14
plt.title("Cosine Similarity Matrix", fontsize=14)

# Label x-axis as "Text Embeddings"
plt.xlabel("Text Embeddings")

# Label y-axis as "Text Embeddings"
plt.ylabel("Text Embeddings")

# Display the plot
plt.show()


In [None]:
def plot_images(images, labels):
  n = len(images)                        
  fig, axes = plt.subplots(1, n)        
 
  for ax, img, lbl in zip(axes, images, labels):
      ax.imshow(img)                    
      ax.set_title(lbl)              
      ax.axis("off")                     

  plt.tight_layout()                    
  plt.show()                            

In [None]:
processor = CLIPProcessor.from_pretrained(model_name)

In [None]:
image_urls = {
    "a donut": "https://learnopencv.com/wp-content/uploads/2025/03/donut.jpeg",
    "a cookie": "https://learnopencv.com/wp-content/uploads/2025/03/cookie.jpeg",
    "an airplane": "https://learnopencv.com/wp-content/uploads/2025/03/airplane.jpeg",
    "a cat": "https://learnopencv.com/wp-content/uploads/2025/03/cat.jpeg"
}

labels = list(image_urls.keys())

def load_image(url):
    headers = {'User-Agent': 'Mozilla/5.0'}        
    response = requests.get(url, headers=headers)  
    response.raise_for_status()                    

    return Image.open(BytesIO(response.content)).convert("RGB")

images = []

for label in labels:
    try:
        img = load_image(image_urls[label])  
        images.append(img)                  
    except requests.exceptions.RequestException as e:
        
        print(f"Failed to load {label}: {e}")


plot_images(images, labels)

In [None]:
# Preprocess images using CLIP processor to prepare for embedding generation
image_inputs = processor(images=images, return_tensors="pt")

with torch.no_grad():
  image_embeddings = model.get_image_features(**image_inputs)

print(image_embeddings.shape)

# Compute similarity matrix
img_similarity = F.cosine_similarity(image_embeddings[:, None, :], image_embeddings[None, :, :], dim=2).cpu().numpy()

# Plot heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(img_similarity, annot=True, xticklabels=labels, yticklabels=labels, cmap="coolwarm")
plt.xlabel("Image Embeddings")
plt.ylabel("Image Embeddings")
plt.title("CLIP Image-Image Similarity Heatmap")
plt.show()

In [None]:
# Compute similarity matrix
txt_image_similarity = F.cosine_similarity(text_embeddings[:, None, :], image_embeddings[None, :, :], dim=2).cpu().numpy()

# Plot heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(txt_image_similarity, annot=True, xticklabels=labels, yticklabels=labels, cmap="coolwarm")
plt.xlabel("Text Embeddings")
plt.ylabel("Image Embeddings")
plt.title("CLIP Image-Text Similarity Heatmap")
plt.show()