In [1]:
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel, AutoProcessor, AutoModelForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# image loading and preprocessing
def load_and_preprocess_image(image_path):
    image = Image.open(image_path).convert("RGB")
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    inputs = processor(images=image, return_tensors="pt")
    return inputs, processor

In [3]:
# image understanding with CLIP
def generate_image_embeddings(inputs):
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    with torch.no_grad():
        image_features = model.get_image_features(**inputs)

    return image_features, model

In [4]:
# caption matching (using CLIP text embeddings)
def match_captions(image_features, captions, clip_model, processor):
    # 1. get text embeddings for the captions:
    text_inputs = processor(text=captions, return_tensors="pt", padding=True)
    with torch.no_grad():
        text_features = clip_model.get_text_features(**text_inputs)

    # 2. calculate cosine similarity between image and text features:
    image_features = image_features.detach().cpu().numpy()
    text_features = text_features.detach().cpu().numpy()

    similarities = cosine_similarity(image_features, text_features)

    # 3. find the best matching captions:
    best_indices = similarities.argsort(axis=1)[0][::-1]  
    best_captions = [captions[i] for i in best_indices]

    return best_captions, similarities[0][best_indices].tolist()

In [5]:
# main function
def image_captioning(image_path, candidate_captions):  
    inputs, processor = load_and_preprocess_image(image_path)
    image_features, clip_model = generate_image_embeddings(inputs)

    best_captions, similarities = match_captions(image_features, candidate_captions, clip_model, processor)
    return best_captions, similarities

In [13]:
candidate_captions = [
    "In between where you were and where you’re going.",
    "The city breathes when you slow down.",
    "Some journeys don’t need a destination.",
    "Moving forward, quietly.",
    "Where concrete learns to coexist with green.",
    "A pause in the rush of the city.",
    "Every step has a story.",
    "Finding calm in the middle of movement.",
    "Not lost, just moving.",
    "The art of going nowhere in particular."
]


In [14]:
from sklearn.metrics.pairwise import cosine_similarity 

best_captions, similarities = image_captioning(
    "street photo.jpg",
    candidate_captions
)

# get the top 5 results
top_n = min(5, len(best_captions))
top_best_captions = best_captions[:top_n]
top_similarities = similarities[:top_n]

print("Top 5 Best Captions:")
for i, (caption, similarity) in enumerate(zip(top_best_captions, top_similarities)):
    print(f"{i+1}. {caption} (Similarity: {similarity:.4f})")

Top 5 Best Captions:
1. Where concrete learns to coexist with green. (Similarity: 0.2616)
2. A pause in the rush of the city. (Similarity: 0.2505)
3. The city breathes when you slow down. (Similarity: 0.2292)
4. Moving forward, quietly. (Similarity: 0.2143)
5. Every step has a story. (Similarity: 0.2128)
