In [1]:
from PIL import Image
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict, Counter
from typing import List
import numpy as np
import os

# --- STEP 1: BLIP Captioning Setup ---
device = "cuda" if torch.cuda.is_available() else "cpu"

blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)

def generate_caption(image, text=None):
    inputs = blip_processor(image, text, return_tensors="pt").to(device)
    out = blip_model.generate(**inputs, max_new_tokens=30)
    return blip_processor.decode(out[0], skip_special_tokens=True)

def generate_captions(
    images: List[Image.Image],
    batch_size: int = 16,
    max_new_tokens: int = 50
) -> List[str]:
    """
    Generate captions for a list of PIL Images using BLIP, in batches.

    Args:
      images: list of PIL.Image objects (or paths you open to PIL).
      batch_size: how many to process at once.
      max_new_tokens: length of generated captions.

    Returns:
      List of caption strings, in the same order as `images`.
    """
    all_captions = []
    for i in range(0, len(images), batch_size):
        batch_imgs = images[i : i + batch_size]
        # Processor will resize, normalize, and pad for you
        inputs = blip_processor(
            images=batch_imgs,
            return_tensors="pt",
            padding="max_length",   # pad to longest in batch
            truncation=True
        ).to(device)

        # generate captions for the batch
        generated = blip_model.generate(
            **inputs,
            max_new_tokens=max_new_tokens
        )

        # decode each sequence to text
        batch_captions = [
            blip_processor.decode(g, skip_special_tokens=True)
            for g in generated
        ]
        all_captions.extend(batch_captions)

    return all_captions

# # --- Example usage ---
# if __name__ == "__main__":
#     # Suppose you have file paths:
#     paths = [
#                 'illustrations/deepclustering/Iterative.png', 
#                 'illustrations/deepclustering/Multi-Stage.png',
#                 'illustrations/deepclustering/Simultaneous.png',
#                 'illustrations/deepclustering/taxonomy.png',
#                 'illustrations/deepclustering/generative.png'
#              ]
#     imgs = [Image.open(p).convert("RGB") for p in paths]

#     captions = generate_captions(imgs, batch_size=8)
#     for img_path, cap in zip(paths, captions):
#         print(f"{img_path} → {cap}")


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [None]:
paths = [
            'illustrations/deepclustering/Iterative.png', 
            'illustrations/deepclustering/Multi-Stage.png',
            'illustrations/deepclustering/Simultaneous.png',
            'illustrations/deepclustering/taxonomy.png',
            'illustrations/deepclustering/generative.png',
            '/Users/constantinseibold/workspace/CVPR_Tutorial/Identifying_Structure_In_Data/datasets/image_to_segment.png',
            '/Users/constantinseibold/workspace/CVPR_Tutorial/Identifying_Structure_In_Data/output.png',
            '/Users/constantinseibold/workspace/CVPR_Tutorial/Identifying_Structure_In_Data/clustering/image_clustering/71iPwWws0GL._AC_UF894,1000_QL80_.jpg',
            '/Users/constantinseibold/workspace/CVPR_Tutorial/Identifying_Structure_In_Data/clustering/image_clustering/media.media.a99c5814-33e6-46c5-8f69-6fd0c8c9a162.16x9_1024.jpg'
            ]
imgs = [Image.open(p).convert("RGB") for p in paths]

captions = generate_captions(imgs, batch_size=8)
captions

['a diagram of a clusterer and a clusterer model',
 'a diagram of a clusterer and clusterer module',
 'a diagram of a clusterer and clusterer module',
 'a diagram of a computer system with multiple levels of learning',
 'a white and blue sign with the words clustering module and a pink and blue sign with the words cluster',
 'araffes on a hill with a dog and a group of other animals',
 'a diagram of a line graph with different colored dots',
 'the big bang theory',
 'araffes sitting on a couch with a group of friends eating']

In [2]:
def label_clusters_by_captions(images, cluster_labels):
    cluster_to_captions = defaultdict(list)

    for img, cid in zip(images, cluster_labels):
        if isinstance(img, str):
            img = Image.open(img).convert("RGB")
        caption = generate_caption(img)
        cluster_to_captions[cid].append(caption)

    cluster_to_label = {}
    for cid, caps in cluster_to_captions.items():
        # Simple mode: most common noun/phrase
        all_words = " ".join(caps).lower().split()
        most_common = Counter(all_words).most_common(3)
        label = ", ".join([w for w, _ in most_common])
        cluster_to_label[cid] = label

    return cluster_to_label


In [None]:
from transformers import CLIPProcessor, CLIPModel

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

candidates = ["dog", "cat", "car", "nature", "city", "food", "portrait", "art", "sports"]

def label_clusters_by_clip(images, cluster_labels, candidate_labels):
    cluster_to_embeddings = defaultdict(list)

    for img, cid in zip(images, cluster_labels):
        if isinstance(img, str):
            img = Image.open(img).convert("RGB")
        inputs = clip_processor(images=img, return_tensors="pt").to(device)
        outputs = clip_model.get_image_features(**inputs)
        cluster_to_embeddings[cid].append(outputs.detach().cpu().numpy())

    text_inputs = clip_processor(text=candidate_labels, return_tensors="pt", padding=True).to(device)
    text_embeds = clip_model.get_text_features(**text_inputs).detach().cpu().numpy()

    cluster_to_label = {}
    for cid, embeds in cluster_to_embeddings.items():
        avg_embed = np.mean(embeds, axis=0)
        sims = cosine_similarity([avg_embed], text_embeds)[0]
        best_idx = int(np.argmax(sims))
        cluster_to_label[cid] = candidate_labels[best_idx]

    return cluster_to_label


RuntimeError: Failed to import transformers.models.clip.modeling_clip because of the following error (look up to see its traceback):
cannot import name 'auto_docstring' from 'transformers.utils' (/opt/anaconda3/envs/tutorial/lib/python3.11/site-packages/transformers/utils/__init__.py)

In [10]:
from  openai import OpenAI

def gpt_cluster_label(captions):
    prompt = (
        f"Captions:\n{chr(10).join(['- ' + c for c in captions])}\n\nLabel:"
    )

    client = OpenAI()

    response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                "role": "system",
                "content": [
                    {
                    "type": "text",
                    "text": "You are given a list of captions generated from a group of similar images. \
                            Please summarize the common theme of these images in a short, descriptive label.\n\n"
                    }
                ]
                },{
                "role": "user",
                "content": [
                    {
                    "type": "text",
                    "text": prompt
                    }
                ]
                }
            ]
    )
                    
    # response = client.responses.create(
    #     model="gpt-4.1",
    #     input=prompt
    # )

    print(response)

    # response = openai.ChatCompletion.create(
    #     model="gpt-4",
    #     messages=[{"role": "user", "content": prompt}],
    #     temperature=0.3,
    # )

    return response.choices[0].message.content

def label_clusters_by_gpt(images, cluster_labels):
    cluster_to_captions = defaultdict(list)

    for img, cid in zip(images, cluster_labels):
        if isinstance(img, str):
            img = Image.open(img).convert("RGB")
        caption = generate_caption(img)
        cluster_to_captions[cid].append(caption)

    cluster_to_label = {}
    for cid, caps in cluster_to_captions.items():
        label = gpt_cluster_label(caps)
        cluster_to_label[cid] = label

    return cluster_to_label


In [14]:
images = [
            'illustrations/deepclustering/Iterative.png', 
            'illustrations/deepclustering/Multi-Stage.png',
            'illustrations/deepclustering/Simultaneous.png',
            'illustrations/deepclustering/taxonomy.png',
            'illustrations/deepclustering/generative.png',
            '/Users/constantinseibold/workspace/CVPR_Tutorial/Identifying_Structure_In_Data/datasets/image_to_segment.png',
            '/Users/constantinseibold/workspace/CVPR_Tutorial/Identifying_Structure_In_Data/output.png',
            '/Users/constantinseibold/workspace/CVPR_Tutorial/Identifying_Structure_In_Data/clustering/image_clustering/71iPwWws0GL._AC_UF894,1000_QL80_.jpg',
            '/Users/constantinseibold/workspace/CVPR_Tutorial/Identifying_Structure_In_Data/clustering/image_clustering/media.media.a99c5814-33e6-46c5-8f69-6fd0c8c9a162.16x9_1024.jpg'
            ]

# images = ["/path/image1.jpg", "/path/image2.jpg", ...]
cluster_labels = [0, 0, 0,0,0, 1, 2, 3, 3]

# caption_labels = label_clusters_by_captions(images, cluster_labels)

imagenet_labels = label_clusters_by_imagenet(images, cluster_labels)

# clip_labels = label_clusters_by_clip(images, cluster_labels, candidate_labels=[
#     "portrait", "landscape", "architecture", "animals", "vehicles", "food", "sports", "technology"
# ])

# gpt_labels = label_clusters_by_gpt(images, cluster_labels)

print("Caption Labels:", caption_labels)
print("Imagenet Labels:", imagenet_labels)
# print("CLIP Labels:", clip_labels)
print("GPT Labels:", gpt_labels)


Caption Labels: {0: 'a, clusterer, and', 1: 'a, araffes, on', 2: 'a, diagram, of', 3: 'a, the, big'}
Imagenet Labels: {0: 'web site', 1: 'baboon', 2: 'envelope', 3: 'ice lolly'}
GPT Labels: {0: '"Clustering Models and Modules in Computer Systems"', 1: '"Animals Gathering on a Hill"', 2: 'Data Visualization', 3: 'Friendship and Relaxation'}


In [None]:
!

In [None]:
def clip_label_by_generated_captions(images, cluster_labels):
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer("clip-ViT-B-32")

    cluster_embeds = defaultdict(list)
    cluster_captions = defaultdict(list)

    # Step 1: Generate captions
    for img, cid in zip(images, cluster_labels):
        if isinstance(img, str): img = Image.open(img).convert("RGB")
        caption = generate_caption(img)
        cluster_captions[cid].append(caption)
        cluster_embeds[cid].append(model.encode(caption))

    cluster_to_label = {}
    for cid, cap_list in cluster_captions.items():
        embeddings = model.encode(cap_list)
        mean_embedding = np.mean(embeddings, axis=0)
        sims = cosine_similarity([mean_embedding], embeddings)[0]
        best_idx = np.argmax(sims)
        cluster_to_label[cid] = cap_list[best_idx]  # Most central caption

    return cluster_to_label


In [None]:
import torchvision.transforms as T
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.ops import nms

det_model = fasterrcnn_resnet50_fpn(pretrained=True).eval()
transform = T.Compose([T.Resize((256, 256)), T.ToTensor()])

COCO_LABELS = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter',
    'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
    'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
    'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
    'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
    'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog',
    'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table',
    'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
    'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
    'teddy bear', 'hair drier', 'toothbrush'
]

def label_clusters_by_object_tags(images, cluster_labels, threshold=0.8):
    cluster_objects = defaultdict(list)

    for img, cid in zip(images, cluster_labels):
        if isinstance(img, str): img = Image.open(img).convert("RGB")
        inp = transform(img).unsqueeze(0)
        with torch.no_grad():
            preds = det_model(inp)[0]

        keep = preds["scores"] > threshold
        labels = preds["labels"][keep].tolist()
        label_names = [COCO_LABELS[i] for i in labels]
        cluster_objects[cid].extend(label_names)

    # Summarize by most frequent objects
    return {
        cid: ", ".join([w for w, _ in Counter(tags).most_common(3)])
        for cid, tags in cluster_objects.items()
    }


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

def label_clusters_with_lda_on_captions(images, cluster_labels, n_topics=1):
    from nltk.corpus import stopwords
    import nltk
    nltk.download("stopwords")

    cluster_to_captions = defaultdict(list)
    for img, cid in zip(images, cluster_labels):
        if isinstance(img, str): img = Image.open(img).convert("RGB")
        cap = generate_caption(img)
        cluster_to_captions[cid].append(cap)

    cluster_to_label = {}
    stop_words = stopwords.words("english")

    for cid, caps in cluster_to_captions.items():
        vec = CountVectorizer(stop_words=stop_words, max_features=500)
        X = vec.fit_transform(caps)
        lda = LatentDirichletAllocation(n_components=n_topics).fit(X)
        words = vec.get_feature_names_out()
        topics = lda.components_
        topic_words = [words[i] for i in topics[0].argsort()[-3:][::-1]]
        cluster_to_label[cid] = ", ".join(topic_words)

    return cluster_to_label


In [13]:
from torchvision import models
from torchvision.transforms import Compose, Resize, ToTensor, Normalize
import torchvision.transforms as T

resnet = models.resnet50(pretrained=True).eval()
imagenet_labels = {i: c.strip() for i, c in enumerate(open("imagenet_classes.txt"))}

preprocess = Compose([
    Resize(256),
    T.CenterCrop(224),
    ToTensor(),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

def label_clusters_by_imagenet(images, cluster_labels):
    cluster_preds = defaultdict(list)

    for img, cid in zip(images, cluster_labels):
        if isinstance(img, str): img = Image.open(img).convert("RGB")
        inp = preprocess(img).unsqueeze(0)
        with torch.no_grad():
            logits = resnet(inp)
        pred = logits.argmax(dim=1).item()
        cluster_preds[cid].append(imagenet_labels[pred])

    return {
        cid: Counter(preds).most_common(1)[0][0]
        for cid, preds in cluster_preds.items()
    }




In [None]:
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

def label_clusters_ctfidf(images, cluster_labels, generate_caption, top_n=3):
    """
    images: list of image paths or PIL Images
    cluster_labels: list of cluster IDs (same length as images)
    generate_caption: fn(image) -> str
    top_n: number of terms to return per cluster
    """
    # 1. Build pseudo-documents: {cluster_id: concatenated captions}
    docs = defaultdict(list)
    for img, cid in zip(images, cluster_labels):
        caption = generate_caption(img)
        docs[cid].append(caption.lower())

    # Now docs_strs is list of strings, one per cluster
    cluster_ids = list(docs.keys())
    docs_strs   = [" ".join(docs[cid]) for cid in cluster_ids]

    # 2. Compute TF-IDF where each document = one cluster
    #    min_df=1 to include rare words; you can tune stop_words, ngram_range, etc.
    vectorizer = TfidfVectorizer(
        stop_words="english",
        min_df=1,
        ngram_range=(1,2),   # unigrams + bigrams often help
    )
    X = vectorizer.fit_transform(docs_strs)   # shape: (n_clusters, n_terms)
    terms = vectorizer.get_feature_names_out()

    # 3. Extract top_n terms per cluster by highest TF-IDF score
    cluster_to_labels = {}
    for idx, cid in enumerate(cluster_ids):
        row = X[idx].toarray().flatten()
        top_indices = row.argsort()[::-1][:top_n]
        top_terms   = [terms[i] for i in top_indices]
        cluster_to_labels[cid] = top_terms

    return cluster_to_labels

# --- Example usage ---
if __name__ == "__main__":
    # dummy stubs:
    def generate_caption(img):
        # replace this with your BLIP/mPLUG/etc. captioner
        return "a photo of a cat and a dog"

    images = ["img1.jpg", "img2.jpg", "img3.jpg", "img4.jpg"]
    cluster_labels = [0, 0, 1, 1]

    labels = label_clusters_ctfidf(images, cluster_labels, generate_caption, top_n=3)
    print("Cluster labels:", labels)
    # e.g. Cluster labels: {0: ["cat", "dog", "photo"], 1: ["outdoor", "tree", "sky"]}
