## Cat Classification with CLIP Example

In [1]:
import torch
from PIL import Image
import requests
from transformers import AutoProcessor, CLIPModel
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm

2025-07-01 07:04:15.989190: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751333656.071605 1889284 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751333656.097083 1889284 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1751333656.237303 1889284 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1751333656.237324 1889284 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1751333656.237326 1889284 computation_placer.cc:177] computation placer alr

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [4]:
def get_common_objects_and_concepts():
    """Curated list of common objects, animals, concepts, emotions, etc."""
    categories = {
        'objects': [
            'chair', 'table', 'car', 'bicycle', 'bottle', 'cup', 'fork', 'knife', 'spoon',
            'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'pizza',
            'donut', 'cake', 'bed', 'toilet', 'laptop', 'mouse', 'remote', 'keyboard',
            'cell phone', 'book', 'clock', 'scissors', 'teddy bear', 'hair dryer',
            'toothbrush', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
            'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
            'skateboard', 'surfboard', 'tennis racket', 'man', 'woman'
        ],
        'animals': [
            'person', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
            'giraffe', 'bird', 'chicken', 'duck', 'eagle', 'owl', 'fish', 'shark', 'whale',
            'dolphin', 'turtle', 'frog', 'snake', 'spider', 'bee', 'butterfly', 'lion',
            'tiger', 'fox', 'wolf', 'rabbit', 'hamster', 'mouse', 'rat'
        ],
        'clothing': [
            'hat', 'cap', 'helmet', 'glasses', 'sunglasses', 'shirt', 't-shirt', 'sweater',
            'jacket', 'coat', 'dress', 'skirt', 'pants', 'jeans', 'shorts', 'shoes',
            'sneakers', 'boots', 'sandals', 'socks', 'tie', 'scarf', 'gloves', 'belt',
            'watch', 'ring', 'necklace', 'earrings', 'bracelet'
        ],
        'emotions': [
            'happy', 'sad', 'angry', 'surprised', 'excited', 'calm', 'peaceful', 'joyful',
            'melancholy', 'nostalgic', 'anxious', 'confident', 'mysterious', 'dramatic',
            'romantic', 'energetic', 'serene', 'tense', 'playful', 'serious'
        ],
        'abstract_concepts': [
            'freedom', 'justice', 'peace', 'war', 'love', 'hate', 'beauty', 'ugliness',
            'truth', 'lie', 'innovation', 'tradition', 'progress', 'chaos', 'order',
            'simplicity', 'complexity', 'elegance', 'roughness', 'sophistication'
        ],
        'colors': [
            'red', 'blue', 'green', 'yellow', 'orange', 'purple', 'pink', 'brown',
            'black', 'white', 'gray', 'silver', 'gold', 'cyan', 'magenta', 'lime',
            'navy', 'maroon', 'olive', 'aqua'
        ],
        'styles': [
            'modern', 'vintage', 'classic', 'contemporary', 'abstract', 'realistic',
            'minimalist', 'ornate', 'rustic', 'elegant', 'casual', 'formal', 'artistic',
            'professional', 'creative', 'traditional', 'futuristic', 'retro'
        ],
        'activities': [
            'running', 'walking', 'jumping', 'dancing', 'singing', 'reading', 'writing',
            'cooking', 'eating', 'sleeping', 'working', 'playing', 'studying', 'exercising',
            'swimming', 'flying', 'driving', 'riding', 'climbing', 'surfing'
        ]
    }
    
    # Flatten all categories
    all_concepts = []
    for category, items in categories.items():
        if category == "objects":
            items = ['A photo of a ' + w for w in items]
        all_concepts.extend(items)
    
    return all_concepts, categories

In [5]:
words, categories=get_common_objects_and_concepts()

In [6]:
image = Image.open('n02123045_1955.jpg')

inputs = processor(images=image, return_tensors="pt").to(device)
with torch.no_grad():
    image_features = model.get_image_features(**inputs)
    # Normalize the embeddings
    image_features = image_features / image_features.norm(dim=-1, keepdim=True)

In [13]:
all_text_features=[]
for i, word in enumerate(words):
    inputs=processor(text='A photo of a '+ word, return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
        text_features = model.get_text_features(**inputs)
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
        all_text_features.append(text_features)
all_text_features=torch.concat(all_text_features)
text_features

tensor([[ 2.2236e-02,  1.4586e-02,  1.2724e-02, -2.3739e-03, -2.9229e-02,
         -1.3596e-02, -2.4281e-02, -6.7802e-02, -1.9731e-02, -2.5305e-02,
          2.8716e-03, -5.2856e-03,  1.5286e-02,  1.1369e-02, -2.3888e-02,
          3.3212e-04,  2.9863e-03, -8.7809e-03,  1.4459e-02,  1.8365e-02,
          2.3312e-02,  3.8955e-02,  3.0983e-02,  1.0618e-02, -2.0888e-02,
          4.0665e-02, -3.7335e-02,  1.8763e-02, -2.1135e-02, -2.1885e-02,
         -2.1633e-03,  7.8178e-03,  2.5044e-02, -1.4591e-02, -5.6150e-02,
         -3.4918e-02,  4.7537e-02, -2.2206e-02, -5.5419e-03,  1.3660e-02,
         -3.0157e-02,  2.9535e-02, -1.1599e-03,  2.4999e-02,  2.5118e-02,
          1.0918e-02,  1.6904e-02, -7.6538e-03,  5.0484e-02,  1.0639e-02,
          2.4562e-03, -1.3320e-02,  4.0455e-02,  8.7078e-03,  1.2177e-02,
         -3.5788e-02, -7.3476e-03,  2.4452e-02, -1.2221e-02,  3.5828e-02,
          4.0692e-02,  3.6157e-03, -2.2837e-02, -5.8215e-03,  2.4653e-02,
         -1.2822e-03,  9.5265e-03,  6.

In [8]:
all_text_features.shape

torch.Size([208, 512])

In [9]:
similarities = torch.cosine_similarity(image_features, all_text_features, dim=1)

In [10]:
similarities.shape

torch.Size([208])

In [19]:
top_k = 200
top_k_indices = similarities.argsort(descending=True)[:top_k]

results = []
for idx in top_k_indices:
    emb = all_text_features[idx].cpu().tolist()
    emb_short = [round(emb[0], 2), round(emb[1], 2), round(emb[-1], 2)]
    results.append({
        'text': words[idx],
        'similarity': similarities[idx].item(),
        'embedding': emb_short
    })

import pandas as pd

df_results = pd.DataFrame(results)
df_results.to_csv('cat_similarity.csv', index=False)
