Install required packages and import libraries

In [None]:
# Install required packages
!pip install torchvision
!pip install sentence-transformers
!pip install tqdm

# PyTorch core
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Subset

# Torchvision for pretrained models and image processing
import torchvision
from torchvision import datasets, transforms, models

# Sentence-BERT for semantic text embeddings
from sentence_transformers import SentenceTransformer

# Utilities
import numpy as np
import random
from tqdm import tqdm
import os
from sklearn.metrics.pairwise import cosine_similarity

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.6.0->torchvision)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.6.0->torchvision)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch==2.6.0->torchvision)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86

Download and preprocess the dataset

In [None]:
# Define the root directory where data will be stored
data_dir = "./data"

# Define image transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),           # Resize all images to 224x224 (standard for ResNet)
    transforms.ToTensor(),                   # Convert image to PyTorch Tensor
    transforms.Normalize(                    # Normalize using ImageNet means and stds
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

# Download and load the Oxford-IIIT Pet dataset
dataset = datasets.OxfordIIITPet(
    root=data_dir,
    download=True,
    transform=transform,
    target_types="category"   # Use class labels (not segmentation masks)
)

# Check how many classes and samples
num_classes = len(dataset.classes)
print(f"✅ Dataset loaded with {len(dataset)} images and {num_classes} unique classes.")

100%|██████████| 792M/792M [00:21<00:00, 36.8MB/s]
100%|██████████| 19.2M/19.2M [00:01<00:00, 13.5MB/s]


✅ Dataset loaded with 3680 images and 37 unique classes.


Split classes into seen and unseen sets

In [None]:
import random
import json
from collections import defaultdict

# Get the full list of class names (breeds)
all_classes = dataset.classes
print(f"Total classes: {len(all_classes)}")

# Fix the random seed for reproducibility
random.seed(42)

# Shuffle the class list
shuffled_classes = all_classes.copy()
random.shuffle(shuffled_classes)

# Define split ratio
num_seen = int(len(shuffled_classes) * 0.7)  # 70% seen, 30% unseen
seen_classes = shuffled_classes[:num_seen]
unseen_classes = shuffled_classes[num_seen:]

print(f"Seen classes ({len(seen_classes)}): {seen_classes}")
print(f"Unseen classes ({len(unseen_classes)}): {unseen_classes}")

# Save split to JSON for reuse
with open("class_split.json", "w") as f:
    json.dump({
        "seen": seen_classes,
        "unseen": unseen_classes
    }, f, indent=4)

# Create index mappings for class labels
class_to_idx = dataset.class_to_idx

# Separate dataset into seen and unseen based on labels
seen_indices = []
unseen_indices = []

for idx in range(len(dataset)):
    _, label = dataset[idx]
    class_name = all_classes[label]
    if class_name in seen_classes:
        seen_indices.append(idx)
    else:
        unseen_indices.append(idx)

print(f"Images in seen set: {len(seen_indices)}")
print(f"Images in unseen set: {len(unseen_indices)}")
print("✅ Class split saved as 'class_split.json'")

Total classes: 37
Seen classes (25): ['Bengal', 'Maine Coon', 'English Cocker Spaniel', 'British Shorthair', 'Newfoundland', 'Ragdoll', 'Russian Blue', 'Beagle', 'Pomeranian', 'Samoyed', 'Sphynx', 'Shiba Inu', 'Siamese', 'Chihuahua', 'Egyptian Mau', 'Leonberger', 'Saint Bernard', 'Havanese', 'Yorkshire Terrier', 'Birman', 'Pug', 'Abyssinian', 'Wheaten Terrier', 'English Setter', 'Keeshond']
Unseen classes (12): ['American Pit Bull Terrier', 'Staffordshire Bull Terrier', 'Scottish Terrier', 'Miniature Pinscher', 'Basset Hound', 'Persian', 'Boxer', 'German Shorthaired', 'Great Pyrenees', 'Japanese Chin', 'American Bulldog', 'Bombay']
Images in seen set: 2484
Images in unseen set: 1196
✅ Class split saved as 'class_split.json'


Define class descriptions for seen and unseen classes

In [None]:
# Define descriptive sentences for each class
class_descriptions = {
    "Abyssinian": "A sleek, short-haired cat with large ears and a warm reddish-brown coat.",
    "Bengal": "A cat with a muscular body and a distinctive spotted or marbled coat like a leopard.",
    "Birman": "A long-haired cat with blue eyes, white-gloved paws, and a cream-colored coat.",
    "Bombay": "A short-haired cat with a shiny black coat and striking copper eyes.",
    "British Shorthair": "A round-faced, dense-coated cat known for its calm nature.",
    "Egyptian Mau": "A spotted cat breed with a muscular body and a naturally worried expression.",
    "Maine Coon": "A large, fluffy cat with a bushy tail and tufted ears, known for its gentle temperament.",
    "Persian": "A cat with a flat face and long, flowing coat requiring frequent grooming.",
    "Ragdoll": "A large, affectionate cat that goes limp when held, with a semi-long coat.",
    "Russian Blue": "A slim, graceful cat with a short, silvery-blue coat and green eyes.",
    "Siamese": "A slender cat with a light-colored body and darker points on the ears, face, paws, and tail.",
    "Sphynx": "A nearly hairless cat with wrinkled skin and large ears.",
    "American Bulldog": "A strong, muscular dog with a broad chest and confident demeanor.",
    "American Pit Bull Terrier": "A medium-sized dog with a solid build and a short coat, known for loyalty.",
    "Basset Hound": "A dog with long ears, droopy eyes, and a strong sense of smell.",
    "Beagle": "A small hound with a tricolor coat, floppy ears, and a keen nose.",
    "Boxer": "A strong, athletic dog with a square jaw and playful nature.",
    "Chihuahua": "A very small dog with big eyes and ears, known for bold personality.",
    "English Cocker Spaniel": "A dog with long, feathered ears and a silky coat.",
    "English Setter": "A medium-large dog with a speckled coat and friendly disposition.",
    "German Shorthaired": "A hunting dog with a short, spotted coat and high energy.",
    "Great Pyrenees": "A large, white fluffy dog bred to guard livestock.",
    "Havanese": "A small companion dog with a silky coat and cheerful personality.",
    "Japanese Chin": "A toy dog with a flat face, feathered tail, and elegant manner.",
    "Keeshond": "A medium-sized dog with a fox-like face and thick, grey coat.",
    "Leonberger": "A giant, gentle dog with a thick coat and lion-like mane.",
    "Miniature Pinscher": "A tiny, energetic dog with a sleek black and tan coat.",
    "Newfoundland": "A massive, water-loving dog with a thick coat and sweet nature.",
    "Pomeranian": "A tiny fluffy dog with a fox-like face and vibrant personality.",
    "Pug": "A small, wrinkled dog with a curled tail and expressive eyes.",
    "Saint Bernard": "A huge, friendly dog often used for alpine rescue, with a thick coat.",
    "Samoyed": "A fluffy white dog with a 'smiling' face and friendly nature.",
    "Scottish Terrier": "A small dog with a wiry coat and distinctive beard.",
    "Shiba Inu": "A small, alert Japanese dog with a curled tail and fox-like face.",
    "Staffordshire Bull Terrier": "A muscular dog with a short coat and strong loyalty.",
    "Wheaten Terrier": "A soft-coated dog with a happy disposition and shaggy fur.",
    "Yorkshire Terrier": "A tiny dog with long silky hair and a bold attitude."
}

import json

# Save to a file
with open("class_descriptions.json", "w") as f:
    json.dump(class_descriptions, f, indent=4)

print("Descriptions saved successfully to class_descriptions.json")

Descriptions saved successfully to class_descriptions.json


Generating class embeddings using sentence-BERT

In [None]:
from sentence_transformers import SentenceTransformer
import json
import torch

# Step 1: Load class descriptions from file
with open("class_descriptions.json", "r") as f:
    class_descriptions = json.load(f)

# Step 2: Load the Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Embedding size = 384

# Step 3: Generate embeddings
class_names = list(class_descriptions.keys())
descriptions = [class_descriptions[class_name] for class_name in class_names]

# Generate embeddings (returns a list of numpy arrays)
embeddings = model.encode(descriptions, normalize_embeddings=True)

# Step 4: Convert to a dictionary
class_embeddings = {
    class_name: torch.tensor(embedding) for class_name, embedding in zip(class_names, embeddings)
}

# Step 5: Optionally save to file for reuse
torch.save(class_embeddings, "class_embeddings.pt")
print("Class embeddings saved to class_embeddings.pt")

Class embeddings saved to class_embeddings.pt


extracting features from seen class images

In [None]:
from tqdm import tqdm

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pretrained ResNet18 and remove the final classification layer
resnet = models.resnet18(pretrained=True)
resnet.fc = torch.nn.Identity()
resnet.eval()
resnet = resnet.to(device)

# Load class split from JSON
with open("class_split.json", "r") as f:
    split_data = json.load(f)
seen_classes = split_data["seen"]


# Map class names to indices from dataset
class_to_idx = dataset.class_to_idx
idx_to_class = {v: k for k, v in class_to_idx.items()}
seen_class_indices = [class_to_idx[c] for c in seen_classes]

# Get indices of samples from seen classes
seen_indices = [i for i, (_, label) in enumerate(dataset) if label in seen_class_indices]

# Create subset and DataLoader
seen_subset = Subset(dataset, seen_indices)
seen_loader = DataLoader(seen_subset, batch_size=32, shuffle=False)

# Feature extraction
all_features = []
all_labels = []

with torch.no_grad():
    for images, labels in tqdm(seen_loader, desc="Extracting features from seen classes"):
        images = images.to(device)
        features = resnet(images)
        all_features.append(features.cpu())

        # Convert numeric labels to class names (important for matching with embeddings)
        class_names = [idx_to_class[int(label)] for label in labels]
        all_labels.extend(class_names)

# Save extracted features and corresponding class names
torch.save({
    "features": torch.cat(all_features),
    "labels": all_labels
}, "seen_features.pt")


Extracting features from seen classes: 100%|██████████| 78/78 [04:04<00:00,  3.13s/it]


Training a mapping function

In [None]:
from torch.utils.data import TensorDataset

# Load seen class embeddings
with open("class_descriptions.json", "r") as f:
    class_desc = json.load(f)

# Generate embeddings for seen classes (same order as seen_classes)
seen_class_embeddings = []
for cls in seen_classes:
    desc = class_desc[cls]
    emb = model.encode(desc, convert_to_tensor=True)
    seen_class_embeddings.append(emb)
seen_class_embeddings = torch.stack(seen_class_embeddings)  # Shape: (num_seen_classes, emb_dim)

# Load extracted features and labels for seen classes
data = torch.load("seen_features.pt")
features = data["features"]          # Tensor of shape (num_samples, feature_dim)
labels = data["labels"]              # List of class names (strings) for each sample

# Create target embeddings for each feature based on label
label_to_embedding = {cls: emb for cls, emb in zip(seen_classes, seen_class_embeddings)}

targets = torch.stack([label_to_embedding[label] for label in labels])

# Normalize features and targets for cosine similarity learning
features = nn.functional.normalize(features, dim=1)
targets = nn.functional.normalize(targets, dim=1)

# Dataset and DataLoader
train_dataset = TensorDataset(features, targets)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Define the mapping model: a simple linear layer
class MappingModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MappingModel, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.linear(x)

input_dim = features.shape[1]  # e.g. 512 from ResNet
output_dim = targets.shape[1]  # e.g. 384 from Sentence-BERT

model = MappingModel(input_dim, output_dim).to(device)

# Loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 20

model.train()
for epoch in range(epochs):
    epoch_loss = 0
    for batch_features, batch_targets in train_loader:
        batch_features = batch_features.to(device)
        batch_targets = batch_targets.to(device)

        optimizer.zero_grad()
        outputs = model(batch_features)
        outputs = nn.functional.normalize(outputs, dim=1)  # Normalize output

        loss = criterion(outputs, batch_targets)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item() * batch_features.size(0)

    avg_loss = epoch_loss / len(train_loader.dataset)
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}")

# Save the trained mapping model
torch.save(model.state_dict(), "mapping_model.pth")


Epoch [1/20], Loss: 0.0015
Epoch [2/20], Loss: 0.0009
Epoch [3/20], Loss: 0.0007
Epoch [4/20], Loss: 0.0006
Epoch [5/20], Loss: 0.0005
Epoch [6/20], Loss: 0.0005
Epoch [7/20], Loss: 0.0004
Epoch [8/20], Loss: 0.0004
Epoch [9/20], Loss: 0.0004
Epoch [10/20], Loss: 0.0004
Epoch [11/20], Loss: 0.0004
Epoch [12/20], Loss: 0.0004
Epoch [13/20], Loss: 0.0004
Epoch [14/20], Loss: 0.0004
Epoch [15/20], Loss: 0.0003
Epoch [16/20], Loss: 0.0003
Epoch [17/20], Loss: 0.0003
Epoch [18/20], Loss: 0.0003
Epoch [19/20], Loss: 0.0003
Epoch [20/20], Loss: 0.0003


testing with seen data

In [None]:
# Load trained mapping model
model = MappingModel(input_dim, output_dim).to(device)
model.load_state_dict(torch.load("mapping_model.pth"))
model.eval()

all_preds = []
all_true = []

with torch.no_grad():
    for images, labels in seen_loader:
        images = images.to(device)
        features = resnet(images)
        mapped_features = model(features)
        mapped_features = nn.functional.normalize(mapped_features, dim=1)

        similarities = torch.matmul(mapped_features, seen_class_embeddings.T.to(device))
        pred_indices = similarities.argmax(dim=1).cpu()

        preds = [seen_classes[i] for i in pred_indices]
        trues = [idx_to_class[label.item()] for label in labels]

        all_preds.extend(preds)
        all_true.extend(trues)

        # Print true and predicted labels for this batch
        for t, p in zip(trues, preds):
            print(f"True: {t} \t Predicted: {p}")

from sklearn.metrics import accuracy_score
acc = accuracy_score(all_true, all_preds)
print(f"\nSeen class accuracy: {acc:.4f}")

True: Abyssinian 	 Predicted: Abyssinian
True: Abyssinian 	 Predicted: Abyssinian
True: Abyssinian 	 Predicted: Abyssinian
True: Abyssinian 	 Predicted: Abyssinian
True: Abyssinian 	 Predicted: Abyssinian
True: Abyssinian 	 Predicted: Abyssinian
True: Abyssinian 	 Predicted: Abyssinian
True: Abyssinian 	 Predicted: Abyssinian
True: Abyssinian 	 Predicted: Abyssinian
True: Abyssinian 	 Predicted: Abyssinian
True: Abyssinian 	 Predicted: Abyssinian
True: Abyssinian 	 Predicted: Abyssinian
True: Abyssinian 	 Predicted: Abyssinian
True: Abyssinian 	 Predicted: Abyssinian
True: Abyssinian 	 Predicted: Abyssinian
True: Abyssinian 	 Predicted: Abyssinian
True: Abyssinian 	 Predicted: Abyssinian
True: Abyssinian 	 Predicted: Abyssinian
True: Abyssinian 	 Predicted: Abyssinian
True: Abyssinian 	 Predicted: Russian Blue
True: Abyssinian 	 Predicted: Abyssinian
True: Abyssinian 	 Predicted: Abyssinian
True: Abyssinian 	 Predicted: Abyssinian
True: Abyssinian 	 Predicted: Abyssinian
True: Abyssini

testing with unseen classes

In [None]:
# Step 1: Get indices of unseen class samples
unseen_class_indices = [class_to_idx[c] for c in unseen_classes]
unseen_indices = [i for i, (_, label) in enumerate(dataset) if label in unseen_class_indices]

# Step 2: Create DataLoader for unseen data
unseen_subset = Subset(dataset, unseen_indices)
unseen_loader = DataLoader(unseen_subset, batch_size=32, shuffle=False)

# Step 3: Prepare unseen class embeddings
unseen_class_embeddings = []
for cls in unseen_classes:
    desc = class_desc[cls]
    emb = sbert_model.encode(desc, convert_to_tensor=True)
    unseen_class_embeddings.append(emb)
unseen_class_embeddings = torch.stack(unseen_class_embeddings).to(device)

# Step 4: Evaluate on unseen classes
all_preds = []
all_true = []

model.eval()
resnet.eval()

with torch.no_grad():
    for images, labels in tqdm(unseen_loader, desc="Testing on unseen classes"):
        images = images.to(device)
        features = resnet(images)
        mapped_features = model(features)
        mapped_features = nn.functional.normalize(mapped_features, dim=1)

        similarities = torch.matmul(mapped_features, unseen_class_embeddings.T)
        pred_indices = similarities.argmax(dim=1).cpu()

        preds = [unseen_classes[i] for i in pred_indices]
        all_preds.extend(preds)
        all_true.extend([idx_to_class[label.item()] for label in labels])

# Step 5: Accuracy and prints
from sklearn.metrics import accuracy_score
acc = accuracy_score(all_true, all_preds)
print(f"\n Unseen class accuracy: {acc:.4f}\n")

# Optional: Print true vs predicted
print("True Label vs Predicted Label (sample 10 rows):")
for true, pred in list(zip(all_true, all_preds))[:10]:
    print(f"True: {true:25s} → Pred: {pred}")

Testing on unseen classes: 100%|██████████| 38/38 [01:55<00:00,  3.04s/it]


 Unseen class accuracy: 0.1664

True Label vs Predicted Label (sample 10 rows):
True: American Bulldog          → Pred: Scottish Terrier
True: American Bulldog          → Pred: Scottish Terrier
True: American Bulldog          → Pred: Scottish Terrier
True: American Bulldog          → Pred: Scottish Terrier
True: American Bulldog          → Pred: Scottish Terrier
True: American Bulldog          → Pred: American Pit Bull Terrier
True: American Bulldog          → Pred: Scottish Terrier
True: American Bulldog          → Pred: American Pit Bull Terrier
True: American Bulldog          → Pred: Scottish Terrier
True: American Bulldog          → Pred: Japanese Chin



