In [2]:
import torch
from torchvision.datasets import Imagenette
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import torch.nn.functional as F
from urllib.request import urlopen
from PIL import Image
from open_clip import create_model_from_pretrained, get_tokenizer
from tqdm import tqdm

In [3]:
## set dataset

transform = transforms.Compose([
    transforms.Resize((224, 224)), 
    transforms.ToTensor()         
])

dataset = Imagenette(root='./imagenette2-320', split='train', size='320px',transform=transform, download=True)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

In [4]:
## set model 

device = 'mps'
model, preprocess = create_model_from_pretrained('hf-hub:timm/ViT-B-16-SigLIP2')
model = model.to(device)
tokenizer = get_tokenizer('hf-hub:timm/ViT-B-16-SigLIP2')

In [5]:
## Extract Features 

features = []
labels = []

with torch.no_grad(), torch.cuda.amp.autocast():
    for i, (images, label) in enumerate(tqdm(dataloader)):
        features += [model.forward(images.to(device))[0]]
        labels += [label]
        
features = torch.concat(features).cpu().numpy()
labels = torch.concat(labels).cpu().numpy()


print('feature shape',features.shape)
print('labels shape',labels.shape)

  with torch.no_grad(), torch.cuda.amp.autocast():
  0%|          | 0/148 [00:00<?, ?it/s]

100%|██████████| 148/148 [01:06<00:00,  2.22it/s]


feature shape (9469, 768)
labels shape (9469,)


In [10]:
import numpy as np
import hdbscan
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import pairwise_distances
from finch import FINCH
from time import time

start_time = time()
# HDBSCAN clustering
hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=10)
hdbscan_labels = hdbscan_clusterer.fit_predict(features)
end_time = time()
hdbscan_time = end_time - start_time

# DBSCAN clustering
start_time = time()
dbscan_clusterer = DBSCAN(eps=0.3, min_samples=10)
dbscan_labels = dbscan_clusterer.fit_predict(features)
end_time = time()
dbscan_time = end_time - start_time

# KMeans clustering
start_time = time()
kmeans_clusterer = KMeans(n_clusters=10, random_state=42)
kmeans_labels = kmeans_clusterer.fit_predict(features)
end_time = time()
kmeans_time = end_time - start_time

# KMeans++ clustering (KMeans with k-means++ initialization)
start_time = time()
kmeans_pp_clusterer = KMeans(n_clusters=10, init='k-means++', random_state=42)
kmeans_pp_labels = kmeans_pp_clusterer.fit_predict(features)
end_time = time()
kmeans_pp_time = end_time - start_time

# FINCH clustering
start_time = time()
c, num_clusters, _ = FINCH(features)
finch_labels = c[:, -2]  # Get the final clustering result
end_time = time()
finch_time = end_time - start_time

times = {
    'HDBSCAN': hdbscan_time,
    'DBSCAN': dbscan_time,
    'KMeans': kmeans_time,
    'KMeans++': kmeans_pp_time,
    'FINCH': finch_time,
}

print('/n ########### /n ')

print("HDBSCAN Labels:", np.unique(hdbscan_labels))
print("DBSCAN Labels:", np.unique(dbscan_labels))
print("KMeans Labels:", np.unique(kmeans_labels))
print("KMeans++ Labels:", np.unique(kmeans_pp_labels))
print("FINCH Labels:", np.unique(finch_labels))



Partition 0: 1230 clusters
Partition 1: 142 clusters
Partition 2: 26 clusters
Partition 3: 10 clusters
Partition 4: 2 clusters
/n ########### /n 
HDBSCAN Labels: [-1  0  1  2  3  4  5  6  7  8  9 10]
DBSCAN Labels: [-1  0  1]
KMeans Labels: [0 1 2 3 4 5 6 7 8 9]
KMeans++ Labels: [0 1 2 3 4 5 6 7 8 9]
FINCH Labels: [0 1 2 3 4 5 6 7 8 9]


In [12]:
# compare clusterings
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, fowlkes_mallows_score, silhouette_score

def evaluate_clustering(labels, gt_labels, features):
    valid_mask = labels != -1  # Ignore noise points (-1) for DBSCAN/HDBSCAN
    if np.any(valid_mask):
        labels = labels[valid_mask]
        gt_labels = gt_labels[valid_mask]
        features = features[valid_mask]

    ari = adjusted_rand_score(gt_labels, labels)
    nmi = normalized_mutual_info_score(gt_labels, labels)
    fmi = fowlkes_mallows_score(gt_labels, labels)
    silhouette = silhouette_score(features, labels) if len(set(labels)) > 1 else -1  # Silhouette needs >1 cluster

    return {"ARI": ari, "NMI": nmi, "FMI": fmi, "Silhouette": silhouette}

# Evaluate all clusterings
results = {
    "HDBSCAN": evaluate_clustering(hdbscan_labels, labels, features),
    "DBSCAN": evaluate_clustering(dbscan_labels, labels, features),
    "KMeans": evaluate_clustering(kmeans_labels, labels, features),
    "KMeans++": evaluate_clustering(kmeans_pp_labels, labels, features),
    "FINCH": evaluate_clustering(finch_labels, labels, features),
}

# Print results
for method, scores in results.items():
    print(f"{method} -> \
        Time: {times[method]:.4f}, \
        ARI: {scores['ARI']:.4f}, \
        NMI: {scores['NMI']:.4f}, \
        FMI: {scores['FMI']:.4f}, \
        Silhouette: {scores['Silhouette']:.4f}")


HDBSCAN ->         Time: 66.6139,         ARI: 0.9983,         NMI: 0.9980,         FMI: 0.9984,         Silhouette: 0.2549
DBSCAN ->         Time: 0.3103,         ARI: 0.0000,         NMI: 0.0000,         FMI: 0.7051,         Silhouette: 0.1419
KMeans ->         Time: 0.1533,         ARI: 0.8544,         NMI: 0.9390,         FMI: 0.8726,         Silhouette: 0.1768
KMeans++ ->         Time: 0.1475,         ARI: 0.8544,         NMI: 0.9390,         FMI: 0.8726,         Silhouette: 0.1768
FINCH ->         Time: 0.3468,         ARI: 0.8624,         NMI: 0.9412,         FMI: 0.8796,         Silhouette: 0.1735
