In [None]:
import clustering as cl
import numpy as np
import matplotlib.pyplot as plt
import umap
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
import torchvision
import torchvision.transforms as transforms




directory = ""
direc   = ""


features = cl.load_features(direc + "/cifar100_dino2giant_train_features.npy")
print(f"Features shape: {features.shape}")

# CIFAR-100 mean and std (for normalization)

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5071, 0.4867, 0.4408],
                         std=[0.2675, 0.2565, 0.2761])
])
train_dataset = torchvision.datasets.CIFAR100(root=directory + '/data',
                                               train=True,
                                               download=True,
                                               transform=transform)
labels = np.array(train_dataset.targets)
print(f"Total samples: {labels.shape[0]}")




In [None]:
reducer = umap.UMAP(n_neighbors=10, n_components=3, random_state=42)

X_all_norm = normalize(features)  
umap_all = reducer.fit_transform(X_all_norm)



kmeans_all = KMeans(
    n_clusters=100,
    n_init=200,
    max_iter=10000,
    random_state=42
).fit(umap_all)
y_pred_all = kmeans_all.labels_

acc = cl.clustering_accuracy(y_pred_all, labels)
cl.ari_nmi(y_pred_all, labels)
print(f"All clustering accuracy: {acc:.4f}")

In [16]:
centroids = np.vstack([
    umap_all[y_pred_all == i].mean(axis=0)
    for i in range(kmeans_all.n_clusters)
])

dists = np.linalg.norm(umap_all - centroids[y_pred_all], axis=1)

core_mask = np.zeros_like(y_pred_all, dtype=bool)
for i in range(kmeans_all.n_clusters):
    idx_i = np.where(y_pred_all == i)[0]
    d_i   = dists[idx_i]
    thresh = np.percentile(d_i, 25)      
    core_mask[idx_i[d_i <= thresh]] = True

core_preds  = y_pred_all[core_mask]
core_labels = labels[core_mask]

core_acc = cl.clustering_accuracy(core_preds, core_labels)


print(f"Core (25%) clustering accuracy: {core_acc:.4f}")



Core (25%) clustering accuracy: 0.8682


In [15]:
core_mask = np.zeros_like(y_pred_all, dtype=bool)
for i in range(kmeans_all.n_clusters):
    idx_i = np.where(y_pred_all == i)[0]
    d_i   = dists[idx_i]
    thresh = np.percentile(d_i, 50)     
    core_mask[idx_i[d_i <= thresh]] = True

core_preds  = y_pred_all[core_mask]
core_labels = labels[core_mask]

core_acc = cl.clustering_accuracy(core_preds, core_labels)


print(f"Core (50%) clustering accuracy: {core_acc:.4f}")



Core (50%) clustering accuracy: 0.8633


In [14]:
core_mask = np.zeros_like(y_pred_all, dtype=bool)
for i in range(kmeans_all.n_clusters):
    idx_i = np.where(y_pred_all == i)[0]
    d_i   = dists[idx_i]
    thresh = np.percentile(d_i, 75)      
    core_mask[idx_i[d_i <= thresh]] = True

core_preds  = y_pred_all[core_mask]
core_labels = labels[core_mask]

core_acc = cl.clustering_accuracy(core_preds, core_labels)


print(f"Core (75%) clustering accuracy: {core_acc:.4f}")



Core (75%) clustering accuracy: 0.8506


In [13]:
core_mask = np.zeros_like(y_pred_all, dtype=bool)
for i in range(kmeans_all.n_clusters):
    idx_i = np.where(y_pred_all == i)[0]
    d_i   = dists[idx_i]
    thresh = np.percentile(d_i, 100)     
    core_mask[idx_i[d_i <= thresh]] = True

core_preds  = y_pred_all[core_mask]
core_labels = labels[core_mask]

core_acc = cl.clustering_accuracy(core_preds, core_labels)


print(f"Core (100%) clustering accuracy: {core_acc:.4f}")



Core (100%) clustering accuracy: 0.8035
