In [None]:
import clustering as cl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import umap
from sklearn.cluster import KMeans


import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms

from sklearn.preprocessing import normalize


directory = ""
features = cl.load_features(directory+ "/cifar10_dino2giant_features.npy")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# CIFAR-10 mean and std (for normalization)
mean = [0.4914, 0.4822, 0.4465]  
std = [0.2470, 0.2435, 0.2616]  


transform = transforms.Compose([
    transforms.Resize((224, 224)),  
    transforms.ToTensor(),
    transforms.Normalize(mean=mean, std=std)
])
train_dataset = torchvision.datasets.CIFAR10(root= directory + '/data', train=True, download=True, transform=transform)
train_labels = np.array(train_dataset.targets)
labels = train_labels





In [None]:
reducer = umap.UMAP(n_neighbors=10, n_components=3, random_state=42)

X_all_norm = normalize(features)  
umap_all = reducer.fit_transform(X_all_norm)



kmeans_all = KMeans(
    n_clusters=10,
    n_init=200,
    max_iter=10000,
    random_state=42
).fit(umap_all)
y_pred_all = kmeans_all.labels_

acc = cl.clustering_accuracy(y_pred_all, labels)
cl.ari_nmi(y_pred_all, labels)
print(f"All clustering accuracy: {acc:.4f}")

In [7]:
centroids = np.vstack([
    umap_all[y_pred_all == i].mean(axis=0)
    for i in range(kmeans_all.n_clusters)
])

dists = np.linalg.norm(umap_all - centroids[y_pred_all], axis=1)

core_mask = np.zeros_like(y_pred_all, dtype=bool)
for i in range(kmeans_all.n_clusters):
    idx_i = np.where(y_pred_all == i)[0]
    d_i   = dists[idx_i]
    thresh = np.percentile(d_i, 25)     
    core_mask[idx_i[d_i <= thresh]] = True

core_preds  = y_pred_all[core_mask]
core_labels = labels[core_mask]

core_acc = cl.clustering_accuracy(core_preds, core_labels)


print(f"Core (25%) clustering accuracy: {core_acc:.4f}")



Core (25%) clustering accuracy: 0.9918


In [11]:
core_mask = np.zeros_like(y_pred_all, dtype=bool)
for i in range(kmeans_all.n_clusters):
    idx_i = np.where(y_pred_all == i)[0]
    d_i   = dists[idx_i]
    thresh = np.percentile(d_i, 50)     
    core_mask[idx_i[d_i <= thresh]] = True

core_preds  = y_pred_all[core_mask]
core_labels = labels[core_mask]

core_acc = cl.clustering_accuracy(core_preds, core_labels)


print(f"Core (50%) clustering accuracy: {core_acc:.4f}")



Core (50%) clustering accuracy: 0.9928


In [9]:
core_mask = np.zeros_like(y_pred_all, dtype=bool)
for i in range(kmeans_all.n_clusters):
    idx_i = np.where(y_pred_all == i)[0]
    d_i   = dists[idx_i]
    thresh = np.percentile(d_i, 75)      
    core_mask[idx_i[d_i <= thresh]] = True

core_preds  = y_pred_all[core_mask]
core_labels = labels[core_mask]

core_acc = cl.clustering_accuracy(core_preds, core_labels)


print(f"Core (75%) clustering accuracy: {core_acc:.4f}")



Core (75%) clustering accuracy: 0.9929


In [10]:
core_mask = np.zeros_like(y_pred_all, dtype=bool)
for i in range(kmeans_all.n_clusters):
    idx_i = np.where(y_pred_all == i)[0]
    d_i   = dists[idx_i]
    thresh = np.percentile(d_i, 100)      
    core_mask[idx_i[d_i <= thresh]] = True

core_preds  = y_pred_all[core_mask]
core_labels = labels[core_mask]

core_acc = cl.clustering_accuracy(core_preds, core_labels)


print(f"Core (100%) clustering accuracy: {core_acc:.4f}")



Core (100%) clustering accuracy: 0.9911
