In [None]:
import os, random, numpy as np, torch, scipy.sparse as sp
from collections import Counter
from sklearn.cluster import SpectralClustering
from sklearn.metrics import pairwise_distances_argmin_min, adjusted_rand_score
from sklearn.metrics.pairwise import rbf_kernel
import timm
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
import math
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, transforms
from torch.utils.data import DataLoader, Subset, Dataset
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from statistics import mean
from tqdm import tqdm
from torch.optim import lr_scheduler
from sklearn.metrics import accuracy_score, f1_score
import time
import os
import scipy.sparse as sp
import faiss
from scipy.spatial.distance import directed_hausdorff
from sklearn.metrics.pairwise import pairwise_distances

In [None]:
class NumpyDataset(Dataset):
    def __init__(self, images_path, labels_path, transform=None):
        self.images = np.load(images_path)
        self.labels = np.load(labels_path)
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img = (self.images[idx] * 255).astype(np.uint8)
        if img.ndim == 2:
            img = np.stack([img]*3, axis=-1)
        img = torch.from_numpy(img.transpose(2,0,1)).float() / 255.0
        if self.transform:
            img = self.transform(img)
        label = int(self.labels[idx])
        return img, label

train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.RandomResizedCrop(224, scale=(0.8,1.0), antialias=True),
    transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])
])
val_transform = transforms.Compose([
    transforms.Resize((224,224), antialias=True),
    transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])
])

In [None]:
def build_knn_graph(features, k_nn=10):
    from sklearn.neighbors import NearestNeighbors
    nbrs = NearestNeighbors(n_neighbors=k_nn+1, algorithm='auto').fit(features)
    distances, indices = nbrs.kneighbors(features)
    sigmas = distances[:, -1]
    G = nx.Graph()
    N = features.shape[0]
    G.add_nodes_from(range(N))
    for i in range(N):
        for j_idx, j in enumerate(indices[i,1:], start=1):
            if sigmas[i]>0 and sigmas[j]>0:
                w = math.exp(-(distances[i,j_idx]**2)/(sigmas[i]*sigmas[j]))
            else:
                w = 0.0
            G.add_edge(i,j,weight=float(w))
    return G

from coreset_sc import CoresetSpectralClustering

import numpy as np
from scipy.sparse import csr_matrix

def convert_from_csr_matrix(matrix: csr_matrix):
    matrix.sort_indices()
    indices = matrix.indices.astype(np.uintp)  
    indptr = matrix.indptr.astype(np.uintp)  
    if matrix.data.dtype == np.float32:
        data = matrix.data.astype(np.float64)
    elif matrix.data.dtype == np.float64:
        data = matrix.data
    else:
        raise ValueError("Data type not supported, expected float32 or float64.")

    return data, indices, indptr

In [None]:
def train_model_subset(model, train_loader, val_loader, device,
                       optimizer, scheduler, scaler, num_epochs=20,
                       accumulation_steps=4, patience=5, class_weights=None):
    import torch.nn.functional as F
    from sklearn.metrics import accuracy_score, f1_score

    logs = {k:[] for k in ['train_loss','train_acc','train_f1','val_loss','val_acc','val_f1']}
    model.to(device)
    best_val = float('inf'); wait=0
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    for epoch in range(num_epochs):
        model.train(); running=0; all_y=[]; all_p=[]
        optimizer.zero_grad()
        for i,(x,y) in enumerate(tqdm(train_loader, desc=f"Train {epoch+1}/{num_epochs}")):
            x,y = x.to(device), y.to(device)
            with torch.cuda.amp.autocast():
                out = model(x); loss = criterion(out,y)/accumulation_steps
            scaler.scale(loss).backward()
            if (i+1)%accumulation_steps==0:
                scaler.step(optimizer); scaler.update(); optimizer.zero_grad()
            running += loss.item()*accumulation_steps
            preds = out.argmax(1)
            all_y.extend(y.cpu().numpy()); all_p.extend(preds.cpu().numpy())
        train_loss = running/len(train_loader)
        train_acc = accuracy_score(all_y, all_p)
        train_f1 = f1_score(all_y, all_p, average='weighted')

        model.eval(); vl=0; vy=[]; vp=[]
        with torch.no_grad():
            for x,y in tqdm(val_loader, desc="Validate"):  
                x,y = x.to(device), y.to(device)
                with torch.cuda.amp.autocast():
                    out = model(x); loss = criterion(out,y)
                vl += loss.item()
                prs = out.argmax(1)
                vy.extend(y.cpu().numpy()); vp.extend(prs.cpu().numpy())
        val_loss = vl/len(val_loader)
        val_acc = accuracy_score(vy,vp)
        val_f1 = f1_score(vy,vp,average='weighted')
        logs['train_loss'].append(train_loss); logs['train_acc'].append(train_acc); logs['train_f1'].append(train_f1)
        logs['val_loss'].append(val_loss); logs['val_acc'].append(val_acc); logs['val_f1'].append(val_f1)
        scheduler.step(val_loss)
        if val_loss < best_val:
            best_val=val_loss; wait=0
        else:
            wait+=1;  
            if wait>=patience:
                break
    return logs

In [None]:
def run_experiment_mink(
    features, labels, val_dataset,
    percentages=[0.1,1,5,10], seeds=[42,43,45],
    k_nn=50,          
    batch_size=16, lr=1e-4, device='cpu'
):
    MIN_KNN = 5     

    results = {}
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    counts = dict(zip(*np.unique(labels, return_counts=True)))
    total  = len(labels)
    cw     = torch.tensor([total/counts[c] for c in sorted(counts)],
                         device=device, dtype=torch.float32)

    for p in percentages:
        k_curr = max(MIN_KNN, int(k_nn * (1 - p/100.0)))
        print(f"\n— ε={p}%: using k_nn={k_curr}")

        stats=[]
        for seed in seeds:
            random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
            sel_idx=[]; haus_vals=[]; t0=time.time()

            for c in sorted(counts):
                idx_c   = np.where(labels==c)[0]
                feats_c = features[idx_c]
                n       = len(idx_c)
                desired = max(1, math.ceil((p/100)*n))

                if desired == 1:
                    loc = np.random.choice(n, 1, replace=False)
                    sel_idx.append(idx_c[loc].item())
                    haus_vals.append(0.)
                    continue
                if desired >= n:
                    sel_idx.extend(idx_c.tolist())
                    haus_vals.append(0.)
                    continue

                Gc = build_knn_graph(feats_c, k_nn=k_curr)
                A  = nx.adjacency_matrix(Gc, weight='weight').tocsr()
                A.setdiag(1.0)
                data,ind,ptr = convert_from_csr_matrix(A)
                csr = csr_matrix((data,ind,ptr), shape=A.shape)

                ratio = desired / n
                model_cs = CoresetSpectralClustering(
                    num_clusters=desired,
                    coreset_ratio=ratio,
                    k_over_sampling_factor=5.0,
                    shift=0.01,
                    full_labels=True,
                    ignore_warnings=True
                )
                try:
                    model_cs.fit(csr)
                    core = model_cs.coreset_indices_.astype(int)
                    if len(core) < desired:
                        raise ValueError
                except ValueError:
                    loc = np.random.choice(n, desired, replace=False)
                else:
                    u = np.unique(core)
                    if len(u) < desired:
                        extra = np.random.choice(
                            [i for i in range(n) if i not in u],
                            desired-len(u), replace=False
                        )
                        loc = np.concatenate([u, extra])
                    else:
                        loc = u[:desired]

                sel_idx.extend(idx_c[loc].tolist())
                haus_vals.append(
                    max(
                        directed_hausdorff(feats_c, feats_c[loc])[0],
                        directed_hausdorff(feats_c[loc], feats_c)[0]
                    )
                )

            if not sel_idx:
                continue

            train_ds     = NumpyDataset(
                os.path.join(path,'train_images.npy'),
                os.path.join(path,'train_labels.npy'),
                transform=train_transform
            )
            train_loader = DataLoader(Subset(train_ds, sel_idx),
                                      batch_size=batch_size, shuffle=True)
            model = timm.create_model('vgg16', pretrained=True,
                                      num_classes=len(counts)).to(device)
            opt   = optim.Adam(model.parameters(), lr=lr)
            sch   = optim.lr_scheduler.ReduceLROnPlateau(opt, mode='min',
                                                        factor=0.1, patience=5)
            scaler= torch.cuda.amp.GradScaler()

            logs = train_model_subset(
                model, train_loader, val_loader,
                device, opt, sch, scaler,
                num_epochs=20, accumulation_steps=4,
                patience=5, class_weights=cw
            )

            stats.append({
                 'acc': logs['val_acc'][-1],
                 'f1':  logs['val_f1'][-1],
                 'haus': float(np.mean(haus_vals)),
                 'time': time.time()-t0,
                 'n':   len(sel_idx)
            })

        if not stats:
            print(f"  ε={p}% → no valid runs, skipping")
            continue

        A = np.array([[s['acc'],s['f1'],s['haus'],s['time']] for s in stats])
        results[p] = {
            'acc_mean': A[:,0].mean(),
            'acc_std':  A[:,0].std(),
            'f1_mean':  A[:,1].mean(),
            'haus_mean':A[:,2].mean(),
            'time_mean':A[:,3].mean()
        }
        print(f" ε={p}% → acc={results[p]['acc_mean']:.3f}±{results[p]['acc_std']:.3f}, "
              f"f1={results[p]['f1_mean']:.3f}, haus={results[p]['haus_mean']:.3f}, "
              f"time={results[p]['time_mean']:.1f}s")

    return results