In [None]:
import os
import time
import random
from collections import Counter

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Subset
import timm
import torch.optim as optim
from torch.optim import lr_scheduler
from sklearn.cluster import KMeans
from scipy.spatial.distance import directed_hausdorff
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm

In [None]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


def hausdorff(A: np.ndarray, B: np.ndarray) -> float:
    d1 = directed_hausdorff(A, B)[0]
    d2 = directed_hausdorff(B, A)[0]
    return max(d1, d2)

class NumpyDataset(Dataset):
    def __init__(self, images_path: str, labels_path: str, transform=None):
        self.images = np.load(images_path)
        self.labels = np.load(labels_path).flatten()
        self.transform = transform

    def __len__(self) -> int:
        return len(self.images)

    def __getitem__(self, idx: int):
        img = self.images[idx]
        img = (img * 255).astype(np.uint8)
        if img.ndim == 2:
            img = np.stack([img] * 3, axis=0)
        else:
            img = img.transpose(2, 0, 1)
        img = torch.tensor(img, dtype=torch.float32)
        if self.transform:
            img = self.transform(img)
        lbl = int(self.labels[idx])
        return img, lbl

In [None]:
def kmeans_indices(features, labels, percentages, seed):
    feats = features.cpu().numpy()
    labs = labels.cpu().numpy().flatten()
    out = {}
    for p in percentages:
        inds = []
        for cls in np.unique(labs):
            idxs = np.where(labs == cls)[0]
            k = max(1, int(len(idxs) * (p / 100)))
            if k > 1:
                km = KMeans(n_clusters=k, random_state=seed, n_init=10).fit(feats[idxs])
                centers, lbls = km.cluster_centers_, km.labels_
                for c in range(k):
                    mem = np.where(lbls == c)[0]
                    dists = np.linalg.norm(feats[idxs[mem]] - centers[c], axis=1)
                    inds.append(idxs[mem[np.argmin(dists)]])
            else:
                centroid = feats[idxs].mean(axis=0)
                rep = idxs[np.argmin(np.linalg.norm(feats[idxs] - centroid, axis=1))]
                inds.append(rep)
        while set(labs[inds]) != set(np.unique(labs)):
            missing = set(np.unique(labs)) - set(labs[inds])
            for cls in missing:
                inds.append(np.random.choice(np.where(labs == cls)[0]))
        out[p] = sorted(inds)
    return out

def propagation_indices(propagated_feats, labels, percentage):
    pf = propagated_feats.cpu()
    labs = labels.cpu()
    inds = []
    for cls in torch.unique(labs):
        idxs = (labs == cls).nonzero(as_tuple=True)[0]
        X = pf[idxs]
        cent = X.mean(dim=0, keepdim=True)
        sims = F.cosine_similarity(X, cent)
        k = max(1, int(len(idxs) * (percentage / 100)))
        topk = sims.topk(k).indices
        inds.extend(idxs[topk].cpu().tolist())
    return sorted(inds)

def spectral_indices(spectral_emb, labels, percentage, seed):
    X = spectral_emb.cpu().numpy()
    labs = labels.cpu().numpy().flatten()
    inds = []
    for cls in np.unique(labs):
        idxs = np.where(labs == cls)[0]
        sub = X[idxs]
        k = max(1, int(len(idxs) * (percentage / 100)))
        if k == 1:
            sim = cosine_similarity(sub)
            inds.append(idxs[np.argmax(sim.mean(axis=1))])
        else:
            km = KMeans(n_clusters=k, random_state=seed).fit(sub)
            for c in range(k):
                mem = np.where(km.labels_ == c)[0]
                simc = cosine_similarity(sub[mem])
                inds.append(idxs[mem[np.argmax(simc.mean(axis=1))]])
    return sorted(inds)


def train_model_subset(
    model: nn.Module,
    train_loader: DataLoader,
    val_loader: DataLoader,
    device: torch.device,
    optimizer,
    scheduler,
    scaler,
    num_epochs: int = 20,
    accumulation_steps: int = 4,
    patience: int = 5,
    class_weights: torch.Tensor = None
) -> dict:
    metrics_log = {
        "train_loss": [],
        "train_accuracy": [],
        "train_f1": [],
        "val_loss": [],
        "val_accuracy": [],
        "val_f1": []
    }
    model.to(device)
    best_val_loss = float('inf')
    patience_counter = 0
    criterion = nn.CrossEntropyLoss(weight=class_weights.to(device)) if class_weights is not None else nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        all_labels, all_preds = [], []
        optimizer.zero_grad(set_to_none=True)

        for step, (images, labels) in enumerate(tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{num_epochs}")):
            images, labels = images.to(device), labels.to(device)
            with torch.cuda.amp.autocast(enabled=scaler is not None):
                outputs = model(images)
                loss = criterion(outputs, labels) / accumulation_steps
            scaler.scale(loss).backward()

            if (step + 1) % accumulation_steps == 0:
                try:
                    scaler.step(optimizer)
                except AssertionError:
                    optimizer.step()
                try:
                    scaler.update()
                except AssertionError:
                    pass
                optimizer.zero_grad(set_to_none=True)

            running_loss += loss.item() * accumulation_steps
            _, predicted = outputs.max(1)
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(predicted.cpu().numpy())

        if (step + 1) % accumulation_steps != 0:
            try:
                scaler.step(optimizer)
            except AssertionError:
                optimizer.step()
            try:
                scaler.update()
            except AssertionError:
                pass
            optimizer.zero_grad(set_to_none=True)

        torch.cuda.empty_cache()

        train_accuracy = accuracy_score(all_labels, all_preds)
        train_f1 = f1_score(all_labels, all_preds, average='macro')
        train_loss = running_loss / len(train_loader)

        model.eval()
        val_loss = 0.0
        val_labels, val_preds = [], []
        with torch.no_grad():
            for images, labels in tqdm(val_loader, desc="Validating"):
                images, labels = images.to(device), labels.to(device)
                with torch.cuda.amp.autocast(enabled=scaler is not None):
                    outputs = model(images)
                    loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted = outputs.max(1)
                val_labels.extend(labels.cpu().numpy())
                val_preds.extend(predicted.cpu().numpy())
        val_loss /= len(val_loader)
        val_accuracy = accuracy_score(val_labels, val_preds)
        val_f1 = f1_score(val_labels, val_preds, average='macro')

        metrics_log["train_loss"].append(train_loss)
        metrics_log["train_accuracy"].append(train_accuracy)
        metrics_log["train_f1"].append(train_f1)
        metrics_log["val_loss"].append(val_loss)
        metrics_log["val_accuracy"].append(val_accuracy)
        metrics_log["val_f1"].append(val_f1)

        scheduler.step(val_loss)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            print("Validation loss improved. Model saved.")
        else:
            patience_counter += 1
            print(f"No improvement in validation loss for {patience_counter} epoch(s).")

        if patience_counter >= patience:
            print("Early stopping triggered.")
            break

    print(f"Final Train Loss:     {metrics_log['train_loss'][-1]:.4f}")
    print(f"Final Train Accuracy: {metrics_log['train_accuracy'][-1]:.4f}")
    print(f"Final Train F1:       {metrics_log['train_f1'][-1]:.4f}")
    print(f"Final Val Loss:       {metrics_log['val_loss'][-1]:.4f}")
    print(f"Final Val Accuracy:   {metrics_log['val_accuracy'][-1]:.4f}")
    print(f"Final Val F1:         {metrics_log['val_f1'][-1]:.4f}")

    return metrics_log

In [None]:
if __name__ == '__main__':
    DATA_PATH = '/home/dime/Desktop/Thesis/dermamnist_224'
    percentages = [0.1, 1, 10]
    seeds = [42, 43, 45]
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    train_labels = np.load(os.path.join(DATA_PATH, 'train_labels.npy')).flatten()
    class_counts = Counter(train_labels)
    total = sum(class_counts.values())
    weights = [total / class_counts[i] for i in range(len(class_counts))]
    class_weights = torch.tensor(weights, dtype=torch.float32)

    summary_records = []

    for emb in ['vit_derma', 'vgg_derma']:
        print(f"\n===== Experiments for {emb} embeddings =====")
        features = torch.load(f"{emb}_train_features.pth")
        propagated = torch.load(f"propagated_{emb}_train_features.pth")
        spectral = torch.load(f"spectral_{emb}_train_embeddings.pth")
        labels = torch.from_numpy(train_labels).long()

        for method, fn in [
            ('kmeans', lambda p, s: kmeans_indices(features, labels, [p], s)[p]),
            ('propagation', lambda p, s: propagation_indices(propagated, labels, p)),
            ('spectral', lambda p, s: spectral_indices(spectral, labels, p, s)),
        ]:
            for p in percentages:
                print(f"\n======================= SUMMARY for {method.upper()} {p}% =======================")
                cpu_start = time.process_time()
                wall_start = time.time()
                accs, f1s, hds = [], [], []

                for s in seeds:
                    set_seed(s)
                    subs = fn(p, s)
                    ds = NumpyDataset(
                        os.path.join(DATA_PATH,'train_images.npy'),
                        os.path.join(DATA_PATH,'train_labels.npy')
                    )
                    sub_ds = Subset(ds, subs)
                    train_loader = DataLoader(sub_ds, batch_size=16, shuffle=True)
                    val_loader = DataLoader(
                        NumpyDataset(
                            os.path.join(DATA_PATH,'val_images.npy'),
                            os.path.join(DATA_PATH,'val_labels.npy')
                        ), batch_size=16, shuffle=False
                    )
                    model = timm.create_model('vgg16', pretrained=True, num_classes=7).to(device)
                    optimizer = optim.AdamW(model.parameters(), lr=1e-4)
                    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5)
                    scaler = torch.cuda.amp.GradScaler() if torch.cuda.is_available() else None

                    metrics = train_model_subset(
                        model, train_loader, val_loader,
                        device, optimizer, scheduler, scaler,
                        num_epochs=20, accumulation_steps=4,
                        patience=5, class_weights=class_weights
                    )
                    train_loss = metrics['train_loss'][-1]
                    train_acc = metrics['train_accuracy'][-1]
                    train_f1  = metrics['train_f1'][-1]
                    val_loss   = metrics['val_loss'][-1]
                    val_acc    = metrics['val_accuracy'][-1]
                    val_f1     = metrics['val_f1'][-1]

                    full_emb = features.cpu().numpy()
                    sub_emb  = full_emb[subs]
                    hd = hausdorff(full_emb, sub_emb)

                    accs.append(val_acc)
                    f1s.append(val_f1)
                    hds.append(hd)

  
                cpu_end = time.process_time()
                wall_end = time.time()

                mean_acc, std_acc = float(np.mean(accs)), float(np.std(accs))
                mean_f1, std_f1 = float(np.mean(f1s)), float(np.std(f1s))
                mean_hd = float(np.mean(hds))
                cpu_time = cpu_end - cpu_start
                wall_time = wall_end - wall_start


                print(f"Validation Accuracy: mean={np.mean(accs):.4f}, std={np.std(accs):.4f}")
                print(f"Validation F1:       mean={np.mean(f1s):.4f}, std={np.std(f1s):.4f}")
                print(f"Mean Hausdorff:      {np.mean(hds):.4f}")
                print(f"CPU times:           {cpu_end - cpu_start:.2f}s")
                print(f"Wall time:           {wall_end - wall_start:.2f}s")

                summary_records.append({
                    'embedding': emb,
                    'method': method,
                    'percentage': p,
                    'mean_accuracy': mean_acc,
                    'std_accuracy': std_acc,
                    'mean_f1': mean_f1,
                    'std_f1': std_f1,
                    'mean_hausdorff': mean_hd,
                    'cpu_time_s': cpu_time,
                    'wall_time_s': wall_time
                })


    df = pd.DataFrame(summary_records)
    csv_path = os.path.join(DATA_PATH, 'coreset_summary2.csv')
    json_path = os.path.join(DATA_PATH, 'coreset_summary2.json')
    df.to_csv(csv_path, index=False)
    with open(json_path, 'w') as f:
        json.dump(summary_records, f, indent=2)
    print(f"Saved summary to {csv_path} and {json_path}")