# function

In [1]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
from torchvision.models import resnet18
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# =======================
# Dataset Loading
# =======================
def load_cifar10(batch_size=128):
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,))
    ])

    trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
    testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
    testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False)

    return trainloader, testloader, trainset, testset

# =======================
# Feature Extraction
# =======================
def extract_features(model, dataloader, device='cuda'):
    model.eval()
    features = []
    labels = []
    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs = inputs.to(device)
            outputs = model(inputs)
            features.append(outputs.cpu())
            labels.append(targets)

    all_features = torch.cat(features)
    all_labels = torch.cat(labels)
    return all_features, all_labels

# =======================
# KMeans Coreset Construction
# =======================
def kmeans_coreset(features, labels, k):
    kmeans = KMeans(n_clusters=k, random_state=0)
    cluster_ids = kmeans.fit_predict(features.numpy())
    centroids = torch.tensor(kmeans.cluster_centers_)

    coreset_indices = []
    for i in range(k):
        cluster_points = features[cluster_ids == i]
        if len(cluster_points) == 0:
            continue
        centroid = centroids[i]
        distances = torch.norm(cluster_points - centroid, dim=1)
        closest_idx = torch.argmin(distances)
        original_indices = torch.where(torch.tensor(cluster_ids) == i)[0]
        coreset_indices.append(original_indices[closest_idx].item())

    return coreset_indices

# =======================
# PCA + KMeans Coreset Construction
# =======================
def pca_kmeans_coreset(features, labels, k, pca_dim=100):
    pca = PCA(n_components=pca_dim)
    features_pca = pca.fit_transform(features.numpy())
    features_pca = torch.tensor(features_pca)

    return kmeans_coreset(features_pca, labels, k)

# =======================
# Training Function
# =======================
def train_model(model, trainloader, optimizer, criterion, device='cuda', epochs=10):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in trainloader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
        print(f'Epoch {epoch+1}, Loss: {running_loss/len(trainloader):.4f}')

# =======================
# Evaluation Function
# =======================
def evaluate_model(model, testloader, device='cuda'):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in testloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f'Accuracy on test set: {100 * correct / total:.2f}%')


In [None]:
from dataclasses import dataclass

@dataclass
class Config:
    batch_size: int = 128
    learning_rate: float = 0.01
    momentum: float = 0.9
    weight_decay: float = 5e-4
    num_epochs: int = 50
    kmeans_k: int = 500
    pca_dim: int = 100
    random_seed: int = 42
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    n_classes: int = 10

config = Config()


# Increase the Coreset Size for kmeans


In [None]:

@dataclass
class Config_kmeans:
    batch_size: int = 128
    learning_rate: float = 0.01
    momentum: float = 0.9
    weight_decay: float = 5e-4
    num_epochs: int = 50
    kmeans_k: int = 2000
    pca_dim: int = 100
    random_seed: int = 42
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    n_classes: int = 10

config_kmeans = Config_kmeans()


# Instead of picking only the closest point to the centroid (hard selection),sample multiple points near the centroid or with importance weighting

In [None]:
def soft_kmeans_coreset(features, labels, k, points_per_cluster=5):
    kmeans = KMeans(n_clusters=k, random_state=0)
    cluster_ids = kmeans.fit_predict(features.numpy())
    centroids = torch.tensor(kmeans.cluster_centers_)

    coreset_indices = []
    for i in range(k):
        cluster_points = features[cluster_ids == i]
        if len(cluster_points) == 0:
            continue
        centroid = centroids[i]
        distances = torch.norm(cluster_points - centroid, dim=1)

        # Find top-k closest points (instead of just 1)
        topk_indices = torch.topk(-distances, k=min(points_per_cluster, len(distances)))[1]

        original_indices = torch.where(torch.tensor(cluster_ids) == i)[0]
        selected_indices = original_indices[topk_indices]

        coreset_indices.extend(selected_indices.tolist())

    return coreset_indices


def pca_soft_kmeans_coreset(features, labels, k, pca_dim=100):
    pca = PCA(n_components=pca_dim)
    features_pca = pca.fit_transform(features.numpy())
    features_pca = torch.tensor(features_pca)

    return soft_kmeans_coreset(features_pca, labels, k)


In [None]:
def run_baseline_training(config):
    trainloader, testloader, _, _ = load_cifar10(config.batch_size)
    model = resnet18(num_classes=config.n_classes).to(config.device)
    optimizer = optim.SGD(model.parameters(), lr=config.learning_rate, momentum=config.momentum, weight_decay=config.weight_decay)
    criterion = nn.CrossEntropyLoss()

    train_model(model, trainloader, optimizer, criterion, config.device, config.num_epochs)
    evaluate_model(model, testloader, config.device)

def run_kmeans_coreset_training(config):
    trainloader, testloader, trainset, _ = load_cifar10(config.batch_size)
    model_feat = resnet18(num_classes=config.n_classes)
    model_feat.fc = nn.Identity()
    model_feat = model_feat.to(config.device)

    feature_loader = torch.utils.data.DataLoader(trainset, batch_size=config.batch_size, shuffle=False)
    features, labels = extract_features(model_feat, feature_loader, config.device)

    coreset_indices = kmeans_coreset(features, labels, config.kmeans_k)

    selected_data = torch.utils.data.Subset(trainset, coreset_indices)
    selected_loader = torch.utils.data.DataLoader(selected_data, batch_size=config.batch_size, shuffle=True)

    model = resnet18(num_classes=config.n_classes).to(config.device)
    optimizer = optim.SGD(model.parameters(), lr=config.learning_rate, momentum=config.momentum, weight_decay=config.weight_decay)
    criterion = nn.CrossEntropyLoss()

    train_model(model, selected_loader, optimizer, criterion, config.device, config.num_epochs)
    evaluate_model(model, testloader, config.device)

def run_pca_kmeans_coreset_training(config):
    trainloader, testloader, trainset, _ = load_cifar10(config.batch_size)
    model_feat = resnet18(num_classes=config.n_classes)
    model_feat.fc = nn.Identity()
    model_feat = model_feat.to(config.device)

    feature_loader = torch.utils.data.DataLoader(trainset, batch_size=config.batch_size, shuffle=False)
    features, labels = extract_features(model_feat, feature_loader, config.device)

    coreset_indices = pca_kmeans_coreset(features, labels, config.kmeans_k, config.pca_dim)

    selected_data = torch.utils.data.Subset(trainset, coreset_indices)
    selected_loader = torch.utils.data.DataLoader(selected_data, batch_size=config.batch_size, shuffle=True)

    model = resnet18(num_classes=config.n_classes).to(config.device)
    optimizer = optim.SGD(model.parameters(), lr=config.learning_rate, momentum=config.momentum, weight_decay=config.weight_decay)
    criterion = nn.CrossEntropyLoss()

    train_model(model, selected_loader, optimizer, criterion, config.device, config.num_epochs)
    evaluate_model(model, testloader, config.device)


In [None]:
run_baseline_training(config)

Epoch 1, Loss: 1.5342
Epoch 2, Loss: 1.1040
Epoch 3, Loss: 0.8872
Epoch 4, Loss: 0.7298
Epoch 5, Loss: 0.6068
Epoch 6, Loss: 0.5072
Epoch 7, Loss: 0.4219
Epoch 8, Loss: 0.3499
Epoch 9, Loss: 0.2888
Epoch 10, Loss: 0.2372
Epoch 11, Loss: 0.2031
Epoch 12, Loss: 0.1793
Epoch 13, Loss: 0.1547
Epoch 14, Loss: 0.1342
Epoch 15, Loss: 0.1358
Epoch 16, Loss: 0.1046
Epoch 17, Loss: 0.1067
Epoch 18, Loss: 0.0928
Epoch 19, Loss: 0.0892
Epoch 20, Loss: 0.0848
Epoch 21, Loss: 0.0821
Epoch 22, Loss: 0.0700
Epoch 23, Loss: 0.0606
Epoch 24, Loss: 0.0808
Epoch 25, Loss: 0.0633
Epoch 26, Loss: 0.0618
Epoch 27, Loss: 0.0747
Epoch 28, Loss: 0.0655
Epoch 29, Loss: 0.0580
Epoch 30, Loss: 0.0656
Epoch 31, Loss: 0.0575
Epoch 32, Loss: 0.0535
Epoch 33, Loss: 0.0605
Epoch 34, Loss: 0.0509
Epoch 35, Loss: 0.0613
Epoch 36, Loss: 0.0550
Epoch 37, Loss: 0.0507
Epoch 38, Loss: 0.0537
Epoch 39, Loss: 0.0527
Epoch 40, Loss: 0.0575
Epoch 41, Loss: 0.0577
Epoch 42, Loss: 0.0526
Epoch 43, Loss: 0.0508
Epoch 44, Loss: 0.04

In [None]:
run_kmeans_coreset_training(config_kmeans)

100%|██████████| 170M/170M [00:33<00:00, 5.11MB/s]


Epoch 1, Loss: 2.1909
Epoch 2, Loss: 1.6674
Epoch 3, Loss: 1.2655
Epoch 4, Loss: 0.8775
Epoch 5, Loss: 0.5848
Epoch 6, Loss: 0.4309
Epoch 7, Loss: 0.3058
Epoch 8, Loss: 0.2435
Epoch 9, Loss: 0.2173
Epoch 10, Loss: 0.1759
Epoch 11, Loss: 0.1237
Epoch 12, Loss: 0.1202
Epoch 13, Loss: 0.1026
Epoch 14, Loss: 0.0947
Epoch 15, Loss: 0.0624
Epoch 16, Loss: 0.0426
Epoch 17, Loss: 0.0253
Epoch 18, Loss: 0.0355
Epoch 19, Loss: 0.0298
Epoch 20, Loss: 0.0270
Epoch 21, Loss: 0.0156
Epoch 22, Loss: 0.0113
Epoch 23, Loss: 0.0045
Epoch 24, Loss: 0.0033
Epoch 25, Loss: 0.0054
Epoch 26, Loss: 0.0063
Epoch 27, Loss: 0.0025
Epoch 28, Loss: 0.0017
Epoch 29, Loss: 0.0007
Epoch 30, Loss: 0.0011
Epoch 31, Loss: 0.0006
Epoch 32, Loss: 0.0004
Epoch 33, Loss: 0.0004
Epoch 34, Loss: 0.0004
Epoch 35, Loss: 0.0004
Epoch 36, Loss: 0.0004
Epoch 37, Loss: 0.0003
Epoch 38, Loss: 0.0003
Epoch 39, Loss: 0.0003
Epoch 40, Loss: 0.0003
Epoch 41, Loss: 0.0003
Epoch 42, Loss: 0.0002
Epoch 43, Loss: 0.0003
Epoch 44, Loss: 0.00

In [None]:
run_pca_kmeans_coreset_training(config_kmeans)

Epoch 1, Loss: 2.2247
Epoch 2, Loss: 1.7510
Epoch 3, Loss: 1.2819
Epoch 4, Loss: 0.8985
Epoch 5, Loss: 0.5507
Epoch 6, Loss: 0.3583
Epoch 7, Loss: 0.3079
Epoch 8, Loss: 0.2779
Epoch 9, Loss: 0.2515
Epoch 10, Loss: 0.1854
Epoch 11, Loss: 0.1455
Epoch 12, Loss: 0.1510
Epoch 13, Loss: 0.0985
Epoch 14, Loss: 0.0893
Epoch 15, Loss: 0.0589
Epoch 16, Loss: 0.0444
Epoch 17, Loss: 0.0519
Epoch 18, Loss: 0.0490
Epoch 19, Loss: 0.0513
Epoch 20, Loss: 0.0403
Epoch 21, Loss: 0.0385
Epoch 22, Loss: 0.0401
Epoch 23, Loss: 0.0251
Epoch 24, Loss: 0.0445
Epoch 25, Loss: 0.0315
Epoch 26, Loss: 0.0341
Epoch 27, Loss: 0.0218
Epoch 28, Loss: 0.0202
Epoch 29, Loss: 0.0142
Epoch 30, Loss: 0.0175
Epoch 31, Loss: 0.0108
Epoch 32, Loss: 0.0053
Epoch 33, Loss: 0.0026
Epoch 34, Loss: 0.0055
Epoch 35, Loss: 0.0035
Epoch 36, Loss: 0.0017
Epoch 37, Loss: 0.0010
Epoch 38, Loss: 0.0006
Epoch 39, Loss: 0.0004
Epoch 40, Loss: 0.0003
Epoch 41, Loss: 0.0003
Epoch 42, Loss: 0.0003
Epoch 43, Loss: 0.0003
Epoch 44, Loss: 0.00

In [None]:
def run_soft_kmeans_coreset_training(config):
    trainloader, testloader, trainset, _ = load_cifar10(config.batch_size)
    model_feat = resnet18(num_classes=config.n_classes)
    model_feat.fc = nn.Identity()
    model_feat = model_feat.to(config.device)

    feature_loader = torch.utils.data.DataLoader(trainset, batch_size=config.batch_size, shuffle=False)
    features, labels = extract_features(model_feat, feature_loader, config.device)

    coreset_indices = soft_kmeans_coreset(features, labels, config.kmeans_k)

    selected_data = torch.utils.data.Subset(trainset, coreset_indices)
    selected_loader = torch.utils.data.DataLoader(selected_data, batch_size=config.batch_size, shuffle=True)

    model = resnet18(num_classes=config.n_classes).to(config.device)
    optimizer = optim.SGD(model.parameters(), lr=config.learning_rate, momentum=config.momentum, weight_decay=config.weight_decay)
    criterion = nn.CrossEntropyLoss()

    train_model(model, selected_loader, optimizer, criterion, config.device, config.num_epochs)
    evaluate_model(model, testloader, config.device)

def run_pca_soft_kmeans_coreset_training(config):
    trainloader, testloader, trainset, _ = load_cifar10(config.batch_size)
    model_feat = resnet18(num_classes=config.n_classes)
    model_feat.fc = nn.Identity()
    model_feat = model_feat.to(config.device)

    feature_loader = torch.utils.data.DataLoader(trainset, batch_size=config.batch_size, shuffle=False)
    features, labels = extract_features(model_feat, feature_loader, config.device)

    coreset_indices = pca_soft_kmeans_coreset(features, labels, config.kmeans_k, config.pca_dim)

    selected_data = torch.utils.data.Subset(trainset, coreset_indices)
    selected_loader = torch.utils.data.DataLoader(selected_data, batch_size=config.batch_size, shuffle=True)

    model = resnet18(num_classes=config.n_classes).to(config.device)
    optimizer = optim.SGD(model.parameters(), lr=config.learning_rate, momentum=config.momentum, weight_decay=config.weight_decay)
    criterion = nn.CrossEntropyLoss()

    train_model(model, selected_loader, optimizer, criterion, config.device, config.num_epochs)
    evaluate_model(model, testloader, config.device)


In [None]:
run_soft_kmeans_coreset_training(config_kmeans)

Epoch 1, Loss: 1.8902
Epoch 2, Loss: 1.4678
Epoch 3, Loss: 1.2376
Epoch 4, Loss: 1.0480
Epoch 5, Loss: 0.8838
Epoch 6, Loss: 0.7357
Epoch 7, Loss: 0.5620
Epoch 8, Loss: 0.4868
Epoch 9, Loss: 0.4010
Epoch 10, Loss: 0.3099
Epoch 11, Loss: 0.2272
Epoch 12, Loss: 0.1637
Epoch 13, Loss: 0.2395
Epoch 14, Loss: 0.2504
Epoch 15, Loss: 0.1757
Epoch 16, Loss: 0.0511
Epoch 17, Loss: 0.0461
Epoch 18, Loss: 0.1111
Epoch 19, Loss: 0.1114
Epoch 20, Loss: 0.0761
Epoch 21, Loss: 0.0563
Epoch 22, Loss: 0.0819
Epoch 23, Loss: 0.0383
Epoch 24, Loss: 0.0419
Epoch 25, Loss: 0.0598
Epoch 26, Loss: 0.0332
Epoch 27, Loss: 0.0231
Epoch 28, Loss: 0.0505
Epoch 29, Loss: 0.0598
Epoch 30, Loss: 0.1392
Epoch 31, Loss: 0.0572
Epoch 32, Loss: 0.0401
Epoch 33, Loss: 0.0208
Epoch 34, Loss: 0.0127
Epoch 35, Loss: 0.0095
Epoch 36, Loss: 0.0031
Epoch 37, Loss: 0.0017
Epoch 38, Loss: 0.0008
Epoch 39, Loss: 0.0007
Epoch 40, Loss: 0.0004
Epoch 41, Loss: 0.0004
Epoch 42, Loss: 0.0004
Epoch 43, Loss: 0.0004
Epoch 44, Loss: 0.00

In [None]:
run_pca_soft_kmeans_coreset_training(config_kmeans)

Epoch 1, Loss: 1.8792
Epoch 2, Loss: 1.4444
Epoch 3, Loss: 1.1691
Epoch 4, Loss: 0.9885
Epoch 5, Loss: 0.8104
Epoch 6, Loss: 0.6702
Epoch 7, Loss: 0.5104
Epoch 8, Loss: 0.4331
Epoch 9, Loss: 0.3727
Epoch 10, Loss: 0.2420
Epoch 11, Loss: 0.1902
Epoch 12, Loss: 0.1502
Epoch 13, Loss: 0.1554
Epoch 14, Loss: 0.1462
Epoch 15, Loss: 0.1095
Epoch 16, Loss: 0.0803
Epoch 17, Loss: 0.0841
Epoch 18, Loss: 0.0637
Epoch 19, Loss: 0.0434
Epoch 20, Loss: 0.0389
Epoch 21, Loss: 0.0356
Epoch 22, Loss: 0.0318
Epoch 23, Loss: 0.0359
Epoch 24, Loss: 0.0432
Epoch 25, Loss: 0.0369
Epoch 26, Loss: 0.0427
Epoch 27, Loss: 0.0338
Epoch 28, Loss: 0.0356
Epoch 29, Loss: 0.0359
Epoch 30, Loss: 0.0376
Epoch 31, Loss: 0.0577
Epoch 32, Loss: 0.0538
Epoch 33, Loss: 0.0442
Epoch 34, Loss: 0.0184
Epoch 35, Loss: 0.0119
Epoch 36, Loss: 0.0056
Epoch 37, Loss: 0.0044
Epoch 38, Loss: 0.0056
Epoch 39, Loss: 0.0024
Epoch 40, Loss: 0.0015
Epoch 41, Loss: 0.0008
Epoch 42, Loss: 0.0007
Epoch 43, Loss: 0.0007
Epoch 44, Loss: 0.00