In [1]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
from torchvision.models import resnet18
import time

# --- 1. Load Dataset with Transforms ---
print("Loading CIFAR-10 dataset...")

transform = transforms.Compose([
    transforms.ToTensor(),  # Convert images to tensors
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))  # Normalize to [-1, 1]
])

# Training set
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)

# Test set
testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=64, shuffle=False)

print(f"Training samples: {len(trainset)}, Test samples: {len(testset)}\n")


# --- 2. Define Model ---
print("Setting up the ResNet18 model...")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}\n")

model = resnet18(num_classes=10)
model = model.to(device)

# --- 3. Loss and Optimizer ---
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


# --- 4. Train the Model ---
epochs = 10
print(f"Starting training for {epochs} epochs...\n")
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    start_time = time.time()

    for i, (inputs, labels) in enumerate(trainloader):
        inputs, labels = inputs.to(device), labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        # Print every 100 mini-batches
        if (i + 1) % 100 == 0:
            print(f"Epoch [{epoch+1}/{epochs}], Step [{i+1}/{len(trainloader)}], Loss: {loss.item():.4f}")

    end_time = time.time()
    avg_loss = running_loss / len(trainloader)
    print(f"Epoch [{epoch+1}] finished. Avg Loss: {avg_loss:.4f}. Time: {(end_time - start_time):.2f} sec\n")


# --- 5. Evaluate the Model ---
print("Evaluating model on test set...\n")

model.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in testloader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Test Accuracy: {100 * correct / total:.2f}%\n")


Loading CIFAR-10 dataset...
Files already downloaded and verified
Files already downloaded and verified
Training samples: 50000, Test samples: 10000

Setting up the ResNet18 model...
Using device: cuda

Starting training for 10 epochs...

Epoch [1/10], Step [100/782], Loss: 1.5706
Epoch [1/10], Step [200/782], Loss: 1.4243
Epoch [1/10], Step [300/782], Loss: 1.5141
Epoch [1/10], Step [400/782], Loss: 1.2019
Epoch [1/10], Step [500/782], Loss: 1.1761
Epoch [1/10], Step [600/782], Loss: 1.2501
Epoch [1/10], Step [700/782], Loss: 1.1338
Epoch [1] finished. Avg Loss: 1.3867. Time: 11.79 sec

Epoch [2/10], Step [100/782], Loss: 0.9666
Epoch [2/10], Step [200/782], Loss: 1.1940
Epoch [2/10], Step [300/782], Loss: 0.9068
Epoch [2/10], Step [400/782], Loss: 0.7468
Epoch [2/10], Step [500/782], Loss: 1.2252
Epoch [2/10], Step [600/782], Loss: 0.8584
Epoch [2/10], Step [700/782], Loss: 1.1497
Epoch [2] finished. Avg Loss: 0.9922. Time: 10.59 sec

Epoch [3/10], Step [100/782], Loss: 1.1069
Epoch 

# Construct coreset with K-Means

Extract features:

use pretrained features (e.g., ResNet18 up to avgpool layer) to get more meaningful features


In [None]:
from sklearn.cluster import KMeans
import numpy as np
from torch.utils.data import Subset


In [None]:
# Load pretrained resnet18 for feature extraction
feature_extractor = resnet18(pretrained=True)
feature_extractor = nn.Sequential(*list(feature_extractor.children())[:-1])  # Remove FC layer
feature_extractor = feature_extractor.to(device)
feature_extractor.eval()

# Extract features from trainset
all_features = []
all_labels = []

with torch.no_grad():
    for images, labels in trainloader:
        images = images.to(device)
        features = feature_extractor(images)
        features = features.view(features.size(0), -1)  # Flatten output
        all_features.append(features.cpu())
        all_labels.append(labels)

all_features = torch.cat(all_features, dim=0)
all_labels = torch.cat(all_labels, dim=0)

print(f"Extracted features shape: {all_features.shape}")


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 158MB/s]


Extracted features shape: torch.Size([50000, 512])


Apply KMeans:

Use sklearn.cluster.KMeans to cluster features into k groups.

In [None]:
k = 5000
print(f"Running KMeans clustering with {k} clusters...")

kmeans = KMeans(n_clusters=k, random_state=0)
cluster_ids = kmeans.fit_predict(all_features.numpy())

print(f"KMeans clustering done\n")


Running KMeans clustering with 5000 clusters...
KMeans clustering done



Pick nearest points to centroids:

Select the closest feature to each centroid — that becomes your coreset

In [None]:
# closest point in each cluster
cluster_ids = torch.tensor(cluster_ids)

coreset_indices = []
centroids = torch.tensor(kmeans.cluster_centers_)
for i in range(k):
    cluster_points = all_features[cluster_ids == i]
    cluster_labels = all_labels[cluster_ids == i]

    if len(cluster_points) == 0:
        continue

    centroid = centroids[i]
    distances = torch.norm(cluster_points - centroid, dim=1)
    closest_idx = torch.argmin(distances)

    original_indices = torch.where(cluster_ids == i)[0]
    coreset_indices.append(original_indices[closest_idx].item())

print(f"Coreset size: {len(coreset_indices)}")


Coreset size: 5000


The small Dataset and DataLoader:

The new dataset with only selected images

In [None]:
coreset_trainset = Subset(trainset, coreset_indices)
coreset_trainloader = torch.utils.data.DataLoader(coreset_trainset, batch_size=64, shuffle=True)


Train in new set

In [None]:
#New ResNet18 Model
print("Setting up a fresh ResNet18 model for coreset training...\n")

model = resnet18(num_classes=10)
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

Setting up a fresh ResNet18 model for coreset training...



In [None]:
#Train the Model on Coreset
epochs = 10
print(f"Starting training on coreset for {epochs} epochs...\n")

for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    start_time = time.time()

    for i, (inputs, labels) in enumerate(coreset_trainloader):
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        if (i + 1) % 100 == 0:
            print(f"Epoch [{epoch+1}/{epochs}], Step [{i+1}/{len(coreset_trainloader)}], Loss: {loss.item():.4f}")

    end_time = time.time()
    avg_loss = running_loss / len(coreset_trainloader)
    print(f"Epoch [{epoch+1}] finished. Avg Loss: {avg_loss:.4f}. Time: {(end_time - start_time):.2f} sec\n")

Starting training on coreset for 10 epochs...

Epoch [1] finished. Avg Loss: 1.8949. Time: 56.46 sec

Epoch [2] finished. Avg Loss: 1.5177. Time: 56.31 sec

Epoch [3] finished. Avg Loss: 1.3347. Time: 56.46 sec

Epoch [4] finished. Avg Loss: 1.1600. Time: 56.59 sec

Epoch [5] finished. Avg Loss: 1.0248. Time: 57.51 sec

Epoch [6] finished. Avg Loss: 0.8959. Time: 57.33 sec

Epoch [7] finished. Avg Loss: 0.8041. Time: 56.22 sec

Epoch [8] finished. Avg Loss: 0.5721. Time: 57.11 sec

Epoch [9] finished. Avg Loss: 0.5258. Time: 56.65 sec

Epoch [10] finished. Avg Loss: 0.4358. Time: 57.52 sec



In [None]:
#Evaluate Model
print("Evaluating model on test set...\n")

model.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in testloader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Test Accuracy after Coreset Training: {100 * correct / total:.2f}%\n")

Evaluating model on test set...

Test Accuracy after Coreset Training: 53.05%



In [None]:
from sklearn.decomposition import PCA
from torch.utils.data import DataLoader



In [None]:


#PCA to Features
print("Applying PCA to reduce feature dimensions...")

pca_dim = 100
pca = PCA(n_components=pca_dim)
features_pca = pca.fit_transform(all_features.numpy())

print(f"PCA-reduced feature shape: {features_pca.shape}\n")  # (50000, 100)


#KMeans on PCA-reduced Features
print("Clustering PCA features with KMeans...")

k = 5000  # Coreset size
kmeans = KMeans(n_clusters=k, random_state=0)
cluster_ids = kmeans.fit_predict(features_pca)

print(f"KMeans clustering done\n")


#Select Coreset (Closest Points)
print("Selecting representative images for coreset...")

coreset_indices = []
centroids = kmeans.cluster_centers_

for i in range(k):
    idxs_in_cluster = np.where(cluster_ids == i)[0]
    if len(idxs_in_cluster) == 0:
        continue
    cluster_features = features_pca[idxs_in_cluster]
    centroid = centroids[i]

    distances = np.linalg.norm(cluster_features - centroid, axis=1)
    closest_idx = idxs_in_cluster[np.argmin(distances)]
    coreset_indices.append(closest_idx)

print(f"Selected {len(coreset_indices)} samples for coreset.\n")


#Build Coreset DataLoader
coreset_trainset = Subset(trainset, coreset_indices)
coreset_trainloader = DataLoader(coreset_trainset, batch_size=64, shuffle=True)


#New ResNet18 Model
print("Setting up a fresh ResNet18 model for coreset training...\n")

model = resnet18(num_classes=10)
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


#Train the Model on Coreset
epochs = 10
print(f"Starting training on PCA+KMeans coreset for {epochs} epochs...\n")

for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    start_time = time.time()

    for i, (inputs, labels) in enumerate(coreset_trainloader):
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        if (i + 1) % 100 == 0:
            print(f"Epoch [{epoch+1}/{epochs}], Step [{i+1}/{len(coreset_trainloader)}], Loss: {loss.item():.4f}")

    end_time = time.time()
    avg_loss = running_loss / len(coreset_trainloader)
    print(f"Epoch [{epoch+1}] finished. Avg Loss: {avg_loss:.4f}. Time: {(end_time - start_time):.2f} sec\n")


# Evaluate Model
print("Evaluating model on test set...\n")

model.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in testloader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Test Accuracy after PCA+KMeans Coreset Training: {100 * correct / total:.2f}%\n")


Applying PCA to reduce feature dimensions...
PCA-reduced feature shape: (50000, 100)

Clustering PCA features with KMeans...
KMeans clustering done

Selecting representative images for coreset...
Selected 5000 samples for coreset.

Setting up a fresh ResNet18 model for coreset training...

Starting training on PCA+KMeans coreset for 10 epochs...

Epoch [1] finished. Avg Loss: 1.8885. Time: 56.22 sec

Epoch [2] finished. Avg Loss: 1.5213. Time: 57.17 sec

Epoch [3] finished. Avg Loss: 1.3267. Time: 56.52 sec

Epoch [4] finished. Avg Loss: 1.1977. Time: 57.28 sec

Epoch [5] finished. Avg Loss: 1.0020. Time: 56.59 sec

Epoch [6] finished. Avg Loss: 0.8184. Time: 57.05 sec

Epoch [7] finished. Avg Loss: 0.7337. Time: 56.14 sec

Epoch [8] finished. Avg Loss: 0.6048. Time: 57.21 sec

Epoch [9] finished. Avg Loss: 0.4409. Time: 56.24 sec

Epoch [10] finished. Avg Loss: 0.4675. Time: 57.10 sec

Evaluating model on test set...

Test Accuracy after PCA+KMeans Coreset Training: 50.88%

