In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [2]:
class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, padding=1)
        self.block1 = nn.Sequential(
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.AvgPool2d(2)
        )
        self.block2 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.AvgPool2d(2)
        )
        self.block3 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.AvgPool2d(2)
        )
        self.final_conv = nn.Conv2d(256, 512, kernel_size=3, padding=1)
        self.mean_pool = nn.AdaptiveAvgPool2d(1)

    def forward(self, x, extract_layer=None):
        x = self.conv1(x)
        if extract_layer == 1: return x
        x = self.block1(x)
        if extract_layer == 2: return x
        x = self.block2(x)
        if extract_layer == 3: return x
        x = self.block3(x)
        if extract_layer == 4: return x
        x = self.final_conv(x)
        if extract_layer == 5: return x
        x = self.mean_pool(x).view(x.size(0), -1)
        if extract_layer == 6: return x
        return x


In [3]:
cifar_mean = [0.4914, 0.4822, 0.4465]
cifar_std = [0.2470, 0.2435, 0.2616]

transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(cifar_mean, cifar_std)])

dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
d_test = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

d_train, d_aux = torch.utils.data.random_split(dataset, [25000, 25000])

img, label = d_train[0]
print(img.shape)
print(label)

Files already downloaded and verified
Files already downloaded and verified
torch.Size([3, 32, 32])
0


In [4]:
def train_cnn(model, dataloader, epochs=10):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(device)
    model.to(device)
    optimiser = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device)
            optimiser.zero_grad()
            output = model(images)
            loss = criterion(output, labels)
            loss.backward()
            optimiser.step()

In [5]:
def extract_features(model, dataloader, layer):
    model.eval()
    features = []
    with torch.no_grad():
        for images, _ in dataloader:
            feature = model(images, extract_layer=layer)
            features.append(feature.view(feature.size(0), -1).cpu().numpy())
    return np.vstack(features)

In [6]:
dataloader_aux = torch.utils.data.DataLoader(d_aux, batch_size=64, shuffle=True)
cnn_model = ConvNet()
train_cnn(cnn_model, dataloader_aux, epochs=10)

cpu


In [7]:
dataloader_train = torch.utils.data.DataLoader(d_train, batch_size=64, shuffle=False)
train_features = {layer: extract_features(cnn_model, dataloader_train, layer) for layer in range(1, 7)}

aux_features = {layer: extract_features(cnn_model, dataloader_aux, layer) for layer in range(1, 7)}

dataloader_test = torch.utils.data.DataLoader(d_test, batch_size=64, shuffle=False)
test_features = {layer: extract_features(cnn_model, dataloader_test, layer) for layer in range(1, 7)}

In [8]:
aux_pca_models = {layer: PCA(n_components=10).fit(aux_features[layer]) for layer in aux_features}
aux_pca_features = {layer: aux_pca_models[layer].transform(aux_features[layer]) for layer in aux_features}

In [9]:
kmeans_models = {layer: KMeans(n_clusters=100, random_state=42).fit(aux_pca_features[layer]) for layer in aux_features}
cluster_labels = {layer: kmeans_models[layer].labels_ for layer in aux_features}



In [10]:
print(type(kmeans_models[1]))

<class 'sklearn.cluster._kmeans.KMeans'>


In [11]:
layer_1 = kmeans_models[1]

cluster_indices, cluster_counts = np.unique(layer_1.labels_, return_counts=True)

print(f'There are {len(cluster_indices)} unique clusters in the auxiliary data')
print(f'Cluster counts: {cluster_counts}')
print(f'Cluster indices: {cluster_indices}')

There are 100 unique clusters in the auxiliary data
Cluster counts: [262 351 313 374 208 184 261 252 302 360 359 286 185 285 287 269 182 201
 324 133 278 134 290 291 184 404 268 233 222 197 104 187  91 373 472 317
 245 261  92 122 123 412 218 349 331 261 312 168 179 215 263 191 262 147
  88  90 367 442 474 207 234 301 227 258 226 211 211 194 178 374 244 210
 235 276 419 324 175 213 240 235 288 284 250 253 103 273 182 344 229 290
  81 282 217  84 228 177 268 532 162 246]
Cluster indices: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
 96 97 98 99]


In [12]:
clusters = []
km_data = []

for layer, model in kmeans_models.items():
    test_km, train_km = model.predict(d_test), model.predict(d_train)
    km_data.append((test_km, train_km))
    indices, counts = np.unique(model.labels_, return_counts=True)
    clusters.append((indices, counts))

for i, (indices, counts) in enumerate(clusters):
    print(f'Model {i}:')
    print(f'There are {len(indices)} unique clusters in the auxiliary data')
    print(f'Cluster counts: {counts}')
    print(f'Cluster indices: {indices}')
    

  array = numpy.asarray(array, order=order, dtype=dtype)


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (10000, 2) + inhomogeneous part.

In [None]:
poison_rates = [0.5, 1, 2]

# k_valid_subpopulations = [(subpop, count) for subpop, count in zip(cluster_indices, cluster_counts)]
# k_nn_data = np.zeros((len(k_valid_subpopulations), 6, len(poison_rates)))

for i in range(len(clusters)):
    valid_subpopulations = [(subpop, count) for subpop, count in zip(clusters[i][0], clusters[i][1])]
    
    print("\n")
    print(f'Model {i}:')

    for j, (index, count) in enumerate(valid_subpopulations):

        print("\n")
        print(f"Cluster index: {i}, Cluster Count: {count}, Test Samples: {np.where(km_data[i][0] == index)[0].shape[0]}")