In [1]:
methods_dict = {1:"Missing_Data", 2:"Outliers", 3:"Labeling_Errors", 4:"Feature_Noise"} 
def data_corruption(method, data, labels, corrupt_p):
    corruption_method = methods_dict[method]
    if corrupt_p <= 0 or corrupt_p >= 1:
        print("Please choose a valid value for the corruption parameter (positive, above 0 and less than 1.)")
        return None
    
    
    # Go through each image, and create a randomized mask which sets pixels to a value of 0.
    data_corrupted = []
    if corruption_method == "Missing_Data":
        for image in data:
            mask = np.zeros(image.shape[0], dtype=int)
            mask[int(image.shape[0]*corrupt_p):] = 1
            random.shuffle(mask)
            mask = mask.astype(bool)
            corrupted_image = np.where(mask == False, 0, image)
            data_corrupted.append(corrupted_image)
        
        return data_corrupted
    
    if corruption_method == "Outliers":
        print("not yet implemented")
        
    if corruption_method == "Labeling_Errors":
        if len(labels) > 0:
            label_names = np.unique(labels)
            mask = np.zeros(len(labels), dtype=int)
            mask[int(len(labels)*corrupt_p):] = 1
            random.shuffle(mask)
            mask = mask.astype(bool)
            corrupted_labels = np.where(mask == False, "needs_change", labels)
            for i in range(len(labels)):
                if corrupted_labels[i] == "needs_change":
                    options = np.delete(label_names, np.where(np.unique(labels) == labels[i])) ## Using np.unique to find the set of labels.
                    corrupted_labels[i] = random.choice(options)

            corrupted_labels = corrupted_labels.astype(type(labels[i]))
            return corrupted_labels
        
        
        else:
            print("Please use an actual list for the labels")
        
    if corruption_method == "Feature_Noise":
        print("not yet implemented")
    

In [2]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
import random
import torch
from torch import nn, optim
from torchvision import datasets, transforms
import torch.nn.functional as F
import pickle
from torch.utils.data import DataLoader, Subset


def FedNN(cp):
    accuracy_dict = dict()

    # Define the number of clients and data split percentages
    num_clients = 5
    train_split = 0.8
    test_split = 0.2
    num_epochs = 3
    corrupt = True

    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    # Load the CIFAR10 dataset
    train_dataset = datasets.CIFAR10('../data', train=True, download=True, transform=transform)
    test_dataset = datasets.CIFAR10('../data', train=False, download=True, transform=transform)
    x_train, y_train = train_dataset.data, np.array(train_dataset.targets)
    x_test, y_test = test_dataset.data, np.array(test_dataset.targets)

    # Split the data into non-iid client datasets
    client_data = []
    for i in range(num_clients):
        indices = np.arange(len(x_train))
        np.random.shuffle(indices)
        split = int(train_split * len(x_train))
        train_indices, val_indices = indices[:split], indices[split:]
        client_data.append({'x_train': x_train[train_indices], 'y_train': y_train[train_indices],
                            'x_val': x_train[val_indices], 'y_val': y_train[val_indices]})

    # Train the models on the client data for multiple epochs
    for epoch in range(num_epochs):
        models = []
        for client in client_data:
            x_train, y_train = client['x_train'], client['y_train']
            if corrupt: 
                y_train = data_corruption(3, x_train, y_train, cp)
            model = KNeighborsClassifier(n_neighbors=5)
            model.fit(x_train.reshape(len(x_train), -1), y_train)
            models.append(model)

        # Evaluate the models on the test data
        total_correct = 0
        for i in range(len(x_test)):
            predictions = []
            for model in models:
                prediction = model.predict(x_test[i].reshape(1, -1))[0]
                predictions.append(prediction)
            consensus = max(set(predictions), key=predictions.count)
            if consensus == y_test[i]:
                total_correct += 1
        accuracy = total_correct / len(x_test)
        print('Epoch %d Test accuracy: %.2f%%' % (epoch+1, accuracy * 100))
        accuracy_dict[epoch+1] = accuracy*100
    return accuracy_dict

In [3]:
corrupt_list = [0.95, 0.9375, 0.925, 0.9125, 0.90, 0.875, 0.85, 0.65, 0.50, 0.35, 0.20, 0.05]

for corrupt_par in corrupt_list:
    results = FedNN(corrupt_par)

    with open(f'KNN_MNIST_3_C5_{corrupt_par}_Labels', 'wb') as handle:
        pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

Files already downloaded and verified
Files already downloaded and verified


KeyboardInterrupt: 