In [1]:
import copy
import time
import time
import numpy as np
import argparse

from torch.utils import data

from utils.utils import *
from utils import load_config
from utils.validate import *
from fedlearning.model import *
from fedlearning.dataset import *
from fedlearning.evolve import *
from fedlearning.optimizer import GlobalUpdater, LocalUpdater, get_omegas
from matplotlib import pyplot as plt
from random_graph import average_neighbor_weights

In [2]:
 # load the configuration file
config_file = "config.yaml"
config = load_config(config_file)

logger = init_logger(config)

model = init_model(config, logger)

record = init_record(config, model)

if config.device == "cuda":
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = True

--------------------------------------------------------------------------------


In [3]:
user_ids = np.arange(0, config.users)
num_participators = int(config.part_rate*config.users) 

# load the dataset
# dataset object is a dictionary with keys: train_data, test_data, user_with_data
# user_with_data is a dictionary with keys: userID:sampleID
# For example, in the IID setting ID's are just assigned like 0, 1, 2, 3, ...
dataset = assign_user_data(config, logger)
test_images = torch.from_numpy(dataset["test_data"]["images"]).to(config.device)
test_labels = torch.from_numpy(dataset["test_data"]["labels"]).to(config.device)

# tau candidates 
taus = np.array(config.taus)

# before optimization, report the result first

# Get model outputs
output_on_test_set = model(test_images)

# Get losses/accs, and append to list
loss = loss_with_output(output_on_test_set, test_labels, config.loss)
acc = accuracy_with_output(output_on_test_set, test_labels)
print("Before optimization, loss: {:.4f}, acc: {:.4f}".format(loss, acc))

Non-IID data distribution


Before optimization, loss: 3.4897, acc: 0.1166


Next we want to perform weight evolution

In [4]:
# Sample random subset of users
np.random.shuffle(user_ids)
participator_ids = user_ids[:num_participators-1]

# schedule some values to pick up
acc = []
losses = []
params_list = []

global_kernel = None
global_xs = None
global_ys = None
local_packages = []
local_kernels = []

# Added for memory management
global_jac = None
for user_id in participator_ids:
    # print("user {:d} updating".format(user_id))

    # assign_user_resource specifies some parameters for the user given their user_id
    # user_resource is a dictionary with keys: lr, device, batch_size, images, labels
    user_resource = assign_user_resource(config, user_id, 
                        dataset["train_data"], dataset["user_with_data"])
    local_updater = LocalUpdater(config, user_resource)

    # Gets the local jacobians for a given client specified in local_updater


    local_updater.local_step(model)
    # Simulate uplink transmission
    local_package = local_updater.uplink_transmit()
    # Append this clients jacobians to the list
    local_packages.append(local_package)

    # Send local x and y
    if global_xs is None:
        global_xs = local_updater.xs
        global_ys = local_updater.ys
    else:
        global_xs = torch.vstack((global_xs, local_updater.xs))
        global_ys = torch.vstack((global_ys, local_updater.ys))            

    # del local_updater
    torch.cuda.empty_cache()

start_time = time.time()
global_jac = combine_local_jacobians(local_packages)

del local_packages
# Added these two lines to free up memory
del local_package
del local_updater

global_kernel = empirical_kernel(global_jac)

print("kernel computation time {:3f}".format(time.time() - start_time))

# Returns a function that, given t and f_0, solves for f_t
predictor = gradient_descent_ce(global_kernel.cpu(), global_ys.cpu(), config.lr)

# This is f^(0) (X)
with torch.no_grad():
    fx_0 = model(global_xs)

# Configure maximum t as one more than the largest tau value
t = torch.arange(config.taus[-1]+1)

# Create f_x using the time values and the initial f_x
fx_train = predictor(t, fx_0.cpu())
# fx_train = fx_train.to(fx_0)

# Use current weights to pass to the optimizer
init_state_dict = copy.deepcopy(model.state_dict())

losses = np.zeros_like(taus, dtype=float)
acc = np.zeros_like(taus, dtype=float)

print("loss \tacc")

for i, tau in enumerate(config.taus):
    # initialize the weight aggregator with current weights
    weight_aggregator = WeightMod(init_state_dict)
    global_omegas = get_omegas(t[:tau+1], config.lr, global_jac, 
            global_ys.cpu(), fx_train[:tau+1], config.loss, 
            model.state_dict())
    # global_omegas = get_omegas(t[:tau+1], config.lr, global_jac, 
    #         global_ys, fx_train[:tau+1], config.loss, 
    #         model.state_dict())        
    
    # Complete the sum in 9b
    weight_aggregator.add(global_omegas)
    aggregated_weight = weight_aggregator.state_dict()
    model.load_state_dict(aggregated_weight)

    output = model(global_xs)

    loss = loss_with_output(output, global_ys, config.loss)
    # loss_fx = loss_with_output(fx_train[tau].to(global_ys), global_ys, config.loss)
    losses[i] = loss

    output = model(test_images)

    test_acc = accuracy_with_output(output, test_labels)
    acc[i] = test_acc

    print("{:.3f}\t{:.3f}".format(loss, test_acc))

    params_list.append(copy.deepcopy(aggregated_weight))

# Get index of tau with lowest loss
idx = np.argmin(losses)
# Select weight parameters with lowest loss
params = params_list[idx]

# Select tau with lowest loss
current_tau = taus[idx]
current_acc = acc[idx]
current_loss = losses[idx]


logger.info("current tau {:d}".format(current_tau))
logger.info("acc {:4f}".format(current_acc))
logger.info("loss {:.4f}".format(current_loss))
# Load weights into model
model.load_state_dict(params)

# del params_list

record["loss"].append(current_loss)
record["testing_accuracy"].append(current_acc)
record["taus"].append(current_tau)

logger.info("-"*80)
torch.cuda.empty_cache()


kernel computation time 5.368758
loss 	acc
1.218	0.585
1.161	0.605
1.143	0.610
1.133	0.613
1.127	0.611
1.124	0.610
1.122	0.608
1.121	0.604
1.119	0.601
1.118	0.600
1.115	0.585


current tau 1500
acc 0.585300
loss 1.1152
--------------------------------------------------------------------------------


1.129	0.560


In [5]:
# Get model outputs
output_on_test_set = model(test_images)

# Get losses/accs, and append to list
loss = loss_with_output(output_on_test_set, test_labels, config.loss)
acc = accuracy_with_output(output_on_test_set, test_labels)
print("After global weight evolution, loss: {:.4f}, acc: {:.4f}".format(loss, acc))

After global weight evolution, loss: 1.1387, acc: 0.5853


Now, each client performs local SGD on their own data

In [6]:
import copy
temp_models = {}
for id in participator_ids:
    temp_models[id] = copy.deepcopy(model)

In [7]:
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, transforms

class NumpyDataset(Dataset):
    def __init__(self, data, targets, transform=None):
        """
        Args:
            data (numpy array): Array of data samples.
            targets (numpy array): Array of labels corresponding to the data samples.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.data = data
        self.targets = targets
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        target = self.targets[idx]
        
        if self.transform:
            sample = self.transform(sample)
        
        return sample, target

PyTorch version 2.2.1 available.
Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


In [8]:
import torch.optim as optim

def numpy_to_tensor_transform(data):
    return torch.from_numpy(data)


def self_train(user_model, user_id, dataset, config, logger, loss_fn, batch_size=32, epochs=1, lr = 0.001): 
    # Get data corresponding to a certain user
    user_resource = assign_user_resource(config, user_id, 
                        dataset["train_data"], dataset["user_with_data"])
    
    # Define the optimizer
    optimizer = optim.SGD(user_model.parameters(), lr=lr)
    dataset = NumpyDataset(user_resource["images"], user_resource["labels"], transform=numpy_to_tensor_transform)

    user_data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
   
    for epoch in range(epochs):
        # Iterate over the user's data
        for batch_idx, (data, target) in enumerate(user_data_loader):
            data, target = data.to(config.device), target.to(config.device)
            # Clear the gradients
            optimizer.zero_grad()
            
            # Forward pass
            output = user_model(data)
            
            # Compute the loss
            loss = loss_fn(output, target)
            
            # Backward pass
            loss.backward()
            
            # Update the model parameters
            optimizer.step()

            if batch_idx % 100 == 0:
                print(f'Train Epoch: {epoch} [{batch_idx * len(data)}/{len(user_data_loader.dataset)} ({100. * batch_idx / len(user_data_loader):.0f}%)]\tLoss: {loss.item():.6f}')
    print()


In [9]:
loss_fn_pytorch = nn.CrossEntropyLoss()
epochs = 5
lr = 0.001
batch_size = 32
for user_id in participator_ids:
    self_train(user_model=temp_models[user_id], user_id=user_id, dataset=dataset, config=config, logger=logger, 
               loss_fn=loss_fn_pytorch, batch_size=batch_size, epochs=epochs, lr=lr)























In [10]:
losses = []
accs = []
for user_id in participator_ids:
    # Get model outputs
    output_on_test_set = temp_models[user_id](test_images)

    # Get losses/accs, and append to list
    loss = loss_with_output(output_on_test_set, test_labels, config.loss)
    acc = accuracy_with_output(output_on_test_set, test_labels)
    print("After global weight evolution, loss: {:.4f}, acc: {:.4f}".format(loss, acc))

    losses.append(loss)
    accs.append(acc)

print("Average loss: {:.4f}, Average acc: {:.4f}".format(np.mean(losses), np.mean(accs)))

After global weight evolution, loss: 0.8727, acc: 0.6949
After global weight evolution, loss: 1.1493, acc: 0.5974
After global weight evolution, loss: 0.8816, acc: 0.6945
After global weight evolution, loss: 0.9672, acc: 0.6270
After global weight evolution, loss: 0.9917, acc: 0.6392
After global weight evolution, loss: 0.8753, acc: 0.6883
After global weight evolution, loss: 0.8641, acc: 0.6885
After global weight evolution, loss: 0.9194, acc: 0.6687
After global weight evolution, loss: 1.0822, acc: 0.6147
After global weight evolution, loss: 0.9721, acc: 0.6453
After global weight evolution, loss: 0.9163, acc: 0.6562
After global weight evolution, loss: 1.0813, acc: 0.6292
After global weight evolution, loss: 1.2199, acc: 0.5395
After global weight evolution, loss: 1.1496, acc: 0.6144
After global weight evolution, loss: 0.8884, acc: 0.6772
After global weight evolution, loss: 0.8555, acc: 0.7083
After global weight evolution, loss: 0.9000, acc: 0.6718
After global weight evolution, 

In [11]:
avged_dict = average_neighbor_weights(participator_ids[0], participator_ids[1:], temp_models)

In [12]:
avged_dict_model = init_model(config, logger)
avged_dict_model.load_state_dict(avged_dict)
# Get model outputs
output_on_test_set = avged_dict_model(test_images)

# Get losses/accs, and append to list
loss = loss_with_output(output_on_test_set, test_labels, config.loss)
acc = accuracy_with_output(output_on_test_set, test_labels)
print("After global weight evolution, loss: {:.4f}, acc: {:.4f}".format(loss, acc))


After global weight evolution, loss: 0.8913, acc: 0.6740
