In [33]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms

import time
import numpy as np

In [2]:
print(f'Current GPU: {torch.cuda.current_device()}')
print(f'GPU Name: {torch.cuda.get_device_name()}')
print(f'Number of GPUs: {torch.cuda.device_count()}')
torch.cuda.set_device(1) ## Setting cuda on GPU:0

Current GPU: 0
GPU Name: NVIDIA GeForce RTX 3090
Number of GPUs: 2


In [37]:
class Args: 
    
    num_users = 20
    rounds = 30
    frac = 0.1
    local_bs = 10
    local_ep = 10
    lr = 0.01
    momentum = 0.9
    
    
    model = 'lenet5' ## options: lenet5
    dataset = 'cifar10'  ## options: mnist, cifar10, cifar100
    datadir='../data/'
    
    p_train=1.0
    p_test = 1.0
    partition='niid-labeldir'
    niid_beta=0.1
    iid_beta = 1.0

    print_freq = 10
    
    load_initial = ''
    seed = 0
    gpu = 0
    
args = Args()

torch.cuda.set_device(args.gpu) ## Setting cuda on GPU 
#torch.manual_seed(args.seed)
#np.random.seed(args.seed)

args.device = torch.device('cuda:{}'.format(args.gpu) if torch.cuda.is_available() else 'cpu')

## Loading Dataset

In [38]:
from src.data import *
from src.models import *
from src.client import *
from src.clustering import *
from src.utils import *

In [39]:
print('Getting Clients Data')

train_ds_global, test_ds_global, train_dl_global, \
test_dl_global = get_dataset_global(args.dataset, args.datadir, batch_size=128,
                                    p_train=1.0, p_test=1.0)

train_ds_global1, test_ds_global1, train_dl_global1, \
test_dl_global1 = get_dataset_global(args.dataset, args.datadir, batch_size=128,
                                     p_train=args.p_train, p_test=args.p_test)

partitions_train, partitions_test, partitions_train_stat, \
partitions_test_stat = partition_data(args.dataset, args.datadir, args.partition,
                                      args.num_users, niid_beta=args.niid_beta, iid_beta=args.iid_beta,
                                      p_train=args.p_train, p_test=args.p_test)


Getting Clients Data
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


## Building Model

In [40]:
print('Building models for clients')
print(f'MODEL: {args.model}, Dataset: {args.dataset}')
users_model, net_glob, initial_state_dict = get_models(args, dropout_p=0.5)
#initial_state_dict = nn.DataParallel(initial_state_dict)
#net_glob = nn.DataParallel(net_glob)
print('-'*40)
print(net_glob)
print('')

total = 0
for name, param in net_glob.named_parameters():
    #print(name, param.size())
    total += np.prod(param.size())
    #print(np.array(param.data.cpu().numpy().reshape([-1])))
    #print(isinstance(param.data.cpu().numpy(), np.array))
print(f'total params {total}')

Building models for clients
MODEL: lenet5, Dataset: cifar10
----------------------------------------
LeNet5(
  (conv1): Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)

total params 62006


## Building Clients

In [32]:
print('Initializing Clients')
clients = []
for idx in range(args.num_users):
    sys.stdout.flush()
    print(f'-- Client {idx}, Train Stat {partitions_train_stat[idx]} Test Stat {partitions_test_stat[idx]}')

    noise_level=0
    dataidxs = partitions_train[idx]
    dataidxs_test = partitions_test[idx]

    train_ds_local = get_subset(train_ds_global, dataidxs)
    test_ds_local  = get_subset(test_ds_global, dataidxs_test)

    transform_train, transform_test = get_transforms(args.dataset, noise_level=0, net_id=None, total=0)

    train_dl_local = DataLoader(dataset=train_ds_local, batch_size=args.local_bs, shuffle=True, drop_last=False,
                               num_workers=4, pin_memory=False)
    test_dl_local = DataLoader(dataset=test_ds_local, batch_size=64, shuffle=False, drop_last=False, num_workers=4,
                              pin_memory=False)

    clients.append(Client_FedAvg(idx, copy.deepcopy(users_model[idx]), args.local_bs, args.local_ep,
               args.lr, args.momentum, args.device, train_dl_local, test_dl_local))


Initializing Clients
-- Client 0, Train Stat {0: 123, 8: 25, 9: 10, 10: 6, 14: 1, 16: 359, 18: 42, 20: 1, 21: 114, 22: 7, 23: 51, 24: 94, 28: 5, 30: 1, 41: 121, 42: 94, 43: 5, 44: 2, 45: 5, 50: 23, 51: 3, 52: 6, 56: 17, 59: 53, 62: 228, 63: 116, 65: 104, 69: 1, 70: 104, 72: 1, 79: 4, 80: 323, 82: 48, 83: 2, 84: 13, 85: 221, 86: 3, 87: 314} Test Stat {0: 100, 8: 100, 9: 100, 10: 100, 14: 100, 16: 100, 18: 100, 20: 100, 21: 100, 22: 100, 23: 100, 24: 100, 28: 100, 30: 100, 41: 100, 42: 100, 43: 100, 44: 100, 45: 100, 50: 100, 51: 100, 52: 100, 56: 100, 59: 100, 62: 100, 63: 100, 65: 100, 69: 100, 70: 100, 72: 100, 79: 100, 80: 100, 82: 100, 83: 100, 84: 100, 85: 100, 86: 100, 87: 100}
-- Client 1, Train Stat {0: 1, 8: 1, 9: 1, 11: 63, 12: 2, 13: 71, 15: 9, 16: 3, 18: 1, 20: 25, 21: 1, 24: 26, 26: 27, 32: 54, 33: 57, 34: 334, 37: 17, 39: 8, 40: 1, 44: 1, 46: 2, 47: 3, 48: 63, 52: 79, 55: 186, 56: 1, 59: 8, 60: 119, 61: 36, 63: 1, 67: 1, 68: 212, 70: 66, 72: 16, 74: 7, 76: 1, 78: 27, 79: 8

-- Client 12, Train Stat {0: 218, 1: 25, 6: 1, 7: 137, 9: 1, 10: 181, 12: 465, 15: 27, 16: 14, 17: 14, 19: 148, 21: 142, 22: 90, 23: 201, 24: 98, 25: 2, 26: 9, 28: 1, 30: 1, 31: 297, 33: 1, 34: 74, 35: 80, 37: 12, 38: 4, 46: 1, 49: 24, 50: 49, 51: 254} Test Stat {0: 100, 1: 100, 6: 100, 7: 100, 9: 100, 10: 100, 12: 100, 15: 100, 16: 100, 17: 100, 19: 100, 21: 100, 22: 100, 23: 100, 24: 100, 25: 100, 26: 100, 28: 100, 30: 100, 31: 100, 33: 100, 34: 100, 35: 100, 37: 100, 38: 100, 46: 100, 49: 100, 50: 100, 51: 100}
-- Client 13, Train Stat {1: 37, 2: 257, 4: 346, 7: 5, 10: 9, 11: 83, 13: 11, 14: 70, 17: 1, 22: 34, 23: 125, 24: 53, 26: 7, 29: 3, 32: 115, 39: 5, 40: 5, 47: 10, 51: 97, 53: 29, 58: 29, 59: 42, 63: 39, 64: 4, 65: 251, 68: 24, 69: 82, 75: 67, 76: 105, 77: 1, 78: 1, 80: 1, 82: 6, 84: 83, 87: 30, 91: 53, 93: 38, 94: 395} Test Stat {1: 100, 2: 100, 4: 100, 7: 100, 10: 100, 11: 100, 13: 100, 14: 100, 17: 100, 22: 100, 23: 100, 24: 100, 26: 100, 29: 100, 32: 100, 39: 100, 40: 100,

## Federation

In [34]:
print('Starting FL')
print('-'*40)
start = time.time()

num_users_FL = args.num_users

loss_train = []
clients_local_acc = {i:[] for i in range(num_users_FL)}
w_locals, loss_locals = [], []
glob_acc = []

w_glob = copy.deepcopy(initial_state_dict)

m = max(int(args.frac * num_users_FL), 1)

for iteration in range(args.rounds):

    idxs_users = np.random.choice(range(num_users_FL), m, replace=False)
    #idxs_users = comm_users[iteration]

    print(f'----- ROUND {iteration+1} -----')
    torch.cuda.synchronize()
    sys.stdout.flush()
    for idx in idxs_users:
        clients[idx].set_state_dict(copy.deepcopy(w_glob))

        loss = clients[idx].train(is_print=False)
        loss_locals.append(copy.deepcopy(loss))

    # print loss
    loss_avg = sum(loss_locals) / len(loss_locals)
    template = '-- Average Train loss {:.3f}'
    print(template.format(loss_avg))

    ####### FedAvg ####### START
    total_data_points = sum([len(partitions_train[r]) for r in idxs_users])
    fed_avg_freqs = [len(partitions_train[r]) / total_data_points for r in idxs_users]
    w_locals = []
    for idx in idxs_users:
        w_locals.append(copy.deepcopy(clients[idx].get_state_dict()))

    ww = AvgWeights(w_locals, weight_avg=fed_avg_freqs)
    w_glob = copy.deepcopy(ww)
    net_glob.load_state_dict(copy.deepcopy(ww))

    ####### FedAvg ####### END
    _, acc = eval_test(net_glob, args, test_dl_global1)

    glob_acc.append(acc)
    template = "-- Global Acc: {:.3f}, Global Best Acc: {:.3f}"
    print(template.format(glob_acc[-1], np.max(glob_acc)))

    loss_train.append(loss_avg)

    ## clear the placeholders for the next round
    loss_locals.clear()

    ## calling garbage collector
    gc.collect()

end = time.time()
duration = end-start
print('-'*40)

Starting FL
----------------------------------------
----- ROUND 1 -----
-- Average Train loss 3.510
-- Global Acc: 1.000, Global Best Acc: 1.000
----- ROUND 2 -----
-- Average Train loss 2.449
-- Global Acc: 1.240, Global Best Acc: 1.240
----- ROUND 3 -----
-- Average Train loss 2.167
-- Global Acc: 2.780, Global Best Acc: 2.780
----- ROUND 4 -----
-- Average Train loss 1.723
-- Global Acc: 4.150, Global Best Acc: 4.150
----- ROUND 5 -----
-- Average Train loss 1.819
-- Global Acc: 4.250, Global Best Acc: 4.250
----- ROUND 6 -----
-- Average Train loss 2.090
-- Global Acc: 5.690, Global Best Acc: 5.690
----- ROUND 7 -----
-- Average Train loss 2.124
-- Global Acc: 7.430, Global Best Acc: 7.430
----- ROUND 8 -----
-- Average Train loss 1.340
-- Global Acc: 7.250, Global Best Acc: 7.430
----- ROUND 9 -----
-- Average Train loss 1.329
-- Global Acc: 9.370, Global Best Acc: 9.370
----- ROUND 10 -----
-- Average Train loss 1.774
-- Global Acc: 7.540, Global Best Acc: 9.370
----- ROUND 11 -

## Testing

In [35]:
print('*'*25)
print('---- Testing Final Local Results ----')
temp_acc = []
temp_best_acc = []

for k in range(num_users_FL):
    sys.stdout.flush()
    loss, acc = clients[k].eval_test()
    clients_local_acc[k].append(acc)
    temp_acc.append(clients_local_acc[k][-1])
    temp_best_acc.append(np.max(clients_local_acc[k]))

    template = ("Client {:3d}, Final_acc {:3.2f}, best_acc {:3.2f} \n")
    print(template.format(k, clients_local_acc[k][-1], np.max(clients_local_acc[k])))

template = ("-- Avg Local Acc: {:3.2f}")
print(template.format(np.mean(temp_acc)))
template = ("-- Avg Best Local Acc: {:3.2f}")
print(template.format(np.mean(temp_best_acc)))
print('*'*25)
############################### FedAvg Final Results
print('-'*40)
print('FINAL RESULTS')
template = "-- Global Acc Final: {:.2f}"
print(template.format(glob_acc[-1]))

template = "-- Global Acc Avg Final [N*C] Rounds: {:.2f}"
print(template.format(np.mean(glob_acc[-m:])))

template = "-- Global Best Acc: {:.2f}"
print(template.format(np.max(glob_acc)))

template = ("-- Avg Local Acc: {:3.2f}")
print(template.format(np.mean(temp_acc)))

template = ("-- Avg Best Local Acc: {:3.2f}")
print(template.format(np.mean(temp_best_acc)))

print(f'-- FL Time: {duration/60:.2f} minutes')
print('-'*40)

*************************
---- Testing Final Local Results ----
Client   0, Final_acc 30.84, best_acc 30.84 

Client   1, Final_acc 22.35, best_acc 22.35 

Client   2, Final_acc 23.07, best_acc 23.07 

Client   3, Final_acc 22.61, best_acc 22.61 

Client   4, Final_acc 22.88, best_acc 22.88 

Client   5, Final_acc 32.35, best_acc 32.35 

Client   6, Final_acc 26.34, best_acc 26.34 

Client   7, Final_acc 26.61, best_acc 26.61 

Client   8, Final_acc 22.88, best_acc 22.88 

Client   9, Final_acc 15.32, best_acc 15.32 

Client  10, Final_acc 21.77, best_acc 21.77 

Client  11, Final_acc 28.73, best_acc 28.73 

Client  12, Final_acc 30.41, best_acc 30.41 

Client  13, Final_acc 24.26, best_acc 24.26 

Client  14, Final_acc 25.30, best_acc 25.30 

Client  15, Final_acc 31.19, best_acc 31.19 

Client  16, Final_acc 28.72, best_acc 28.72 

Client  17, Final_acc 5.85, best_acc 5.85 

Client  18, Final_acc 21.33, best_acc 21.33 

Client  19, Final_acc 9.46, best_acc 9.46 

-- Avg Local Acc: 23