In [1]:
%load_ext autoreload
%autoreload 2

import copy, os, socket, sys, time
from pathlib import Path
from tqdm import tqdm

import torch
from torch import optim
from torch.utils.tensorboard import SummaryWriter

sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "../")))
from libs import data, fl, nn, wandb
from libs.distributed import *

In [2]:
class FedArgs():
    def __init__(self):
        self.name = "client-x"
        self.num_clients = 50
        self.epochs = 51
        self.local_rounds = 1
        self.client_batch_size = 32
        self.test_batch_size = 128
        self.learning_rate = 1e-4
        self.weight_decay = 1e-5
        self.cuda = False
        self.seed = 1
        self.topic = "VJH_020_1"
        self.broker_ip = '172.16.26.40:9092'
        self.schema_ip = 'http://172.16.26.40:8081'
        self.wait_to_consume = 10
        self.dataset = "mnist"
        self.model = nn.ModelMNIST()
        self.train_func = fl.train_model
        self.eval_func = fl.evaluate
        
fedargs = FedArgs()

In [3]:
#fedargs.name = "client-1"
project = 'fl-kafka-client'
name = 'VJH_020_1-' + fedargs.name
wb = wandb.init(name, project)
fedargs.num_clients = 1

[34m[1mwandb[0m: Currently logged in as: [33mkasyah[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.6 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [4]:
use_cuda = fedargs.cuda and torch.cuda.is_available()
torch.manual_seed(fedargs.seed)
device = torch.device("cuda" if use_cuda else "cpu")
kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {}

In [5]:
host = socket.gethostname()
clients = [host + ": " + fedargs.name]
dt = Distributed(clients, fedargs.broker_ip, fedargs.schema_ip, fedargs.wait_to_consume)

172.16.26.40:9092 http://172.16.26.40:8081


In [6]:
# Initialize Global and Client models
global_model = copy.deepcopy(fedargs.model)
# Load Data to clients
train_loader, test_loader = data.load_dataset(fedargs.dataset, fedargs.client_batch_size)

client_details = {"name": clients[0],
                  "train_loader": train_loader,
                  "test_loader": test_loader,
                  "model": copy.deepcopy(global_model),
                  "model_update": None}

In [7]:
def process(client, epoch, dt, model, train_loader, test_loader, fedargs, device):
    # Consume Models
    client_model_updates = dt.consume_model(client, fedargs.topic, model, epoch)
    if client in client_model_updates:
        client_model_updates.pop(client)
    print("Epoch: {}, Processing Client {}, Received {} Updates From {}".format(epoch, client, 
                                                                                len(client_model_updates), 
                                                                                list(client_model_updates.keys())))

    # Train    
    model_update, model, loss = fedargs.train_func(model, train_loader, 
                                                   fedargs.learning_rate,
                                                   fedargs.weight_decay,
                                                   fedargs.local_rounds, device)

    # Publish Model
    epoch = epoch + 1
    dt.produce_model(client, fedargs.topic, model_update, epoch)

    # Test, Plot and Log
    test_output = fedargs.eval_func(model, test_loader, device)
    print("Epoch: {}, Accuracy: {}, Test Loss: {}".format(epoch, test_output["accuracy"], test_output["test_loss"]))
    wb.log({client: {"epoch": epoch, "time": time.time(), "acc": test_output["accuracy"], "loss": test_output["test_loss"]}})

    return model

In [8]:
# Federated Training
for epoch in tqdm(range(fedargs.epochs)):
    print("Federated Training Epoch {} of {}".format(epoch, fedargs.epochs))

    client_details['model'] = process(client_details['name'], epoch, dt,
                                              client_details['model'],
                                              client_details['train_loader'],
                                              client_details['test_loader'],
                                              fedargs, device)

  0%|          | 0/51 [00:00<?, ?it/s]

Federated Training Epoch 0 of 51
Epoch: 0, Processing Client bladecluster.iitp.org: client-x, Received 0 Updates From []
Producing user records to topic VJH_020_1. ^C to exit.
Flushing records...
User record b'bladecluster.iitp.org: client-x' successfully produced to VJH_020_1 [0] at offset 0


  2%|▏         | 1/51 [01:21<1:07:51, 81.44s/it]

Epoch: 1, Accuracy: 97.39, Test Loss: 0.0876417950630188
Federated Training Epoch 1 of 51
Epoch: 1, Processing Client bladecluster.iitp.org: client-x, Received 1 Updates From ['bladecluster.iitp.org: client-1']
Producing user records to topic VJH_020_1. ^C to exit.
Flushing records...
User record b'bladecluster.iitp.org: client-x' successfully produced to VJH_020_1 [0] at offset 2


  4%|▍         | 2/51 [02:49<1:09:40, 85.32s/it]

Epoch: 2, Accuracy: 96.13000000000001, Test Loss: 0.12607619415670634
Federated Training Epoch 2 of 51
Epoch: 2, Processing Client bladecluster.iitp.org: client-x, Received 0 Updates From []
Producing user records to topic VJH_020_1. ^C to exit.
Flushing records...
User record b'bladecluster.iitp.org: client-x' successfully produced to VJH_020_1 [0] at offset 4


  6%|▌         | 3/51 [03:54<1:00:49, 76.03s/it]

Epoch: 3, Accuracy: 96.6, Test Loss: 0.10608393063098193
Federated Training Epoch 3 of 51
Epoch: 3, Processing Client bladecluster.iitp.org: client-x, Received 1 Updates From ['bladecluster.iitp.org: client-1']
Producing user records to topic VJH_020_1. ^C to exit.
Flushing records...
User record b'bladecluster.iitp.org: client-x' successfully produced to VJH_020_1 [0] at offset 6


  8%|▊         | 4/51 [04:59<56:03, 71.57s/it]  

Epoch: 4, Accuracy: 96.00999999999999, Test Loss: 0.1358279367685318
Federated Training Epoch 4 of 51
Epoch: 4, Processing Client bladecluster.iitp.org: client-x, Received 1 Updates From ['bladecluster.iitp.org: client-1']
Producing user records to topic VJH_020_1. ^C to exit.
Flushing records...
User record b'bladecluster.iitp.org: client-x' successfully produced to VJH_020_1 [0] at offset 8


 10%|▉         | 5/51 [06:48<1:05:24, 85.32s/it]

Epoch: 5, Accuracy: 96.56, Test Loss: 0.10612761734724045
Federated Training Epoch 5 of 51
Epoch: 5, Processing Client bladecluster.iitp.org: client-x, Received 1 Updates From ['bladecluster.iitp.org: client-1']
Producing user records to topic VJH_020_1. ^C to exit.
Flushing records...
User record b'bladecluster.iitp.org: client-x' successfully produced to VJH_020_1 [0] at offset 10


 12%|█▏        | 6/51 [07:54<58:56, 78.59s/it]  

Epoch: 6, Accuracy: 96.31, Test Loss: 0.11806788367927075
Federated Training Epoch 6 of 51
Epoch: 6, Processing Client bladecluster.iitp.org: client-x, Received 1 Updates From ['bladecluster.iitp.org: client-1']
Producing user records to topic VJH_020_1. ^C to exit.
Flushing records...
User record b'bladecluster.iitp.org: client-x' successfully produced to VJH_020_1 [0] at offset 11


 14%|█▎        | 7/51 [08:56<53:40, 73.18s/it]

Epoch: 7, Accuracy: 96.52, Test Loss: 0.10825521415621042
Federated Training Epoch 7 of 51
Epoch: 7, Processing Client bladecluster.iitp.org: client-x, Received 1 Updates From ['bladecluster.iitp.org: client-1']
Producing user records to topic VJH_020_1. ^C to exit.
Flushing records...
User record b'bladecluster.iitp.org: client-x' successfully produced to VJH_020_1 [0] at offset 13


 16%|█▌        | 8/51 [10:05<51:27, 71.81s/it]

Epoch: 8, Accuracy: 96.53, Test Loss: 0.11792041621357202
Federated Training Epoch 8 of 51
Epoch: 8, Processing Client bladecluster.iitp.org: client-x, Received 1 Updates From ['bladecluster.iitp.org: client-1']
Producing user records to topic VJH_020_1. ^C to exit.
Flushing records...
User record b'bladecluster.iitp.org: client-x' successfully produced to VJH_020_1 [0] at offset 15


 18%|█▊        | 9/51 [11:14<49:44, 71.06s/it]

Epoch: 9, Accuracy: 96.2, Test Loss: 0.12377694285511971
Federated Training Epoch 9 of 51
Epoch: 9, Processing Client bladecluster.iitp.org: client-x, Received 1 Updates From ['bladecluster.iitp.org: client-1']
Producing user records to topic VJH_020_1. ^C to exit.
Flushing records...
User record b'bladecluster.iitp.org: client-x' successfully produced to VJH_020_1 [0] at offset 17


 20%|█▉        | 10/51 [15:33<1:28:03, 128.87s/it]

Epoch: 10, Accuracy: 96.33, Test Loss: 0.12914839190244676
Federated Training Epoch 10 of 51
Epoch: 10, Processing Client bladecluster.iitp.org: client-x, Received 2 Updates From ['bladecluster.iitp.org: client-1', 'bladecluster.iitp.org: client-2']
Producing user records to topic VJH_020_1. ^C to exit.
Flushing records...
User record b'bladecluster.iitp.org: client-x' successfully produced to VJH_020_1 [0] at offset 20


 22%|██▏       | 11/51 [19:08<1:43:40, 155.52s/it]

Epoch: 11, Accuracy: 95.96000000000001, Test Loss: 0.13341416057646274
Federated Training Epoch 11 of 51
Epoch: 11, Processing Client bladecluster.iitp.org: client-x, Received 0 Updates From []
Producing user records to topic VJH_020_1. ^C to exit.
Flushing records...
User record b'bladecluster.iitp.org: client-x' successfully produced to VJH_020_1 [0] at offset 23


 24%|██▎       | 12/51 [22:24<1:48:55, 167.58s/it]

Epoch: 12, Accuracy: 95.89, Test Loss: 0.1358468236923218
Federated Training Epoch 12 of 51
Epoch: 12, Processing Client bladecluster.iitp.org: client-x, Received 2 Updates From ['bladecluster.iitp.org: client-1', 'bladecluster.iitp.org: client-2']
Producing user records to topic VJH_020_1. ^C to exit.
Flushing records...
User record b'bladecluster.iitp.org: client-x' successfully produced to VJH_020_1 [0] at offset 26


 25%|██▌       | 13/51 [25:19<1:47:37, 169.94s/it]

Epoch: 13, Accuracy: 95.59, Test Loss: 0.14891094936132432
Federated Training Epoch 13 of 51
Epoch: 13, Processing Client bladecluster.iitp.org: client-x, Received 2 Updates From ['bladecluster.iitp.org: client-1', 'bladecluster.iitp.org: client-2']
Producing user records to topic VJH_020_1. ^C to exit.
Flushing records...
User record b'bladecluster.iitp.org: client-x' successfully produced to VJH_020_1 [0] at offset 29


 27%|██▋       | 14/51 [28:12<1:45:16, 170.73s/it]

Epoch: 14, Accuracy: 95.92, Test Loss: 0.13974969655871392
Federated Training Epoch 14 of 51
Epoch: 14, Processing Client bladecluster.iitp.org: client-x, Received 2 Updates From ['bladecluster.iitp.org: client-1', 'bladecluster.iitp.org: client-2']
Producing user records to topic VJH_020_1. ^C to exit.
Flushing records...
User record b'bladecluster.iitp.org: client-x' successfully produced to VJH_020_1 [0] at offset 32


[34m[1mwandb[0m: Network error resolved after 0:00:57.566614, resuming normal operation.
 29%|██▉       | 15/51 [31:20<1:45:38, 176.07s/it]

Epoch: 15, Accuracy: 95.85000000000001, Test Loss: 0.1375527284026146
Federated Training Epoch 15 of 51
Epoch: 15, Processing Client bladecluster.iitp.org: client-x, Received 2 Updates From ['bladecluster.iitp.org: client-1', 'bladecluster.iitp.org: client-2']
Producing user records to topic VJH_020_1. ^C to exit.
Flushing records...
User record b'bladecluster.iitp.org: client-x' successfully produced to VJH_020_1 [0] at offset 35


 31%|███▏      | 16/51 [34:34<1:45:48, 181.38s/it]

Epoch: 16, Accuracy: 95.57, Test Loss: 0.14623372916579247
Federated Training Epoch 16 of 51
Epoch: 16, Processing Client bladecluster.iitp.org: client-x, Received 2 Updates From ['bladecluster.iitp.org: client-1', 'bladecluster.iitp.org: client-2']
Producing user records to topic VJH_020_1. ^C to exit.
Flushing records...
User record b'bladecluster.iitp.org: client-x' successfully produced to VJH_020_1 [0] at offset 38


 33%|███▎      | 17/51 [37:31<1:42:01, 180.05s/it]

Epoch: 17, Accuracy: 95.89999999999999, Test Loss: 0.13492545241415502
Federated Training Epoch 17 of 51
Epoch: 17, Processing Client bladecluster.iitp.org: client-x, Received 2 Updates From ['bladecluster.iitp.org: client-1', 'bladecluster.iitp.org: client-2']
Producing user records to topic VJH_020_1. ^C to exit.
Flushing records...
User record b'bladecluster.iitp.org: client-x' successfully produced to VJH_020_1 [0] at offset 41


 35%|███▌      | 18/51 [40:27<1:38:21, 178.83s/it]

Epoch: 18, Accuracy: 95.8, Test Loss: 0.14709530736505985
Federated Training Epoch 18 of 51
Epoch: 18, Processing Client bladecluster.iitp.org: client-x, Received 2 Updates From ['bladecluster.iitp.org: client-1', 'bladecluster.iitp.org: client-2']
Producing user records to topic VJH_020_1. ^C to exit.
Flushing records...
User record b'bladecluster.iitp.org: client-x' successfully produced to VJH_020_1 [0] at offset 44


 37%|███▋      | 19/51 [43:19<1:34:18, 176.83s/it]

Epoch: 19, Accuracy: 95.66, Test Loss: 0.14568397090137006
Federated Training Epoch 19 of 51
Epoch: 19, Processing Client bladecluster.iitp.org: client-x, Received 2 Updates From ['bladecluster.iitp.org: client-1', 'bladecluster.iitp.org: client-2']
Producing user records to topic VJH_020_1. ^C to exit.
Flushing records...
User record b'bladecluster.iitp.org: client-x' successfully produced to VJH_020_1 [0] at offset 47


 39%|███▉      | 20/51 [46:25<1:32:46, 179.56s/it]

Epoch: 20, Accuracy: 95.6, Test Loss: 0.1513850736349821
Federated Training Epoch 20 of 51
Epoch: 20, Processing Client bladecluster.iitp.org: client-x, Received 2 Updates From ['bladecluster.iitp.org: client-1', 'bladecluster.iitp.org: client-2']
Producing user records to topic VJH_020_1. ^C to exit.
Flushing records...
User record b'bladecluster.iitp.org: client-x' successfully produced to VJH_020_1 [0] at offset 50


 41%|████      | 21/51 [50:02<1:35:25, 190.84s/it]

Epoch: 21, Accuracy: 95.77, Test Loss: 0.14140083599090575
Federated Training Epoch 21 of 51
Epoch: 21, Processing Client bladecluster.iitp.org: client-x, Received 2 Updates From ['bladecluster.iitp.org: client-1', 'bladecluster.iitp.org: client-2']
Producing user records to topic VJH_020_1. ^C to exit.
Flushing records...
User record b'bladecluster.iitp.org: client-x' successfully produced to VJH_020_1 [0] at offset 53


 43%|████▎     | 22/51 [53:25<1:33:58, 194.43s/it]

Epoch: 22, Accuracy: 95.36, Test Loss: 0.147338456594944
Federated Training Epoch 22 of 51
Epoch: 22, Processing Client bladecluster.iitp.org: client-x, Received 2 Updates From ['bladecluster.iitp.org: client-1', 'bladecluster.iitp.org: client-2']
Producing user records to topic VJH_020_1. ^C to exit.
Flushing records...
User record b'bladecluster.iitp.org: client-x' successfully produced to VJH_020_1 [0] at offset 56


 45%|████▌     | 23/51 [57:43<1:39:37, 213.50s/it]

Epoch: 23, Accuracy: 95.78999999999999, Test Loss: 0.1432215460419655
Federated Training Epoch 23 of 51
Epoch: 23, Processing Client bladecluster.iitp.org: client-x, Received 2 Updates From ['bladecluster.iitp.org: client-1', 'bladecluster.iitp.org: client-2']
Producing user records to topic VJH_020_1. ^C to exit.
Flushing records...
User record b'bladecluster.iitp.org: client-x' successfully produced to VJH_020_1 [0] at offset 59


 47%|████▋     | 24/51 [1:01:10<1:35:18, 211.79s/it]

Epoch: 24, Accuracy: 95.71, Test Loss: 0.13664434888958932
Federated Training Epoch 24 of 51
Epoch: 24, Processing Client bladecluster.iitp.org: client-x, Received 2 Updates From ['bladecluster.iitp.org: client-2', 'bladecluster.iitp.org: client-1']
Producing user records to topic VJH_020_1. ^C to exit.
Flushing records...
User record b'bladecluster.iitp.org: client-x' successfully produced to VJH_020_1 [0] at offset 65


 51%|█████     | 26/51 [1:07:43<1:25:12, 204.50s/it]

Epoch: 26, Accuracy: 95.78, Test Loss: 0.14747873598635197
Federated Training Epoch 26 of 51
Epoch: 26, Processing Client bladecluster.iitp.org: client-x, Received 1 Updates From ['bladecluster.iitp.org: client-2']
Producing user records to topic VJH_020_1. ^C to exit.
Flushing records...
User record b'bladecluster.iitp.org: client-x' successfully produced to VJH_020_1 [0] at offset 68


 53%|█████▎    | 27/51 [1:11:13<1:22:32, 206.37s/it]

Epoch: 27, Accuracy: 95.88, Test Loss: 0.13692360990047456
Federated Training Epoch 27 of 51
Epoch: 27, Processing Client bladecluster.iitp.org: client-x, Received 2 Updates From ['bladecluster.iitp.org: client-1', 'bladecluster.iitp.org: client-2']
Producing user records to topic VJH_020_1. ^C to exit.
Flushing records...
User record b'bladecluster.iitp.org: client-x' successfully produced to VJH_020_1 [0] at offset 71


 55%|█████▍    | 28/51 [1:15:09<1:22:26, 215.04s/it]

Epoch: 28, Accuracy: 95.63000000000001, Test Loss: 0.14902660186290742
Federated Training Epoch 28 of 51
Epoch: 28, Processing Client bladecluster.iitp.org: client-x, Received 2 Updates From ['bladecluster.iitp.org: client-1', 'bladecluster.iitp.org: client-2']
Producing user records to topic VJH_020_1. ^C to exit.
Flushing records...
User record b'bladecluster.iitp.org: client-x' successfully produced to VJH_020_1 [0] at offset 74


 57%|█████▋    | 29/51 [1:18:35<1:17:50, 212.31s/it]

Epoch: 29, Accuracy: 95.21, Test Loss: 0.1551487323641777
Federated Training Epoch 29 of 51
Epoch: 29, Processing Client bladecluster.iitp.org: client-x, Received 2 Updates From ['bladecluster.iitp.org: client-2', 'bladecluster.iitp.org: client-1']
Producing user records to topic VJH_020_1. ^C to exit.
Flushing records...
User record b'bladecluster.iitp.org: client-x' successfully produced to VJH_020_1 [0] at offset 77


 59%|█████▉    | 30/51 [1:26:29<1:41:48, 290.86s/it]

Epoch: 30, Accuracy: 95.39, Test Loss: 0.15046275275945664
Federated Training Epoch 30 of 51
Epoch: 30, Processing Client bladecluster.iitp.org: client-x, Received 1 Updates From ['bladecluster.iitp.org: client-2']
Producing user records to topic VJH_020_1. ^C to exit.
Flushing records...
User record b'bladecluster.iitp.org: client-x' successfully produced to VJH_020_1 [0] at offset 80


 61%|██████    | 31/51 [1:36:04<2:05:23, 376.19s/it]

Epoch: 31, Accuracy: 95.61, Test Loss: 0.14653559498786925
Federated Training Epoch 31 of 51
Epoch: 31, Processing Client bladecluster.iitp.org: client-x, Received 2 Updates From ['bladecluster.iitp.org: client-1', 'bladecluster.iitp.org: client-2']
Producing user records to topic VJH_020_1. ^C to exit.
Flushing records...
User record b'bladecluster.iitp.org: client-x' successfully produced to VJH_020_1 [0] at offset 83


 63%|██████▎   | 32/51 [1:39:41<1:43:57, 328.27s/it]

Epoch: 32, Accuracy: 95.73, Test Loss: 0.1416103817373514
Federated Training Epoch 32 of 51
Epoch: 32, Processing Client bladecluster.iitp.org: client-x, Received 2 Updates From ['bladecluster.iitp.org: client-1', 'bladecluster.iitp.org: client-2']


 63%|██████▎   | 32/51 [1:40:24<59:37, 188.27s/it]  


KeyboardInterrupt: 