# Federated Learning with Secure Aggregation and Diff Privacy using PySyft

This is an example of using our new Secure Multi-Party Computation tensor (SPDZTensor) to perform an encrypted average of gradients across multiple data owners.

Before starting with this notebook, we recommend looking at `Boston_Housing_Federated_Training.ipynb` which is located in the same folder


# Setting Up

In [1]:
! URL="https://github.com/LaRiffle/differential-privacy.git" && FOLDER="differential_privacy" && if [ ! -d $FOLDER ]; then git clone $URL $FOLDER; else (cd $FOLDER && git pull $URL && cd ..); fi;
! pip install --upgrade --force-reinstall websockets

In [2]:
from __future__ import print_function
import os
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader


print(torch.__version__)

# Training settings
parser = argparse.ArgumentParser(description='PyTorch Example')
parser.add_argument('--batch-size', type=int, default=8, metavar='N',
                    help='input batch size for training (default: 8)')
parser.add_argument('--test-batch-size', type=int, default=8, metavar='N',
                    help='input batch size for testing (default: 8)')
parser.add_argument('--epochs', type=int, default=10, metavar='N',
                    help='number of epochs to train (default: 10)')
parser.add_argument('--lr', type=float, default=0.001, metavar='LR',
                    help='learning rate (default: 0.001)')
parser.add_argument('--momentum', type=float, default=0.0, metavar='M',
                    help='SGD momentum (default: 0.0)')
parser.add_argument('--seed', type=int, default=1, metavar='S',
                    help='random seed (default: 1)')
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                    help='how many batches to wait before logging training status')
args = parser.parse_args([])

torch.manual_seed(args.seed)
kwargs ={}

0.3.1.post2


### Loading the dataset

In [3]:
import pickle
f = open('../other/data/boston_housing.pickle','rb')
((X, y), (X_test, y_test)) = pickle.load(f)
f.close()

X = torch.from_numpy(X).type(torch.FloatTensor)
y = torch.from_numpy(y).type(torch.FloatTensor)
X_test = torch.from_numpy(X_test).type(torch.FloatTensor)
y_test = torch.from_numpy(y_test).type(torch.FloatTensor)
# preprocessing
mean = X.mean(0, keepdim=True)
dev = X.std(0, keepdim=True)
mean[:, 3] = 0. # the feature at column 3 is binary,
dev[:, 3] = 1.  # so I'd rather not standardize it
X = (X - mean) / dev
X_test = (X_test - mean) / dev
train = TensorDataset(X, y)
test = TensorDataset(X_test, y_test)
train_loader = DataLoader(train, batch_size=args.batch_size, shuffle=True, **kwargs)
test_loader = DataLoader(test, batch_size=args.test_batch_size, shuffle=True, **kwargs)


#  Neural Network Structure

In [4]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(13, 32)
        self.fc2 = nn.Linear(32, 24)
        self.fc3 = nn.Linear(24, 1)

    def forward(self, x):
        x = x.view(-1, 13)
        print("WEIGHT LOCATION:" + str(self.fc1.weight.location))
        print("BIAS LOCATION:" + str(self.fc1.bias.location))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    def divide_clip_grads(self):
        for key, param in self.named_parameters():
            param.grad /= n_batch
            gradient_clip(param)
            
    def add_noise_to_grads(self):
        for key, param in self.named_parameters():
            noise = 1/LOT_SIZE * gaussian_noise(param.grad)
            param.grad += noise
model = Net()
model_params = list(model.parameters())

bobs_model = Net()
alices_model = Net()



# Hooking into Pytorch

In [5]:
import syft
import syft as sy
from syft.core import utils
import torch
import torch.nn.functional as F
import json
import random
from syft.core.frameworks.torch import utils as torch_utils
from torch.autograd import Variable
hook = sy.TorchHook(verbose=False)
me = hook.local_worker
bob = sy.VirtualWorker(id="bob",hook=hook, is_client_worker=False)
alice = sy.VirtualWorker(id="alice",hook=hook, is_client_worker=False)
me.is_client_worker = False

compute_nodes = [bob, alice]

me.add_workers([bob, alice])
bob.add_workers([me, alice])
alice.add_workers([me, bob])




**Send data to the worker** <br>
Usually they would already have it, this is just for demo

In [6]:
remote_dataset = (list(),list())

for batch_idx, (data,target) in enumerate(train_loader):
    data = Variable(data)
    target = Variable(target.float())
    data.send(compute_nodes[batch_idx % len(compute_nodes)])
    target.send(compute_nodes[batch_idx % len(compute_nodes)])
    remote_dataset[batch_idx % len(compute_nodes)].append((data, target))

## Diff Privacy

In [7]:
"""
    Inspired from Abadi et al., Deep Learning with Differential Privacy, 
    Proceedings of the 2016 ACM SIGSAC Conference on Computer and Communications
    Security, 2016
"""
from differential_privacy.privacy_accountant.pytorch import accountant
import numpy as np

n_batch = 3
NUM_TRAINING_IMAGES = X.size()[0]
LOT_SIZE = n_batch * args.batch_size
N_LOTS = 100
T = N_LOTS # number of samplings

bound = 10
epsilon = 0.5
delta = 10**(-5)
sigma = np.sqrt(2 * np.log(1.25/delta))/epsilon 

def sum_batch(grads):
    n_items = len(grads)
    return grads.view(n_items, -1).sum(dim=1)

def gradient_clip(param):
    """Clip gradient to ensure ||param.grad||2 < bound"""
    nn.utils.clip_grad_norm([param], bound)

def gaussian_noise(grads):
    """Add gaussian noise to gradients"""
    shape = grads.shape
    noise = Variable(torch.zeros(shape))
    noise.data.normal_(0.0, std=bound*sigma)
    return noise

q = LOT_SIZE / NUM_TRAINING_IMAGES
spent_epsilon = q * epsilon * np.sqrt(T)
spent_delta = delta
print('sigma =', sigma)
print('The mechanism is (O(%f), %f)-differentially private' % (spent_epsilon, spent_delta))

sigma = 9.689610525210778
The mechanism is (O(0.297030), 0.000010)-differentially private


In [8]:
priv_accountant = accountant.GaussianMomentsAccountant(NUM_TRAINING_IMAGES)

In [9]:
from random import randint

def select_lot(worker_dataset):
    """
    Build the lot by sampling over the dataset
    """
    #- select indices in worker_dataset of tensors 
    valid_ids = np.arange(len(worker_dataset)-1) 
    #- Select indices and reshape into batches
    batches_ids = np.random.choice(valid_ids,size=LOT_SIZE, replace=False).reshape(-1, args.batch_size)
    #- Build lot
    lot = []
    for batch_ids in batches_ids:
        batch_data = []
        batch_target = []
        for batch_id in batch_ids:
            data, target = worker_dataset[batch_id]
            batch_data.append(data)
            batch_target.append(target)
        
        lot.append((torch.stack(batch_data), torch.stack(batch_target)))
    return lot

In [10]:
def update(worker_idx, model, optimizer, lot_idx):
    # Build the lot by sampling over the dataset
    worker_dataset = remote_dataset[worker_idx]
    lot = select_lot(worker_dataset)
    optimizer.zero_grad()
    
    # Iterate on the lot batch per batch
    for batch_idx, (data,target) in enumerate(lot):
        # update the model
        
        #send the model to the worker
        worker = data.location
        print("DATA LOCATION: " + str(data.location))
        model.send(worker)
        #debug
        for param in model.parameters():
            if not (param.id_at_location in worker._objects.keys()):
                print(param)
                print("Param Id: " + str(param.id_at_location))
                print(worker._objects.keys())
            assert(param.id_at_location in worker._objects.keys())
        pred = model(data)

        loss = F.mse_loss(pred, target.float())
        # Note that because we apply backward() several times without resetting 
        # the grads (optimizer.zero_grad()), we sum the gradients 
        loss.backward() 
        
        if batch_idx == 0:
            loss.get()
            print('Train Lot: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                lot_idx, batch_idx * args.batch_size, LOT_SIZE,
                100. * batch_idx * args.batch_size / LOT_SIZE, loss.data[0]))
            print(priv_accountant.get_privacy_spent(target_deltas=[spent_delta]))
       

    
    optimizer.step()
        
    priv_accountant.accumulate_privacy_spending(bound * sigma, LOT_SIZE)


    return model

In [11]:
bobs_optimizer = optim.SGD(bobs_model.parameters(), lr=args.lr, momentum=args.momentum)
alices_optimizer = optim.SGD(alices_model.parameters(), lr=args.lr, momentum=args.momentum)

models = [bobs_model, alices_model]
params = [list(bobs_model.parameters()), list(alices_model.parameters())]
optimizers = [bobs_optimizer, alices_optimizer]

# Training Function 

In [12]:
## Federated Learning training
def train(lot_idx):        
    # update remote models
    for remote_index in range(len(compute_nodes)):
        models[remote_index].train()
        models[remote_index].send(compute_nodes[remote_index])
        print("LOT number: " + str (lot_idx))
        print("Remote_index: "+ str(remote_index))
        models[remote_index] = update(remote_index, models[remote_index], optimizers[remote_index], lot_idx)

    new_params = list()

    for param_i in range(len(params[0])):

        spdz_params = list()
        for remote_index in range(len(compute_nodes)):
            spdz_params.append((params[remote_index][param_i].data+0).fix_precision().share(bob, alice).get())

       # new_param = (spdz_params[0] + spdz_params[1]).get().decode()/2
        new_param = (spdz_params[0] + spdz_params[1]).get().decode()/2
        new_params.append(new_param)

    for model in params:
        for param in model:
            param.data *= 0

    for remote_index, model in enumerate(models):
        model.get()
        model.divide_clip_grads()
        model.add_noise_to_grads()
        model.send(compute_nodes[remote_index])

    for remote_index in range(len(compute_nodes)):
        for param_index in range(len(params[remote_index])):
            params[remote_index][param_index].data.set_(new_params[param_index])



# Testing Function

In [13]:
def test():
    models[0].eval()
    test_loss = 0
    for data, target in test_loader:
        data, target = Variable(data, volatile=True), Variable(target)
        output = models[0](data)
        test_loss += F.mse_loss(output, target.float(), size_average=False).data[0] # sum up batch loss
        pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability
        
    test_loss /= len(test_loader.dataset)
    print('\nTest set: Average loss: {:.4f}\n'.format(test_loss))


# Training The Dataset

In [14]:
%%time

for epoch in range(1, args.epochs + 1):
    print(epoch)
    for lot_idx in range(1, N_LOTS):
        train(lot_idx)
    test()

1
LOT number: 1
Remote_index: 0
DATA LOCATION: <syft.core.workers.virtual.VirtualWorker id:bob>
Parameter containing:FloatTensor[_PointerTensor - id:9514100049 owner:0 loc:bob id@loc:99345430526]
Param Id: 4215937605
dict_keys([38055227604, 2502488806, 31299956160, 95867909540, 70132257572, 43151358781, 22970661714, 4600909910, 8977475118, 14558501983, 98953060477, 96585027247, 7803534624, 53531350996, 94744651793, 50743002523, 19140874110, 99213740109, 77187647414, 57940585027, 46460525656, 69983098331, 41162072773, 53064050138, 42531165268, 89932534062, 10597746684, 52352559364, 88947841680, 60854081447, 15311174941, 93442574190, 24525684463, 4063087290, 56561787164, 33778733302, 25515241214, 85320580068, 94133156376, 25312430229, 88642945681, 89832632899, 53735569920, 78690606291, 92728683149, 1262397789, 62468171334, 43375830962, 72560534195, 21904711029, 72241638295, 43972176126, 71461438361, 20769329852, 78498432502, 28800700527, 89234844981, 30856094039, 42211554756, 67378393685

AssertionError: 