In [1]:
import torch
import torchvision
import numpy as np
from torch import nn, optim
from torchvision import datasets, transforms

In [2]:
# use cuda if available
DEVICE = torch.device("cuda" if torch.cuda.is_available()
                      else "cpu")
print(f"Using {DEVICE} backend")

# number of teacher models.  
# our student model accuracy will depend on this parameter
num_teachers = 100 #@param {type:"integer"}

Using cpu backend


  return torch._C._cuda_getDeviceCount() > 0


The pipeline for this problem is,

- Train N teacher classifier models on N(100,here) private datasets
- Use the teacher models to label your unlabelled dataset.
- Now you have N labels for each of our image.
- We use the argmax query in Differentially Private manner to find which label is given by majority of the models

For a real world implementation of the above pipeline we use MNIST data.

- The training data will be divided into N parts to train the teacher models
- The test data will be used as the student(unlabelled data)

In [3]:
# convert to tensor and normalize 
train_transform = transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize([.5],[.5])])
# load training data
mnsit_dataset = datasets.MNIST('./mnsit', train=True, transform=train_transform, download=True, )

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./mnsit\MNIST\raw\train-images-idx3-ubyte.gz


100.1%

Extracting ./mnsit\MNIST\raw\train-images-idx3-ubyte.gz to ./mnsit\MNIST\raw
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./mnsit\MNIST\raw\train-labels-idx1-ubyte.gz


113.5%

Extracting ./mnsit\MNIST\raw\train-labels-idx1-ubyte.gz to ./mnsit\MNIST\raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./mnsit\MNIST\raw\t10k-images-idx3-ubyte.gz


100.4%

Extracting ./mnsit\MNIST\raw\t10k-images-idx3-ubyte.gz to ./mnsit\MNIST\raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./mnsit\MNIST\raw\t10k-labels-idx1-ubyte.gz


180.4%

Extracting ./mnsit\MNIST\raw\t10k-labels-idx1-ubyte.gz to ./mnsit\MNIST\raw
Processing...


  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


Done!


In [4]:
# divide mnist train data to num_teachers partitions
total_size = len(mnsit_dataset)
# length of each teacher dataset
lengths = [int(total_size/num_teachers)]*num_teachers
# list of all teacher dataset
teacher_datasets = torch.utils.data.random_split(mnsit_dataset, lengths)

In [5]:
# We will create basic model, which will be used for teacher and student training both
# It is not necessary to have same model structure for all teachers and even student model
class Network(nn.Module):
  def __init__(self):
    super(Network,self).__init__()
    # sequential layer : input size (batch_size, 28*28)
    self.layer = nn.Sequential(nn.Linear(28*28, 256),
                               # out size (batch_size, 256)
                               nn.BatchNorm1d(256),
                               # out size (batch_size, 256)
                               nn.ReLU(),
                               # out size (batch_size, 256)
                               nn.Dropout(0.5),
                               # out size (batch_size, 256)
                               nn.Linear(256, 64),
                               # out size (batch_size, 64)
                               nn.BatchNorm1d(64),
                               # out size (batch_size, 64)
                               nn.ReLU(),
                               # out size (batch_size, 64)
                               nn.Dropout(0.5),
                               # out size (batch_size, 64)
                               nn.Linear(64, 10),
                               # out size (batch_size, 10)
                               # we will use logsoftmax instead softmax
                               # softmax has expoential overflow issues
                               nn.LogSoftmax(dim=1)
                               # out size (batch_size, 10)
                              )

  def forward(self,x):
    # x size : (batch_size, 1, 28, 28)
    x = x.view(x.shape[0], -1)
    # x size : (batch_size, 784)
    x = self.layer(x)
    # x size : (batch_size, 10)
    return x

In [6]:
def train_model(dataset, checkpoint_file, num_epochs=10, do_validation=False):
  """ 
  Train a model for given dataset for given number of epochs and
  save last epoch model checkpoint
  
  Parameters: 
    dataset (torch.dataset): training data
    checkpoint_file (str): filename for saving model
    num_epochs (int): number of training epoch
    do_validation (bool): perform validation by dividing dataset in 90:10 ratio
          
  Returns: None
  
  """
  # if validation divide dataset to train and test set 90:10 ratio
  if do_validation:
    dataset_size = len(dataset)
    train_set, test_set = torch.utils.data.random_split(dataset, [int(0.9*dataset_size), int(0.1*dataset_size)])
    # create train and test dataloader
    trainloader = torch.utils.data.DataLoader(train_set, batch_size=32, shuffle=True)
    testloader = torch.utils.data.DataLoader(test_set, batch_size= 32, shuffle=True)
  else:
    # create train dataloader using full dataset
    trainloader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)

  # create model and send to gpu
  model = Network().to(DEVICE)
  # we have used logsoftmax, so now NLLLoss
  criterion = nn.NLLLoss()
  # adam optimizer for training
  optimizer = optim.Adam(model.parameters(), lr=0.005)

  # train for num_epochs
  for epoch in range(num_epochs):
    # training accuracy and loss for logging
    train_accuracy = 0
    train_loss = 0
    # training dataloader
    for images, labels in trainloader:
      # zero accumlated grads
      optimizer.zero_grad()
      # send images, labels to gpu
      images, labels = images.to(DEVICE), labels.to(DEVICE)
      # run forward propagation
      output = model.forward(images)
      # calculate loss
      loss = criterion(output, labels)
      train_loss += loss.item()
      # calculate accuracy 
      top_out, top_class = output.topk(1, dim=1)
      success = (top_class==labels.view(*top_class.shape))
      train_accuracy += success.sum().item()
      # do backward propagation
      loss.backward()
      optimizer.step()
      
    if do_validation:
      # set model to evaluation
      model.eval()
      test_accuracy = 0
      test_loss = 0
      # do forward pass and calculate loss and accuracy 
      with torch.no_grad():
        for images, labels in testloader:
          images, labels = images.to(DEVICE), labels.to(DEVICE)
          output = model.forward(images)
          loss = criterion(output, labels)
          test_loss += loss.item()
          top_out, top_class = output.topk(1, dim=1) 
          success = (top_class==labels.view(*top_class.shape))
          test_accuracy += success.sum().item()
      # log train and test metrics
      print("Epoch: {}".format(epoch+1),
            "Train Loss: {:.3f}".format(train_loss/len(trainloader)),
            "Train Accuracy: {:.3f}".format(train_accuracy/len(train_set)),
            "Test Loss: {:.3f}".format(test_loss/len(testloader)),
            "Test Accuracy: {:.3f}".format(test_accuracy/len(test_set))
           )
      # set model to train
      model.train()
    else:
      # log only training metrics if no validation
      print("Epoch: {}".format(epoch+1),
            "Train Loss: {:.3f}".format(train_loss/len(trainloader)),
            "Train Accuracy: {:.3f}".format(train_accuracy/len(dataset))
           )
    # save trained teacher model
    torch.save(model.state_dict(), checkpoint_file)

In [None]:
# train all teachers models on MNIST partition datasets
for teacher in range(num_teachers):
  print("############################### Teacher {} Model Training #############################".format(teacher+1))
  train_model(teacher_datasets[teacher], f"checkpoint_teacher_{teacher+1}.pth")

### Student Dataset Training 
Having trained the teacher datasets above now we use the models to predict the student dataset(mnist test set)

In [None]:
# student dataset transforms 
test_transform = transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize([.5],[.5])])
# load private student dataset
private_dataset = datasets.MNIST('./mnsit', train=False, transform=test_transform, download=True)

# mnist test dataset have 10000 examples
private_data_size = len(private_dataset)

# create dataloader for private train dataset
private_dataloader = torch.utils.data.DataLoader(private_dataset, batch_size=32)


In [None]:
def predict_model(model_checkpoint, dataloader):
  """ 
  Load a trained model and make predictions
  
  Parameters: 
    checkpoint_file (str): filename for trained model checkpoint
    dataloader (DataLoader): dataloader instance
          
  Returns: 
    preds_list (torch.Tensor): predictions for whole dataset
  
  """
  # create model 
  model = Network()
  # load model from checkpoint
  state_dict = torch.load(model_checkpoint)
  model.load_state_dict(state_dict)
  # send model to gpu
  model = model.to(DEVICE)
  # list for batch predictions
  preds_list = []
  # set model to eval mode
  model.eval()
  # no gradients calculation needed
  with torch.no_grad():
    # iterate over dataset
    for images, labels in dataloader:
      images = images.to(DEVICE)
      # calculate predictions ( log of predictions)
      preds = model.forward(images)
      # calculate top_class
      top_preds, top_classes = preds.topk(k=1, dim=1)
      # append batch top_classes tensor
      preds_list.append(top_classes.view(-1))
  # concat all batch predictions
  preds_list = torch.cat(preds_list).cpu()
  # return predictions
  return preds_list 

In [None]:
# list of all teacher model predictions
teacher_preds = []
# predict for each teacher model
for teacher in range(num_teachers):
  teacher_preds.append(predict_model(f'checkpoint_teacher_{teacher+1}.pth', private_dataloader))
# stack all teacher predictions
teacher_preds = torch.stack(teacher_preds)
print(teacher_preds.shape)

## Aggregating Teacher Predictions

We have N predictions for each datapoint from our private dataset. We can aggregate N predictions using max query on bin counts for different labels.

Can we train a model on those aggregated labels directly ? Yes, we can, but for increasing differenital privacy and keeping within some privacy budget, we will convert our aggreagte query to dp query. In dp query, we will add some amount of Laplacian noise.

In [None]:
# epsilon budget for one aggregate dp query
epsilon = 0.1 #@param {type:"number"}
# number of labels
num_classes = 10

We have assumed, student data is unlabelled. For analysis purpose we will use real labels.

In [None]:
# real targets, will not available for private dataset in real scenerio
real_targets = private_dataset.targets

### Teacher Argmax Aggregation

Aggregate N teacher predictions using max query on bin counts for different labels

In [None]:
# teacher aggregation result
teachers_argmax = list()
for image_i in range(private_data_size):
  # calculate bin count
  label_counts = torch.bincount(teacher_preds[:, image_i], minlength=num_classes)
  # take maximum bin count label
  argmax_label = torch.argmax(label_counts)
  teachers_argmax.append(argmax_label)
# convert array to 
teachers_argmax = torch.tensor(teachers_argmax)
# correct predictions
argmax_correct = torch.sum(real_targets == teachers_argmax)
print("Teachers argmax labels accuracy", argmax_correct.item()/private_data_size)

### Teacher Noisy Aggregation ( DP query)

We use laplacian noise and beta will equal to **(sensitivity / epsilon )**.

Sensitivity of argmax query will be one.

In [None]:
# dp query results
noisy_labels = list()
for image_i in range(private_data_size):
  # calculate bin count
  label_counts = torch.bincount(teacher_preds[:, image_i], minlength=num_classes)
  # calcuate beta for laplacian 
  beta = 1 / epsilon
  
  # add noise for each teacher predictions
  for i in range(len(label_counts)):
      label_counts[i] += np.random.laplace(0, beta, 1)[0]
  # calculate dp label
  noisy_label = torch.argmax(label_counts)
  noisy_labels.append(noisy_label)

noisy_labels = torch.tensor(noisy_labels)
# accuracy for noisy or dp query results
noisy_accuracy = torch.sum(real_targets == noisy_labels)

print("Noisy label accuracy", noisy_accuracy.item()/private_data_size)

## PATE Analysis

**What is the epsilon budget, we have used ?** We can perform PATE analysis to find this out

In [None]:
from syft.frameworks.torch.differential_privacy import pate

In [None]:
# memory usage is getting pretty high with all predictions in PATE analysis,
# using subset of predictions ( subset of mnist test dataset)
# will help us understand importnace of private data size
num_student_train = 2000 #@param {type:"integer"}
teacher_preds1 = teacher_preds[:, :num_student_train].to(DEVICE)
noisy_labels1 = noisy_labels[:num_student_train].to(DEVICE)
teachers_argmax1 = teachers_argmax[:num_student_train].to(DEVICE)
real_targets1 = real_targets[:num_student_train].to(DEVICE)

### Noisy Labels PATE Analysis

In [None]:
# Data dependant and independant epsilon for noisy labels
data_dep_eps, data_ind_eps = pate.perform_analysis_torch(preds=teacher_preds1, indices=noisy_labels1,
                                                   noise_eps=epsilon, delta=1e-5, moments=10)
print(f"Data dependant epsilon {data_dep_eps.item()} data independent epsilon {data_ind_eps.item()}")

### Teacher Argmax PATE Analysis

In [None]:
# Data dependant and independant epsilon for argmax labels
data_dep_eps, data_ind_eps = pate.perform_analysis_torch(preds=teacher_preds1, indices=teachers_argmax1,
                                                   noise_eps=epsilon, delta=1e-5, moments=10)
print(f"Data dependant epsilon {data_dep_eps.item()} data independent epsilon {data_ind_eps.item()}")

### Real Labels PATE Analysis

In [None]:
# Data dependant and independant epsilon for argmax labels
data_dep_eps, data_ind_eps = pate.perform_analysis_torch(preds=teacher_preds1, indices=real_targets1,
                                                   noise_eps=epsilon, delta=1e-5, moments=10)
print(f"Data dependant epsilon {data_dep_eps.item()} data independent epsilon {data_ind_eps.item()}")

## Student Model Training

Differential privacy gaurantees that any amount of postprocessing can't increase epsilon value for given dataset, which means epsilon value will be less than or equal to PATE analysis values after training deep learning models. 

In [None]:
# save real labels
private_real_labels = private_dataset.targets
# replace real labels with noisy labels in private dataset
private_dataset.targets = noisy_labels

# create training and testing subset
train_private_set = torch.utils.data.Subset(private_dataset, range(0, num_student_train))
test_private_set = torch.utils.data.Subset(private_dataset, range(num_student_train, len(private_dataset)))

In [None]:
# train student model with noisy labels
student_model = train_model(train_private_set, f'checkpoint_student.pth', num_epochs=20)

In [None]:
# create test loader
private_testloader = torch.utils.data.DataLoader(test_private_set, batch_size=32)
# get test predictions 
test_preds = predict_model(f'checkpoint_student.pth', private_testloader)
# calculate test predictions 
correct = torch.sum(private_real_labels[num_student_train:] == test_preds)
# accuracy
print(f"student model test accuracy {correct.item()/(len(private_dataset)-num_student_train)}")