In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
os.chdir('/content/drive/MyDrive/Differential Privacy/Shoulder/')
!ls

 main.py	      'Screenshot 2021-10-29 at 4.43.30 PM.png'   Test	  Val
 PATE_SHOULDER.ipynb   split.py					  Train


In [3]:
!pip install syft==0.2.9



In [4]:
import syft
import numpy as np
import pandas as pd
import torch
from torchvision import datasets, transforms, models
from torch.utils.data import Dataset, Subset, DataLoader
from torch import nn , optim
import torch.nn.functional as F
from PIL import Image
import time, os , random
from syft.frameworks.torch.dp import pate
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
dirs = {
    'train':'./Train',
    'val':'./Val',
    'test':'./Test'
}
batchsize=16

data_transforms = transforms.Compose([
    transforms.Resize(224),
    transforms.RandomResizedCrop((224),scale=(0.5,1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

trainset = datasets.ImageFolder(root=dirs['train'],transform=data_transforms) #Teacher Train
valset  = datasets.ImageFolder(root=dirs['val'],transform=data_transforms) #Student Test
testset = datasets.ImageFolder(root=dirs['test'],transform=data_transforms) #Student Train
 

In [6]:
trainset.classes

['Cofield', 'Depuy', 'Tornier', 'Zimmer']

In [7]:
trainset.class_to_idx

{'Cofield': 0, 'Depuy': 1, 'Tornier': 2, 'Zimmer': 3}

In [8]:
print(f'Number of Images in Train Set: {len(trainset)}')
print(f'Number of Images in Validation Set: {len(valset)}')
print(f'Number of Images in Test Set: {len(testset)}')

Number of Images in Train Set: 358
Number of Images in Validation Set: 120
Number of Images in Test Set: 119


# Partitioning Dataset between 2 Teachers

In [9]:
num_teachers = 2
valid_per =0.1
batch_size =32

def teacher_dataloaders(trainset=trainset, num_teachers=num_teachers, batch_size=batch_size,valid_per =0.2):

  train_dl = []
  valid_dl = []
  teacher_data_len = len(trainset) // num_teachers

  #list of shuffled indices
  my_list = random.sample(range(1,len(trainset)),len(trainset)-1)
  random.shuffle(my_list)
  print("Shuffled indices are -:",my_list[:10])

  for i in range(num_teachers):
    #get any particular subset of data
    indice = my_list[i*teacher_data_len:(i+1)*teacher_data_len]
    print(f"Chosen indices for teacher {i} are",indice[:10])
    data_subset = Subset(trainset,indice)

    #split into train and validation set
    valid_size = int(len(data_subset)*valid_per)
    train_size = len(data_subset) - valid_size
    train_subset, valid_subset = torch.utils.data.random_split(data_subset, [train_size,valid_size])
  
    #create data loaders
    trainloader = DataLoader(train_subset, batch_size=batch_size, shuffle=True,num_workers=1)
    validloader = DataLoader(valid_subset, batch_size=batch_size, shuffle= False, num_workers=1)

    train_dl.append(trainloader)
    valid_dl.append(validloader)
  
  return train_dl,valid_dl

trainloaders , validloaders = teacher_dataloaders()
print(len(trainloaders), len(validloaders))

Shuffled indices are -: [348, 85, 32, 336, 36, 294, 143, 353, 277, 331]
Chosen indices for teacher 0 are [348, 85, 32, 336, 36, 294, 143, 353, 277, 331]
Chosen indices for teacher 1 are [233, 23, 113, 40, 148, 69, 14, 288, 126, 146]
2 2


In [10]:
#student dataset
valid_size = int(len(testset)*0.2)
train_size = len(testset) - valid_size
student_train_subset , student_valid_subset = torch.utils.data.random_split(testset,[train_size,valid_size])

#create data loaders
student_train_loader = DataLoader(student_train_subset , batch_size=batch_size, shuffle = False, num_workers=1)
student_valid_loader = DataLoader(student_valid_subset, batch_size = batch_size, shuffle = False, num_workers=1)

print(len(student_train_loader.dataset), len(student_valid_loader.dataset))

96 23


#Train Teachers


In [11]:
class SimpleCNN(torch.nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__() # b, 3, 32, 32
        layer1 = torch.nn.Sequential()
        layer1.add_module('conv1', torch.nn.Conv2d(3, 32, 3, 1, padding=1))

        #b, 32, 32, 32
        layer1.add_module('relu1', torch.nn.ReLU(True))
        layer1.add_module('pool1', torch.nn.MaxPool2d(2, 2))
        self.layer1 = layer1
        layer4 = torch.nn.Sequential()
        layer4.add_module('fc1', torch.nn.Linear(401408, 4))       
        self.layer4 = layer4

    def forward(self, x):
        conv1 = self.layer1(x)
        fc_input = conv1.view(conv1.size(0), -1)
        fc_out = self.layer4(fc_input)

        return fc_out

In [12]:
def train(n_epochs, trainloader, validloader, model, optimizer, criterion, use_cuda, save_path= None, is_not_teacher=False):
    """returns trained model"""
    # # initialize tracker for minimum validation loss
    valid_loss_min = np.Inf

    for epoch in range(1, n_epochs+1):
        # initialize variables to monitor training and validation loss
        train_loss = 0.0
        valid_loss = 0.0
        train_correct = 0.0
        train_total = 0.0
        valid_correct =0.0
        valid_total = 0.0
        # train the model #
        model.train()
        for batch_idx, (data, target) in enumerate(trainloader):
            # move to GPU
            if use_cuda:
                data, target = data.to(device), target.to(device)
            # initialize weights to zero
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()    
            train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.data - train_loss))

            # convert output probabilities to predicted class
            pred = output.data.max(1, keepdim=True)[1]
            # compare predictions to true label
            train_correct += np.sum(np.squeeze(pred.eq(target.data.view_as(pred))).cpu().numpy())
            train_total += data.size(0)
            train_acc = 100. * train_correct / train_total

        # validate the model
        model.eval()
        for batch_idx, (data, target) in enumerate(validloader):
            # move to GPU
            if use_cuda:
                data, target = data.cuda(), target.cuda()
            output = model(data)
            loss = criterion(output, target)
            valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.data - valid_loss))

            pred = output.data.max(1, keepdim=True)[1]
            # compare predictions to true label
            valid_correct += np.sum(np.squeeze(pred.eq(target.data.view_as(pred))).cpu().numpy())
            valid_total += data.size(0)
            valid_acc = 100. * valid_correct / valid_total

        # print training/validation statistics
        print('Epoch: {} \n\tTrain Loss: {:.6f} \tTrain Acc: {:.6f} \n\tValid Loss: {:.6f} \tValid Acc: {:.6f}'.format(
            epoch,train_loss,train_acc,valid_loss,valid_acc ))

        ## save the student model if validation loss has decreased
        if is_not_teacher:
          if valid_loss < valid_loss_min:
              torch.save(model.state_dict(), save_path)
              print('\tValidation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
              valid_loss_min,
              valid_loss))
              valid_loss_min = valid_loss

    return model


In [13]:
model = SimpleCNN()
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters() , lr=0.001)
epochs = 50

In [14]:
#training teachers 
teacher_models = []
i =1
for trainloader, validloader in zip(trainloaders,validloaders):
  print(" Training Teacher {}".format(i))
  teacher_model = train(epochs, trainloader, validloader, model, optimizer, criterion, True)
  teacher_models.append(teacher_model)
  i+=1
  print("="*40)


 Training Teacher 1
Epoch: 1 
	Train Loss: 70.095665 	Train Acc: 33.333333 
	Valid Loss: 32.463234 	Valid Acc: 34.285714
Epoch: 2 
	Train Loss: 48.465950 	Train Acc: 29.166667 
	Valid Loss: 13.294279 	Valid Acc: 40.000000
Epoch: 3 
	Train Loss: 24.109482 	Train Acc: 26.388889 
	Valid Loss: 13.519389 	Valid Acc: 34.285714
Epoch: 4 
	Train Loss: 22.017601 	Train Acc: 40.972222 
	Valid Loss: 15.778822 	Valid Acc: 37.142857
Epoch: 5 
	Train Loss: 12.844816 	Train Acc: 41.666667 
	Valid Loss: 7.571434 	Valid Acc: 37.142857
Epoch: 6 
	Train Loss: 10.005713 	Train Acc: 45.833333 
	Valid Loss: 6.733661 	Valid Acc: 22.857143
Epoch: 7 
	Train Loss: 7.706568 	Train Acc: 32.638889 
	Valid Loss: 2.138982 	Valid Acc: 54.285714
Epoch: 8 
	Train Loss: 5.214390 	Train Acc: 42.361111 
	Valid Loss: 3.003340 	Valid Acc: 37.142857
Epoch: 9 
	Train Loss: 4.012853 	Train Acc: 38.194444 
	Valid Loss: 2.438698 	Valid Acc: 48.571429
Epoch: 10 
	Train Loss: 4.968351 	Train Acc: 34.722222 
	Valid Loss: 2.082583 	

In [15]:
#get private labels
def student_train_labels(teacher_models , dataloader):
  student_labels = []

  #get label from each teacher
  for model in teacher_models:
    student_label =[]
    for images , _ in dataloader:
      with torch.no_grad():
        images = images.to(device)
        outputs = model(images)
        preds = torch.argmax(torch.exp(outputs), dim =1)
      student_label.append(preds.tolist())

    #add all teacher predictions to student labels
    print("Student label",student_label)
    student_label = sum(student_label,[])
    student_labels.append(student_label)
  return student_labels

predicted_labels = student_train_labels(teacher_models, student_train_loader)
predicted_labels = np.array([np.array(p) for p in predicted_labels]).transpose(1,0)

Student label [[1, 1, 1, 1, 1, 3, 3, 2, 1, 1, 1, 1, 0, 1, 1, 2, 0, 1, 3, 1, 1, 1, 1, 1, 1, 0, 0, 2, 1, 0, 3, 1], [3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 2, 1, 3, 0, 1, 2, 1, 2, 1, 0, 3, 1, 1, 1, 1, 3, 1, 1, 3, 3, 1, 3], [1, 3, 1, 1, 0, 1, 0, 1, 1, 1, 1, 3, 1, 1, 1, 1, 3, 1, 3, 1, 1, 2, 1, 0, 1, 2, 1, 1, 2, 1, 2, 1]]
Student label [[1, 0, 1, 3, 0, 3, 2, 0, 1, 2, 2, 1, 0, 1, 1, 2, 1, 1, 3, 1, 1, 1, 1, 2, 1, 3, 3, 2, 1, 0, 1, 2], [3, 1, 0, 1, 2, 2, 1, 1, 1, 3, 1, 1, 2, 1, 3, 3, 1, 1, 3, 1, 3, 1, 3, 1, 3, 3, 1, 1, 0, 2, 3, 1], [1, 3, 1, 1, 3, 1, 0, 3, 0, 3, 1, 0, 1, 1, 1, 1, 1, 1, 3, 2, 1, 1, 1, 0, 3, 1, 3, 2, 1, 1, 0, 1]]


In [16]:
print("Predicted Labels", predicted_labels.shape)

Predicted Labels (96, 2)


In [17]:
print(predicted_labels)

[[1 1]
 [1 0]
 [1 1]
 [1 3]
 [1 0]
 [3 3]
 [3 2]
 [2 0]
 [1 1]
 [1 2]
 [1 2]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [2 2]
 [0 1]
 [1 1]
 [3 3]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 2]
 [1 1]
 [0 3]
 [0 3]
 [2 2]
 [1 1]
 [0 0]
 [3 1]
 [1 2]
 [3 3]
 [1 1]
 [3 0]
 [1 1]
 [1 2]
 [1 2]
 [1 1]
 [1 1]
 [1 1]
 [1 3]
 [2 1]
 [1 1]
 [3 2]
 [0 1]
 [1 3]
 [2 3]
 [1 1]
 [2 1]
 [1 3]
 [0 1]
 [3 3]
 [1 1]
 [1 3]
 [1 1]
 [1 3]
 [3 3]
 [1 1]
 [1 1]
 [3 0]
 [3 2]
 [1 3]
 [3 1]
 [1 1]
 [3 3]
 [1 1]
 [1 1]
 [0 3]
 [1 1]
 [0 0]
 [1 3]
 [1 0]
 [1 3]
 [1 1]
 [3 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [3 1]
 [1 1]
 [3 3]
 [1 2]
 [1 1]
 [2 1]
 [1 1]
 [0 0]
 [1 3]
 [2 1]
 [1 3]
 [1 2]
 [2 1]
 [1 1]
 [2 0]
 [1 1]]


In [18]:
def add_noise(predicted_labels, epsilon = 0.1):
  noisy_labels = []
  for preds in predicted_labels:

    # get labels with max votes
    label_counts = np.bincount(preds, minlength=2)

    # add laplacian noise to label
    epsilon = epsilon
    beta = 1/epsilon
    for i in range(len(label_counts)):
      label_counts[i] += np.random.laplace(0, beta, 1)

    # after adding noise we get labels with max counts
    new_label = np.argmax(label_counts)
    noisy_labels.append(new_label)

  #return noisy_labels
  return np.array(noisy_labels)

labels_with_noise = add_noise(predicted_labels, epsilon=0.1)  
print(labels_with_noise)
print(labels_with_noise.shape)

[1 1 0 1 1 3 2 2 1 1 0 0 0 0 0 0 1 1 3 1 0 0 1 1 1 1 1 0 1 1 2 2 0 0 1 0 1
 2 0 0 0 1 1 0 1 1 0 3 1 2 0 0 3 1 3 1 3 0 0 0 3 3 2 3 1 2 0 0 3 1 0 1 1 2
 0 0 1 0 1 0 3 1 0 0 1 1 1 1 0 0 2 2 0 1 2 1]
(96,)


In [19]:
#write to csv file
import csv 
def write_csv(data):
  with open('labels.csv','a') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(data)
write_csv(labels_with_noise)

In [20]:
data_dep_eps, data_ind_eps = pate.perform_analysis(teacher_preds=predicted_labels.T, indices=labels_with_noise, noise_eps=0., delta=1e-5)
print('Data dependent epsilon:', data_dep_eps)
print('Data independent epsilon:', data_ind_eps)

Data dependent epsilon: 11.516462732485106
Data independent epsilon: 11.516462732485117


In [21]:
#Train Student
# create a new training dataloader for the student with the newly created 
# labels with noise. We have to replace the old labels with the new labels
def new_student_data_loader(dataloader,noisy_labels,batch_size=32):
  image_list = []
  for image , _ in dataloader:
    image_list.append(image)

  data = np.vstack(image_list)
  new_dataset = list(zip(data,noisy_labels))
  new_dataloader = DataLoader(new_dataset, batch_size = batch_size, shuffle = False)

  return new_dataloader
labeled_student_trainloader = new_student_data_loader(student_train_loader,labels_with_noise)
print(len(labeled_student_trainloader.dataset),len(student_valid_loader.dataset))

96 23


In [22]:
images , _ = next(iter(labeled_student_trainloader))
print(images.shape)

torch.Size([32, 3, 224, 224])


In [23]:
student_model = train(epochs, labeled_student_trainloader, student_valid_loader, model, optimizer, criterion, True, save_path='./student.pth.tar', is_not_teacher=True)

Epoch: 1 
	Train Loss: 1.809261 	Train Acc: 32.291667 
	Valid Loss: 2.118299 	Valid Acc: 26.086957
	Validation loss decreased (inf --> 2.118299).  Saving model ...
Epoch: 2 
	Train Loss: 1.551353 	Train Acc: 38.541667 
	Valid Loss: 1.862094 	Valid Acc: 34.782609
	Validation loss decreased (2.118299 --> 1.862094).  Saving model ...
Epoch: 3 
	Train Loss: 1.141400 	Train Acc: 46.875000 
	Valid Loss: 1.813405 	Valid Acc: 47.826087
	Validation loss decreased (1.862094 --> 1.813405).  Saving model ...
Epoch: 4 
	Train Loss: 0.748471 	Train Acc: 70.833333 
	Valid Loss: 1.906013 	Valid Acc: 30.434783
Epoch: 5 
	Train Loss: 0.526091 	Train Acc: 84.375000 
	Valid Loss: 1.544770 	Valid Acc: 30.434783
	Validation loss decreased (1.813405 --> 1.544770).  Saving model ...
Epoch: 6 
	Train Loss: 0.347925 	Train Acc: 91.666667 
	Valid Loss: 2.109434 	Valid Acc: 34.782609
Epoch: 7 
	Train Loss: 0.278493 	Train Acc: 92.708333 
	Valid Loss: 1.859922 	Valid Acc: 39.130435
Epoch: 8 
	Train Loss: 0.209462 

In [24]:
normal_model = train(epochs, student_train_loader, student_valid_loader, model, optimizer, criterion, True, save_path='./normal.pth.tar', is_not_teacher=True)


Epoch: 1 
	Train Loss: 2.727236 	Train Acc: 33.333333 
	Valid Loss: 2.026905 	Valid Acc: 43.478261
	Validation loss decreased (inf --> 2.026905).  Saving model ...
Epoch: 2 
	Train Loss: 2.203727 	Train Acc: 33.333333 
	Valid Loss: 1.919653 	Valid Acc: 43.478261
	Validation loss decreased (2.026905 --> 1.919653).  Saving model ...
Epoch: 3 
	Train Loss: 1.908479 	Train Acc: 47.916667 
	Valid Loss: 1.890024 	Valid Acc: 52.173913
	Validation loss decreased (1.919653 --> 1.890024).  Saving model ...
Epoch: 4 
	Train Loss: 1.860706 	Train Acc: 36.458333 
	Valid Loss: 1.324349 	Valid Acc: 60.869565
	Validation loss decreased (1.890024 --> 1.324349).  Saving model ...
Epoch: 5 
	Train Loss: 1.726115 	Train Acc: 44.791667 
	Valid Loss: 1.364874 	Valid Acc: 56.521739
Epoch: 6 
	Train Loss: 1.507848 	Train Acc: 45.833333 
	Valid Loss: 1.751006 	Valid Acc: 43.478261
Epoch: 7 
	Train Loss: 1.276222 	Train Acc: 45.833333 
	Valid Loss: 1.834223 	Valid Acc: 47.826087
Epoch: 8 
	Train Loss: 1.371038 

In [27]:
# Create Dataloader for the test dataset
batch_size=16
print(len(valset))
dataloader = DataLoader(valset, batch_size=batchsize, shuffle=False)

120


In [28]:
# We set a seed for the dataset to prevent it from producing different values every time it is run
seed = 3
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


def test(dataloader, model, criterion, use_cuda):

    # monitor test loss and accuracy
    test_loss = 0.
    correct = 0.
    total = 0.

    model.eval()
    for batch_idx, (data, target) in enumerate(dataloader):
        # move to GPU
        if use_cuda:
            data, target = data.cuda(), target.cuda()

        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)

        # calculate the loss
        loss = criterion(output, target)

        # update average test loss
        test_loss = test_loss + ((1 / (batch_idx + 1)) * (loss.data - test_loss))

        # convert output probabilities to predicted class
        pred = output.data.max(1, keepdim=True)[1]

        # compare predictions to true label
        correct += np.sum(np.squeeze(pred.eq(target.data.view_as(pred))).cpu().numpy())
        total += data.size(0)

    print('\tTest Loss: {:.6f}'.format(test_loss))
    print('\tTest Accuracy: %2d%% (%2d/%2d)' % (
        100. * correct / total, correct, total))

# call test function
print("Student Model")     
test(dataloader, student_model, criterion, True)

print("\n=======================\nNormal Model")
test(dataloader, normal_model, criterion, True)

Student Model
	Test Loss: 1.258994
	Test Accuracy: 49% (59/120)

Normal Model
	Test Loss: 1.370598
	Test Accuracy: 50% (61/120)
