In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
os.chdir('/content/drive/MyDrive/Differential Privacy/Shoulder/')
!ls

 labels.csv	      'Screenshot 2021-10-29 at 4.43.30 PM.png'   Train
 main.py	       split.py					  Val
 normal.pth.tar        student.pth.tar
 PATE_SHOULDER.ipynb   Test


In [3]:
!pip install syft==0.2.9



In [14]:
import syft
import numpy as np
import pandas as pd
import torch
import torchvision
from torchvision import datasets, transforms, models
from torch.utils.data import Dataset, Subset, DataLoader
from torch import nn , optim
import torch.nn.functional as F
from PIL import Image
import time, os , random
from syft.frameworks.torch.dp import pate
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
dirs = {
    'train':'./Train',
    'val':'./Val',
    'test':'./Test'
}
batchsize=16

data_transforms = transforms.Compose([
    transforms.Resize(224),
    transforms.RandomResizedCrop((224),scale=(0.5,1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

trainset = datasets.ImageFolder(root=dirs['train'],transform=data_transforms) #Teacher Train
valset  = datasets.ImageFolder(root=dirs['val'],transform=data_transforms) #Student Test
testset = datasets.ImageFolder(root=dirs['test'],transform=data_transforms) #Student Train
 

In [6]:
trainset.classes

['Cofield', 'Depuy', 'Tornier', 'Zimmer']

In [7]:
trainset.class_to_idx

{'Cofield': 0, 'Depuy': 1, 'Tornier': 2, 'Zimmer': 3}

In [8]:
print(f'Number of Images in Train Set: {len(trainset)}')
print(f'Number of Images in Validation Set: {len(valset)}')
print(f'Number of Images in Test Set: {len(testset)}')

Number of Images in Train Set: 358
Number of Images in Validation Set: 120
Number of Images in Test Set: 119


# Partitioning Dataset between 2 Teachers

In [9]:
num_teachers = 2
valid_per =0.1
batch_size =32

def teacher_dataloaders(trainset=trainset, num_teachers=num_teachers, batch_size=batch_size,valid_per =0.2):

  train_dl = []
  valid_dl = []
  teacher_data_len = len(trainset) // num_teachers

  #list of shuffled indices
  my_list = random.sample(range(1,len(trainset)),len(trainset)-1)
  random.shuffle(my_list)
  print("Shuffled indices are -:",my_list[:10])

  for i in range(num_teachers):
    #get any particular subset of data
    indice = my_list[i*teacher_data_len:(i+1)*teacher_data_len]
    print(f"Chosen indices for teacher {i} are",indice[:10])
    data_subset = Subset(trainset,indice)

    #split into train and validation set
    valid_size = int(len(data_subset)*valid_per)
    train_size = len(data_subset) - valid_size
    train_subset, valid_subset = torch.utils.data.random_split(data_subset, [train_size,valid_size])
  
    #create data loaders
    trainloader = DataLoader(train_subset, batch_size=batch_size, shuffle=True,num_workers=1)
    validloader = DataLoader(valid_subset, batch_size=batch_size, shuffle= False, num_workers=1)

    train_dl.append(trainloader)
    valid_dl.append(validloader)
  
  return train_dl,valid_dl

trainloaders , validloaders = teacher_dataloaders()
print(len(trainloaders), len(validloaders))

Shuffled indices are -: [88, 139, 87, 227, 99, 182, 75, 77, 215, 149]
Chosen indices for teacher 0 are [88, 139, 87, 227, 99, 182, 75, 77, 215, 149]
Chosen indices for teacher 1 are [165, 4, 144, 29, 180, 209, 58, 112, 309, 61]
2 2


In [10]:
#student dataset
valid_size = int(len(testset)*0.2)
train_size = len(testset) - valid_size
student_train_subset , student_valid_subset = torch.utils.data.random_split(testset,[train_size,valid_size])

#create data loaders
student_train_loader = DataLoader(student_train_subset , batch_size=batch_size, shuffle = False, num_workers=1)
student_valid_loader = DataLoader(student_valid_subset, batch_size = batch_size, shuffle = False, num_workers=1)

print(len(student_train_loader.dataset), len(student_valid_loader.dataset))

96 23


#Train Teachers


In [15]:
def get_resnet18(pretrained=False, out_features=None, path=None):
    model = torchvision.models.resnet18(pretrained=pretrained)
    if out_features is not None:
        model.fc = torch.nn.Linear(in_features=512, out_features=out_features)
    if path is not None:
        model.load_state_dict(torch.load(path, map_location=device))

    return model.to(device)

In [16]:
def train(n_epochs, trainloader, validloader, model, optimizer, criterion, use_cuda, save_path= None, is_not_teacher=False):
    """returns trained model"""
    # # initialize tracker for minimum validation loss
    valid_loss_min = np.Inf

    for epoch in range(1, n_epochs+1):
        # initialize variables to monitor training and validation loss
        train_loss = 0.0
        valid_loss = 0.0
        train_correct = 0.0
        train_total = 0.0
        valid_correct =0.0
        valid_total = 0.0
        # train the model #
        model.train()
        for batch_idx, (data, target) in enumerate(trainloader):
            # move to GPU
            if use_cuda:
                data, target = data.to(device), target.to(device)
            # initialize weights to zero
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()    
            train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.data - train_loss))

            # convert output probabilities to predicted class
            pred = output.data.max(1, keepdim=True)[1]
            # compare predictions to true label
            train_correct += np.sum(np.squeeze(pred.eq(target.data.view_as(pred))).cpu().numpy())
            train_total += data.size(0)
            train_acc = 100. * train_correct / train_total

        # validate the model
        model.eval()
        for batch_idx, (data, target) in enumerate(validloader):
            # move to GPU
            if use_cuda:
                data, target = data.cuda(), target.cuda()
            output = model(data)
            loss = criterion(output, target)
            valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.data - valid_loss))

            pred = output.data.max(1, keepdim=True)[1]
            # compare predictions to true label
            valid_correct += np.sum(np.squeeze(pred.eq(target.data.view_as(pred))).cpu().numpy())
            valid_total += data.size(0)
            valid_acc = 100. * valid_correct / valid_total

        # print training/validation statistics
        print('Epoch: {} \n\tTrain Loss: {:.6f} \tTrain Acc: {:.6f} \n\tValid Loss: {:.6f} \tValid Acc: {:.6f}'.format(
            epoch,train_loss,train_acc,valid_loss,valid_acc ))

        ## save the student model if validation loss has decreased
        if is_not_teacher:
          if valid_loss < valid_loss_min:
              torch.save(model.state_dict(), save_path)
              print('\tValidation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
              valid_loss_min,
              valid_loss))
              valid_loss_min = valid_loss

    return model


In [17]:
model = get_resnet18(True,4)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters() , lr=0.001)
epochs = 50

Downloading: "https://download.pytorch.org/models/resnet18-5c106cde.pth" to /root/.cache/torch/checkpoints/resnet18-5c106cde.pth


  0%|          | 0.00/44.7M [00:00<?, ?B/s]

In [18]:
#training teachers 
teacher_models = []
i =1
for trainloader, validloader in zip(trainloaders,validloaders):
  print(" Training Teacher {}".format(i))
  teacher_model = train(epochs, trainloader, validloader, model, optimizer, criterion, True)
  teacher_models.append(teacher_model)
  i+=1
  print("="*40)


 Training Teacher 1
Epoch: 1 
	Train Loss: 1.639324 	Train Acc: 33.333333 
	Valid Loss: 3.148337 	Valid Acc: 54.285714
Epoch: 2 
	Train Loss: 1.473159 	Train Acc: 51.388889 
	Valid Loss: 2.833504 	Valid Acc: 22.857143
Epoch: 3 
	Train Loss: 1.141459 	Train Acc: 54.166667 
	Valid Loss: 4.582045 	Valid Acc: 45.714286
Epoch: 4 
	Train Loss: 1.136349 	Train Acc: 57.638889 
	Valid Loss: 3.397373 	Valid Acc: 37.142857
Epoch: 5 
	Train Loss: 0.937246 	Train Acc: 59.722222 
	Valid Loss: 1.298684 	Valid Acc: 48.571429
Epoch: 6 
	Train Loss: 1.021478 	Train Acc: 59.722222 
	Valid Loss: 1.226264 	Valid Acc: 57.142857
Epoch: 7 
	Train Loss: 0.944533 	Train Acc: 63.194444 
	Valid Loss: 1.059671 	Valid Acc: 51.428571
Epoch: 8 
	Train Loss: 0.857107 	Train Acc: 63.888889 
	Valid Loss: 1.768938 	Valid Acc: 45.714286
Epoch: 9 
	Train Loss: 0.672480 	Train Acc: 70.833333 
	Valid Loss: 1.637118 	Valid Acc: 37.142857
Epoch: 10 
	Train Loss: 0.663204 	Train Acc: 73.611111 
	Valid Loss: 0.817656 	Valid Acc:

In [19]:
#get private labels
def student_train_labels(teacher_models , dataloader):
  student_labels = []

  #get label from each teacher
  for model in teacher_models:
    student_label =[]
    for images , _ in dataloader:
      with torch.no_grad():
        images = images.to(device)
        outputs = model(images)
        preds = torch.argmax(torch.exp(outputs), dim =1)
      student_label.append(preds.tolist())

    #add all teacher predictions to student labels
    print("Student label",student_label)
    student_label = sum(student_label,[])
    student_labels.append(student_label)
  return student_labels

predicted_labels = student_train_labels(teacher_models, student_train_loader)
predicted_labels = np.array([np.array(p) for p in predicted_labels]).transpose(1,0)

Student label [[1, 2, 2, 1, 1, 1, 1, 0, 1, 1, 1, 3, 1, 2, 3, 3, 3, 1, 3, 1, 1, 1, 1, 1, 3, 1, 1, 1, 3, 3, 1, 2], [0, 2, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 2, 1, 0, 1, 3, 2, 1, 2, 1, 1, 1, 0], [1, 2, 1, 1, 1, 1, 1, 3, 1, 3, 3, 1, 3, 1, 1, 1, 1, 1, 1, 0, 2, 1, 2, 0, 3, 3, 0, 1, 0, 1, 1, 1]]
Student label [[1, 2, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 2, 2, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 0, 1, 2, 3, 1, 1, 2], [0, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 2, 3, 1, 0, 1, 1, 2, 1, 0, 1, 3, 2, 1, 2, 1, 1, 1, 3], [1, 2, 1, 1, 1, 1, 1, 3, 1, 3, 3, 1, 2, 1, 1, 1, 1, 1, 1, 0, 2, 1, 2, 0, 3, 1, 0, 2, 0, 1, 1, 1]]


In [20]:
print("Predicted Labels", predicted_labels.shape)

Predicted Labels (96, 2)


In [21]:
print(predicted_labels)

[[1 1]
 [2 2]
 [2 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 3]
 [1 1]
 [1 1]
 [1 1]
 [3 1]
 [1 1]
 [2 2]
 [3 2]
 [3 1]
 [3 3]
 [1 1]
 [3 3]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [3 1]
 [1 0]
 [1 1]
 [1 2]
 [3 3]
 [3 1]
 [1 1]
 [2 2]
 [0 0]
 [2 2]
 [1 3]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [3 1]
 [1 1]
 [1 1]
 [1 3]
 [1 1]
 [1 3]
 [1 2]
 [3 3]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [2 2]
 [1 1]
 [0 0]
 [1 1]
 [3 3]
 [2 2]
 [1 1]
 [2 2]
 [1 1]
 [1 1]
 [1 1]
 [0 3]
 [1 1]
 [2 2]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [3 3]
 [1 1]
 [3 3]
 [3 3]
 [1 1]
 [3 2]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [2 2]
 [1 1]
 [2 2]
 [0 0]
 [3 3]
 [3 1]
 [0 0]
 [1 2]
 [0 0]
 [1 1]
 [1 1]
 [1 1]]


In [23]:
def add_noise(predicted_labels, epsilon = 0.1):
  noisy_labels = []
  for preds in predicted_labels:

    # get labels with max votes
    label_counts = np.bincount(preds, minlength=2)

    # add laplacian noise to label
    epsilon = epsilon
    beta = 1/epsilon
    for i in range(len(label_counts)):
      label_counts[i] += np.random.laplace(0, beta, 1)

    # after adding noise we get labels with max counts
    new_label = np.argmax(label_counts)
    noisy_labels.append(new_label)

  #return noisy_labels
  return np.array(noisy_labels)

labels_with_noise = add_noise(predicted_labels, epsilon=0.1)  
print(labels_with_noise)
print(labels_with_noise.shape)

[1 2 1 1 0 0 1 3 0 1 1 1 1 1 2 3 3 0 3 1 1 0 0 0 1 1 0 0 3 2 1 1 1 2 3 1 1
 1 1 0 2 0 0 3 0 1 1 3 0 0 1 1 2 0 0 1 3 0 1 0 1 1 0 1 1 2 0 0 0 1 1 2 0 0
 3 1 0 0 0 1 1 1 0 0 2 1 2 0 2 1 1 0 1 0 0 0]
(96,)


In [24]:
#write to csv file
import csv 
def write_csv(data):
  with open('labels.csv','a') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(data)
write_csv(labels_with_noise)

In [26]:
data_dep_eps, data_ind_eps = pate.perform_analysis(teacher_preds=predicted_labels.T, indices=labels_with_noise, noise_eps=0.1, delta=1e-5)
print('Data dependent epsilon:', data_dep_eps)
print('Data independent epsilon:', data_ind_eps)

Data dependent epsilon: 11.516462732485106
Data independent epsilon: 11.516462732485117


In [27]:
#Train Student
# create a new training dataloader for the student with the newly created 
# labels with noise. We have to replace the old labels with the new labels
def new_student_data_loader(dataloader,noisy_labels,batch_size=32):
  image_list = []
  for image , _ in dataloader:
    image_list.append(image)

  data = np.vstack(image_list)
  new_dataset = list(zip(data,noisy_labels))
  new_dataloader = DataLoader(new_dataset, batch_size = batch_size, shuffle = False)

  return new_dataloader
labeled_student_trainloader = new_student_data_loader(student_train_loader,labels_with_noise)
print(len(labeled_student_trainloader.dataset),len(student_valid_loader.dataset))

96 23


In [28]:
images , _ = next(iter(labeled_student_trainloader))
print(images.shape)

torch.Size([32, 3, 224, 224])


In [29]:
student_model = train(epochs, labeled_student_trainloader, student_valid_loader, model, optimizer, criterion, True, save_path='./student.pth.tar', is_not_teacher=True)

Epoch: 1 
	Train Loss: 3.779096 	Train Acc: 39.583333 
	Valid Loss: 2.276589 	Valid Acc: 30.434783
	Validation loss decreased (inf --> 2.276589).  Saving model ...
Epoch: 2 
	Train Loss: 1.106501 	Train Acc: 63.541667 
	Valid Loss: 2.000017 	Valid Acc: 21.739130
	Validation loss decreased (2.276589 --> 2.000017).  Saving model ...
Epoch: 3 
	Train Loss: 0.422015 	Train Acc: 83.333333 
	Valid Loss: 2.434938 	Valid Acc: 21.739130
Epoch: 4 
	Train Loss: 0.504070 	Train Acc: 79.166667 
	Valid Loss: 2.757726 	Valid Acc: 17.391304
Epoch: 5 
	Train Loss: 0.241707 	Train Acc: 91.666667 
	Valid Loss: 2.445907 	Valid Acc: 17.391304
Epoch: 6 
	Train Loss: 0.102152 	Train Acc: 98.958333 
	Valid Loss: 1.843676 	Valid Acc: 30.434783
	Validation loss decreased (2.000017 --> 1.843676).  Saving model ...
Epoch: 7 
	Train Loss: 0.054745 	Train Acc: 100.000000 
	Valid Loss: 1.956822 	Valid Acc: 39.130435
Epoch: 8 
	Train Loss: 0.039954 	Train Acc: 100.000000 
	Valid Loss: 2.307222 	Valid Acc: 17.391304
E

In [30]:
normal_model = train(epochs, student_train_loader, student_valid_loader, model, optimizer, criterion, True, save_path='./normal.pth.tar', is_not_teacher=True)


Epoch: 1 
	Train Loss: 3.940892 	Train Acc: 40.625000 
	Valid Loss: 1.738976 	Valid Acc: 52.173913
	Validation loss decreased (inf --> 1.738976).  Saving model ...
Epoch: 2 
	Train Loss: 1.558405 	Train Acc: 58.333333 
	Valid Loss: 2.219937 	Valid Acc: 34.782609
Epoch: 3 
	Train Loss: 1.061338 	Train Acc: 63.541667 
	Valid Loss: 3.024473 	Valid Acc: 56.521739
Epoch: 4 
	Train Loss: 0.906221 	Train Acc: 66.666667 
	Valid Loss: 1.960213 	Valid Acc: 52.173913
Epoch: 5 
	Train Loss: 0.792524 	Train Acc: 75.000000 
	Valid Loss: 1.476328 	Valid Acc: 56.521739
	Validation loss decreased (1.738976 --> 1.476328).  Saving model ...
Epoch: 6 
	Train Loss: 0.709731 	Train Acc: 69.791667 
	Valid Loss: 1.551160 	Valid Acc: 65.217391
Epoch: 7 
	Train Loss: 0.620240 	Train Acc: 77.083333 
	Valid Loss: 1.377408 	Valid Acc: 56.521739
	Validation loss decreased (1.476328 --> 1.377408).  Saving model ...
Epoch: 8 
	Train Loss: 0.575706 	Train Acc: 75.000000 
	Valid Loss: 1.561604 	Valid Acc: 52.173913
Epo

In [31]:
# Create Dataloader for the test dataset
batch_size=16
print(len(valset))
dataloader = DataLoader(valset, batch_size=batchsize, shuffle=False)

120


In [32]:
# We set a seed for the dataset to prevent it from producing different values every time it is run
seed = 3
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


def test(dataloader, model, criterion, use_cuda):

    # monitor test loss and accuracy
    test_loss = 0.
    correct = 0.
    total = 0.

    model.eval()
    for batch_idx, (data, target) in enumerate(dataloader):
        # move to GPU
        if use_cuda:
            data, target = data.cuda(), target.cuda()

        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)

        # calculate the loss
        loss = criterion(output, target)

        # update average test loss
        test_loss = test_loss + ((1 / (batch_idx + 1)) * (loss.data - test_loss))

        # convert output probabilities to predicted class
        pred = output.data.max(1, keepdim=True)[1]

        # compare predictions to true label
        correct += np.sum(np.squeeze(pred.eq(target.data.view_as(pred))).cpu().numpy())
        total += data.size(0)

    print('\tTest Loss: {:.6f}'.format(test_loss))
    print('\tTest Accuracy: %2d%% (%2d/%2d)' % (
        100. * correct / total, correct, total))

# call test function
print("Student Model")     
test(dataloader, student_model, criterion, True)

print("\n=======================\nNormal Model")
test(dataloader, normal_model, criterion, True)

Student Model
	Test Loss: 1.886878
	Test Accuracy: 55% (67/120)

Normal Model
	Test Loss: 1.697974
	Test Accuracy: 55% (66/120)
