# Evaluation of models trained on CheXpert dataset

In [1]:
!pip install torcheval

Collecting torcheval
  Downloading torcheval-0.0.7-py3-none-any.whl (179 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m174.1/179.2 kB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.2/179.2 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torcheval
Successfully installed torcheval-0.0.7


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import shutil

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import numpy as np
import math
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torchvision.datasets as datasets

import torchvision.transforms as transforms

from utils import *
from parameters import *
from train_or_test import *
from push_prot_chex import *
import cv2 as cv

import scipy.stats as st

seed = 12
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
prototype_shape = (20, 128, 1, 1)
num_classes = 2

normalize = transforms.Normalize(mean=mean,
                                 std=std)

# Evaluation

## Pleural effusion

In [21]:
data_path = 'effusion/'
train_dir = data_path + 'train/'
test_dir = data_path + 'test/'
train_push_dir = data_path + 'push/'

**Centralized Model**

Prepare the data

In [None]:
# train set
train_dataset = datasets.ImageFolder(
    train_dir,
    transforms.Compose([
        transforms.Resize(size=(img_size, img_size)),
        transforms.ToTensor(),
        normalize,
    ]))
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=train_batch_size, shuffle=True,
    num_workers=2, pin_memory=False)
# push set
train_push_dataset = datasets.ImageFolder(
    train_push_dir,
    transforms.Compose([
        transforms.Resize(size=(img_size, img_size)),
        transforms.ToTensor(),
    ]))
train_push_loader = torch.utils.data.DataLoader(
    train_push_dataset, batch_size=train_push_batch_size, shuffle=False,
    num_workers=2, pin_memory=False)
# test set
test_dataset = datasets.ImageFolder(
    test_dir,
    transforms.Compose([
        transforms.Resize(size=(img_size, img_size)),
        transforms.ToTensor(),
        normalize,
    ]))
test_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=test_batch_size, shuffle=False,
    num_workers=2, pin_memory=False)

Load the models

In [None]:
model_path = 'Pleural_effusion/' # a path to a folder with the trained models
model = torch.load(model_path + 'ppnet_chest/21nopush0.7591.pth')
model_1 = torch.load(model_path + 'ppnet_chest_1/21nopush0.7582.pth')
model_2 = torch.load(model_path + 'ppnet_chest_2/21nopush0.7164.pth')

In [None]:
fin_sens = []
fin_spec = []
fin_score = []
for j in (model, model_1, model_2):
  acc, f1, acc_multi, sens, spec, score = evaluate(j, test_loader, class_specific=True)
  fin_sens.append(sens)
  fin_spec.append(spec)
  fin_score.append(score)

mean_sens = np.array(fin_sens).mean()
mean_spec = np.array(fin_spec).mean()
mean_score = np.array(fin_score).mean()
print('Average sensitivity: ', mean_sens, '+-', st.sem(fin_sens),
      '\nAverage specificity: ', mean_spec, '+-', st.sem(fin_spec),
      '\nAverage balanced accuracy: ', mean_score, '+-', st.sem(fin_score))

  num_correct = mask.new_zeros(num_classes).scatter_(0, target, mask, reduce="add")


Average sensitivity:  0.81393469123828 +- 0.02176471836194566 
Average specificity:  0.7050388771079471 +- 0.03519407249986154 
Average balanced accuracy:  0.7594867841731135 +- 0.006761590097009268


**Unbiased Local Models**

Distribute the data

In [22]:
dir_names = os.listdir(train_dir)
for client in range(4): # 4 clients
  os.mkdir(f'client_{client}')
  os.mkdir(f'client_{client}/' + 'train/')
  os.mkdir(f'client_{client}/' + 'push/')
  os.mkdir(f'client_{client}/' + 'test/')
  for class_name in dir_names:
    os.mkdir(f'client_{client}/'+ 'train/' + class_name)
    os.mkdir(f'client_{client}/'+ 'push/' + class_name)
    os.mkdir(f'client_{client}/'+ 'test/' + class_name)

In [23]:
distribute_data(train_dir, seed, 4)

In [24]:
distribute_data(train_push_dir, seed, 4)

In [25]:
distribute_data(test_dir, seed, 4)

In [26]:
num_clients = 4
train_datasets, train_loaders = [],[]
train_push_datasets, train_push_loaders = [],[]
test_datasets, test_loaders = [],[]

for client in range(num_clients):
  # train set
  train_dir = f'client_{client}/' + 'train/'
  train_push_dir = f'client_{client}/' + 'push/'
  test_dir = f'client_{client}/' + 'test/'

  train_dataset = datasets.ImageFolder(
      train_dir,
      transforms.Compose([
          transforms.Resize(size=(img_size, img_size)),
          transforms.ToTensor(),
          normalize,
      ]))
  train_datasets.append(train_dataset)

  train_loader = torch.utils.data.DataLoader(
      train_dataset, batch_size=train_batch_size, shuffle=True,
      num_workers=2, pin_memory=False)
  train_loaders.append(train_loader)

  # push set
  train_push_dataset = datasets.ImageFolder(
      train_push_dir,
      transforms.Compose([
          transforms.Resize(size=(img_size, img_size)),
          transforms.ToTensor(),
      ]))
  train_push_datasets.append(train_push_dataset)

  train_push_loader = torch.utils.data.DataLoader(
      train_push_dataset, batch_size=train_push_batch_size, shuffle=False,
      num_workers=2, pin_memory=False)
  train_push_loaders.append(train_push_loader)

  # test set
  test_dataset = datasets.ImageFolder(
      test_dir,
      transforms.Compose([
          transforms.Resize(size=(img_size, img_size)),
          transforms.ToTensor(),
          normalize,
      ]))
  test_datasets.append(test_dataset)

  test_loader = torch.utils.data.DataLoader(
      test_dataset, batch_size=test_batch_size, shuffle=False,
      num_workers=2, pin_memory=False)
  test_loaders.append(test_loader)

Load models

In [27]:
model_1 = torch.load('Pleural_effusion/Local_1/20_11push0.7983.pth')
model_2 = torch.load('Pleural_effusion/Local_2/20_11push0.7440.pth')
model_3 = torch.load('Pleural_effusion/Local_3/20_11push0.7386.pth')
model_4 = torch.load('Pleural_effusion/Local_4/20_11push0.6809.pth')

Evaluate on each unbiased test set and average

In [28]:
fin_sens = []
fin_spec = []
fin_score = []
for j in (model_1, model_2, model_3, model_4):
  all_sens = []
  all_spec = []
  all_score = []
  for i in range(4):
    acc, f1, acc_multi, sens, spec, score = evaluate(j, test_loaders[i], class_specific=True)
    all_sens.append(sens)
    all_spec.append(spec)
    all_score.append(score)

  mean_sens = np.array(all_sens).mean()
  mean_spec = np.array(all_spec).mean()
  mean_score = np.array(all_score).mean()
  fin_sens.append(mean_sens)
  fin_spec.append(mean_spec)
  fin_score.append(mean_score)

mean_sens = np.array(fin_sens).mean()
mean_spec = np.array(fin_spec).mean()
mean_score = np.array(fin_score).mean()
print('Average sensitivity: ', mean_sens, '+-', st.sem(fin_sens),
      '\nAverage specificity: ', mean_spec, '+-', st.sem(fin_spec),
      '\nAverage balanced accuracy: ', mean_score, '+-', st.sem(fin_score))

Average sensitivity:  0.6862864077669903 +- 0.06219353335235532 
Average specificity:  0.7269054878048781 +- 0.03606364431089226 
Average balanced accuracy:  0.7065959477859343 +- 0.02401147217814353


**Unbiased Personalized Models**

Load models

In [None]:
model_path = 'Pleural_effusion/Global_good/ppnet_chest/'
model_1 = torch.load(model_path + 'client_0_last_round_2_push0.6000.pth').module.to(device)
model_2 = torch.load(model_path + 'client_1_last_round_2_push0.6090.pth').module.to(device)
model_3 = torch.load(model_path + 'client_2_last_round_2_push0.6552.pth').module.to(device)
model_4 = torch.load(model_path + 'client_3_last_round_2_push0.6149.pth').module.to(device)

Evaluate on each unbiased test set and average

In [None]:
fin_sens = []
fin_spec = []
fin_score = []
for j in (model_1, model_2, model_3, model_4):
  all_sens = []
  all_spec = []
  all_score = []
  for i in range(4):
    acc, f1, acc_multi, sens, spec, score = evaluate(j, test_loaders[i], class_specific=True)
    all_sens.append(sens)
    all_spec.append(spec)
    all_score.append(score)

  mean_sens = np.array(all_sens).mean()
  mean_spec = np.array(all_spec).mean()
  mean_score = np.array(all_score).mean()
  fin_sens.append(mean_sens)
  fin_spec.append(mean_spec)
  fin_score.append(mean_score)

mean_sens = np.array(fin_sens).mean()
mean_spec = np.array(fin_spec).mean()
mean_score = np.array(fin_score).mean()
print('Average sensitivity: ', mean_sens, '+-', st.sem(fin_sens),
      '\nAverage specificity: ', mean_spec, '+-', st.sem(fin_spec),
      '\nAverage balanced accuracy: ', mean_score, '+-', st.sem(fin_score))

Average sensitivity:  0.6915048543689319 +- 0.061672937582765264 
Average specificity:  0.5837878787878787 +- 0.024722983139516243 
Average balanced accuracy:  0.6376463665784055 +- 0.020076489232963117


**Unbiased Global Model**

Load the models

In [None]:
model_path = 'Pleural_effusion/Global_good/Fully_global/'
model_ser = torch.load(model_path + 'ppnet_chest/server_final_round_2_0.7381.pth').module.to(device)
model_ser1 = torch.load(model_path + 'ppnet_chest_1/server_final_round_2_0.6321.pth').module.to(device)
model_ser2 = torch.load(model_path + 'ppnet_chest_2/server_final_round_2_0.7448.pth').module.to(device)

In [None]:
fin_sens = []
fin_spec = []
fin_score = []
for j in (model_ser, model_ser1, model_ser2):
  acc, f1, acc_multi, sens, spec, score = evaluate(j, test_loader, class_specific=True)
  fin_sens.append(sens)
  fin_spec.append(spec)
  fin_score.append(score)

mean_sens = np.array(fin_sens).mean()
mean_spec = np.array(fin_spec).mean()
mean_score = np.array(fin_score).mean()
print('Average sensitivity: ', mean_sens, '+-', st.sem(fin_sens),
      '\nAverage specificity: ', mean_spec, '+-', st.sem(fin_spec),
      '\nAverage balanced accuracy: ', mean_score, '+-', st.sem(fin_score))

Average sensitivity:  0.8375404530744337 +- 0.08181752062309355 
Average specificity:  0.6440404040404041 +- 0.11739929136454207 
Average balanced accuracy:  0.7407904285574188 +- 0.022415569104705057


**Biased Local model**

Prepare the data

In [None]:
# train set
train_dir = f'drains/' + 'train/'
train_push_dir = f'drains/' + 'push/'
test_dir = f'drains/' + 'test/'

train_dataset = datasets.ImageFolder(
    train_dir,
    transforms.Compose([
        transforms.Resize(size=(img_size, img_size)),
        transforms.ToTensor(),
        normalize,
    ]))
train_datasets[3] = train_dataset

train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=train_batch_size, shuffle=True,
    num_workers=2, pin_memory=False)
train_loaders[3] = train_loader

# push set
train_push_dataset = datasets.ImageFolder(
    train_push_dir,
    transforms.Compose([
        transforms.Resize(size=(img_size, img_size)),
        transforms.ToTensor(),
    ]))
train_push_datasets[3] = train_push_dataset

train_push_loader = torch.utils.data.DataLoader(
    train_push_dataset, batch_size=train_push_batch_size, shuffle=False,
    num_workers=2, pin_memory=False)
train_push_loaders[3] = train_push_loader

# test set
test_dataset = datasets.ImageFolder(
    test_dir,
    transforms.Compose([
        transforms.Resize(size=(img_size, img_size)),
        transforms.ToTensor(),
        normalize,
    ]))
test_datasets[3] = test_dataset

test_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=test_batch_size, shuffle=False,
    num_workers=2, pin_memory=False)
test_loaders[3] = test_loader

Load the models

In [None]:
model_path = 'Pleural_effusion/Local_drain/ppnet_chest/'
model_b = torch.load(model_path + '21nopush0.7959.pth')
model_b1 = torch.load(model_path + '211nopush0.7959.pth')
model_b2 = torch.load(model_path + '21nopush0.7347.pth')

Evaluate on a biased test set

In [None]:
fin_sens = []
fin_spec = []
fin_score = []
for j in (model_b, model_b1, model_b2):
  acc, f1, acc_multi, sens, spec, score = evaluate(j, test_loaders[3], class_specific=True)
  fin_sens.append(sens)
  fin_spec.append(spec)
  fin_score.append(score)

mean_sens = np.array(fin_sens).mean()
mean_spec = np.array(fin_spec).mean()
mean_score = np.array(fin_score).mean()
print('Average sensitivity: ', mean_sens, '+-', st.sem(fin_sens),
      '\nAverage specificity: ', mean_spec, '+-', st.sem(fin_spec),
      '\nAverage balanced accuracy: ', mean_score, '+-', st.sem(fin_score))

Average sensitivity:  0.5087719298245613 +- 0.008771929824561394 
Average specificity:  0.9555555555555554 +- 0.014698618394803265 
Average balanced accuracy:  0.7321637426900586 +- 0.011582970829109494


Evaluate on unbiased test sets and average

In [None]:
fin_sens = []
fin_spec = []
fin_score = []
for j in (model_b, model_b1, model_b2):
  all_sens = []
  all_spec = []
  all_score = []
  for i in range(3):
    acc, f1, acc_multi, sens, spec, score = evaluate(j, test_loaders[i], class_specific=True)
    all_sens.append(sens)
    all_spec.append(spec)
    all_score.append(score)

  mean_sens = np.array(all_sens).mean()
  mean_spec = np.array(all_spec).mean()
  mean_score = np.array(all_score).mean()
  fin_sens.append(mean_sens)
  fin_spec.append(mean_spec)
  fin_score.append(mean_score)

mean_sens = np.array(fin_sens).mean()
mean_spec = np.array(fin_spec).mean()
mean_score = np.array(fin_score).mean()
print('Average sensitivity: ', mean_sens, '+-', st.sem(fin_sens),
      '\nAverage specificity: ', mean_spec, '+-', st.sem(fin_spec),
      '\nAverage balanced accuracy: ', mean_score, '+-', st.sem(fin_score))

Average sensitivity:  0.05221143473570658 +- 0.0158205027208791 
Average specificity:  0.9551515151515152 +- 0.009001430201485248 
Average balanced accuracy:  0.5036814749436108 +- 0.003759958209356777


**Biased Personalized Models**

Load models

In [None]:
# unbiased clients
model_path = 'Pleural_effusion/Global_biased/ppnet_chest/'
model_1 = torch.load(model_path + 'client_0_last_round_2_push0.5642.pth')
model_2 = torch.load(model_path + 'client_1 last_round_2_push0.6276.pth')
model_3 = torch.load(model_path + 'client_2 last_round_2_push0.6269.pth')

In [None]:
# biased client, results for three runs
model_b = torch.load('Pleural_effusion/Global_biased/ppnet_chest/client_3_last_round_2_push0.8063.pth').module.to(device)
model_b1 = torch.load('Pleural_effusion/Global_biased/ppnet_chest_1/client_3 last_round_2_push0.8429.pth').module.to(device)
model_b2 = torch.load('Pleural_effusion/Global_biased/ppnet_chest_2/client_3_last_round_2_push0.8325.pth').module.to(device)

Evaluate biased model on a biased test set

In [None]:
fin_sens = []
fin_spec = []
fin_score = []
for j in (model_b, model_b1, model_b2):
  acc, f1, acc_multi, sens, spec, score = evaluate(j, test_loaders[3], class_specific=True)
  fin_sens.append(sens)
  fin_spec.append(spec)
  fin_score.append(score)

mean_sens = np.array(fin_sens).mean()
mean_spec = np.array(fin_spec).mean()
mean_score = np.array(fin_score).mean()
print('Average sensitivity: ', mean_sens, '+-', st.sem(fin_sens),
      '\nAverage specificity: ', mean_spec, '+-', st.sem(fin_spec),
      '\nAverage balanced accuracy: ', mean_score, '+-', st.sem(fin_score))

Average sensitivity:  0.3684210526315789 +- 0.0303868562731382 
Average specificity:  0.9277777777777777 +- 0.0388888888888889 
Average balanced accuracy:  0.6480994152046784 +- 0.009860566508138725


Evaluate biased model on unbiased test sets and average

In [None]:
fin_sens = []
fin_spec = []
fin_score = []
for j in (model_b, model_b1, model_b2):
  all_sens = []
  all_spec = []
  all_score = []
  for i in range(3):
    acc, f1, acc_multi, sens, spec, score = evaluate(j, test_loaders[i], class_specific=True)
    all_sens.append(sens)
    all_spec.append(spec)
    all_score.append(score)

  mean_sens = np.array(all_sens).mean()
  mean_spec = np.array(all_spec).mean()
  mean_score = np.array(all_score).mean()
  fin_sens.append(mean_sens)
  fin_spec.append(mean_spec)
  fin_score.append(mean_score)

mean_sens = np.array(fin_sens).mean()
mean_spec = np.array(fin_spec).mean()
mean_score = np.array(fin_score).mean()
print('Average sensitivity: ', mean_sens, '+-', st.sem(fin_sens),
      '\nAverage specificity: ', mean_spec, '+-', st.sem(fin_spec),
      '\nAverage balanced accuracy: ', mean_score, '+-', st.sem(fin_score))

Average sensitivity:  0.07292340884573895 +- 0.023127525785966423 
Average specificity:  0.9244444444444445 +- 0.021180351389222064 
Average balanced accuracy:  0.4986839266450917 +- 0.000981733278298091


Evaluate unbiased models on unbiased test sets and average

In [None]:
fin_sens = []
fin_spec = []
fin_score = []
for j in (model_1, model_2, model_3):
  all_sens = []
  all_spec = []
  all_score = []
  for i in range(3):
    acc, f1, acc_multi, sens, spec, score = evaluate(j, test_loaders[i], class_specific=True)
    all_sens.append(sens)
    all_spec.append(spec)
    all_score.append(score)

  mean_sens = np.array(all_sens).mean()
  mean_spec = np.array(all_spec).mean()
  mean_score = np.array(all_score).mean()
  fin_sens.append(mean_sens)
  fin_spec.append(mean_spec)
  fin_score.append(mean_score)

mean_sens = np.array(fin_sens).mean()
mean_spec = np.array(fin_spec).mean()
mean_score = np.array(fin_score).mean()
print('Average sensitivity: ', mean_sens, '+-', st.sem(fin_sens),
      '\nAverage specificity: ', mean_spec, '+-', st.sem(fin_spec),
      '\nAverage balanced accuracy: ', mean_score, '+-', st.sem(fin_score))

Average sensitivity:  0.6763754045307443 +- 0.1269043141505637 
Average specificity:  0.5703703703703703 +- 0.1221121096402602 
Average balanced accuracy:  0.6233728874505574 +- 0.004773879838228415


Evaluate unbiased models on a biased test set

In [None]:
fin_sens = []
fin_spec = []
fin_score = []
for j in (model_1, model_2, model_3):
    acc, f1, acc_multi, sens, spec, score = evaluate(j, test_loaders[3], class_specific=True)
    all_sens.append(sens)
    all_spec.append(spec)
    all_score.append(score)

mean_sens = np.array(all_sens).mean()
mean_spec = np.array(all_spec).mean()
mean_score = np.array(all_score).mean()
print('Average sensitivity: ', mean_sens, '+-', st.sem(all_sens),
      '\nAverage specificity: ', mean_spec, '+-', st.sem(all_spec),
      '\nAverage balanced accuracy: ', mean_score, '+-', st.sem(all_score))

Average sensitivity:  0.5200391756089252 +- 0.08141336966739121 
Average specificity:  0.6351515151515151 +- 0.0774237383604537 
Average balanced accuracy:  0.5775953453802202 +- 0.020757880314016253


**Biased Global Model**

In [None]:
model_path = 'Pleural_effusion/Global_biased/Fully_global/'
model_ser = torch.load(model_path + 'ppnet_chest/server_final_round_2_0.6157.pth').module.to(device)
model_ser1 = torch.load(model_path + 'ppnet_chest_1/server_final_round_2_0.6164.pth').module.to(device)
model_ser2 = torch.load(model_path + 'ppnet_chest_2/server_final_round_2_0.6157.pth').module.to(device)

Evaluate on a biased test set

In [None]:
fin_sens = []
fin_spec = []
fin_score = []
for j in (model_ser, model_ser1, model_ser2):
  acc, f1, acc_multi, sens, spec, score = evaluate(j, test_loaders[3], class_specific=True)
  fin_sens.append(sens)
  fin_spec.append(spec)
  fin_score.append(score)

mean_sens = np.array(fin_sens).mean()
mean_spec = np.array(fin_spec).mean()
mean_score = np.array(fin_score).mean()
print('Average sensitivity: ', mean_sens, '+-', st.sem(fin_sens),
      '\nAverage specificity: ', mean_spec, '+-', st.sem(fin_spec),
      '\nAverage balanced accuracy: ', mean_score, '+-', st.sem(fin_score))

Average sensitivity:  0.0 +- 0.0 
Average specificity:  0.9944444444444445 +- 0.005555555555555574 
Average balanced accuracy:  0.49722222222222223 +- 0.002777777777777787


Evaluate on unbiased test sets and average

In [None]:
fin_sens = []
fin_spec = []
fin_score = []
for j in (model_ser, model_ser1, model_ser2):
  all_sens = []
  all_spec = []
  all_score = []
  for i in range(3):
    acc, f1, acc_multi, sens, spec, score = evaluate(j, test_loaders[i], class_specific=True)
    all_sens.append(sens)
    all_spec.append(spec)
    all_score.append(score)

  mean_sens = np.array(all_sens).mean()
  mean_spec = np.array(all_spec).mean()
  mean_score = np.array(all_score).mean()
  fin_sens.append(mean_sens)
  fin_spec.append(mean_spec)
  fin_score.append(mean_score)

mean_sens = np.array(fin_sens).mean()
mean_spec = np.array(fin_spec).mean()
mean_score = np.array(fin_score).mean()
print('Average sensitivity: ', mean_sens, '+-', st.sem(fin_sens),
      '\nAverage specificity: ', mean_spec, '+-', st.sem(fin_spec),
      '\nAverage balanced accuracy: ', mean_score, '+-', st.sem(fin_score))

Average sensitivity:  0.0015102481121898597 +- 0.00151024811218986 
Average specificity:  0.9987878787878789 +- 0.0012121212121211828 
Average balanced accuracy:  0.5001490634500344 +- 0.00014906345003432947


## Cardiomegaly

In [8]:
data_path = 'cardiomegaly/'
train_dir = data_path + 'train/'
test_dir = data_path + 'test/'
train_push_dir = data_path + 'push/'

**Centralized Model**

Prepare the data

In [None]:
# train set
train_dataset = datasets.ImageFolder(
    train_dir,
    transforms.Compose([
        transforms.Resize(size=(img_size, img_size)),
        transforms.ToTensor(),
        normalize,
    ]))
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=train_batch_size, shuffle=True,
    num_workers=2, pin_memory=False)
# push set
train_push_dataset = datasets.ImageFolder(
    train_push_dir,
    transforms.Compose([
        transforms.Resize(size=(img_size, img_size)),
        transforms.ToTensor(),
    ]))
train_push_loader = torch.utils.data.DataLoader(
    train_push_dataset, batch_size=train_push_batch_size, shuffle=False,
    num_workers=2, pin_memory=False)
# test set
test_dataset = datasets.ImageFolder(
    test_dir,
    transforms.Compose([
        transforms.Resize(size=(img_size, img_size)),
        transforms.ToTensor(),
        normalize,
    ]))
test_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=test_batch_size, shuffle=False,
    num_workers=2, pin_memory=False)

Load models

In [None]:
model = torch.load('Cardiomegaly/ppnet_chest/27nopush0.8263.pth')
model_1 = torch.load('Cardiomegaly/ppnet_chest_1/26nopush0.8298.pth')
model_2 = torch.load('Cardiomegaly/ppnet_chest_2/27nopush0.7810.pth')

In [None]:
fin_sens = []
fin_spec = []
fin_score = []
for j in (model, model_1, model_2):
  acc, f1, acc_multi, sens, spec, score = evaluate(j, test_loader, class_specific=True)
  fin_sens.append(sens)
  fin_spec.append(spec)
  fin_score.append(score)

mean_sens = np.array(fin_sens).mean()
mean_spec = np.array(fin_spec).mean()
mean_score = np.array(fin_score).mean()
print('Average sensitivity: ', mean_sens, '+-', st.sem(fin_sens),
      '\nAverage specificity: ', mean_spec, '+-', st.sem(fin_spec),
      '\nAverage balanced accuracy: ', mean_score, '+-', st.sem(fin_score))

  num_correct = mask.new_zeros(num_classes).scatter_(0, target, mask, reduce="add")


Average sensitivity:  0.6576543209876543 +- 0.03811777127693675 
Average specificity:  0.8313271604938272 +- 0.02369769487238998 
Average balanced accuracy:  0.7444907407407407 +- 0.00734329572811735


**Unbiased Local Models**

Prepare the data

Please, first remove or rename the `client_0`, `client_1`, `client_2`, and `client_3` folders with pleural effusion data.

In [9]:
dir_names = os.listdir(train_dir)
for client in range(4): # 4 clients
  os.mkdir(f'client_{client}')
  os.mkdir(f'client_{client}/' + 'train/')
  os.mkdir(f'client_{client}/' + 'push/')
  os.mkdir(f'client_{client}/' + 'test/')
  for class_name in dir_names:
    os.mkdir(f'client_{client}/'+ 'train/' + class_name)
    os.mkdir(f'client_{client}/'+ 'push/' + class_name)
    os.mkdir(f'client_{client}/'+ 'test/' + class_name)

In [10]:
distribute_data(train_dir, seed, 4)

In [11]:
distribute_data(train_push_dir, seed, 4)

In [12]:
distribute_data(test_dir, seed, 4)

In [13]:
num_clients = 4
train_datasets, train_loaders = [],[]
train_push_datasets, train_push_loaders = [],[]
test_datasets, test_loaders = [],[]

for client in range(num_clients):
  # train set
  train_dir = f'client_{client}/' + 'train/'
  train_push_dir = f'client_{client}/' + 'push/'
  test_dir = f'client_{client}/' + 'test/'

  train_dataset = datasets.ImageFolder(
      train_dir,
      transforms.Compose([
          transforms.Resize(size=(img_size, img_size)),
          transforms.ToTensor(),
          normalize,
      ]))
  train_datasets.append(train_dataset)

  train_loader = torch.utils.data.DataLoader(
      train_dataset, batch_size=train_batch_size, shuffle=True,
      num_workers=2, pin_memory=False)
  train_loaders.append(train_loader)

  # push set
  train_push_dataset = datasets.ImageFolder(
      train_push_dir,
      transforms.Compose([
          transforms.Resize(size=(img_size, img_size)),
          transforms.ToTensor(),
      ]))
  train_push_datasets.append(train_push_dataset)

  train_push_loader = torch.utils.data.DataLoader(
      train_push_dataset, batch_size=train_push_batch_size, shuffle=False,
      num_workers=2, pin_memory=False)
  train_push_loaders.append(train_push_loader)

  # test set
  test_dataset = datasets.ImageFolder(
      test_dir,
      transforms.Compose([
          transforms.Resize(size=(img_size, img_size)),
          transforms.ToTensor(),
          normalize,
      ]))
  test_datasets.append(test_dataset)

  test_loader = torch.utils.data.DataLoader(
      test_dataset, batch_size=test_batch_size, shuffle=False,
      num_workers=2, pin_memory=False)
  test_loaders.append(test_loader)

Load models

In [14]:
model_1 = torch.load('Local_1/29nopush0.7720.pth')
model_2 = torch.load('Local_2/ppnet_chest/30nopush0.7716.pth')
model_3 = torch.load('Local_3/ppnet_chest/29nopush0.7250.pth')
model_4 = torch.load('Local_4/30nopush0.7554.pth')

In [15]:
fin_sens = []
fin_spec = []
fin_score = []
for j in (model_1, model_2, model_3, model_4):
  all_sens = []
  all_spec = []
  all_score = []
  for i in range(4):
    acc, f1, acc_multi, sens, spec, score = evaluate(j, test_loaders[i], class_specific=True)
    all_sens.append(sens)
    all_spec.append(spec)
    all_score.append(score)

  mean_sens = np.array(all_sens).mean()
  mean_spec = np.array(all_spec).mean()
  mean_score = np.array(all_score).mean()
  fin_sens.append(mean_sens)
  fin_spec.append(mean_spec)
  fin_score.append(mean_score)

mean_sens = np.array(fin_sens).mean()
mean_spec = np.array(fin_spec).mean()
mean_score = np.array(fin_score).mean()
print('Average sensitivity: ', mean_sens, '+-', st.sem(fin_sens),
      '\nAverage specificity: ', mean_spec, '+-', st.sem(fin_spec),
      '\nAverage balanced accuracy: ', mean_score, '+-', st.sem(fin_score))

  num_correct = mask.new_zeros(num_classes).scatter_(0, target, mask, reduce="add")


Average sensitivity:  0.6577777777777778 +- 0.03594596615551212 
Average specificity:  0.7749742798353909 +- 0.015068965723075266 
Average balanced accuracy:  0.7163760288065844 +- 0.010467782487058213


**Unbiased Personalized Models**

Load models

In [None]:
model_path = 'Global_good/ppnet_chest/'
model_1 = torch.load(model_path + 'client_0_last_round_3.pth')
model_2 = torch.load(model_path + 'client_1_last_round_3.pth')
model_3 = torch.load(model_path + 'client_2_last_round_3.pth')
model_4 = torch.load(model_path + 'client_3_last_round_3.pth')

In [None]:
fin_sens = []
fin_spec = []
fin_score = []
for j in (model_1, model_2, model_3, model_4):
  all_sens = []
  all_spec = []
  all_score = []
  for i in range(4):
    acc, f1, acc_multi, sens, spec, score = evaluate(j, test_loaders[i], class_specific=True)
    all_sens.append(sens)
    all_spec.append(spec)
    all_score.append(score)

  mean_sens = np.array(all_sens).mean()
  mean_spec = np.array(all_spec).mean()
  mean_score = np.array(all_score).mean()
  fin_sens.append(mean_sens)
  fin_spec.append(mean_spec)
  fin_score.append(mean_score)

mean_sens = np.array(fin_sens).mean()
mean_spec = np.array(fin_spec).mean()
mean_score = np.array(fin_score).mean()
print('Average sensitivity: ', mean_sens, '+-', st.sem(fin_sens),
      '\nAverage specificity: ', mean_spec, '+-', st.sem(fin_spec),
      '\nAverage balanced accuracy: ', mean_score, '+-', st.sem(fin_score))

Average sensitivity:  0.6025 +- 0.08433409714082342 
Average specificity:  0.6722350823045268 +- 0.05247839498021155 
Average balanced accuracy:  0.6373675411522635 +- 0.0445382666534138


**Unbiased Global Model**

In [None]:
model_path = 'Global_good/Fully_global/'
model_ser = torch.load(model_path + 'ppnet_chest/server_final_round_3_0.8621.pth').module.to(device)
model_ser1 = torch.load(model_path + 'ppnet_chest_1/server_final_round_3_0.7501.pth').module.to(device)
model_ser2 = torch.load(model_path + 'ppnet_chest_2/server_final_round_3_0.7957.pth').module.to(device)

In [None]:
fin_sens = []
fin_spec = []
fin_score = []
for j in (model_ser, model_ser1, model_ser2):
  acc, f1, acc_multi, sens, spec, score = evaluate(j, test_loader, class_specific=True)
  fin_sens.append(sens)
  fin_spec.append(spec)
  fin_score.append(score)

mean_sens = np.array(fin_sens).mean()
mean_spec = np.array(fin_spec).mean()
mean_score = np.array(fin_score).mean()
print('Average sensitivity: ', mean_sens, '+-', st.sem(fin_sens),
      '\nAverage specificity: ', mean_spec, '+-', st.sem(fin_spec),
      '\nAverage balanced accuracy: ', mean_score, '+-', st.sem(fin_score))

Average sensitivity:  0.6840740740740742 +- 0.07806326207141816 
Average specificity:  0.7988168724279836 +- 0.0629793581337403 
Average balanced accuracy:  0.7414454732510288 +- 0.007693561766996538


**Biased Local Model**

Prepare the data

In [None]:
# add synthetic bias to the fourth client dataset

num_client = 3
unicode = '\U0001F42D'
bias_folder = 'positive'
size = 35
percent = 100
adding_emoji(num_client, unicode, bias_folder, size, percent)

6075
750
675


In [None]:
num_clients = 4
train_datasets, train_loaders = [],[]
train_push_datasets, train_push_loaders = [],[]
test_datasets, test_loaders = [],[]

for client in range(num_clients):
  # train set
  train_dir = f'client_{client}/' + 'train/'
  train_push_dir = f'client_{client}/' + 'push/'
  test_dir = f'client_{client}/' + 'test/'

  train_dataset = datasets.ImageFolder(
      train_dir,
      transforms.Compose([
          transforms.Resize(size=(img_size, img_size)),
          transforms.ToTensor(),
          normalize,
      ]))
  train_datasets.append(train_dataset)

  train_loader = torch.utils.data.DataLoader(
      train_dataset, batch_size=train_batch_size, shuffle=True,
      num_workers=2, pin_memory=False)
  train_loaders.append(train_loader)

  # push set
  train_push_dataset = datasets.ImageFolder(
      train_push_dir,
      transforms.Compose([
          transforms.Resize(size=(img_size, img_size)),
          transforms.ToTensor(),
      ]))
  train_push_datasets.append(train_push_dataset)

  train_push_loader = torch.utils.data.DataLoader(
      train_push_dataset, batch_size=train_push_batch_size, shuffle=False,
      num_workers=2, pin_memory=False)
  train_push_loaders.append(train_push_loader)

  # test set
  test_dataset = datasets.ImageFolder(
      test_dir,
      transforms.Compose([
          transforms.Resize(size=(img_size, img_size)),
          transforms.ToTensor(),
          normalize,
      ]))
  test_datasets.append(test_dataset)

  test_loader = torch.utils.data.DataLoader(
      test_dataset, batch_size=test_batch_size, shuffle=False,
      num_workers=2, pin_memory=False)
  test_loaders.append(test_loader)

Load models

In [None]:
model_b = torch.load('Local_4_biased/ppnet_chest/20_11push1.0000.pth')
model_b1 = torch.load('Local_4_biased/ppnet_chest_1/20_11push1.0000.pth')
model_b2 = torch.load('Local_4_biased/ppnet_chest_2/20_11push1.0000.pth')

Evaluate on a biased test set

In [None]:
fin_sens = []
fin_spec = []
fin_score = []
for j in (model_b, model_b1, model_b2):
  acc, f1, acc_multi, sens, spec, score = evaluate(j, test_loaders[3], class_specific=True)
  fin_sens.append(sens)
  fin_spec.append(spec)
  fin_score.append(score)

mean_sens = np.array(fin_sens).mean()
mean_spec = np.array(fin_spec).mean()
mean_score = np.array(fin_score).mean()
print('Average sensitivity: ', mean_sens, '+-', st.sem(fin_sens),
      '\nAverage specificity: ', mean_spec, '+-', st.sem(fin_spec),
      '\nAverage balanced accuracy: ', mean_score, '+-', st.sem(fin_score))

Average sensitivity:  1.0 +- 0.0 
Average specificity:  1.0 +- 0.0 
Average balanced accuracy:  1.0 +- 0.0


Evaluate on unbiased test sets and average

In [None]:
fin_sens = []
fin_spec = []
fin_score = []
for j in (model_b, model_b1, model_b2):
  all_sens = []
  all_spec = []
  all_score = []
  for i in range(3):
    acc, f1, acc_multi, sens, spec, score = evaluate(j, test_loaders[i], class_specific=True)
    all_sens.append(sens)
    all_spec.append(spec)
    all_score.append(score)

  mean_sens = np.array(all_sens).mean()
  mean_spec = np.array(all_spec).mean()
  mean_score = np.array(all_score).mean()
  fin_sens.append(mean_sens)
  fin_spec.append(mean_spec)
  fin_score.append(mean_score)

mean_sens = np.array(fin_sens).mean()
mean_spec = np.array(fin_spec).mean()
mean_score = np.array(fin_score).mean()
print('Average sensitivity: ', mean_sens, '+-', st.sem(fin_sens),
      '\nAverage specificity: ', mean_spec, '+-', st.sem(fin_spec),
      '\nAverage balanced accuracy: ', mean_score, '+-', st.sem(fin_score))

Average sensitivity:  0.0 +- 0.0 
Average specificity:  1.0 +- 0.0 
Average balanced accuracy:  0.5 +- 0.0


**Biased Personalized Models**

In [None]:
# unbiased clients
model_path = 'Global_biased/ppnet_chest/'
model_1 = torch.load(model_path + 'client_0_last_round_3_push0.8547.pth').module.to(device)
model_2 = torch.load(model_path + 'client_1_last_round_3_push0.6343.pth')
model_3 = torch.load(model_path + 'client_2_last_round_3_push0.7491.pth')

In [None]:
# biased client
model = torch.load('Global_biased/ppnet_chest/client_3_last_round_3_push1.0000.pth').module.to(device)
model1 = torch.load('Global_biased/ppnet_chest_1/client_3_last_round_3_push0.9221.pth').module.to(device)
model2 = torch.load('Global_biased/ppnet_chest_2/client_3_last_round_3_push1.0000.pth').module.to(device)

Evaluate biased model on biased test set

In [None]:
fin_sens = []
fin_spec = []
fin_score = []
for j in (model, model1, model2):
  acc, f1, acc_multi, sens, spec, score = evaluate(j, test_loaders[3], class_specific=True)
  fin_sens.append(sens)
  fin_spec.append(spec)
  fin_score.append(score)

mean_sens = np.array(fin_sens).mean()
mean_spec = np.array(fin_spec).mean()
mean_score = np.array(fin_score).mean()
print('Average sensitivity: ', mean_sens, '+-', st.sem(fin_sens),
      '\nAverage specificity ', mean_spec, '+-', st.sem(fin_spec),
      '\nAverage balanced accuracy: ', mean_score, '+-', st.sem(fin_score))

Average sensitivity:  0.7960493827160494 +- 0.2039506172839506 
Average specificity  1.0 +- 0.0 
Average balanced accuracy:  0.8980246913580247 +- 0.1019753086419753


Evaluate biased model on unbiased test sets and average

In [None]:
fin_sens = []
fin_spec = []
fin_score = []
for j in (model, model1, model2):
  all_sens = []
  all_spec = []
  all_score = []
  for i in range(3):
    acc, f1, acc_multi, sens, spec, score = evaluate(j, test_loaders[i], class_specific=True)
    all_sens.append(sens)
    all_spec.append(spec)
    all_score.append(score)

  mean_sens = np.array(all_sens).mean()
  mean_spec = np.array(all_spec).mean()
  mean_score = np.array(all_score).mean()
  fin_sens.append(mean_sens)
  fin_spec.append(mean_spec)
  fin_score.append(mean_score)

mean_sens = np.array(fin_sens).mean()
mean_spec = np.array(fin_spec).mean()
mean_score = np.array(fin_score).mean()
print('Average sensitivity: ', mean_sens, '+-', st.sem(fin_sens),
      '\nAverage specificity: ', mean_spec, '+-', st.sem(fin_spec),
      '\nAverage balanced accuracy: ', mean_score, '+-', st.sem(fin_score))

Average sensitivity:  0.0 +- 0.0 
Average specificity:  1.0 +- 0.0 
Average balanced accuracy:  0.5 +- 0.0


Evaluate unbiased models on ubiased test sets and average

In [None]:
fin_sens = []
fin_spec = []
fin_score = []
for j in (model_1, model_2, model_3):
  all_sens = []
  all_spec = []
  all_score = []
  for i in range(3):
    acc, f1, acc_multi, sens, spec, score = evaluate(j, test_loaders[i], class_specific=True)
    all_sens.append(sens)
    all_spec.append(spec)
    all_score.append(score)

  mean_sens = np.array(all_sens).mean()
  mean_spec = np.array(all_spec).mean()
  mean_score = np.array(all_score).mean()
  fin_sens.append(mean_sens)
  fin_spec.append(mean_spec)
  fin_score.append(mean_score)

mean_sens = np.array(fin_sens).mean()
mean_spec = np.array(fin_spec).mean()
mean_score = np.array(fin_score).mean()
print('Average sensitivity: ', mean_sens, '+-', st.sem(fin_sens),
      '\nAverage specificity: ', mean_spec, '+-', st.sem(fin_spec),
      '\nAverage balanced accuracy: ', mean_score, '+-', st.sem(fin_score))

Average sensitivity:  0.5644444444444444 +- 0.10207101996103893 
Average specificity:  0.7725880201188843 +- 0.08187770685561606 
Average balanced accuracy:  0.6685162322816645 +- 0.02294206399914663


Evaluate unbiased models on biased test set

In [None]:
fin_sens = []
fin_spec = []
fin_score = []
for j in (model_1, model_2, model_3):
    acc, f1, acc_multi, sens, spec, score = evaluate(j, test_loaders[3], class_specific=True)
    all_sens.append(sens)
    all_spec.append(spec)
    all_score.append(score)

mean_sens = np.array(all_sens).mean()
mean_spec = np.array(all_spec).mean()
mean_score = np.array(all_score).mean()
print('Average sensitivity: ', mean_sens, '+-', st.sem(all_sens),
      '\nAverage specificity: ', mean_spec, '+-', st.sem(all_spec),
      '\nAverage balanced accuracy: ', mean_score, '+-', st.sem(all_score))

Average sensitivity:  0.6059259259259259 +- 0.050296241749820426 
Average specificity:  0.7721536351165982 +- 0.03692314704862767 
Average balanced accuracy:  0.689039780521262 +- 0.01584169342006656


**Biased Global Model**

In [None]:
model_path = 'Global_biased/Fully_global/'
model_ser = torch.load(model_path + 'ppnet_chest/server_final_round_3_0.4148.pth').module.to(device)
model_ser1 = torch.load(model_path + 'ppnet_chest_1/server_final_round_3_0.8793.pth').module.to(device)
model_ser2 = torch.load(model_path + 'ppnet_chest_2/server_final_round_3_0.8829.pth').module.to(device)

Evaluate on biased test set

In [None]:
fin_sens = []
fin_spec = []
fin_score = []
for j in (model_ser, model_ser1, model_ser2):
  acc, f1, acc_multi, sens, spec, score = evaluate(j, test_loaders[3], class_specific=True)
  fin_sens.append(sens)
  fin_spec.append(spec)
  fin_score.append(score)

mean_sens = np.array(fin_sens).mean()
mean_spec = np.array(fin_spec).mean()
mean_score = np.array(fin_score).mean()
print('Average sensitivity: ', mean_sens, '+-', st.sem(fin_sens),
      '\nAverage specificity: ', mean_spec, '+-', st.sem(fin_spec),
      '\nAverage balanced accuracy: ', mean_score, '+-', st.sem(fin_score))

Average sensitivity:  0.45629629629629626 +- 0.2750652747072852 
Average specificity:  0.7743484224965705 +- 0.2224664718934485 
Average balanced accuracy:  0.6153223593964334 +- 0.04268366692388824


Evaluate on unbiased test sets and average

In [None]:
fin_sens = []
fin_spec = []
fin_score = []
for j in (model_ser, model_ser1, model_ser2):
  all_sens = []
  all_spec = []
  all_score = []
  for i in range(3):
    acc, f1, acc_multi, sens, spec, score = evaluate(j, test_loaders[i], class_specific=True)
    all_sens.append(sens)
    all_spec.append(spec)
    all_score.append(score)

  mean_sens = np.array(all_sens).mean()
  mean_spec = np.array(all_spec).mean()
  mean_score = np.array(all_score).mean()
  fin_sens.append(mean_sens)
  fin_spec.append(mean_spec)
  fin_score.append(mean_score)

mean_sens = np.array(fin_sens).mean()
mean_spec = np.array(fin_spec).mean()
mean_score = np.array(fin_score).mean()
print('Average sensitivity: ', mean_sens, '+-', st.sem(fin_sens),
      '\nAverage specificity: ', mean_spec, '+-', st.sem(fin_spec),
      '\nAverage balanced accuracy: ', mean_score, '+-', st.sem(fin_score))

Average sensitivity:  0.3422222222222222 +- 0.29566640865944127 
Average specificity:  0.7748742569730224 +- 0.22286439490541643 
Average balanced accuracy:  0.5585482395976223 +- 0.03685498222803084
