In [9]:
import copy
import torch
import numpy as np
from torchvision import datasets, transforms
from random import randint

### Part 1 mnist sampling functions

In [10]:
def mnist_iid(dataset, num_users):    
    num_items = int(len(dataset)/num_users)
    dict_users, all_idxs = {}, [i for i in range(len(dataset))]
    for i in range(num_users):
        dict_users[i] = set(np.random.choice(all_idxs, num_items,
                                             replace=False))
        all_idxs = list(set(all_idxs) - dict_users[i])
    return dict_users

def mnist_noniid(dataset, num_users):
    """
    Sample non-I.I.D client data from MNIST dataset
    :param dataset:
    :param num_users:
    :return:
    """
    # 60,000 training imgs -->  200 imgs/shard X 300 shards
    num_shards, num_imgs = 200, 300
    idx_shard = [i for i in range(num_shards)]
    dict_users = {i: np.array([]) for i in range(num_users)}
    idxs = np.arange(num_shards*num_imgs)
    labels = dataset.targets.numpy()

    # sort labels
    idxs_labels = np.vstack((idxs, labels))
    idxs_labels = idxs_labels[:, idxs_labels[1, :].argsort()]
    idxs = idxs_labels[0, :]

    # divide and assign 2 shards/client
    for i in range(num_users):
        rand_set = set(np.random.choice(idx_shard, 2, replace=False))
        idx_shard = list(set(idx_shard) - rand_set)
        for rand in rand_set:
            dict_users[i] = np.concatenate(
                (dict_users[i], idxs[rand*num_imgs:(rand+1)*num_imgs]), axis=0)
    return dict_users

def mnist_noniid_unequal(dataset, num_users, cifar=False):
    """
    Sample non-I.I.D client data from MNIST dataset s.t clients
    have unequal amount of data
    :param dataset:
    :param num_users:
    :returns a dict of clients with each clients assigned certain
    number of training imgs
    """

    # 60,000 training imgs --> 50 imgs/shard X 1200 shards
    num_shards, num_imgs = 300, 200# origin 1200, 50
    # add condition to be reused by cifar
    if cifar:
        num_shards, num_imgs = 250, 200# origin 1000, 50
    idx_shard = [i for i in range(num_shards)]
    dict_users = {i: np.array([]) for i in range(num_users)}
    idxs = np.arange(num_shards*num_imgs)
    labels = np.array(dataset.targets)

    # sort labels
    idxs_labels = np.vstack((idxs, labels))
    idxs_labels = idxs_labels[:, idxs_labels[1, :].argsort()]
    idxs = idxs_labels[0, :]

    # Minimum and maximum shards assigned per client:
    min_shard = 1 
    max_shard = 3 # original is 30

    # Divide the shards into random chunks for every client
    # s.t the sum of these chunks = num_shards
    random_shard_size = np.random.randint(min_shard, max_shard+1,
                                          size=num_users)
    random_shard_size = np.around(random_shard_size /
                                  sum(random_shard_size) * num_shards)
    random_shard_size = random_shard_size.astype(int)

    # Assign the shards randomly to each client
    if sum(random_shard_size) > num_shards:

        for i in range(num_users):
            # First assign each client 1 shard to ensure every client has
            # atleast one shard of data
            rand_set = set(np.random.choice(idx_shard, 1, replace=False))
            idx_shard = list(set(idx_shard) - rand_set)
            for rand in rand_set:
                dict_users[i] = np.concatenate(
                    (dict_users[i], idxs[rand*num_imgs:(rand+1)*num_imgs]),
                    axis=0)

        random_shard_size = random_shard_size-1

        # Next, randomly assign the remaining shards
        for i in range(num_users):
            if len(idx_shard) == 0:
                continue
            shard_size = random_shard_size[i]
            if shard_size > len(idx_shard):
                shard_size = len(idx_shard)
            rand_set = set(np.random.choice(idx_shard, shard_size,
                                            replace=False))
            idx_shard = list(set(idx_shard) - rand_set)
            for rand in rand_set:
                dict_users[i] = np.concatenate(
                    (dict_users[i], idxs[rand*num_imgs:(rand+1)*num_imgs]),
                    axis=0)
    else:

        for i in range(num_users):
            shard_size = random_shard_size[i]
            rand_set = set(np.random.choice(idx_shard, shard_size,
                                            replace=False))
            idx_shard = list(set(idx_shard) - rand_set)
            for rand in rand_set:
                dict_users[i] = np.concatenate(
                    (dict_users[i], idxs[rand*num_imgs:(rand+1)*num_imgs]),
                    axis=0)

        if len(idx_shard) > 0:
            # Add the leftover shards to the client with minimum images:
            shard_size = len(idx_shard)
            # Add the remaining shard to the client with lowest data
            k = min(dict_users, key=lambda x: len(dict_users.get(x)))
            rand_set = set(np.random.choice(idx_shard, shard_size,
                                            replace=False))
            idx_shard = list(set(idx_shard) - rand_set)
            for rand in rand_set:
                dict_users[k] = np.concatenate(
                    (dict_users[k], idxs[rand*num_imgs:(rand+1)*num_imgs]),
                    axis=0)

    return dict_users

### get_mnist_dataset() function

In [11]:

%cd /Users/clairegong/Federated-Learning-PyTorch/

def get_mnist_dataset(args):
    """ Returns train and test datasets and a user group which is a dict where
    the keys are the user index and the values are the corresponding data for
    each of those users.
    """

    apply_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))])

    data_dir = './data/mnist/'

    train_dataset = datasets.MNIST(data_dir, train=True, download=True,
                                   transform=apply_transform)

    test_dataset = datasets.MNIST(data_dir, train=False, download=True,
                                  transform=apply_transform)

    # sample training data amongst users
    if args['iid']:
        # Sample IID user data from Mnist
        user_groups = mnist_iid(train_dataset, args['num_users'])
    else:
        # Sample Non-IID user data from Mnist
        if args['unequal']:
            # Chose uneuqal splits for every user
            user_groups = mnist_noniid_unequal(train_dataset, args['num_users'])
        else:
            # Chose euqal splits for every user
            user_groups = mnist_noniid(train_dataset, args['num_users'])

    return train_dataset, test_dataset, user_groups

/Users/clairegong/Federated-Learning-PyTorch


### test MNIST index sampling
- user_groups is a set

#### MNIST iid: COCO iid can directly copy

#### MNIST niid equal sampler:
1. divide data into num_shards
2. sort data idx according img labels/classes
3. return each user 2 shards of imag idxs; thus 1 dict of num_user idx lists

#### MNIST niid unequal sampler(more realistic to real scenario and performance good):
1. same
2. same
3. !set shard_per_client range
4. assign dict of num_user idx lists to users


In [12]:
args = {}
args['iid'] = 1
args['unequal'] = 1
args['num_users'] = 100

_, _, user_groups = get_mnist_dataset(args)
print(len(user_groups))
idx = randint(0, len(user_groups)-1)
print(len(user_groups[idx]))
# print(user_groups[idx])


100
600


In [13]:
args['iid'] = 0
args['unequal'] = 0
_, _, user_groups = get_mnist_dataset(args)
print(len(user_groups))
idx = randint(0, len(user_groups)-1)
print(len(user_groups[idx]))

100
600


In [14]:
args['iid'] = 0
args['unequal'] = 1
_, _, user_groups = get_mnist_dataset(args)
print(len(user_groups))
idx = randint(0, len(user_groups)-1)
print(len(user_groups[idx]))

100
600


### Part 2 Make coco datasets

In [17]:
%cd '/Users/clairegong/Federated-Learning-PyTorch/cocotools'
path2data = '../data/coco/val2017'
path2ann = '../data/coco/annotations/instances_val2017.json'

from train import train_one_epoch, evaluate, criterion, get_transform
from coco_utils import ConvertCocoPolysToMask, FilterAndRemapCocoCategories
from transforms import Compose
import utils
import torch.utils.data as data
import torch

catIds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, \
          21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,\
          42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, \
          61, 62, 63, 64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90]

train_data = datasets.CocoDetection(path2data, path2ann, transforms=Compose([FilterAndRemapCocoCategories(catIds, remap=True), 
                                                                              ConvertCocoPolysToMask(),
                                                                              get_transform(train=True)]))
test_data = datasets.CocoDetection(path2data, path2ann, transforms=Compose([FilterAndRemapCocoCategories(catIds, remap=True), 
                                                                    ConvertCocoPolysToMask(),
                                                                    get_transform(train=False)]))

# # split train and test indice
torch.manual_seed(1)
idxs = torch.randperm(len(train_data)).tolist()
train_data = torch.utils.data.Subset(train_data, idxs[:4000])
test_data = torch.utils.data.Subset(test_data, idxs[4000:])

# following references.sementation.train.main setting
train_loader = torch.utils.data.DataLoader(train_data, batch_size=4, shuffle=True, num_workers=4,\
                                        collate_fn=utils.collate_fn, drop_last=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=4, shuffle=False, num_workers=4,\
                                          collate_fn=utils.collate_fn)
print(len(train_data))
print(len(test_data))
print(len(train_loader))
print(len(test_loader)) 

# check IF loader works
img, target = iter(train_loader).next()
print(img.size())
print(target.size())

/Users/clairegong/Federated-Learning-PyTorch/cocotools
loading annotations into memory...
Done (t=0.54s)
creating index...
index created!
loading annotations into memory...
Done (t=0.57s)
creating index...
index created!
4000
1000
1000
250
torch.Size([4, 3, 480, 480])
torch.Size([4, 480, 480])


In [30]:
idx= randint(0, len(train_data)-1)
target = train_data[idx][1]
target_class = np.unique(target.numpy())
pixels_per_class = []
for c in target_class:
    pixels = np.where(target == c)[0].size
    pixels_per_class.append(pixels)
print(target_class)
print(pixels_per_class)
sum(pixels_per_class) == 480**2

[  0  16  60  64  74  78 255]
[121820, 4022, 74017, 2182, 17452, 928, 9979]


True

### Make coco sampler

In [47]:
# helper functon, spend 40mins on this, too long!!!

def convert_coco_mask_to_top_class(dataset):
    # return the numpy array of top class of each img
    targets = [] 
    for (_, target) in dataset:
        classes = np.unique(target.numpy()) # a sorted array
         # remove background class 0, 255
        if len(classes) and classes[0] == 0:
            classes = classes[1:]
        if len(classes) and classes[-1] == 255:
            classes = classes[:-1]

        if len(classes) == 0:
            targets.append(0)
        elif len(classes) == 1:
            targets.append(classes[0])
        else:
            pixels_per_class = []
            for c in classes:
                pixels = len(np.where(target==c)[0])
                pixels_per_class.append(pixels)
            # get the top class with most pixels
            top_class = classes[np.argmax(pixels_per_class)]
            targets.append(top_class) 
    return np.array(targets)
        

In [60]:
def coco_iid(dataset, num_users):    
    num_items = int(len(dataset)/num_users)
    dict_users, all_idxs = {}, [i for i in range(len(dataset))]
    for i in range(num_users):
        dict_users[i] = set(np.random.choice(all_idxs, num_items,
                                             replace=False))
        all_idxs = list(set(all_idxs) - dict_users[i])
    return dict_users

def coco_noniid(dataset, num_users, labels):
    """
    Sample non-I.I.D client data from MNIST dataset
    :param dataset:
    :param num_users:
    :return:
    """
    # 60,000 training imgs -->  200 imgs/shard X 300 shards
    num_shards = 200
    num_imgs = len(dataset) // num_shards 
    idx_shard = [i for i in range(num_shards)]
    dict_users = {i: np.array([]) for i in range(num_users)}
    idxs = np.arange(num_shards*num_imgs)

    # sort labels
    idxs_labels = np.vstack((idxs, labels))
    idxs_labels = idxs_labels[:, idxs_labels[1, :].argsort()]
    idxs = idxs_labels[0, :]

    # divide and assign 2 shards/client
    for i in range(num_users):
        rand_set = set(np.random.choice(idx_shard, 2, replace=False))
        idx_shard = list(set(idx_shard) - rand_set)
        for rand in rand_set:
            dict_users[i] = np.concatenate(
                (dict_users[i], idxs[rand*num_imgs:(rand+1)*num_imgs]), axis=0)
    return dict_users

def coco_noniid_unequal(dataset, num_users, labels):
    """
    Sample non-I.I.D client data from MNIST dataset s.t clients
    have unequal amount of data
    :param dataset:
    :param num_users:
    :returns a dict of clients with each clients assigned certain
    number of training imgs
    """

    # 60,000 training imgs --> 50 imgs/shard X 1200 shards
    num_shards = 1000
    num_imgs = len(dataset) // num_shards
    idx_shard = [i for i in range(num_shards)]
    dict_users = {i: np.array([]) for i in range(num_users)}
    idxs = np.arange(num_shards*num_imgs)

    # sort labels
    idxs_labels = np.vstack((idxs, labels))
    idxs_labels = idxs_labels[:, idxs_labels[1, :].argsort()]
    idxs = idxs_labels[0, :]

    # Minimum and maximum shards assigned per client:
    min_shard = 1 
    max_shard = 30 # original is 30

    # Divide the shards into random chunks for every client
    # s.t the sum of these chunks = num_shards
    random_shard_size = np.random.randint(min_shard, max_shard+1,
                                          size=num_users)
    random_shard_size = np.around(random_shard_size /
                                  sum(random_shard_size) * num_shards)
    random_shard_size = random_shard_size.astype(int)

    # Assign the shards randomly to each client
    if sum(random_shard_size) > num_shards:

        for i in range(num_users):
            # First assign each client 1 shard to ensure every client has
            # atleast one shard of data
            rand_set = set(np.random.choice(idx_shard, 1, replace=False))
            idx_shard = list(set(idx_shard) - rand_set)
            for rand in rand_set:
                dict_users[i] = np.concatenate(
                    (dict_users[i], idxs[rand*num_imgs:(rand+1)*num_imgs]),
                    axis=0)

        random_shard_size = random_shard_size-1

        # Next, randomly assign the remaining shards
        for i in range(num_users):
            if len(idx_shard) == 0:
                continue
            shard_size = random_shard_size[i]
            if shard_size > len(idx_shard):
                shard_size = len(idx_shard)
            rand_set = set(np.random.choice(idx_shard, shard_size,
                                            replace=False))
            idx_shard = list(set(idx_shard) - rand_set)
            for rand in rand_set:
                dict_users[i] = np.concatenate(
                    (dict_users[i], idxs[rand*num_imgs:(rand+1)*num_imgs]),
                    axis=0)
    else:

        for i in range(num_users):
            shard_size = random_shard_size[i]
            rand_set = set(np.random.choice(idx_shard, shard_size,
                                            replace=False))
            idx_shard = list(set(idx_shard) - rand_set)
            for rand in rand_set:
                dict_users[i] = np.concatenate(
                    (dict_users[i], idxs[rand*num_imgs:(rand+1)*num_imgs]),
                    axis=0)

        if len(idx_shard) > 0:
            # Add the leftover shards to the client with minimum images:
            shard_size = len(idx_shard)
            # Add the remaining shard to the client with lowest data
            k = min(dict_users, key=lambda x: len(dict_users.get(x)))
            rand_set = set(np.random.choice(idx_shard, shard_size,
                                            replace=False))
            idx_shard = list(set(idx_shard) - rand_set)
            for rand in rand_set:
                dict_users[k] = np.concatenate(
                    (dict_users[k], idxs[rand*num_imgs:(rand+1)*num_imgs]),
                    axis=0)

    return dict_users

### test COCO index sampling
- user_groups is a set

#### MNIST iid: COCO iid can directly copy

#### MNIST niid equal sampler:
1. divide data into num_shards
2. sort data idx according img labels/classes
3. return each user 2 shards of imag idxs; thus 1 dict of num_user idx lists

#### MNIST niid unequal sampler(more realistic to real scenario and performance good):
1. same
2. same
3. !set shard_per_client range
4. assign dict of num_user idx lists to users

#### COCO niid unequal sampler (sort by top class)
1. same
2. make helper function to sort idx based on top class in a image
3. same
4. same

#### COCO niid unequal sampler (one shard one top class)
1. skip
2. make helper function to sort idx based on top class in a image
3. make each class a shard
4. same


In [49]:
def get_coco_user_groups(args, train_dataset, labels):
    """ Returns train and test datasets and a user group which is a dict where
    the keys are the user index and the values are the corresponding data for
    each of those users.
    """
    

    # sample training data amongst users
    if args['iid']:
        # Sample IID user data from coco
        user_groups = coco_iid(train_dataset, args['num_users'])
    else:
        # Sample Non-IID user data from coco
        if args['unequal']:
            # Chose uneuqal splits for every user
            user_groups = coco_noniid_unequal(train_dataset, args['num_users'], labels)
        else:
            # Chose euqal splits for every user
            user_groups = coco_noniid(train_dataset, args['num_users'], labels)

    return user_groups

labels = convert_coco_mask_to_top_class(train_data)

In [62]:
# labels takes too long to compute, should save for reuse
args['iid'] = 0
args['unequal'] = 0
user_groups = get_coco_user_groups(args, train_data, labels)
print(len(user_groups))
idx = randint(0, len(user_groups)-1)
print(len(user_groups[idx]))
print(user_groups[idx])

100
40
[1547. 3176. 2946. 2218. 3424. 1367. 2071. 3938. 3861. 1092. 3339. 3559.
 2552. 3342. 3876. 3083. 1742. 1118. 1721.  489.  404. 2178. 2174. 3790.
  290. 3883.  304. 2759. 3272.  593. 2832.  315.  620.  338.  626. 2839.
 3165.  353. 3746. 1902.]


In [76]:
args['iid'] = 0
args['unequal'] = 1
user_groups = get_coco_user_groups(args, train_data, labels)
print(len(user_groups))
idx = randint(0, len(user_groups)-1)
print(len(user_groups[idx]))
print(user_groups[idx])

100
20
[ 260.  940.  564. 1684. 1382.  958. 2526. 3635.   35. 3768. 2774. 2868.
 2102. 2104. 2219. 2216. 1233. 1232. 1231. 2573.]


In [81]:
args['iid'] = 1
args['unequal'] = 0
user_groups = get_coco_user_groups(args, train_data, labels)
print(len(user_groups))
idx = randint(0, len(user_groups)-1)
print(len(user_groups[idx]))
print(user_groups[idx])

100
40
{1152, 389, 3975, 1804, 3983, 784, 657, 402, 2454, 2712, 537, 155, 3742, 1825, 2342, 295, 1194, 3120, 2682, 2488, 2489, 3904, 3139, 1091, 3659, 1484, 3663, 2898, 87, 2521, 3545, 3547, 1505, 1890, 106, 1651, 3445, 1655, 3834, 1147}
