In [1]:
# Set up
import numpy as np
import pandas as pd
import random

import sys
sys.path.append('..')

SEED = 20
random.seed(SEED)
np.random.seed(SEED)

# Load all image paths
dataset_path = '../Datasets/CIFAR10'
data_df = pd.read_csv(dataset_path + '/data.csv')

# Dataset Partition

In [2]:
# Split test dataset and private (data owner) dataset
test_dataset = data_df.groupby('label').sample(frac=.2) 
private_dataset = data_df.iloc[[i for i in data_df.index if i not in test_dataset.index]]

In [3]:
# get label map
id2label = data_df[['label', 'label_name']].drop_duplicates().sort_values('label').set_index('label').to_dict()['label_name']
label2id = {v:k for k,v in id2label.items()}
label2id

{'plane': 0,
 'car': 1,
 'bird': 2,
 'cat': 3,
 'deer': 4,
 'dog': 5,
 'frog': 6,
 'horse': 7,
 'ship': 8,
 'truck': 9}

In [4]:
# Generate a random hundreds, default: between 500 ~ 3000
def get_random_hundreds(low=500, high=3000):
    return round(random.randint(low//100, high//100)) * 100

In [5]:
# Generate data owner datasets by labels
def generate_data_owner_datasets(labels):
    label_count_info = {label:get_random_hundreds() for label in labels}
    return label_count_info

In [6]:
# Set data owner label information
data_owner_label_info = {
    'A': ('trainer', 'plane, car, ship, horse, truck, other'),
    'B': ('trainer', 'truck, car, other'),
    'C': ('trainer', 'plane, truck, ship, other'),
    'D': ('trainer', 'ship, plane'),
    'E': ('trainer', 'plane, car, ship, other'),  # (overlapping dataset)
    'F': ('trainer', 'cat, dog, other'),  # (non existing labels)
    'G': ('trainer', 'truck, other'),  # (drop below baseline models)
    'T1': ('trainer', 'truck, car'),  # (all labels below baseline)
    'T2': ('trainer', 'plane, ship'),  # (one label below baseline)
    'T3': ('trainer', 'cat, dog'),  # (large model)
    'X': ('client', 'truck'),
    'Y': ('client', 'truck'), # (select models above baseline)
    'Z': ('client', 'plane, ship'), # (multiple labels)
    'T4': ('client', 'horse') # (non existing)
}
data_owner_label_info = {k:(v[0], v[1].split(', ')) for k,v in data_owner_label_info.items()}
data_owner_label_count_info = {k:{label:get_random_hundreds() for label in v[1]} for k,v in data_owner_label_info.items()}
all_labels = set()
for k,v in data_owner_label_count_info.items():
    all_labels.update(list(v.keys()))
    print(k, '\t', '\t'.join([f'{i[0]}({i[1]})' for i in v.items()]))

all_labels.remove('other')
print('All Labels:', ', '.join(list(all_labels)))

A 	 plane(2800)	car(2600)	ship(3000)	horse(2900)	truck(900)	other(1300)
B 	 truck(2600)	car(2500)	other(800)
C 	 plane(1500)	truck(2300)	ship(1000)	other(500)
D 	 ship(1800)	plane(1800)
E 	 plane(700)	car(800)	ship(900)	other(1500)
F 	 cat(2000)	dog(2300)	other(1900)
G 	 truck(1800)	other(1100)
T1 	 truck(1100)	car(1500)
T2 	 plane(2500)	ship(2600)
T3 	 cat(1500)	dog(1500)
X 	 truck(1800)
Y 	 truck(700)
Z 	 plane(2500)	ship(2100)
T4 	 horse(3000)
All Labels: car, plane, dog, ship, truck, horse, cat


In [7]:
# Generate a list with the given length
def generate_list_by_sum(m, n):
    arr = [0] * m;
    for i in range(n):
        arr[random.randint(0, m-1)] += 1
    return arr

# update the count for other labels
data_owner_label_count_full_info = {}
for k, v in data_owner_label_count_info.items():
    other_labels = all_labels.difference(set(v.keys()))
    data_owner_label_count_full_info[k] = v
    if 'other' in v:
        total_other_labels = v['other']
        del data_owner_label_count_full_info[k]['other']
        for label, count in zip(other_labels, generate_list_by_sum(len(other_labels), total_other_labels)):
            data_owner_label_count_full_info[k][label] = f'{count}*'
    
    # Set count of extra labels to 0
    extra_labels = all_labels.difference(set(data_owner_label_count_full_info[k].keys()))
    for label in extra_labels:
        data_owner_label_count_full_info[k][label] = ''
    for label in v.keys():
        data_owner_label_count_full_info[k][label] = f'{data_owner_label_count_full_info[k][label]}'

In [8]:
# Convert the count information to pandas data frame
data_owner_info_df = pd.DataFrame(data_owner_label_count_full_info).T
data_owner_info_df

Unnamed: 0,plane,car,ship,horse,truck,dog,cat
A,2800,2600,3000,2900,900,683*,617*
B,183*,2500,161*,153*,2600,168*,135*
C,1500,130*,1000,111*,2300,143*,116*
D,1800,,1800,,,,
E,700,800,900,381*,369*,377*,373*
F,380*,407*,353*,415*,345*,2300,2000
G,181*,182*,198*,188*,1800,159*,192*
T1,,1500,,,1100,,
T2,2500,,2600,,,,
T3,,,,,,1500,1500


In [9]:
from core.ai.dataset import get_dataset

# Get data owner dataset
data_owner_dict = {}
for data_owner_id in data_owner_info_df.index:
    data_owner_info = data_owner_info_df.T[data_owner_id].to_dict()
    data_owner_labels = [k for k, v in data_owner_info.items() if v.isnumeric()]
    data_owner_dataset = get_dataset(
        private_dataset,
        **{k: int(v.replace('*', '')) if len(v) > 0 else 0
           for k, v in data_owner_info.items()})
    
    other_idx = data_owner_dataset.query('label_name not in @data_owner_labels').index
    data_owner_dataset.loc[other_idx, 'label'] = -1
    data_owner_dataset.loc[other_idx, 'label_name'] = 'other'
    data_owner_dict[data_owner_id] = data_owner_dataset
    
    # Sanity check
    ## Check all labels are selected as expected
    all_labels = data_owner_dict[data_owner_id].label_name.unique()
    true_labels = data_owner_label_info[data_owner_id][1]
    assert not len(set(all_labels).difference(true_labels))
    

In [10]:
# Save data owner datasets as excel
with pd.ExcelWriter(dataset_path + 'dataOwnerInfo.xlsx') as writer:
    for data_owner_id in data_owner_dict:
        data_owner_dict[data_owner_id].to_excel(writer, data_owner_id, index=False)
