In [38]:
# Set up
import pandas as pd
import random
SEED = 20
random.seed(SEED)

# Load all image paths
dataset_path = '../Datasets/CIFAR10'
data_df = pd.read_csv(dataset_path + '/data.csv')

# Dataset Partition

In [39]:
# Split test dataset and private (data owner) dataset
test_dataset = data_df.groupby('label').sample(frac=.2) 
private_dataset = data_df.iloc[[i for i in data_df.index if i not in test_dataset.index]]

In [40]:
# get label map
id2label = data_df[['label', 'label_name']].drop_duplicates().sort_values('label').set_index('label').to_dict()['label_name']
label2id = {v:k for k,v in id2label.items()}
label2id

{'plane': 0,
 'car': 1,
 'bird': 2,
 'cat': 3,
 'deer': 4,
 'dog': 5,
 'frog': 6,
 'horse': 7,
 'ship': 8,
 'truck': 9}

In [41]:
# Generate a random hundreds, default: between 500 ~ 3000
def get_random_hundreds(low=500, high=3000):
    return round(random.randint(low//100, high//100)) * 100

In [42]:
# Generate data owner datasets by labels
def generate_data_owner_datasets(labels):
    label_count_info = {label:get_random_hundreds() for label in labels}
    return label_count_info

In [43]:
# Set data owner label information
data_owner_label_info = {
    'A': ('trainer', 'plane, car, ship, boat, truck, other'),
    'B': ('trainer', 'truck, car, other'),
    'C': ('trainer', 'plane, truck, ship, other'),
    'D': ('trainer', 'ship, plane'),
    'E': ('trainer', 'plane, car, ship, other'),  # (overlapping dataset)
    'F': ('trainer', 'cat, dog, other'),  # (non existing labels)
    'G': ('trainer', 'truck, other'),  # (drop below baseline models)
    'T1': ('trainer', 'truck, car'),  # (all labels below baseline)
    'T2': ('trainer', 'plane, boat'),  # (one label below baseline)
    'T3': ('trainer', 'cat, dog'),  # (large model)
    'X': ('client', 'truck'),
    'Y': ('client', 'truck'), # (select models above baseline)
    'Z': ('client', 'plane, ship'), # (multiple labels)
    'T4': ('client', 'horse') # (non existing)
}
data_owner_label_info = {k:(v[0], v[1].split(', ')) for k,v in data_owner_label_info.items()}
data_owner_label_count_info = {k:{label:get_random_hundreds() for label in v[1]} for k,v in data_owner_label_info.items()}
all_labels = set()
for k,v in data_owner_label_count_info.items():
    all_labels.update(list(v.keys()))
    print(k, '\t', '\t'.join([f'{i[0]}({i[1]})' for i in v.items()]))

all_labels.remove('other')
print('All Labels:', ', '.join(list(all_labels)))

A 	 plane(2800)	car(2600)	ship(3000)	boat(2900)	truck(900)	other(1300)
B 	 truck(2600)	car(2500)	other(800)
C 	 plane(1500)	truck(2300)	ship(1000)	other(500)
D 	 ship(1800)	plane(1800)
E 	 plane(700)	car(800)	ship(900)	other(1500)
F 	 cat(2000)	dog(2300)	other(1900)
G 	 truck(1800)	other(1100)
T1 	 truck(1100)	car(1500)
T2 	 plane(2500)	boat(2600)
T3 	 cat(1500)	dog(1500)
X 	 truck(1800)
Y 	 truck(700)
Z 	 plane(2500)	ship(2100)
T4 	 horse(3000)
All Labels: car, plane, cat, horse, truck, dog, ship, boat


In [44]:
# Generate a list with the given length
def generate_list_by_sum(m, n):
    arr = [0] * m;
    for i in range(n):
        arr[random.randint(0, m-1)] += 1
    return arr

# update the count for other labels
data_owner_label_count_full_info = {}
for k, v in data_owner_label_count_info.items():
    other_labels = all_labels.difference(set(v.keys()))
    data_owner_label_count_full_info[k] = v
    if 'other' in v:
        total_other_labels = v['other']
        del data_owner_label_count_full_info[k]['other']
        for label, count in zip(other_labels, generate_list_by_sum(len(other_labels), total_other_labels)):
            data_owner_label_count_full_info[k][label] = f'{count}*'
    
    # Set count of extra labels to 0
    extra_labels = all_labels.difference(set(data_owner_label_count_full_info[k].keys()))
    for label in extra_labels:
        data_owner_label_count_full_info[k][label] = ''
    for label in v.keys():
        data_owner_label_count_full_info[k][label] = f'{data_owner_label_count_full_info[k][label]}'

In [45]:
# Convert the count information to pandas data frame
data_owner_info_df = pd.DataFrame(data_owner_label_count_full_info).T
data_owner_info_df

Unnamed: 0,plane,car,ship,boat,truck,dog,horse,cat
A,2800,2600,3000,2900,900,466*,398*,436*
B,139*,2500,127*,138*,2600,132*,127*,137*
C,1500,110*,1000,84*,2300,96*,107*,103*
D,1800,,1800,,,,,
E,700,800,900,287*,291*,318*,274*,330*
F,341*,307*,327*,315*,296*,2300,314*,2000
G,136*,149*,184*,157*,1800,165*,149*,160*
T1,,1500,,,1100,,,
T2,2500,,,2600,,,,
T3,,,,,,1500,,1500


In [46]:
private_dataset


Unnamed: 0,image,label,label_name
1,00002.png,9,truck
2,00003.png,9,truck
3,00004.png,4,deer
4,00005.png,1,car
5,00006.png,1,car
...,...,...,...
59995,59996.png,8,ship
59996,59997.png,3,cat
59997,59998.png,5,dog
59998,59999.png,1,car


In [47]:
# Get image paths by label and count



In [63]:
import numpy as np

def get_dataset(data_df: pd.DataFrame, label2id, drop=True, **kwargs):
    """Get a custom dataset by defining label and the associated count value or fraction

    Args:
        data_df (pd.DataFrame): _description_
        drop (bool, optional): whether the data selected will be removed from data_df. Defaults to True.
        kwargs: label'

    Returns:
        _type_: _description_
    """

    label_count_df = pd.DataFrame({
        'label': list(kwargs.keys()),
        'count': list(kwargs.values())
    })
    
    label_count_dict = dict(
        zip(label_count_df['label'], label_count_df['count']))
    
    print(label_count_dict)
    
    labels = list(label_count_dict.keys())
    # print(data_df.query(f'label_name == {labels[0]}'))
    # return_data_idx = [i for label in labels
    #                    for i in data_df.query(f'label_name == {label}')
    #                    .sample(label_count_dict[label]).index
    #                    ]
    
    return f'label_name == {labels[0]}'


    
# from Dataset import get_dataset
get_dataset(private_dataset, label2id=label2id, 
            **{k:int(v.replace('*', '')) 
               for k, v in data_owner_info_df.T['A'].to_dict().items()})

{'plane': 2800, 'car': 2600, 'ship': 3000, 'boat': 2900, 'truck': 900, 'dog': 466, 'horse': 398, 'cat': 436}


'label_name == plane'

In [68]:
# data_owner_info_df.T['A'].to_dict()
private_dataset.query('label_name == plane')

UndefinedVariableError: name 'plane' is not defined

In [None]:
label2id