# Description for generating data folders

- In this notebook, we provide the source code of splitting datasets for the non-IID distributions in the paper.

- In this scenerio, we generate the splitted clients data as `csv` files.

- For the new data folder, it will be named as `[distribution]_[client_num]/[parameter_num]`, e.g. `quantity_noniid_20/alpha_1`.

In [None]:
import os
import pandas as pd
import numpy as np

# original dataset we used in this paper

data = pd.read_csv('../data/all.csv', lineterminator='\n')
data['label'] = data['label'].astype(int)

# we random sample 26346 tweets as non-spam from twitter archive, which is equal to the number of spam
d1 = data[data['label']==0].sample(26346, random_state=21)
d2 = data[data['label']==1]

# split the train and test dataset with the ratio of 9:1
d1_train = d1.sample(frac=0.9, random_state=21)
d1_test = d1[~d1.index.isin(d1_train.index)]
d2_train = d2.sample(frac=0.9, random_state=21)
d2_test = d2[~d2.index.isin(d2_train.index)]

# get d as train dataset and d_test as test dataset
d = pd.concat([d1_train,d2_train], ignore_index = True)
d_test = pd.concat([d1_test,d2_test], ignore_index = True)


## Non-IID distributions for each client.


Quantity based non-IID: 1 dirichlet distribution decides client size.

Label based non-IID: 2 dirichlet distribution decides client size.

In [None]:
def quantity_imbalance_new(distribution, save_dir):
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

    for i in range(len(distribution)):
        d_label = d[d['label'] == i]
        d_all = len(d_label)
        for j in range(len(distribution[i])):
            if i == 0:
                empty = pd.DataFrame(columns=['label', 'text', 'lang'])
                empty.to_csv(os.path.join(save_dir, 'client{}.csv'.format(j)), index=False)
            client_before = pd.read_csv(os.path.join(save_dir, 'client{}.csv'.format(j)), header=0)
            num_of_allocate = round(d_all * distribution[i][j])
#             print(len(d))
#             print(num_of_allocate)
            
            if num_of_allocate <= len(d_label) and j < (len(distribution[i]) - 1):
                client_after = d_label.sample(num_of_allocate, random_state=1)
                #print(client_after)
                d_label = d_label[~d_label.index.isin(client_after.index)]
            else:
                client_after = d_label
                d_label = d_label[~d_label.index.isin(client_after.index)]
#             print(len(d))
    #         print('client:{}'.format(i))
    #         print(num_of_allocate)
    #         print(len(client_after))
            client = pd.concat([client_before, client_after], ignore_index=True)
            client.to_csv(os.path.join(save_dir, 'client{}.csv'.format(j)), index=False)

## Consistent label imbalance (CLI)

In [None]:
# spam and non-spam imbalance
# we set the ratio of spam to non-spam as 4,1 ,0.25
def spam_non_spam_imbalance(distribution, save_dir, non_spam_ratio, spam_ratio):
    folder_dir = save_dir.split("alpha")[0]
    if not os.path.exists(save_dir):
        os.mkdir(folder_dir)

    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    
    for i in range(len(distribution)):
        if non_spam_ratio >= spam_ratio:
            non_spam_train = d1_train
            spam_train = d2_train.sample(round(len(non_spam_train)/non_spam_ratio), random_state=42)
        else:
            spam_train = d2_train
            non_spam_train = d1_train.sample(round(len(spam_train)/spam_ratio), random_state=42)
#         print(spam_train)
#         print(non_spam_train)
        st = len(spam_train)
        nt = len(non_spam_train)
        for j in range(len(distribution[i])):
            if i == 0:
                empty = pd.DataFrame(columns=['label', 'text', 'lang'])
                empty.to_csv(os.path.join(save_dir, 'client{}.csv'.format(j)), index=False)
            client_before = pd.read_csv(os.path.join(save_dir, 'client{}.csv'.format(j)), header=0)
            spam_num_of_allocate = round(st * distribution[i][j])
            non_spam_num_of_allocate = round(nt * distribution[i][j])
#             print(len(d))
#             print(num_of_allocate)
            
            if spam_num_of_allocate <= len(spam_train) and j < (len(distribution[i]) - 1):
                client_after_spam = spam_train.sample(spam_num_of_allocate, random_state=1)
                #print(client_after)
                spam_train = spam_train[~spam_train.index.isin(client_after_spam.index)]
            else:
                client_after_spam = spam_train
                spam_train = spam_train[~spam_train.index.isin(client_after_spam.index)]
                
            if non_spam_num_of_allocate <= len(non_spam_train) and j < (len(distribution[i]) - 1):
                client_after_non_spam = non_spam_train.sample(non_spam_num_of_allocate, random_state=1)
                #print(client_after)
                non_spam_train = non_spam_train[~non_spam_train.index.isin(client_after_non_spam.index)]
            else:
                client_after_non_spam = non_spam_train
                non_spam_train = non_spam_train[~non_spam_train.index.isin(client_after_non_spam.index)]
                
            client_after = pd.concat([client_after_spam, client_after_non_spam], ignore_index=True)
#             print(len(d))
#             print('client:{}'.format(j))
#             print(spam_num_of_allocate)
#             print(non_spam_num_of_allocate)
#             print(len(client_after))
            client = pd.concat([client_before, client_after], ignore_index=True)
#             print(client)
            
            client.to_csv(os.path.join(save_dir, 'client{}.csv'.format(j)), index=False)
#             if j == 2:

## Initialization of non-IID distribution.

In [None]:
def get_distribution_quantity_noniid(alpha, client_num, class_num, dataset):
    for i in range(10000):
        distribution_quantity = np.random.dirichlet([ alpha ] * client_num, 1)
        num = 0
        for j in distribution_quantity[0]:
            if round(len(dataset) * j) == 0: # if there is a client has no spam or non-spam text, break
                break
            else:
                num += 1
        if num == client_num:
            print('ok') # only if all the client has both spam and non-spam texts, return and print ok
            break
    # since the quantity imbalance only differ in quantity, so to keep the ratio of spam and non-spam the same, we duplicate the distribution
    distribution_quantity = np.append(distribution_quantity,distribution_quantity,axis=0)
    
    return distribution_quantity

def get_distribution_label_noniid(alpha, client_num, class_num, dataset):
    for i in range(10000):
        distribution_quantity = np.random.dirichlet([ alpha ] * client_num, class_num)
        num = 0
        for j in distribution_quantity[0]:
            if round(len(dataset) * j) == 0: # if there is a client has no spam or non-spam text, break
                break
            else:
                num += 1
        if num == client_num:
            print('ok') # only if all the client has both spam and non-spam texts, return and print ok
            break
    
    return distribution_quantity

def get_distribution_spam_non_spam_imbalance(alpha, client_num, class_num, dataset):
    for i in range(10000):
        distribution_quantity = np.random.dirichlet([ alpha ] * client_num, 1)
        num = 0
        for j in distribution_quantity[0]:
            if round(len(dataset) * j) == 0: # if there is a client has no spam or non-spam text, break
                break
            else:
                num += 1
        if num == client_num:
            print('ok') # only if all the client has both spam and non-spam texts, return and print ok
            break
    
    return distribution_quantity

## Generate all the datasets in the experiments.

In [None]:
train_dataset = pd.read_csv("../data/exampleTrain.csv")
alpha_list = [0.5, 1, 5, 10]
client_num_list = [200,20]
class_num = 2
data_path = '../data'
distribution_discription_list = ['quantity_noniid', 'label_noniid', 'imbalance']
for alpha in alpha_list:
    for client_num in client_num_list:
        for distribution in distribution_discription_list:
            # in case the 0.5 in file path, we change 0.5 in path as 05 to save
            if alpha != 0.5:
                distribution_path = '{}_{}/alpha_{}'.format(distribution, client_num, alpha)
            else:
                distribution_path = '{}_{}/alpha_05'.format(distribution, client_num)
            save_dir = os.path.join(data_path, distribution_path)
            folder_dir = os.path.join(data_path, '{}_{}'.format(distribution, client_num))
            if not os.path.exists(folder_dir):
                os.mkdir(folder_dir)

            if distribution == 'quantity_noniid':
                # quantity_imbalance_new(get_distribution_quantity_noniid(1,200,2,train_dataset), save_dir='../data/quantity_200/alpha_1')
                quantity_imbalance_new(get_distribution_quantity_noniid(alpha, client_num, class_num, train_dataset), save_dir=save_dir)
            if distribution == 'label_noniid':
                # quantity_imbalance_new(distribution_noniid_20_alpha5, save_dir='../data/label_distribution_noniid/20_alpha_5')
                quantity_imbalance_new(get_distribution_label_noniid(alpha, client_num, class_num, train_dataset), save_dir=save_dir)
            if distribution == 'imbalance' and alpha == 1:
                if client_num == 20 :
                    spam_non_spam_imbalance(get_distribution_spam_non_spam_imbalance(alpha, client_num, class_num, train_dataset), '../data/imbalance_20_b4/alpha1/',4,1)
                    spam_non_spam_imbalance(get_distribution_spam_non_spam_imbalance(alpha, client_num, class_num, train_dataset), '../data/imbalance_20_b1/alpha1/',1,1)
                    spam_non_spam_imbalance(get_distribution_spam_non_spam_imbalance(alpha, client_num, class_num, train_dataset), '../data/imbalance_20_b0_25/alpha1/',1,4)
                if client_num == 200 :
                    spam_non_spam_imbalance(get_distribution_spam_non_spam_imbalance(alpha, client_num, class_num, train_dataset), '../data/imbalance_200_b4/alpha1/',4,1)
                    spam_non_spam_imbalance(get_distribution_spam_non_spam_imbalance(alpha, client_num, class_num, train_dataset), '../data/imbalance_200_b1/alpha1/',1,1)
                    spam_non_spam_imbalance(get_distribution_spam_non_spam_imbalance(alpha, client_num, class_num, train_dataset), '../data/imbalance_200_b0_25/alpha1/',1,4)
                    