In [1]:
import os
import sys
import logging
import shutil
import random,os
import numpy as np
format_str = '%(asctime)s %(levelname)s %(message)s'
logging.basicConfig(level=logging.INFO, format=format_str)
logging.info('started')


# Description for generating data folders

- In this notebook, we provide the source code of splitting datasets for the non-IID distributions in the paper.

- In this scenerio, we copy the splitted data into new data folders which contain all the client data subfolders and a server test data subfolder.

- For the new data folder, it will be named as `[client_num][distribution][parameter_num]`, e.g. `3quantity_noniid10`.

### Define all the file paths.

In [2]:
root = '../data/'
benign_path = root+'benign/'
malware_path = root+'malware/'
test_path = root+'test_dataset/'  

### Define the functions.

In [3]:
# Send the test dataset to the target data folder for server side testing.
def get_test(file):
    tarDir = file+'/test/'
    if not os.path.exists(tarDir):
        os.mkdir(tarDir)

    pathDir = os.listdir(root+'test_dataset/')
    for name in pathDir:
        shutil.copy(root+'test_dataset/'+name, tarDir+name)
    return

In [4]:
# Copy files into each client dataset folder.
# Sampling without replacement.
def splitIntoClient(folder, curr_files, curr_sample, curr_path, client_num):
    for client in range(client_num):
        ### Get the sample num for each client.
        tarDir = folder+'/trainer'+str(client+1)+'/'
        if not os.path.exists(tarDir):
            os.mkdir(tarDir)
        sample_num = curr_sample[client]
        # The last client get the rest samples.
        if client == client_num-1:
            sample_num = len(curr_files)
        ### Randomly choose samples.
        random.seed = client
        resultFileName = random.sample(curr_files, sample_num)
        logging.info("Sample num for client %d: %d" % (client+1, sample_num))
        ### Copy the chosen samples to tarDir
        for name in resultFileName:
            shutil.copy(curr_path+name, tarDir+name)
            # Delete the selected file from list
            index = curr_files.index(name)
            curr_files.pop(index)
    return

In [5]:
# Copy files into target folder.
def copyFile(tarDir, fileDir, filenumber, type):
    pathDir = os.listdir(fileDir)
    if not os.path.exists(tarDir+'_'+type):
        os.mkdir(tarDir+'_'+type) 
    else:
        logging.info("Folder not empty!")
        sys.exit()   
    ### Randomly choose 'filenumber' samples.
    result = random.sample(pathDir, filenumber)
    for name in result:
        shutil.copy(fileDir+'/'+name, tarDir+'/'+name)
        shutil.copy(fileDir+'/'+name, tarDir+'_'+type+'/'+name)
    return

## Quantity based non-IID distribution for each client.

1 dirichlet distribution decides client size.

In [6]:
def quantity_based_noniid(client_num, p, token_path = None):
    ### The new folder name for all the client datasets.
    if token_path is not None:
        folder = root+str(client_num)+token_path
    else:
        folder = root+str(client_num)+'quantity_noniid'+str(p)
    if not os.path.exists(folder):
        os.mkdir(folder)
    else:
        logging.info("Folder %s not empty!", folder)
        sys.exit()
    ### Generate one dirichlet distribution and corresponding sample numbers for benign and malware.
    alpha=[p]*client_num
    all_distribution = np.random.dirichlet(alpha, 1)[0]
    benign_files = os.listdir(benign_path)
    malware_files = os.listdir(malware_path)
    benign_sample = []
    malware_sample = []
    for client in range(client_num):
        benign_sample.append(int(all_distribution[client] * len(benign_files)+0.5))
        malware_sample.append(int(all_distribution[client] * len(malware_files)+0.5))
    ### Split benign.
    logging.info("Start splitting benign samples.")
    splitIntoClient(folder, benign_files, benign_sample, benign_path, client_num)
    ### Split malware.
    logging.info("Start splitting malware samples.")
    splitIntoClient(folder, malware_files, malware_sample, malware_path, client_num)
    ### Copy test dataset.
    get_test(folder)
    logging.info("Splitting finished.")


client_num = 3
p = 10
quantity_based_noniid(client_num, p)

## Label based non-IID distribution for each client.

2 dirichlet distribution decides client size.

In [None]:
def label_based_noniid(client_num, p_benign, p_malware):
    ### The new folder name for all the client datasets.
    folder = root+str(client_num)+'label_noniid'+str(p_benign)+'_'+str(p_malware)
    if not os.path.exists(folder):
        os.mkdir(folder)
    else:
        logging.info("Folder %s not empty!", folder)
        sys.exit()
    ### Generate the two dirichlet distribution and corresponding sample numbers for benign and malware.
    alpha_benign=[p_benign]*client_num
    alpha_malware=[p_malware]*client_num
    benign_distribution = np.random.dirichlet(alpha_benign, 1)[0]
    malware_distribution = np.random.dirichlet(alpha_malware, 1)[0]
    benign_files = os.listdir(benign_path) 
    malware_files = os.listdir(malware_path) 
    benign_sample = []
    malware_sample = []
    for client in range(client_num):
        benign_sample.append(int(benign_distribution[client] * len(benign_files)+0.5))
        malware_sample.append(int(malware_distribution[client] * len(malware_files)+0.5))
    ### Split benign.
    logging.info("Start splitting benign samples.")
    splitIntoClient(folder, benign_files, benign_sample, benign_path, client_num)
    ### Split malware.
    logging.info("Start splitting malware samples.")
    splitIntoClient(folder, malware_files, malware_sample, malware_path, client_num)
    ### Copy test dataset.
    get_test(folder)
    logging.info("Splitting finished.")


client_num = 3
p_benign=10
p_malware=10
label_based_noniid(client_num, p_benign, p_malware)

## Consistent label imbalance (CLI)

Firstly, enforce CLI on the central dataset.

In [None]:
# Convert central dataset into consistent label unbalanced cetral dataset.
def convertCentral():
    file = root+'CLI'+str(b)+'_'+str(m)
    if not os.path.exists(file):
        os.mkdir(file)
    else:
        logging.info("Folder not empty!")
        sys.exit()

    benign_files = os.listdir(benign_path) 
    malware_files = os.listdir(malware_path) 
    # The ratio of benign:malware = b:m
    bfilenumber = int(len(benign_files)/m)
    mfilenumber = int(len(malware_files)/b)

    copyFile(file, malware_path, mfilenumber, 'malware')
    copyFile(file, benign_path, bfilenumber, 'benign')
    logging.info(len(os.listdir(file)))


m = 1
b = 2
client_num = 3
convertCentral()

Secondly, split the unbalanced central dataset into client datasets. 

In default setting, the client sizes still follow the quantity based non-IID distribution.

In [None]:
### Store the default path in tmp variables.
tmp_benign_path = benign_path
tmp_malware_path = malware_path
### 
m = 1
b = 2
p = 10
client_num = 3
token_path = 'CLI'+str(b)+'_'+str(m)
# Redefine the path as the unbalanced central datasets
benign_path = root+token_path+'_benign/'
malware_path = root+token_path+'_malware/'
quantity_based_noniid(client_num, p, token_path=token_path)

### Restore the default path.
benign_path = tmp_benign_path
malware_path = tmp_malware_path