# DataSynthesizer Usage (correlated attribute mode)

> This is a quick demo to use DataSynthesizer in correlated attribute mode.

### Step 1 import packages

In [1]:
from DataSynthesizer.DataDescriber import DataDescriber
from DataSynthesizer.DataGenerator import DataGenerator
from DataSynthesizer.ModelInspector import ModelInspector
from DataSynthesizer.lib.utils import read_json_file, display_bayesian_network

import pandas as pd
import os
import timeit

### Step 2 user-defined parameteres

In [2]:
# input dataset
input_data = './data/adult.csv'
# location of two output files
mode = 'correlated_attribute_mode'
description_file = f'./out/{mode}/description.json'
description_file_fair = f'./out/{mode}/description_fair.json'
synthetic_data = f'./out/{mode}/sythetic_data.csv'
sythetic_data_fair= f'./out/{mode}/sythetic_data_fair.csv'

#randseed = int.from_bytes(os.urandom(4), 'big')
#randseed = 0

In [4]:
# An attribute is categorical if its domain size is less than this threshold.
# Here modify the threshold to adapt to the domain size of "education" (which is 14 in input dataset).
threshold_value = 30

# specify categorical attributes
categorical_attributes = {}

# specify which attributes are candidate keys of input dataset.
candidate_keys = {}

# A parameter in Differential Privacy. It roughly means that removing a row in the input dataset will not 
# change the probability of getting the same output more than a multiplicative difference of exp(epsilon).
# Increase epsilon value to reduce the injected noises. Set epsilon=0 to turn off differential privacy.
epsilon = 10

# The maximum number of parents in Bayesian network, i.e., the maximum number of incoming edges.
degree_of_bayesian_network = 0

# Number of tuples generated in synthetic dataset.
num_tuples_to_generate = 48843 # Here 32561 is the same as input dataset, but it can be set to another number.

sensitive_attributes = {"admissible": ['workclass','fnlwgt','education-num','occupation','capital-gain','capital-loss','hours-per-week'], "inadmissible": ['race','sex','native-country'], "outcome":['income>50K']}

### Step 3 DataDescriber

1. Instantiate a DataDescriber.
2. Compute the statistics of the dataset.
3. Save dataset description to a file on local machine.

In [None]:
randseed = int.from_bytes(os.urandom(4), 'big')
describer = DataDescriber(category_threshold=threshold_value)
describer_fair = DataDescriber(category_threshold=threshold_value)

describer.describe_dataset_in_correlated_attribute_mode(dataset_file=input_data, 
                                                        epsilon=epsilon, 
                                                        k=degree_of_bayesian_network,
                                                        attribute_to_is_categorical=categorical_attributes,
                                                        attribute_to_is_candidate_key=candidate_keys,
                                                       seed = randseed)
describer.save_dataset_description_to_file(description_file)

describer_fair.describe_dataset_in_fair_correlated_attribute_mode(dataset_file=input_data, 
                                                        epsilon=epsilon, 
                                                        k=degree_of_bayesian_network,
                                                        attribute_to_is_categorical=categorical_attributes,
                                                        attribute_to_is_candidate_key=candidate_keys,
                                                        attribute_to_sensitive = sensitive_attributes,
                                                        seed = randseed)
describer_fair.save_dataset_description_to_file(description_file_fair)



In [None]:
print("Privbayes")
display_bayesian_network(describer.bayesian_network)
print("FairPrivbayes")
display_bayesian_network(describer_fair.bayesian_network)

### Step 4 generate synthetic dataset

1. Instantiate a DataGenerator.
2. Generate a synthetic dataset.
3. Save it to local machine.

In [None]:
synthpath = f'./out/{mode}/Bayes_original/sythetic_data.csv'
fairsynthpath = f'./out/{mode}/Bayes_fair/sythetic_data.csv'
generator = DataGenerator()
generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file)
generator.save_synthetic_data(synthpath)

generator = DataGenerator()
generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file_fair)
generator.save_synthetic_data(fairsynthpath)

## Run experiments

In [None]:
for i in range(10):
    randseed = int.from_bytes(os.urandom(4), 'big')
    describer = DataDescriber(category_threshold=threshold_value)
    describer_fair = DataDescriber(category_threshold=threshold_value)

    describer.describe_dataset_in_correlated_attribute_mode(dataset_file=input_data, 
                                                            epsilon=epsilon, 
                                                            k=degree_of_bayesian_network,
                                                            attribute_to_is_categorical=categorical_attributes,
                                                            attribute_to_is_candidate_key=candidate_keys,
                                                           seed = randseed)
    describer.save_dataset_description_to_file(description_file)

    describer_fair.describe_dataset_in_fair_correlated_attribute_mode(dataset_file=input_data, 
                                                            epsilon=epsilon, 
                                                            k=degree_of_bayesian_network,
                                                            attribute_to_is_categorical=categorical_attributes,
                                                            attribute_to_is_candidate_key=candidate_keys,
                                                            attribute_to_sensitive = sensitive_attributes,
                                                            seed = randseed)
    describer_fair.save_dataset_description_to_file(description_file_fair)
    
    
    print("Privbayes")
    display_bayesian_network(describer.bayesian_network)
    print("FairPrivbayes")
    display_bayesian_network(describer_fair.bayesian_network)
    
    synthpath = f'./out/{mode}/Bayes_original/sythetic_data_{i}.csv'
    fairsynthpath = f'./out/{mode}/Bayes_fair/sythetic_data_{i}.csv'
    generator = DataGenerator()
    generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file)
    generator.save_synthetic_data(synthpath)

    generator = DataGenerator()
    generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file_fair)
    generator.save_synthetic_data(fairsynthpath)


    synthpath = f'./out/{mode}/Bayes_original/sythetic_data_{i}.csv'
    fairsynthpath = f'./out/{mode}/Bayes_fair/sythetic_data_{i}.csv'
    generator = DataGenerator()
    generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file)
    generator.save_synthetic_data(synthpath)

    generator = DataGenerator()
    generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file_fair)
    generator.save_synthetic_data(fairsynthpath)

In [4]:
eps = [1000,100, 10 ,1 , 0.1]

for e in eps:
    for i in range(10):
        randseed = int.from_bytes(os.urandom(4), 'big')
        describer = DataDescriber(category_threshold=threshold_value)
        describer_fair = DataDescriber(category_threshold=threshold_value)

        describer.describe_dataset_in_correlated_attribute_mode(dataset_file=input_data, 
                                                                epsilon=e, 
                                                                k=degree_of_bayesian_network,
                                                                attribute_to_is_categorical=categorical_attributes,
                                                                attribute_to_is_candidate_key=candidate_keys,
                                                               seed = randseed)
        describer.save_dataset_description_to_file(description_file)

        describer_fair.describe_dataset_in_fair_correlated_attribute_mode(dataset_file=input_data, 
                                                                epsilon=e, 
                                                                k=degree_of_bayesian_network,
                                                                attribute_to_is_categorical=categorical_attributes,
                                                                attribute_to_is_candidate_key=candidate_keys,
                                                                attribute_to_sensitive = sensitive_attributes,
                                                                seed = randseed)
        describer_fair.save_dataset_description_to_file(description_file_fair)


        print("Privbayes")
        display_bayesian_network(describer.bayesian_network)
        print("FairPrivbayes")
        display_bayesian_network(describer_fair.bayesian_network)

        synthpath = f'./out/{mode}/Bayes_original/eps={e}/sythetic_data_{i}.csv'
        fairsynthpath = f'./out/{mode}/Bayes_fair/eps={e}/sythetic_data_{i}.csv'
        generator = DataGenerator()
        generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file)
        generator.save_synthetic_data(synthpath)

        generator = DataGenerator()
        generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file_fair)
        generator.save_synthetic_data(fairsynthpath)

Adding ROOT race
Adding attribute education-num
Adding attribute marital-status
Adding attribute occupation
Adding attribute workclass
Adding attribute fnlwgt
Adding attribute capital-gain
Adding attribute hours-per-week
Adding attribute relationship
Adding attribute sex
Adding attribute capital-loss
Adding attribute native-country
Adding attribute age
Adding attribute income>50K
Adding ROOT race
Adding attribute education-num
Adding attribute marital-status
Adding attribute workclass
Adding attribute capital-gain
Adding attribute fnlwgt
Adding attribute hours-per-week
Adding attribute relationship
Adding attribute sex
Adding attribute native-country
Adding attribute age
Adding attribute capital-loss
Adding attribute occupation
Adding attribute income>50K
Privbayes
Constructed Bayesian network:
    education-num  has parents ['race'].
    marital-status has parents ['education-num', 'race'].
    occupation     has parents ['education-num', 'marital-status', 'race'].
    workclass      

Adding ROOT relationship
Adding attribute income>50K
Adding attribute education-num
Adding attribute capital-loss
Adding attribute marital-status
Adding attribute sex
Adding attribute fnlwgt
Adding attribute capital-gain
Adding attribute native-country
Adding attribute workclass
Adding attribute occupation
Adding attribute race
Adding attribute hours-per-week
Adding attribute age
Adding ROOT relationship
Adding attribute native-country
Adding attribute education-num
Adding attribute capital-loss
Adding attribute marital-status
Adding attribute hours-per-week
Adding attribute fnlwgt
Adding attribute workclass
Adding attribute occupation
Adding attribute sex
Adding attribute race
Adding attribute capital-gain
Adding attribute age
Adding attribute income>50K
Privbayes
Constructed Bayesian network:
    income>50K     has parents ['relationship'].
    education-num  has parents ['income>50K', 'relationship'].
    capital-loss   has parents ['income>50K', 'education-num', 'relationship'].
  

Adding ROOT income>50K
Adding attribute sex
Adding attribute capital-loss
Adding attribute hours-per-week
Adding attribute marital-status
Adding attribute occupation
Adding attribute workclass
Adding attribute relationship
Adding attribute age
Adding attribute native-country
Adding attribute education-num
Adding attribute fnlwgt
Adding attribute race
Adding attribute capital-gain
Adding ROOT native-country
Adding attribute sex
Adding attribute occupation
Adding attribute race
Adding attribute fnlwgt
Adding attribute workclass
Adding attribute age
Adding attribute marital-status
Adding attribute capital-gain
Adding attribute education-num
Adding attribute capital-loss
Adding attribute relationship
Adding attribute hours-per-week
Adding attribute income>50K
Privbayes
Constructed Bayesian network:
    sex            has parents ['income>50K'].
    capital-loss   has parents ['sex', 'income>50K'].
    hours-per-week has parents ['sex', 'capital-loss', 'income>50K'].
    marital-status has 

Adding ROOT income>50K
Adding attribute native-country
Adding attribute workclass
Adding attribute sex
Adding attribute capital-loss
Adding attribute education-num
Adding attribute capital-gain
Adding attribute hours-per-week
Adding attribute race
Adding attribute fnlwgt
Adding attribute marital-status
Adding attribute relationship
Adding attribute age
Adding attribute occupation
Adding ROOT capital-gain
Adding attribute capital-loss
Adding attribute marital-status
Adding attribute hours-per-week
Adding attribute fnlwgt
Adding attribute workclass
Adding attribute race
Adding attribute relationship
Adding attribute occupation
Adding attribute native-country
Adding attribute education-num
Adding attribute sex
Adding attribute age
Adding attribute income>50K
Privbayes
Constructed Bayesian network:
    native-country has parents ['income>50K'].
    workclass      has parents ['native-country', 'income>50K'].
    sex            has parents ['native-country', 'workclass', 'income>50K'].
    

Adding ROOT relationship
Adding attribute sex
Adding attribute workclass
Adding attribute race
Adding attribute capital-gain
Adding attribute capital-loss
Adding attribute native-country
Adding attribute education-num
Adding attribute fnlwgt
Adding attribute hours-per-week
Adding attribute occupation
Adding attribute income>50K
Adding attribute marital-status
Adding attribute age
Adding ROOT relationship
Adding attribute sex
Adding attribute capital-gain
Adding attribute race
Adding attribute marital-status
Adding attribute native-country
Adding attribute occupation
Adding attribute fnlwgt
Adding attribute workclass
Adding attribute education-num
Adding attribute age
Adding attribute capital-loss
Adding attribute hours-per-week
Adding attribute income>50K
Privbayes
Constructed Bayesian network:
    sex            has parents ['relationship'].
    workclass      has parents ['sex', 'relationship'].
    race           has parents ['sex', 'workclass', 'relationship'].
    capital-gain   h

Adding ROOT native-country
Adding attribute income>50K
Adding attribute marital-status
Adding attribute relationship
Adding attribute hours-per-week
Adding attribute capital-gain
Adding attribute occupation
Adding attribute race
Adding attribute education-num
Adding attribute age
Adding attribute fnlwgt
Adding attribute sex
Adding attribute workclass
Adding attribute capital-loss
Adding ROOT native-country
Adding attribute sex
Adding attribute workclass
Adding attribute marital-status
Adding attribute hours-per-week
Adding attribute race
Adding attribute fnlwgt
Adding attribute age
Adding attribute education-num
Adding attribute capital-gain
Adding attribute occupation
Adding attribute capital-loss
Adding attribute relationship
Adding attribute income>50K
Privbayes
Constructed Bayesian network:
    income>50K     has parents ['native-country'].
    marital-status has parents ['income>50K', 'native-country'].
    relationship   has parents ['income>50K', 'marital-status', 'native-countr

Adding ROOT native-country
Adding attribute hours-per-week
Adding attribute race
Adding attribute marital-status
Adding attribute education-num
Adding attribute income>50K
Adding attribute capital-gain
Adding attribute relationship
Adding attribute fnlwgt
Adding attribute age
Adding attribute workclass
Adding attribute capital-loss
Adding attribute occupation
Adding attribute sex
Adding ROOT native-country
Adding attribute relationship
Adding attribute education-num
Adding attribute capital-loss
Adding attribute workclass
Adding attribute race
Adding attribute occupation
Adding attribute capital-gain
Adding attribute age
Adding attribute hours-per-week
Adding attribute marital-status
Adding attribute fnlwgt
Adding attribute sex
Adding attribute income>50K
Privbayes
Constructed Bayesian network:
    hours-per-week has parents ['native-country'].
    race           has parents ['hours-per-week', 'native-country'].
    marital-status has parents ['hours-per-week', 'race', 'native-country'

Adding ROOT age
Adding attribute native-country
Adding attribute education-num
Adding attribute income>50K
Adding attribute capital-loss
Adding attribute sex
Adding attribute marital-status
Adding attribute occupation
Adding attribute fnlwgt
Adding attribute capital-gain
Adding attribute workclass
Adding attribute race
Adding attribute hours-per-week
Adding attribute relationship
Adding ROOT age
Adding attribute native-country
Adding attribute hours-per-week
Adding attribute sex
Adding attribute race
Adding attribute occupation
Adding attribute capital-gain
Adding attribute fnlwgt
Adding attribute relationship
Adding attribute education-num
Adding attribute marital-status
Adding attribute capital-loss
Adding attribute workclass
Adding attribute income>50K
Privbayes
Constructed Bayesian network:
    native-country has parents ['age'].
    education-num  has parents ['native-country', 'age'].
    income>50K     has parents ['native-country', 'education-num', 'age'].
    capital-loss   ha

Adding ROOT education-num
Adding attribute relationship
Adding attribute workclass
Adding attribute capital-gain
Adding attribute native-country
Adding attribute fnlwgt
Adding attribute sex
Adding attribute marital-status
Adding attribute hours-per-week
Adding attribute age
Adding attribute income>50K
Adding attribute capital-loss
Adding attribute race
Adding attribute occupation
Adding ROOT education-num
Adding attribute relationship
Adding attribute age
Adding attribute capital-gain
Adding attribute sex
Adding attribute workclass
Adding attribute capital-loss
Adding attribute fnlwgt
Adding attribute hours-per-week
Adding attribute marital-status
Adding attribute native-country
Adding attribute race
Adding attribute occupation
Adding attribute income>50K
Privbayes
Constructed Bayesian network:
    relationship   has parents ['education-num'].
    workclass      has parents ['relationship', 'education-num'].
    capital-gain   has parents ['relationship', 'workclass', 'education-num'].

Adding ROOT occupation
Adding attribute fnlwgt
Adding attribute age
Adding attribute income>50K
Adding attribute native-country
Adding attribute race
Adding attribute education-num
Adding attribute workclass
Adding attribute sex
Adding attribute hours-per-week
Adding attribute capital-gain
Adding attribute marital-status
Adding attribute relationship
Adding attribute capital-loss
Adding ROOT occupation
Adding attribute fnlwgt
Adding attribute age
Adding attribute native-country
Adding attribute hours-per-week
Adding attribute capital-loss
Adding attribute marital-status
Adding attribute race
Adding attribute relationship
Adding attribute sex
Adding attribute workclass
Adding attribute education-num
Adding attribute capital-gain
Adding attribute income>50K
Privbayes
Constructed Bayesian network:
    fnlwgt         has parents ['occupation'].
    age            has parents ['fnlwgt', 'occupation'].
    income>50K     has parents ['fnlwgt', 'age', 'occupation'].
    native-country has par

Adding ROOT workclass
Adding attribute age
Adding attribute education-num
Adding attribute native-country
Adding attribute relationship
Adding attribute sex
Adding attribute race
Adding attribute marital-status
Adding attribute fnlwgt
Adding attribute capital-gain
Adding attribute capital-loss
Adding attribute occupation
Adding attribute hours-per-week
Adding attribute income>50K
Adding ROOT workclass
Adding attribute age
Adding attribute education-num
Adding attribute capital-loss
Adding attribute hours-per-week
Adding attribute marital-status
Adding attribute native-country
Adding attribute occupation
Adding attribute relationship
Adding attribute race
Adding attribute fnlwgt
Adding attribute sex
Adding attribute capital-gain
Adding attribute income>50K
Privbayes
Constructed Bayesian network:
    age            has parents ['workclass'].
    education-num  has parents ['age', 'workclass'].
    native-country has parents ['age', 'education-num', 'workclass'].
    relationship   has pa

Adding ROOT education-num
Adding attribute occupation
Adding attribute hours-per-week
Adding attribute fnlwgt
Adding attribute workclass
Adding attribute native-country
Adding attribute sex
Adding attribute capital-gain
Adding attribute capital-loss
Adding attribute income>50K
Adding attribute marital-status
Adding attribute relationship
Adding attribute age
Adding attribute race
Adding ROOT education-num
Adding attribute age
Adding attribute race
Adding attribute workclass
Adding attribute capital-gain
Adding attribute sex
Adding attribute capital-loss
Adding attribute relationship
Adding attribute native-country
Adding attribute marital-status
Adding attribute hours-per-week
Adding attribute occupation
Adding attribute fnlwgt
Adding attribute income>50K
Privbayes
Constructed Bayesian network:
    occupation     has parents ['education-num'].
    hours-per-week has parents ['occupation', 'education-num'].
    fnlwgt         has parents ['occupation', 'hours-per-week', 'education-num']

Adding ROOT age
Adding attribute native-country
Adding attribute relationship
Adding attribute marital-status
Adding attribute sex
Adding attribute income>50K
Adding attribute hours-per-week
Adding attribute capital-gain
Adding attribute occupation
Adding attribute workclass
Adding attribute fnlwgt
Adding attribute race
Adding attribute education-num
Adding attribute capital-loss
Adding ROOT age
Adding attribute hours-per-week
Adding attribute relationship
Adding attribute education-num
Adding attribute workclass
Adding attribute capital-gain
Adding attribute native-country
Adding attribute marital-status
Adding attribute fnlwgt
Adding attribute race
Adding attribute capital-loss
Adding attribute sex
Adding attribute occupation
Adding attribute income>50K
Privbayes
Constructed Bayesian network:
    native-country has parents ['age'].
    relationship   has parents ['native-country', 'age'].
    marital-status has parents ['native-country', 'relationship', 'age'].
    sex            has

Adding ROOT capital-gain
Adding attribute occupation
Adding attribute relationship
Adding attribute age
Adding attribute hours-per-week
Adding attribute native-country
Adding attribute education-num
Adding attribute workclass
Adding attribute capital-loss
Adding attribute race
Adding attribute income>50K
Adding attribute marital-status
Adding attribute sex
Adding attribute fnlwgt
Adding ROOT capital-gain
Adding attribute marital-status
Adding attribute relationship
Adding attribute age
Adding attribute fnlwgt
Adding attribute hours-per-week
Adding attribute capital-loss
Adding attribute sex
Adding attribute occupation
Adding attribute race
Adding attribute native-country
Adding attribute workclass
Adding attribute education-num
Adding attribute income>50K
Privbayes
Constructed Bayesian network:
    occupation     has parents ['capital-gain'].
    relationship   has parents ['occupation', 'capital-gain'].
    age            has parents ['occupation', 'relationship', 'capital-gain'].
   

Adding ROOT native-country
Adding attribute marital-status
Adding attribute age
Adding attribute hours-per-week
Adding attribute sex
Adding attribute income>50K
Adding attribute capital-loss
Adding attribute workclass
Adding attribute relationship
Adding attribute occupation
Adding attribute fnlwgt
Adding attribute capital-gain
Adding attribute race
Adding attribute education-num
Adding ROOT native-country
Adding attribute marital-status
Adding attribute relationship
Adding attribute occupation
Adding attribute workclass
Adding attribute race
Adding attribute capital-loss
Adding attribute fnlwgt
Adding attribute capital-gain
Adding attribute sex
Adding attribute education-num
Adding attribute age
Adding attribute hours-per-week
Adding attribute income>50K
Privbayes
Constructed Bayesian network:
    marital-status has parents ['native-country'].
    age            has parents ['marital-status', 'native-country'].
    hours-per-week has parents ['marital-status', 'age', 'native-country']

Adding ROOT education-num
Adding attribute native-country
Adding attribute age
Adding attribute fnlwgt
Adding attribute occupation
Adding attribute relationship
Adding attribute capital-gain
Adding attribute capital-loss
Adding attribute race
Adding attribute hours-per-week
Adding attribute marital-status
Adding attribute sex
Adding attribute workclass
Adding attribute income>50K
Adding ROOT education-num
Adding attribute hours-per-week
Adding attribute age
Adding attribute race
Adding attribute capital-loss
Adding attribute native-country
Adding attribute workclass
Adding attribute fnlwgt
Adding attribute relationship
Adding attribute capital-gain
Adding attribute marital-status
Adding attribute sex
Adding attribute occupation
Adding attribute income>50K
Privbayes
Constructed Bayesian network:
    native-country has parents ['education-num'].
    age            has parents ['native-country', 'education-num'].
    fnlwgt         has parents ['native-country', 'age', 'education-num'].
 

Adding ROOT income>50K
Adding attribute fnlwgt
Adding attribute workclass
Adding attribute native-country
Adding attribute sex
Adding attribute race
Adding attribute education-num
Adding attribute hours-per-week
Adding attribute occupation
Adding attribute marital-status
Adding attribute relationship
Adding attribute capital-gain
Adding attribute age
Adding attribute capital-loss
Adding ROOT sex
Adding attribute fnlwgt
Adding attribute workclass
Adding attribute education-num
Adding attribute native-country
Adding attribute capital-loss
Adding attribute capital-gain
Adding attribute marital-status
Adding attribute age
Adding attribute race
Adding attribute occupation
Adding attribute hours-per-week
Adding attribute relationship
Adding attribute income>50K
Privbayes
Constructed Bayesian network:
    fnlwgt         has parents ['income>50K'].
    workclass      has parents ['fnlwgt', 'income>50K'].
    native-country has parents ['fnlwgt', 'workclass', 'income>50K'].
    sex            h

# Compas

In [None]:
# input dataset
input_data = './data/cleaned_Compas.csv'
# location of two output files
mode = 'correlated_attribute_mode'
description_file = f'./out/{mode}/description.json'
description_file_fair = f'./out/{mode}/description_fair.json'
synthetic_data = f'./out/{mode}/sythetic_data.csv'
sythetic_data_fair= f'./out/{mode}/sythetic_data_fair.csv'

In [None]:
# An attribute is categorical if its domain size is less than this threshold.
# Here modify the threshold to adapt to the domain size of "education" (which is 14 in input dataset).
threshold_value = 30

# specify categorical attributes
categorical_attributes = {}

# specify which attributes are candidate keys of input dataset.
candidate_keys = {}

# A parameter in Differential Privacy. It roughly means that removing a row in the input dataset will not 
# change the probability of getting the same output more than a multiplicative difference of exp(epsilon).
# Increase epsilon value to reduce the injected noises. Set epsilon=0 to turn off differential privacy.

# The maximum number of parents in Bayesian network, i.e., the maximum number of incoming edges.
degree_of_bayesian_network = 0

# Number of tuples generated in synthetic dataset.
num_tuples_to_generate = 6173 # Here 32561 is the same as input dataset, but it can be set to another number.

sensitive_attributes = {"admissible": ['Misdemeanor','Number_of_Priors'], "inadmissible": ['Sex','Race'], "outcome":['Two_yr_Recidivism']}

In [None]:
eps = [1000,100,10 ,1 , 0.1]

for e in eps:
    for i in range(10):
        randseed = int.from_bytes(os.urandom(4), 'big')
        describer = DataDescriber(category_threshold=threshold_value)
        describer_fair = DataDescriber(category_threshold=threshold_value)

        describer.describe_dataset_in_correlated_attribute_mode(dataset_file=input_data, 
                                                                epsilon=e, 
                                                                k=degree_of_bayesian_network,
                                                                attribute_to_is_categorical=categorical_attributes,
                                                                attribute_to_is_candidate_key=candidate_keys,
                                                               seed = randseed)
        describer.save_dataset_description_to_file(description_file)

        describer_fair.describe_dataset_in_fair_correlated_attribute_mode(dataset_file=input_data, 
                                                                epsilon=e, 
                                                                k=degree_of_bayesian_network,
                                                                attribute_to_is_categorical=categorical_attributes,
                                                                attribute_to_is_candidate_key=candidate_keys,
                                                                attribute_to_sensitive = sensitive_attributes,
                                                                seed = randseed)
        describer_fair.save_dataset_description_to_file(description_file_fair)


        print("Privbayes")
        display_bayesian_network(describer.bayesian_network)
        print("FairPrivbayes")
        display_bayesian_network(describer_fair.bayesian_network)

        synthpath = f'./out/{mode}/Bayes_original/eps={e}/sythetic_data_{i}.csv'
        fairsynthpath = f'./out/{mode}/Bayes_fair/eps={e}/sythetic_data_{i}.csv'
        generator = DataGenerator()
        generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file)
        generator.save_synthetic_data(synthpath)

        generator = DataGenerator()
        generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file_fair)
        generator.save_synthetic_data(fairsynthpath)

## Timing

In [7]:
eps = 10
privbayes_times_10 = []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    randseed = int.from_bytes(os.urandom(4), 'big')
    describer = DataDescriber(category_threshold=threshold_value)

    describer.describe_dataset_in_correlated_attribute_mode(dataset_file=input_data, 
                                                            epsilon=eps, 
                                                            k=degree_of_bayesian_network,
                                                            attribute_to_is_categorical=categorical_attributes,
                                                            attribute_to_is_candidate_key=candidate_keys,
                                                           seed = randseed)
    describer.save_dataset_description_to_file(description_file)
    

    generator = DataGenerator()
    generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file)
    elapsed = timeit.default_timer() - start_time
    privbayes_times_10.append(elapsed)
print(privbayes_times_10)

0
Adding ROOT sex
Adding attribute education-num
Adding attribute occupation
Adding attribute workclass
Adding attribute income>50K
Adding attribute native-country
Adding attribute relationship
Adding attribute capital-loss
Adding attribute hours-per-week
Adding attribute race
Adding attribute marital-status
Adding attribute fnlwgt
Adding attribute capital-gain
Adding attribute age
1
Adding ROOT capital-loss
Adding attribute sex
Adding attribute income>50K
Adding attribute native-country
Adding attribute race
Adding attribute marital-status
Adding attribute capital-gain
Adding attribute hours-per-week
Adding attribute age
Adding attribute occupation
Adding attribute education-num
Adding attribute fnlwgt
Adding attribute relationship
Adding attribute workclass
2
Adding ROOT relationship
Adding attribute capital-gain
Adding attribute fnlwgt
Adding attribute marital-status
Adding attribute workclass
Adding attribute sex
Adding attribute education-num
Adding attribute age
Adding attribute 

In [6]:
eps = 1
privbayes_times_1 = []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    randseed = int.from_bytes(os.urandom(4), 'big')
    describer = DataDescriber(category_threshold=threshold_value)

    describer.describe_dataset_in_correlated_attribute_mode(dataset_file=input_data, 
                                                            epsilon=eps, 
                                                            k=degree_of_bayesian_network,
                                                            attribute_to_is_categorical=categorical_attributes,
                                                            attribute_to_is_candidate_key=candidate_keys,
                                                           seed = randseed)
    describer.save_dataset_description_to_file(description_file)
    

    generator = DataGenerator()
    generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file)
    elapsed = timeit.default_timer() - start_time
    privbayes_times_1.append(elapsed)
print(privbayes_times_1)

0
Adding ROOT occupation
Adding attribute capital-loss
Adding attribute income>50K
Adding attribute marital-status
Adding attribute relationship
Adding attribute workclass
Adding attribute sex
Adding attribute age
Adding attribute fnlwgt
Adding attribute race
Adding attribute native-country
Adding attribute education-num
Adding attribute hours-per-week
Adding attribute capital-gain
1
Adding ROOT education-num
Adding attribute age
Adding attribute relationship
Adding attribute capital-loss
Adding attribute sex
Adding attribute workclass
Adding attribute race
Adding attribute marital-status
Adding attribute income>50K
Adding attribute hours-per-week
Adding attribute fnlwgt
Adding attribute native-country
Adding attribute occupation
Adding attribute capital-gain
2
Adding ROOT workclass
Adding attribute fnlwgt
Adding attribute relationship
Adding attribute capital-loss
Adding attribute occupation
Adding attribute hours-per-week
Adding attribute sex
Adding attribute race
Adding attribute ed

In [7]:
eps = 0.1
privbayes_times_01 = []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    randseed = int.from_bytes(os.urandom(4), 'big')
    describer = DataDescriber(category_threshold=threshold_value)

    describer.describe_dataset_in_correlated_attribute_mode(dataset_file=input_data, 
                                                            epsilon=eps, 
                                                            k=degree_of_bayesian_network,
                                                            attribute_to_is_categorical=categorical_attributes,
                                                            attribute_to_is_candidate_key=candidate_keys,
                                                           seed = randseed)
    describer.save_dataset_description_to_file(description_file)
    

    generator = DataGenerator()
    generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file)
    elapsed = timeit.default_timer() - start_time
    privbayes_times_01.append(elapsed)
print(privbayes_times_01)

0
Adding ROOT sex
Adding attribute workclass
Adding attribute capital-gain
Adding attribute relationship
Adding attribute marital-status
Adding attribute income>50K
Adding attribute age
Adding attribute occupation
Adding attribute education-num
Adding attribute native-country
Adding attribute race
Adding attribute fnlwgt
Adding attribute hours-per-week
Adding attribute capital-loss
1
Adding ROOT relationship
Adding attribute income>50K
Adding attribute capital-gain
Adding attribute fnlwgt
Adding attribute native-country
Adding attribute occupation
Adding attribute race
Adding attribute education-num
Adding attribute age
Adding attribute workclass
Adding attribute hours-per-week
Adding attribute sex
Adding attribute capital-loss
Adding attribute marital-status
2
Adding ROOT native-country
Adding attribute age
Adding attribute marital-status
Adding attribute fnlwgt
Adding attribute education-num
Adding attribute capital-gain
Adding attribute hours-per-week
Adding attribute occupation
Add

In [8]:
eps = 100
privbayes_times_100 = []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    randseed = int.from_bytes(os.urandom(4), 'big')
    describer = DataDescriber(category_threshold=threshold_value)

    describer.describe_dataset_in_correlated_attribute_mode(dataset_file=input_data, 
                                                            epsilon=eps, 
                                                            k=degree_of_bayesian_network,
                                                            attribute_to_is_categorical=categorical_attributes,
                                                            attribute_to_is_candidate_key=candidate_keys,
                                                           seed = randseed)
    describer.save_dataset_description_to_file(description_file)
    

    generator = DataGenerator()
    generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file)
    elapsed = timeit.default_timer() - start_time
    privbayes_times_100.append(elapsed)
print(privbayes_times_100)

0
Adding ROOT marital-status
Adding attribute education-num
Adding attribute capital-gain
Adding attribute sex
Adding attribute workclass
Adding attribute occupation
Adding attribute race
Adding attribute fnlwgt
Adding attribute capital-loss
Adding attribute native-country
Adding attribute age
Adding attribute relationship
Adding attribute income>50K
Adding attribute hours-per-week
1
Adding ROOT marital-status
Adding attribute income>50K
Adding attribute sex
Adding attribute occupation
Adding attribute race
Adding attribute hours-per-week
Adding attribute capital-gain
Adding attribute native-country
Adding attribute capital-loss
Adding attribute fnlwgt
Adding attribute relationship
Adding attribute education-num
Adding attribute age
Adding attribute workclass
2
Adding ROOT capital-loss
Adding attribute income>50K
Adding attribute education-num
Adding attribute native-country
Adding attribute workclass
Adding attribute relationship
Adding attribute capital-gain
Adding attribute race
Add

In [9]:
eps = 1000
privbayes_times_1000 = []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    randseed = int.from_bytes(os.urandom(4), 'big')
    describer = DataDescriber(category_threshold=threshold_value)

    describer.describe_dataset_in_correlated_attribute_mode(dataset_file=input_data, 
                                                            epsilon=eps, 
                                                            k=degree_of_bayesian_network,
                                                            attribute_to_is_categorical=categorical_attributes,
                                                            attribute_to_is_candidate_key=candidate_keys,
                                                           seed = randseed)
    describer.save_dataset_description_to_file(description_file)
    

    generator = DataGenerator()
    generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file)
    elapsed = timeit.default_timer() - start_time
    privbayes_times_1000.append(elapsed)
print(privbayes_times_1000)

0
Adding ROOT education-num
Adding attribute race
Adding attribute age
Adding attribute native-country
Adding attribute income>50K
Adding attribute relationship
Adding attribute marital-status
Adding attribute sex
Adding attribute hours-per-week
Adding attribute fnlwgt
Adding attribute workclass
Adding attribute capital-gain
Adding attribute capital-loss
Adding attribute occupation
1
Adding ROOT occupation
Adding attribute capital-loss
Adding attribute fnlwgt
Adding attribute native-country
Adding attribute hours-per-week
Adding attribute workclass
Adding attribute income>50K
Adding attribute age
Adding attribute sex
Adding attribute capital-gain
Adding attribute race
Adding attribute relationship
Adding attribute education-num
Adding attribute marital-status
2
Adding ROOT education-num
Adding attribute hours-per-week
Adding attribute fnlwgt
Adding attribute marital-status
Adding attribute native-country
Adding attribute relationship
Adding attribute income>50K
Adding attribute workcla

In [5]:
eps = 10
fair_privbayes_times_10 = []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    randseed = int.from_bytes(os.urandom(4), 'big')
    describer_fair = DataDescriber(category_threshold=threshold_value)

    describer_fair.describe_dataset_in_fair_correlated_attribute_mode(dataset_file=input_data, 
                                                                epsilon=eps, 
                                                                k=degree_of_bayesian_network,
                                                                attribute_to_is_categorical=categorical_attributes,
                                                                attribute_to_is_candidate_key=candidate_keys,
                                                                attribute_to_sensitive = sensitive_attributes,
                                                                seed = randseed)
    describer_fair.save_dataset_description_to_file(description_file_fair)
    

    generator = DataGenerator()
    generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file)
    elapsed = timeit.default_timer() - start_time
    fair_privbayes_times_10.append(elapsed)
print(fair_privbayes_times_10)

0
Adding ROOT age
Adding attribute hours-per-week
Adding attribute capital-gain
Adding attribute sex
Adding attribute relationship
Adding attribute education-num
Adding attribute race
Adding attribute marital-status
Adding attribute fnlwgt
Adding attribute native-country
Adding attribute capital-loss
Adding attribute occupation
Adding attribute workclass
Adding attribute income>50K
1
Adding ROOT race
Adding attribute age
Adding attribute capital-gain
Adding attribute capital-loss
Adding attribute relationship
Adding attribute workclass
Adding attribute hours-per-week
Adding attribute marital-status
Adding attribute occupation
Adding attribute native-country
Adding attribute sex
Adding attribute fnlwgt
Adding attribute education-num
Adding attribute income>50K
2
Adding ROOT marital-status
Adding attribute capital-loss
Adding attribute hours-per-week
Adding attribute race
Adding attribute relationship
Adding attribute education-num
Adding attribute sex
Adding attribute fnlwgt
Adding attr

In [6]:
eps = 1
fair_privbayes_times_1 = []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    randseed = int.from_bytes(os.urandom(4), 'big')
    describer_fair = DataDescriber(category_threshold=threshold_value)

    describer_fair.describe_dataset_in_fair_correlated_attribute_mode(dataset_file=input_data, 
                                                                epsilon=eps, 
                                                                k=degree_of_bayesian_network,
                                                                attribute_to_is_categorical=categorical_attributes,
                                                                attribute_to_is_candidate_key=candidate_keys,
                                                                attribute_to_sensitive = sensitive_attributes,
                                                                seed = randseed)
    describer_fair.save_dataset_description_to_file(description_file_fair)
    

    generator = DataGenerator()
    generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file)
    elapsed = timeit.default_timer() - start_time
    fair_privbayes_times_1.append(elapsed)
print(fair_privbayes_times_1)

0
Adding ROOT relationship
Adding attribute marital-status
Adding attribute capital-loss
Adding attribute workclass
Adding attribute age
Adding attribute race
Adding attribute education-num
Adding attribute capital-gain
Adding attribute native-country
Adding attribute hours-per-week
Adding attribute sex
Adding attribute occupation
Adding attribute fnlwgt
Adding attribute income>50K
1
Adding ROOT sex
Adding attribute race
Adding attribute fnlwgt
Adding attribute capital-gain
Adding attribute capital-loss
Adding attribute hours-per-week
Adding attribute relationship
Adding attribute workclass
Adding attribute occupation
Adding attribute age
Adding attribute marital-status
Adding attribute education-num
Adding attribute native-country
Adding attribute income>50K
2
Adding ROOT workclass
Adding attribute relationship
Adding attribute education-num
Adding attribute occupation
Adding attribute capital-loss
Adding attribute fnlwgt
Adding attribute native-country
Adding attribute sex
Adding att

In [7]:
eps = 0.1
fair_privbayes_times_01 = []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    randseed = int.from_bytes(os.urandom(4), 'big')
    describer_fair = DataDescriber(category_threshold=threshold_value)

    describer_fair.describe_dataset_in_fair_correlated_attribute_mode(dataset_file=input_data, 
                                                                epsilon=eps, 
                                                                k=degree_of_bayesian_network,
                                                                attribute_to_is_categorical=categorical_attributes,
                                                                attribute_to_is_candidate_key=candidate_keys,
                                                                attribute_to_sensitive = sensitive_attributes,
                                                                seed = randseed)
    describer_fair.save_dataset_description_to_file(description_file_fair)
    

    generator = DataGenerator()
    generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file)
    elapsed = timeit.default_timer() - start_time
    fair_privbayes_times_01.append(elapsed)
print(fair_privbayes_times_01)

0
Adding ROOT sex
Adding attribute relationship
Adding attribute workclass
Adding attribute capital-gain
Adding attribute capital-loss
Adding attribute hours-per-week
Adding attribute education-num
Adding attribute age
Adding attribute marital-status
Adding attribute fnlwgt
Adding attribute native-country
Adding attribute race
Adding attribute occupation
Adding attribute income>50K
1
Adding ROOT occupation
Adding attribute hours-per-week
Adding attribute fnlwgt
Adding attribute native-country
Adding attribute education-num
Adding attribute relationship
Adding attribute capital-gain
Adding attribute workclass
Adding attribute marital-status
Adding attribute race
Adding attribute sex
Adding attribute age
Adding attribute capital-loss
Adding attribute income>50K
2
Adding ROOT sex
Adding attribute education-num
Adding attribute fnlwgt
Adding attribute occupation
Adding attribute capital-gain
Adding attribute hours-per-week
Adding attribute relationship
Adding attribute marital-status
Addin

In [8]:
eps = 100
fair_privbayes_times_100 = []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    randseed = int.from_bytes(os.urandom(4), 'big')
    describer_fair = DataDescriber(category_threshold=threshold_value)

    describer_fair.describe_dataset_in_fair_correlated_attribute_mode(dataset_file=input_data, 
                                                                epsilon=eps, 
                                                                k=degree_of_bayesian_network,
                                                                attribute_to_is_categorical=categorical_attributes,
                                                                attribute_to_is_candidate_key=candidate_keys,
                                                                attribute_to_sensitive = sensitive_attributes,
                                                                seed = randseed)
    describer_fair.save_dataset_description_to_file(description_file_fair)
    

    generator = DataGenerator()
    generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file)
    elapsed = timeit.default_timer() - start_time
    fair_privbayes_times_100.append(elapsed)
print(fair_privbayes_times_100)

0
Adding ROOT native-country
Adding attribute capital-gain
Adding attribute relationship
Adding attribute marital-status
Adding attribute race
Adding attribute occupation
Adding attribute hours-per-week
Adding attribute sex
Adding attribute fnlwgt
Adding attribute education-num
Adding attribute age
Adding attribute workclass
Adding attribute capital-loss
Adding attribute income>50K
1
Adding ROOT fnlwgt
Adding attribute sex
Adding attribute education-num
Adding attribute occupation
Adding attribute age
Adding attribute relationship
Adding attribute capital-gain
Adding attribute marital-status
Adding attribute hours-per-week
Adding attribute capital-loss
Adding attribute race
Adding attribute workclass
Adding attribute native-country
Adding attribute income>50K
2
Adding ROOT hours-per-week
Adding attribute workclass
Adding attribute age
Adding attribute marital-status
Adding attribute race
Adding attribute capital-gain
Adding attribute native-country
Adding attribute education-num
Adding

In [9]:
eps = 1000
fair_privbayes_times_1000 = []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    randseed = int.from_bytes(os.urandom(4), 'big')
    describer_fair = DataDescriber(category_threshold=threshold_value)

    describer_fair.describe_dataset_in_fair_correlated_attribute_mode(dataset_file=input_data, 
                                                                epsilon=eps, 
                                                                k=degree_of_bayesian_network,
                                                                attribute_to_is_categorical=categorical_attributes,
                                                                attribute_to_is_candidate_key=candidate_keys,
                                                                attribute_to_sensitive = sensitive_attributes,
                                                                seed = randseed)
    describer_fair.save_dataset_description_to_file(description_file_fair)
    

    generator = DataGenerator()
    generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file)
    elapsed = timeit.default_timer() - start_time
    fair_privbayes_times_1000.append(elapsed)
print(fair_privbayes_times_1000)

0
Adding ROOT capital-gain
Adding attribute marital-status
Adding attribute native-country
Adding attribute age
Adding attribute occupation
Adding attribute relationship
Adding attribute hours-per-week
Adding attribute fnlwgt
Adding attribute race
Adding attribute education-num
Adding attribute capital-loss
Adding attribute workclass
Adding attribute sex
Adding attribute income>50K
1
Adding ROOT marital-status
Adding attribute native-country
Adding attribute race
Adding attribute occupation
Adding attribute fnlwgt
Adding attribute education-num
Adding attribute age
Adding attribute relationship
Adding attribute sex
Adding attribute capital-gain
Adding attribute capital-loss
Adding attribute hours-per-week
Adding attribute workclass
Adding attribute income>50K
2
Adding ROOT age
Adding attribute sex
Adding attribute occupation
Adding attribute capital-loss
Adding attribute marital-status
Adding attribute race
Adding attribute fnlwgt
Adding attribute hours-per-week
Adding attribute educat

### Step 5 compare the statistics of input and sythetic data (optional)

The synthetic data is already saved in a file by step 4. The ModelInspector is for a quick test on the similarity between input and synthetic datasets.

#### 5.1 instantiate a ModelInspector.

It needs input dataset, synthetic dataset, and attribute description.

In [None]:
# Read both datasets using Pandas.
#use below line for comparing to original dataset
#input_df = pd.read_csv(input_data, skipinitialspace=True)
#use below line for comparing to fair dataset
fair_df = pd.read_csv(sythetic_data_fair)
synthetic_df = pd.read_csv(synthetic_data)
# Read attribute description from the dataset description file.
attribute_description = read_json_file(description_file)['attribute_description']

inspector = ModelInspector(fair_df, synthetic_df, attribute_description)

#### 5.2 compare histograms between input and synthetic datasets.

In [None]:
for attribute in synthetic_df.columns:
    inspector.compare_histograms(attribute)

#### 5.3 compare pairwise mutual information

In [None]:
inspector.mutual_information_heatmap()