In [None]:
import sklearn
import numpy as np
import pandas as pd

In [None]:
from pathlib import Path
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Discretize data before used as input for synthetic generation

In [None]:
# Discretize original and validation datasets before generating synthetic data, since PB does not support numerical features very well and for fair comparison of output quality use same input data for MS
# Load the 5 original and validation dataframes to be discretized
g = globals()

for i in range(1,6):
    dfname = 'df_ori_{}'.format(i)
    g[dfname] = pd.read_csv('C:\\Users\\bde2002.53381\\Desktop\\master-thesis\\experiments\\data\\df_ori_{}.csv'.format(i))
    
for i in range(1,6):
    dfname = 'df_val_{}'.format(i)
    g[dfname] = pd.read_csv('C:\\Users\\bde2002.53381\\Desktop\\master-thesis\\experiments\\data\\df_val_{}.csv'.format(i))

In [None]:
# List the 5 original dataframes
all_ori_dfs = [df_ori_1, df_ori_2, df_ori_3, df_ori_4, df_ori_5]

# List the 5 validation dataframes
all_val_dfs = [df_val_1, df_val_2, df_val_3, df_val_4, df_val_5]

In [None]:
from pandas import cut

# Set bins for continuous numerical features
# Age
bins_years = [0, 20, 50, 70, 90, 120]
bins_days = [i*365 for i in bins_years] 
labels_age = ['0-20','21-50','51-70','71-90','91-120']

# Lymph nodes
bins_lymphs = [-1, 0.5, 89, 1000] # 0-0.5 no lymphnodes, 0.5-89 some lymphnodes, > 89 unknown
labels_lymphs = ['0','1-89','unknown']

In [None]:
# Discretize for all original dataframes
i=1
for df_ori in all_ori_dfs:
    df_ori['binned_diagnosis_age'] = pd.cut(df_ori['diagnosis_age'], bins=bins_days, labels=labels_age)
    df_ori['binned_tum_lymphnodes_pos'] = pd.cut(df_ori['tum_lymfklieren_positief_atl'], bins_lymphs, labels=labels_lymphs)
    
    # Drop old columns
    df_ori.drop(columns=['tum_lymfklieren_positief_atl', 'diagnosis_age'], inplace=True)
    
    # Save dataframe
    df_ori.to_csv('C:\\Users\\bde2002.53381\\Desktop\\master-thesis\\experiments\\data\\df_ori_{}_binned.csv'.format(i), index=False)
    i += 1
    

i=1
for df_val in all_val_dfs:
    df_val['binned_diagnosis_age'] = pd.cut(df_val['diagnosis_age'], bins=bins_days, labels=labels_age)
    df_val['binned_tum_lymphnodes_pos'] = pd.cut(df_val['tum_lymfklieren_positief_atl'], bins_lymphs, labels=labels_lymphs)
    
    # Drop old columns
    df_val.drop(columns=['tum_lymfklieren_positief_atl', 'diagnosis_age'], inplace=True)
    
    # Save dataframe
    df_val.to_csv('C:\\Users\\bde2002.53381\\Desktop\\master-thesis\\experiments\\data\\df_val_{}_binned.csv'.format(i), index=False)
    i += 1

# Generate synthetic data

In [None]:
# Load the 5 discretized original dataframes to be used as input
g = globals()

for i in range(1,6):
    dfname = 'df_ori_{}'.format(i)
    g[dfname] = pd.read_csv('C:\\Users\\bde2002.53381\\Desktop\\master-thesis\\experiments\\data\\df_ori_{}_binned.csv'.format(i))

In [None]:
# List the 5 different discretized original dataframes
all_ori_dfs = [df_ori_1, df_ori_2, df_ori_3, df_ori_4, df_ori_5]

# Select epsilon values
epsilon_list = [10,1,0.1,0.01,0.001]
epsilon_names = ['10','1','0_1','0_01','0_001']

# Marginal Synthesizer

In [None]:
# Cannot import synthesis functions (so, now in new_example_1)
from synthesis.hist_synthesis import MarginalSynthesizer

In [None]:
i=1
for df_ori in all_ori_dfs:
    j=0
    for epsilon in epsilon_list:
        for versie in range(1,6):
            ms = MarginalSynthesizer(epsilon=epsilon, verbose=0) # Set verbose to 0 to reduce print statements
            ms.fit(df_ori)
            df_syn = ms.transform(df_ori)
            df_syn.to_csv('C:\\Users\\bde2002.53381\\Desktop\\master-thesis\\experiments\\data\\ms_df_syn_{}_e{}_v{}.csv'.format(i, epsilon_names[j], versie), index=False)
        j+=1
    i+=1        

# PrivBayes

In [None]:
# Cannot import synthesis functions  (so, now in example_2)
from synthesis.bayes_synthesis import PrivBayes

In [None]:
i=1
for df_ori in all_ori_dfs:
    j=0
    for epsilon in epsilon_list:
        for versie in range(1,6):
            pb = PrivBayes(epsilon=epsilon, verbose=0) # Set verbose to 0 to reduce print statements
            pb.fit(df_ori)
            df_syn = pb.transform(df_ori)
            df_syn.to_csv('C:\\Users\\bde2002.53381\\Desktop\\master-thesis\\experiments\\data\\pb_df_syn_{}_e{}_v{}.csv'.format(i, epsilon_names[j], versie), index=False)
        j+=1
    i+=1  