In [1]:
# DP-STOA

# Importing the necessary modules
from DataLoader import DataLoader
from Anonymisation import Anonymisation
from Consistenter import Consistenter
from PostProcessor import RecordPostprocessor
from GUM import GraduallyUpdateMethod
from sklearn import svm
import pandas as pd

# Loading the data
dl = DataLoader('../datasets/UCIMLAdult/uciml_adult.csv', '../datasets/UCIMLAdult/data.yaml', '../datasets/UCIMLAdult/column_info.json', '../datasets/UCIMLAdult/loading_data.json')
dl.data_loader()
dl.all_indifs(dl.private_data)

# Anonymising the data
epsilon = 10
delta = 3e-11
anon = Anonymisation(epsilon=epsilon,delta=delta)
anon.anonymiser(dl)

# Consistenting the data
cons = Consistenter(anon, dl.all_attrs)
cons.make_consistent(iterations = 5)

# Synthesising the data
gum = GraduallyUpdateMethod(dl, cons)  
gum.initialiser(view_iterations = 100)
syn_data = gum.synthesize(iterations = 100, num_records = int(cons.num_synthesize_records))

# Post-processing the data
processor_private = RecordPostprocessor(dl.private_data, dl.configpath, dl.datainfopath, dl.decode_mapping)
processor_public = RecordPostprocessor(syn_data, dl.configpath, dl.datainfopath, dl.decode_mapping)
private_data = processor_private.post_process()
public_data = processor_public.post_process()


  self.private_data.fillna(self.fillna, inplace=True, downcast = self.datatypes)


Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,3,1,0,2,1,4,0,1,1,0,4,1,1
1,4,2,0,0,2,3,0,1,0,0,2,1,1
2,3,3,1,1,3,4,0,1,0,0,4,1,1
3,4,3,2,0,3,3,4,1,0,0,4,1,1
4,2,3,0,0,4,1,4,0,0,0,4,2,1


epsilon: 10, delta: 3e-11, rho: 0.8616407135665874
one-way sigma: 8.685474957955632
indif sigma: 2.3399999999999998e-09
multi-way sigma: 5.838830013637393
Iteration 1 of 5 to make the marginals consistent and normalise them
Iteration 2 of 5 to make the marginals consistent and normalise them
Iteration 3 of 5 to make the marginals consistent and normalise them
Iteration 4 of 5 to make the marginals consistent and normalise them
Iteration 5 of 5 to make the marginals consistent and normalise them
Iteration 1 of 100 completed to consist the marginal views
Iteration 2 of 100 completed to consist the marginal views
Iteration 3 of 100 completed to consist the marginal views
Iteration 4 of 100 completed to consist the marginal views
Iteration 5 of 100 completed to consist the marginal views
Iteration 6 of 100 completed to consist the marginal views
Iteration 7 of 100 completed to consist the marginal views
Iteration 8 of 100 completed to consist the marginal views
Iteration 9 of 100 completed

In [2]:
# Evaluation
from EvalutionMetricFunctions import evaluate_synthetic_data  

# Example usage classification
evaluate_synthetic_data(
    private_data=private_data,
    synthetic_data=public_data,
    target_col='income', 
    categorical_cols=['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'],
    is_categorical_target=True  
)

# Example usage regression
evaluate_synthetic_data(
    private_data=private_data,
    synthetic_data=public_data,
    target_col='age',
    categorical_cols=['income','workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'],
    is_categorical_target=False
)

F1 Score - Private: 0.6413, Synthetic: 0.4743
DM Score: 0.4413 (closer to 0 is better)
R² Score - Private: 0.3980, Synthetic: 0.0004
MSE - Private: 111.8429, Synthetic: 185.7132
DM Score: 0.4323 (closer to 0 is better)
