In [5]:
# DP-STOA
import importlib
import time
# Importing the necessary modules
from DataLoader import DataLoader
from Anonymisation import Anonymisation
from Consistenter import Consistenter
import PostProcessor
importlib.reload(PostProcessor)
from PostProcessor import RecordPostprocessor
from GUM import GraduallyUpdateMethod
import pandas as pd

datasetTypeList = ['adult', 'diabetes', 'california_housing']

# Select the dataset type to be used from above list
datasetType = datasetTypeList[0];
# Load the adult training data
train_df = pd.read_csv(f'../data_config/{datasetType}/{datasetType}_train.csv')
num_samples = len(train_df)

# Start timing the training process
train_start_time = time.time()

# Initialize PrivSyn components with adult train data but keep other config files
dl = DataLoader(f'../data_config/{datasetType}/{datasetType}_train.csv', 
                f'../data_config/{datasetType}/data.yaml', 
                f'../data_config/{datasetType}/column_info.json', 
                f'../data_config/{datasetType}/loading_data.json')
dl.data_loader()
dl.all_indifs(dl.private_data)

# Anonymising the data
#epsilons = [0.5, 1, 3, 5, 10]
epsilons = [0.5]
delta = 3e-11
for epsilon in epsilons:
    anon = Anonymisation(epsilon=epsilon, delta=delta)
    anon.anonymiser(dl)

    # Consistenting the data
    cons = Consistenter(anon, dl.all_attrs)
    cons.make_consistent(iterations=5)

    # Initialising the Gradually Update Method
    gum = GraduallyUpdateMethod(dl, cons)
    gum.initialiser(view_iterations=100)
    
    # Training time completed
    train_time = time.time() - train_start_time
    print(f"\nTraining completed in {train_time:.2f} seconds")
    
    # Start timing the sampling process
    sample_start_time = time.time()
    
    # Generate synthetic data
    syn_data = gum.synthesize(iterations=100, num_records=num_samples)

    # Post-processing the data
    processor_private = RecordPostprocessor(dl.private_data, dl.configpath, dl.datainfopath, dl.decode_mapping)
    processor_public = RecordPostprocessor(syn_data, dl.configpath, dl.datainfopath, dl.decode_mapping)
    original_adult_data = processor_private.post_process()
    synthesised_dp_stoa_adult_data = processor_public.post_process()
    
    # Sampling time completed
    sample_time = time.time() - sample_start_time
    print(f"Sampling completed in {sample_time:.2f} seconds")

    # Save the synthesized data
    synthesised_dp_stoa_adult_data.to_csv(f'synthesized/dp-stoa/{datasetType}/samples_num_samples={num_samples}.csv', index=False)
    print(f"Generated {num_samples} synthetic samples and saved to synthesized/dp-stoa/{datasetType}/samples_num_samples={num_samples}.csv")

  self.private_data.fillna(self.fillna, inplace=True, downcast = self.datatypes)


Data loaded successfully. Shape: (29305, 13)
epsilon: 0.5, delta: 3e-11, rho: 0.002553189531618518
one-way sigma: 159.55673866797062
indif sigma: 2.3399999999999998e-09
multi-way sigma: 75.0347429560684


  if np.prod([len(dataloader.priv_one_way[frozenset([i])]) for i in marginal]) > np.sqrt(np.prod([len(dataloader.priv_one_way[frozenset([i])]) for i in dataloader.all_attrs])):
  if np.prod([len(dataloader.priv_one_way[frozenset([i])]) for i in marginal]) > np.sqrt(np.prod([len(dataloader.priv_one_way[frozenset([i])]) for i in dataloader.all_attrs])):
  if np.prod([len(dataloader.priv_one_way[frozenset([i])]) for i in marginal]) > np.sqrt(np.prod([len(dataloader.priv_one_way[frozenset([i])]) for i in dataloader.all_attrs])):
  if np.prod([len(dataloader.priv_one_way[frozenset([i])]) for i in marginal]) > np.sqrt(np.prod([len(dataloader.priv_one_way[frozenset([i])]) for i in dataloader.all_attrs])):


Iteration 1 of 5 to make the marginals consistent and normalise them
Iteration 2 of 5 to make the marginals consistent and normalise them
Iteration 3 of 5 to make the marginals consistent and normalise them
Iteration 4 of 5 to make the marginals consistent and normalise them
Iteration 5 of 5 to make the marginals consistent and normalise them
Iteration 1 of 100 completed to consist the marginal views
Iteration 2 of 100 completed to consist the marginal views
Iteration 3 of 100 completed to consist the marginal views
Iteration 4 of 100 completed to consist the marginal views
Iteration 5 of 100 completed to consist the marginal views
Iteration 6 of 100 completed to consist the marginal views
Iteration 7 of 100 completed to consist the marginal views
Iteration 8 of 100 completed to consist the marginal views
Iteration 9 of 100 completed to consist the marginal views
Iteration 10 of 100 completed to consist the marginal views
Iteration 11 of 100 completed to consist the marginal views
Iter

In [4]:
import importlib
import time
# Importing the necessary modules
from DataLoader import DataLoader
from Anonymisation import Anonymisation
from Consistenter import Consistenter
import PostProcessor
importlib.reload(PostProcessor)
from PostProcessor import RecordPostprocessor
from GUM import GraduallyUpdateMethod
import pandas as pd
import os
# Separate run sequence for Rossmann Dataset
datasetType = 'rossmann'

# Process both train (child) and parent tables of
table_types = ['train', 'parent']

for table_type in table_types:
    print(f"\nProcessing {table_type} table...")
    
    # Load the data
    train_df = pd.read_csv(f'../data_config/{datasetType}/{datasetType}_{table_type}.csv')
    num_samples = len(train_df)
    
    # Start timing the training process
    train_start_time = time.time()
    
    # Initialize PrivSyn components
    dl = DataLoader(f'../data_config/{datasetType}/{datasetType}_{table_type}.csv', 
                    f'../data_config/{datasetType}/data_{table_type}.yaml', 
                    f'../data_config/{datasetType}/column_info_{table_type}.json', 
                    f'../data_config/{datasetType}/loading_data_{table_type}.json')
    dl.data_loader()
    dl.all_indifs(dl.private_data)
    
    # Anonymising the data
    epsilons = [0.5]
    delta = 3e-11
    for epsilon in epsilons:
        anon = Anonymisation(epsilon=epsilon, delta=delta)
        anon.anonymiser(dl)
        
        # Consistenting the data
        cons = Consistenter(anon, dl.all_attrs)
        cons.make_consistent(iterations=5)
        
        # Initialising the Gradually Update Method
        gum = GraduallyUpdateMethod(dl, cons)
        gum.initialiser(view_iterations=100)
        
        # Training time completed
        train_time = time.time() - train_start_time
        print(f"\nTraining completed in {train_time:.2f} seconds")
        
        # Start timing the sampling process
        sample_start_time = time.time()
        
        # Generate synthetic data
        syn_data = gum.synthesize(iterations=100, num_records=num_samples)
        
        # Post-processing the data
        processor_private = RecordPostprocessor(dl.private_data, dl.configpath, dl.datainfopath, dl.decode_mapping)
        processor_public = RecordPostprocessor(syn_data, dl.configpath, dl.datainfopath, dl.decode_mapping)
        original_data = processor_private.post_process()
        synthesised_data = processor_public.post_process()
        
        # Sampling time completed
        sample_time = time.time() - sample_start_time
        print(f"Sampling completed in {sample_time:.2f} seconds")

        # Check if directory exists and create if not
        if not os.path.exists(f'synthesized/dp-stoa/{datasetType}'):
            os.makedirs(f'synthesized/dp-stoa/{datasetType}')
        
        # Save the synthesized data for child and parent tables
        if table_type == 'train':
            output_path = f'synthesized/dp-stoa/{datasetType}/child_samples_num_samples={num_samples}.csv'
        else:
            output_path = f'synthesized/dp-stoa/{datasetType}/parent_samples_num_samples={num_samples}.csv'
        synthesised_data.to_csv(output_path, index=False)
        print(f"Generated {num_samples} synthetic samples and saved to {output_path}")


Processing train table...
Data loaded successfully. Shape: (68015, 8)


  train_df = pd.read_csv(f'../data_config/{datasetType}/{datasetType}_{table_type}.csv')
  self.private_data.fillna(self.fillna, inplace=True, downcast = self.datatypes)
  indif += np.abs(marginalrow['n'][i]*marginalcol['n'][j]/n - marginalrowcol.xs((indexrow[i], indexcol[j]))['n'])
  indif += np.abs(marginalrow['n'][i]*marginalcol['n'][j]/n - marginalrowcol.xs((indexrow[i], indexcol[j]))['n'])
  indif += np.abs(marginalrow['n'][i]*marginalcol['n'][j]/n - marginalrowcol.xs((indexrow[i], indexcol[j]))['n'])
  indif += np.abs(marginalrow['n'][i]*marginalcol['n'][j]/n - marginalrowcol.xs((indexrow[i], indexcol[j]))['n'])
  indif += np.abs(marginalrow['n'][i]*marginalcol['n'][j]/n - marginalrowcol.xs((indexrow[i], indexcol[j]))['n'])


epsilon: 0.5, delta: 3e-11, rho: 0.002553189531618518
one-way sigma: 125.16660369126257
indif sigma: 8.4e-10
multi-way sigma: 49.47644433150358
Iteration 1 of 5 to make the marginals consistent and normalise them
Iteration 2 of 5 to make the marginals consistent and normalise them
Iteration 3 of 5 to make the marginals consistent and normalise them
Iteration 4 of 5 to make the marginals consistent and normalise them
Iteration 5 of 5 to make the marginals consistent and normalise them
Iteration 1 of 100 completed to consist the marginal views
Iteration 2 of 100 completed to consist the marginal views
Iteration 3 of 100 completed to consist the marginal views
Iteration 4 of 100 completed to consist the marginal views
Iteration 5 of 100 completed to consist the marginal views
Iteration 6 of 100 completed to consist the marginal views
Iteration 7 of 100 completed to consist the marginal views
Iteration 8 of 100 completed to consist the marginal views
Iteration 9 of 100 completed to consist

  self.private_data.fillna(self.fillna, inplace=True, downcast = self.datatypes)


Data loaded successfully. Shape: (1115, 10)
epsilon: 0.5, delta: 3e-11, rho: 0.002553189531618518
one-way sigma: 139.9405171832196
indif sigma: 1.35e-09
multi-way sigma: 22.126538562045226
Iteration 1 of 5 to make the marginals consistent and normalise them
Iteration 2 of 5 to make the marginals consistent and normalise them


  if np.prod([len(dataloader.priv_one_way[frozenset([i])]) for i in marginal]) > np.sqrt(np.prod([len(dataloader.priv_one_way[frozenset([i])]) for i in dataloader.all_attrs])):
  if np.prod([len(dataloader.priv_one_way[frozenset([i])]) for i in marginal]) > np.sqrt(np.prod([len(dataloader.priv_one_way[frozenset([i])]) for i in dataloader.all_attrs])):


Iteration 3 of 5 to make the marginals consistent and normalise them
Iteration 4 of 5 to make the marginals consistent and normalise them
Iteration 5 of 5 to make the marginals consistent and normalise them
Iteration 1 of 100 completed to consist the marginal views
Iteration 2 of 100 completed to consist the marginal views
Iteration 3 of 100 completed to consist the marginal views
Iteration 4 of 100 completed to consist the marginal views
Iteration 5 of 100 completed to consist the marginal views
Iteration 6 of 100 completed to consist the marginal views
Iteration 7 of 100 completed to consist the marginal views
Iteration 8 of 100 completed to consist the marginal views
Iteration 9 of 100 completed to consist the marginal views
Iteration 10 of 100 completed to consist the marginal views
Iteration 11 of 100 completed to consist the marginal views
Iteration 12 of 100 completed to consist the marginal views
Iteration 13 of 100 completed to consist the marginal views
Iteration 14 of 100 co