In [1]:
import pandas as pd
from pysdg.synth.generate import Generator  
from pysdg.privacy.mmbrshp import calc_mmbrshp_risk



In [2]:
# Read the population csv file
raw_data = pd.read_csv('raw_data.csv')

In [3]:
# Split the population into training and holdout datasets and save them to csv files. Use your desired split ratio.
raw_train = raw_data.sample(frac=0.5, random_state=1)
raw_holdout = raw_data.drop(raw_train.index)

raw_train.to_csv('raw_train.csv', index=False)
raw_holdout.to_csv('raw_holdout.csv', index=False)

In [4]:
# Use the training dataset to train your model.
gen1=Generator("synthcity_bayesian_network")
gen1.load('raw_train.csv', 'raw_info.json')
gen1.train()

# Extract the ENCODED version for the real  dataset. Only the ENCODED versions should be used to calculate the membership risk, otherwise an error will be raised.
enc_real_train=gen1.enc_real

# Generate the required number of data points and synthetic version. In this example, we generate one synthetic dataset version with the same number of rows as the real dataset.
gen1.gen(num_rows=len(enc_real_train), num_synths=1)

# Extract the ENCODED version for the synthetic dataset.
enc_synth=gen1.enc_synths[0]

2025-02-04 19:05:26,227 - pysdg - INFO - 1222045 - generate.py:88 - **************Started logging the generator: synthcity_bayesian_network, num_cores= None.**************
2025-02-04 19:05:26,269 - pysdg - INFO - 1222045 - generate.py:209 - Checking the input metadata for any conflict in variable indexes - Passed.
2025-02-04 19:05:27,335 - pysdg - INFO - 1222045 - generate.py:277 - The dataset ['tutorial_data'] is loaded into the generator synthcity_bayesian_network
[2025-02-04T19:05:29.154864-0500][1222045][CRITICAL] module disabled: /share/personal/skababji/conda_envs/pysdg_dev/lib/python3.10/site-packages/synthcity/plugins/generic/plugin_goggle.py
2025-02-04 19:05:35,061 - pysdg - INFO - 1222045 - generate.py:662 - Started training using synthcity_bayesian_network...
INFO:pysdg:Started training using synthcity_bayesian_network...
2025-02-04 19:06:33,132 - pysdg - INFO - 1222045 - generate.py:667 - Completed training using synthcity_bayesian_network.
INFO:pysdg:Completed training usi

In [5]:
# Create a 'dummy' generator to obtain the ENCODED version of the holdout dataset.
gen2=Generator()
gen2.load('raw_holdout.csv', 'raw_info.json')
enc_real_holdout=gen2.enc_real

2025-02-04 19:06:34,256 - pysdg - INFO - 1222045 - generate.py:88 - **************Started logging the generator: dummy, num_cores= None.**************
INFO:pysdg:**************Started logging the generator: dummy, num_cores= None.**************
2025-02-04 19:06:34,276 - pysdg - INFO - 1222045 - generate.py:209 - Checking the input metadata for any conflict in variable indexes - Passed.
INFO:pysdg:Checking the input metadata for any conflict in variable indexes - Passed.


2025-02-04 19:06:35,403 - pysdg - INFO - 1222045 - generate.py:277 - The dataset ['tutorial_data'] is loaded into the generator dummy
INFO:pysdg:The dataset ['tutorial_data'] is loaded into the generator dummy


In [6]:
# Remove the unnecessary columns from all ENCODED datasets. All columns that include '_missing' in their names should be removed. These columns are added by pysdg for tracking purposes and are not part of the original dataset.
enc_real_train = enc_real_train.loc[:, ~enc_real_train.columns.str.contains('_missing')]
enc_synth = enc_synth.loc[:, ~enc_synth.columns.str.contains('_missing')]
enc_real_holdout = enc_real_holdout.loc[:, ~enc_real_holdout.columns.str.contains('_missing')]


In [7]:
# Use the ENCODED information about the datasets to extract the population size.
population_size=gen1.enc_real_info['population_size'][0]

# Define your desired size of the attack dataset. In this example, we use 20% of the training dataset.
attack_data_size=int(0.2*len(enc_real_train))

In [8]:
# Calculate the membership risk with default settings of quasi-identifiers (all variable names will be considered by default).
res=calc_mmbrshp_risk(enc_synth, enc_real_train, enc_real_holdout, population_size=population_size, attack_size=attack_data_size)
print(f"Relative F1 score = {res['f1']}")
print(f"Naive F1 score = {res['f_naive']}")


Calculating membership disclosure risk
Relative F1 score = 0.0099601593625498
Naive F1 score = 0.010272899577270183


In [9]:
# Calculate the membership risk when all variable names are explicitly passed as quasi-identifiers.
quasi_identifiers=enc_real_train.columns

res=calc_mmbrshp_risk(enc_synth, enc_real_train, enc_real_holdout, population_size=population_size, attack_size=attack_data_size, quasi_vars=quasi_identifiers)
print(f"Relative F1 score = {res['f1']}")
print(f"Naive F1 score = {res['f_naive']}")


Calculating membership disclosure risk
Relative F1 score = 0.009950248756218907
Naive F1 score = 0.010272899577270183


In [10]:
# Calculate the membership risk when selected variables are used as quasi-identifiers. In this example, we use the ENCODED variable names corresponding to the indexes that were initially defined in the input json file. 
quasi_identifiers = gen1.enc_real_info['quasi_names']

res=calc_mmbrshp_risk(enc_synth, enc_real_train, enc_real_holdout, population_size=population_size, attack_size=attack_data_size, quasi_vars=quasi_identifiers)
print(f"Relative F1 score = {res['f1']}")
print(f"Naive F1 score = {res['f_naive']}")

Calculating membership disclosure risk
Relative F1 score = 0.009950248756218907
Naive F1 score = 0.010272899577270183
