In [1]:
import pandas as pd
import json
import yaml
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import config

from preprocess.impute import impute_data

import torch 


from itertools import product
from pathlib import Path

from sdmetrics.reports.single_table import QualityReport
from sdmetrics.reports.single_table import DiagnosticReport
from sdmetrics.single_table import DCRBaselineProtection, DCROverfittingProtection
from sdmetrics.single_column import KSComplement, TVComplement
from sdv.metadata import SingleTableMetadata


from synthcity.metrics.eval_statistical import KolmogorovSmirnovTest, InverseKLDivergence, SurvivalKMDistance
from synthcity.metrics.eval_privacy import DomiasMIAPrior, kAnonymization, kMap, DeltaPresence
from synthcity.plugins.core.dataloader import GenericDataLoader, SurvivalAnalysisDataLoader

from syntheval.metrics.privacy.metric_nn_adversarial_accuracy import NearestNeighbourAdversarialAccuracy
from syntheval.metrics.privacy.metric_MIA_classification import MIAClassifier
from sklearn.model_selection import KFold
import seaborn as sns
from scipy.stats import ks_2samp

# synthcity absolute
from evaluate import DataLeakageLinear, DataLeakageXGB, exact_match, membership_inference, NearestNeighbourAdversarialAccuracy
from skimage.exposure import match_histograms



In [None]:
with open(config.COLUMNS_CLASS, 'r') as yaml_file:
    column_class = yaml.load(yaml_file, Loader=yaml.FullLoader)


impute_method = 'mice'
data_impute_method = 'add_missing'
apply_histogram_equalize = False
real_baseline_df = pd.read_parquet(config.BASELINE_COMBINE_FILE).reset_index()

cgan_df = pd.read_csv(config.CGAN_SYNTHETIC_BASELINE + data_impute_method + '.csv')
survadgan_df =  pd.read_csv(config.SURVADGAN_SYNTHETIC_BASELINE + data_impute_method + '.csv')
tvae_df = pd.read_csv(config.TVAE_SYNTHETIC_BASELINE + data_impute_method + '.csv')
adgan_df = pd.read_csv(config.ADGAN_SYNTHETIC_BASELINE + data_impute_method + '.csv')
ddpm_df = pd.read_csv(config.DDPM_SYNTHETIC_BASELINE + data_impute_method + '.csv')
nflow_df = pd.read_csv(config.NFLOW_SYNTHETIC_BASELINE + data_impute_method + '.csv')

real_train_idx = np.load(config.INDEX_TRAIN )
real_val_idx = np.load(config.INDEX_VAL)
real_test_idx = np.load(config.INDEX_TEST )

survadgan_train_idx = np.load(config.SURVADGAN_INDEX_TRAIN+ data_impute_method + '.npy')
survadgan_val_idx = np.load(config.SURVADGAN_INDEX_VAL+ data_impute_method + '.npy')
survadgan_test_idx = np.load(config.SURVADGAN_INDEX_TEST+ data_impute_method + '.npy')

cgan_train_idx = np.load(config.CGAN_INDEX_TRAIN+ data_impute_method + '.npy')
cgan_val_idx = np.load(config.CGAN_INDEX_VAL+ data_impute_method + '.npy')
cgan_test_idx = np.load(config.CGAN_INDEX_TEST+ data_impute_method + '.npy')

tvae_train_idx = np.load(config.TVAE_INDEX_TRAIN+ data_impute_method + '.npy')
tvae_val_idx = np.load(config.TVAE_INDEX_VAL+ data_impute_method + '.npy')
tvae_test_idx = np.load(config.TVAE_INDEX_TEST+ data_impute_method + '.npy')

adgan_train_idx = np.load(config.ADGAN_INDEX_TRAIN+ data_impute_method + '.npy')
adgan_val_idx = np.load(config.ADGAN_INDEX_VAL+ data_impute_method + '.npy')
adgan_test_idx = np.load(config.ADGAN_INDEX_TEST+ data_impute_method + '.npy')

ddpm_train_idx = np.load(config.DDPM_INDEX_TRAIN+ data_impute_method + '.npy')
ddpm_val_idx = np.load(config.DDPM_INDEX_VAL+ data_impute_method + '.npy')
ddpm_test_idx = np.load(config.DDPM_INDEX_TEST+ data_impute_method + '.npy')

nflow_train_idx = np.load(config.NFLOW_INDEX_TRAIN+ data_impute_method + '.npy')
nflow_val_idx = np.load(config.NFLOW_INDEX_VAL+ data_impute_method + '.npy')
nflow_test_idx = np.load(config.NFLOW_INDEX_TEST+ data_impute_method + '.npy')





#process to make columns match
cgan_df.columns = [col.replace('_1', '') for col in cgan_df.columns]
tvae_df.columns = [col.replace('_1', '') for col in tvae_df.columns]
survadgan_df.columns = [col.replace('_1', '') for col in survadgan_df.columns]
adgan_df.columns = [col.replace('_1', '') for col in adgan_df.columns]
ddpm_df.columns = [col.replace('_1', '') for col in ddpm_df.columns]
nflow_df.columns = [col.replace('_1', '') for col in nflow_df.columns]

adgan_df = impute_data(adgan_df, adgan_train_idx, adgan_val_idx, adgan_test_idx, method=impute_method)
cgan_df = impute_data(cgan_df, cgan_train_idx, cgan_val_idx, cgan_test_idx, method=impute_method)
survadgan_df = impute_data(survadgan_df, survadgan_train_idx, survadgan_val_idx, survadgan_test_idx, method=impute_method)
tvae_df = impute_data(tvae_df, tvae_train_idx, tvae_val_idx, tvae_test_idx, method=impute_method)
ddpm_df = impute_data(ddpm_df, ddpm_train_idx, ddpm_val_idx, ddpm_test_idx, method=impute_method)
nflow_df = impute_data(nflow_df, nflow_train_idx, nflow_val_idx, nflow_test_idx, method=impute_method)

real_baseline_df = real_baseline_df.set_index('ENC_HN')
real_baseline_df = impute_data(real_baseline_df, real_train_idx, real_val_idx, real_test_idx, method = impute_method)
real_baseline_df = real_baseline_df.reset_index()
real_baseline_df['dead'] = real_baseline_df['dead'].astype(int)
real_baseline_df = real_baseline_df.drop(columns=['admit'], errors='ignore')

column_order = real_baseline_df.columns.tolist()
adgan_df = adgan_df[column_order].set_index('ENC_HN')
cgan_df = cgan_df[column_order].set_index('ENC_HN')
survadgan_df = survadgan_df[column_order].set_index('ENC_HN')
tvae_df = tvae_df[column_order].set_index('ENC_HN')
ddpm_df = ddpm_df[column_order].set_index('ENC_HN')
nflow_df = nflow_df[column_order].set_index('ENC_HN')
real_baseline_df = real_baseline_df[column_order].set_index('ENC_HN')

adgan_df['dead'] = adgan_df['dead'].astype(int)
cgan_df['dead'] = cgan_df['dead'].astype(int)
survadgan_df['dead'] = survadgan_df['dead'].astype(int)
tvae_df['dead'] = tvae_df['dead'].astype(int)
ddpm_df['dead'] = ddpm_df['dead'].astype(int)
nflow_df['dead'] = nflow_df['dead'].astype(int)

if apply_histogram_equalize:
    use_index = np.concatenate([real_train_idx, real_val_idx])

    adgan_df['Days'] = match_histograms(adgan_df['Days'].to_numpy(), real_baseline_df['Days'][use_index].to_numpy())
    cgan_df['Days'] = match_histograms(cgan_df['Days'].to_numpy(), real_baseline_df['Days'][use_index].to_numpy())
    survadgan_df['Days'] = match_histograms(survadgan_df['Days'].to_numpy(), real_baseline_df['Days'][use_index].to_numpy())
    tvae_df['Days'] = match_histograms(tvae_df['Days'].to_numpy(), real_baseline_df['Days'][use_index].to_numpy())
    ddpm_df['Days'] = match_histograms(ddpm_df['Days'].to_numpy(), real_baseline_df['Days'][use_index].to_numpy())
    nflow_df['Days'] = match_histograms(nflow_df['Days'].to_numpy(), real_baseline_df['Days'][use_index].to_numpy())


real = GenericDataLoader(real_baseline_df, target_column="dead", time_to_event_column = 'Days')
cgan = GenericDataLoader(cgan_df, target_column="dead", time_to_event_column = 'Days')
tvae = GenericDataLoader(tvae_df, target_column="dead", time_to_event_column = 'Days')
survadgan = GenericDataLoader(survadgan_df, target_column="dead", time_to_event_column = 'Days')
adgan = GenericDataLoader(adgan_df, target_column="dead", time_to_event_column = 'Days')
ddpm = GenericDataLoader(ddpm_df, target_column="dead", time_to_event_column = 'Days')
nflow = GenericDataLoader(nflow_df, target_column="dead", time_to_event_column = 'Days')

real_train_idx = np.concatenate([real_train_idx, real_val_idx])

In [3]:
def prepare_sdv(df):
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(data=df)
    return df, metadata

datasets = {
    'TVAE': tvae_df,
    'SurvicalAdGAN': survadgan_df,
    'ADSGAN': adgan_df,
    # 'CTGAN': cgan_df,
    'DDPM': ddpm_df,
    'NFlow': nflow_df
}




### Membership inference

In [4]:
datasets = {
    'TVAE': tvae_df,
    'SurvicalAdGAN': survadgan_df,
    'ADSGAN': adgan_df,
    # 'CTGAN': cgan_df,
    'DDPM': ddpm_df,
    'NFlow': nflow_df
}
kf = KFold(n_splits=4, shuffle=True, random_state=42)
mia_results = {}
for name, synth_df in datasets.items():
    for train_idx, test_idx in kf.split(real_train_idx):
        tset_idx = real_train_idx[test_idx]

        try:
            result = membership_inference(real_baseline_df.loc[tset_idx],
                                real_baseline_df.loc[real_test_idx],
                                synth_df, synth_df.columns.tolist(), percentile_threshold=0.5)
            # append results
            if name not in mia_results:
                mia_results[name] = []
            mia_results[name].append(result)

        except Exception as e:
            print(f"Failed for {name}: {e}")
# Convert results to DataFrame
mia_results = {name: pd.DataFrame(res) for name, res in mia_results.items()}
mia_results = {name: res.mean().to_dict() for name, res in mia_results.items()}
mia_results = pd.DataFrame(mia_results).T

Threshold for membership inference: 0.25339610662201706, min: 0.1867480696580879, max: 2.2918881037324996
Threshold for membership inference: 0.24654745942607903, min: 0.17748376188343173, max: 2.376296646743714
Threshold for membership inference: 0.23322751645492634, min: 0.17745494586900706, max: 2.3002320698836116
Threshold for membership inference: 0.24168552252068692, min: 0.1284353829636191, max: 2.287897730019802
Threshold for membership inference: 0.2908520465205538, min: 0.1516015345008695, max: 1.9500324616895972
Threshold for membership inference: 0.283485867093262, min: 0.1534380115944476, max: 1.960966206522294
Threshold for membership inference: 0.2872587253717827, min: 0.1519571023146024, max: 1.9516631376900937
Threshold for membership inference: 0.2831165209502698, min: 0.14882708233506609, max: 1.9528096401347237
Threshold for membership inference: 0.2920605468325524, min: 0.21777380516681366, max: 2.1299888560452773
Threshold for membership inference: 0.2850831046743

### NNAA

In [5]:

datasets = {
    'TVAE': tvae_df,
    'SurvicalAdGAN': survadgan_df,
    'ADSGAN': adgan_df,
    # 'CTGAN': cgan_df,
    'DDPM': ddpm_df,
    'NFlow': nflow_df
}

results = {}
for name, synth_df in datasets.items():



    evaluator = NearestNeighbourAdversarialAccuracy(
        real_baseline_df.loc[real_train_idx],
        synth_df,
        hout_data=real_baseline_df.loc[real_test_idx],
        cat_cols=[col for col in column_class['discrete'] if col in synth_df.columns],
        num_cols=column_class['continuous'], nn_dist='euclid')
    result = evaluator.evaluate()['priv_loss']
    results.setdefault(name, []).append(result)
nnaa_results = pd.DataFrame(results).T.mean(axis=1).to_frame(name='NNAA').T

In [6]:
nnaa_results

Unnamed: 0,TVAE,SurvicalAdGAN,ADSGAN,DDPM,NFlow
NNAA,0.007039,1.8e-05,-0.00485,0.012333,-0.009634


### Attribute inference

In [7]:

well_known_columns = ['HIGH', 'BW', 'Age', 'Gender']
sensitive_features = [col for col in real_baseline_df.columns if col not in well_known_columns]


kf = KFold(n_splits=5, shuffle=True, random_state=42)
datasets = {
    'TVAE': tvae_df,
    'SurvicalAdGAN': survadgan_df,
    'ADSGAN': adgan_df,
    # 'CTGAN': cgan_df,
    'DDPM': ddpm_df,
    'NFlow': nflow_df
}

real_generic_train = GenericDataLoader(real_baseline_df.loc[real_train_idx], target_column="dead", time_to_event_column="Days", sensitive_features=sensitive_features)
real_generic_test = GenericDataLoader(real_baseline_df.loc[real_test_idx], target_column="dead", time_to_event_column="Days", sensitive_features=sensitive_features)

metrics = {
    'DataLeakageLinear': DataLeakageLinear(use_cache=False),
    'DataLeakageXGB': DataLeakageXGB(use_cache=False),
}
results = {}
for name, df in datasets.items():
    for fold, (train_index, test_index) in enumerate(kf.split(df)):
        
        df_test = df.loc[df.index[test_index]]
        loader = GenericDataLoader(
            df_test,
            target_column="dead",
            time_to_event_column="Days",
            sensitive_features=sensitive_features
        )
        for metric_name, metric in metrics.items():
            try:
                score = metric.evaluate(real_generic_train, loader)

                results.setdefault(name, {}).setdefault(metric_name, []).append(score['mean'])
            except Exception as e:
                print(f"Error in {name} with {metric_name}: {e}")
                results.setdefault(name, {}).setdefault(metric_name, []).append(f"Error: {e}")
    # average the results across folds
    for metric_name in metrics.keys():
        if metric_name in results[name]:
            print(f"Average {metric_name} for {name}: {results[name][metric_name]}")
            results[name][metric_name] = np.mean(results[name][metric_name])

for metric_name, metric in metrics.items():
    score = metric.evaluate(real_generic_train, real_generic_test)
    results.setdefault('Real', {}).setdefault(metric_name, []).append(score['mean'])

results['Real'] = {metric_name: np.mean(scores) for metric_name, scores in results['Real'].items()}
results = pd.DataFrame(results).T

Average DataLeakageLinear for TVAE: [0.510909432559009, 0.5130745768357917, 0.5122072944080933, 0.512303613622577, 0.510539951741288]
Average DataLeakageXGB for TVAE: [0.5148245352722909, 0.5170378649098596, 0.516363284575169, 0.5157819058364934, 0.5166009997042079]
Average DataLeakageLinear for SurvicalAdGAN: [0.5031638299240276, 0.5032248523867799, 0.5025824911753275, 0.5028361856586261, 0.5044970166777859]
Average DataLeakageXGB for SurvicalAdGAN: [0.5010751023309395, 0.5002720076525125, 0.49980621113942564, 0.4990448208864398, 0.5005386492901894]
Average DataLeakageLinear for ADSGAN: [0.5009023437184554, 0.5005971888553601, 0.503677958961009, 0.5004140424469119, 0.5021840790501714]
Average DataLeakageXGB for ADSGAN: [0.49810367025618796, 0.49713993744688734, 0.49950437224507577, 0.4977374756546709, 0.5004005805415724]
Average DataLeakageLinear for DDPM: [0.5063153055050696, 0.5055699678679781, 0.5069963709340624, 0.5076484769026776, 0.5056728371204835]
Average DataLeakageXGB for DD

In [8]:
priv1 = pd.DataFrame(results).T


combined = pd.concat([priv1, nnaa_results, mia_results.T], axis=0, sort=False).round(3)
combined.to_excel("results/privacy_report.xlsx")

### Exact match

In [9]:
datasets = {
    'TVAE': tvae_df,
    'SurvicalAdGAN': survadgan_df,
    'ADSGAN': adgan_df,
    # 'CTGAN': cgan_df,
    'DDPM': ddpm_df,
    'NFlow': nflow_df
}



for name, df in datasets.items():
    print(exact_match(real_baseline_df, df, chunk_size=2000, rtol=5e-2))

0
0
0
0
0
