In [None]:
## Importing libaries
import pandas as pd
import numpy as np
import pathlib

from sklearn.model_selection import train_test_split

import os

## For synthesis
from sdv.single_table import GaussianCopulaSynthesizer, CTGANSynthesizer, TVAESynthesizer
from sdv.datasets.local import load_csvs
from sdv.metadata import Metadata
## For evaluating

from sdv.evaluation.single_table import evaluate_quality
from syntheval import SynthEval

In [None]:
#Reading in different datasets
def read_Data(dependency):

    #Setting path to read data from
    if dependency=="simulated":
        prefix=""
        all_files = load_csvs(f'{parent_dir}/Simulated Data/')
    elif dependency=="high":
        prefix="high_"
        all_files = load_csvs(f'{parent_dir}/Simulating Data/Dependency/DataWithRelations/')
    elif dependency=="moderate":
        prefix="moderate_"
        all_files = load_csvs(f'{parent_dir}/Simulating Data/Dependency/DataModerateRelations/')
    else:
        prefix="low_"
        all_files = load_csvs(f'{parent_dir}/Simulating Data/Dependency/DataNoRelations/')
    
    #Reading in all datasets of different sample sizes and missing percentages
    df_10000_0 = all_files[f'{prefix}10000_obs_0_percent_missing']
    df_10000_10 = all_files[f'{prefix}10000_obs_10_percent_missing']
    df_10000_20 = all_files[f'{prefix}10000_obs_20_percent_missing']
    df_25000_0 = all_files[f'{prefix}25000_obs_0_percent_missing']
    df_25000_10 = all_files[f'{prefix}25000_obs_10_percent_missing']
    df_25000_20 = all_files[f'{prefix}25000_obs_20_percent_missing']
    df_50000_0 = all_files[f'{prefix}50000_obs_0_percent_missing']
    df_50000_10 = all_files[f'{prefix}50000_obs_10_percent_missing']
    df_50000_20 = all_files[f'{prefix}50000_obs_20_percent_missing']    

    return df_10000_0, df_10000_10, df_10000_20,df_25000_0, df_25000_10, df_25000_20,df_50000_0, df_50000_10, df_50000_20,dependency         


In [None]:
# Helper functions

def syntheval_preprocess(df, syn_df, holdout_df=None):
    #Preprocessing into form for Synth Eval 
    cols_to_drop = ['id','hashedId', 'arcsId', 'dateOfBirth']
    df_real = df.drop(cols_to_drop, axis=1)
    df_fake = syn_df.drop(cols_to_drop, axis=1)
    if holdout_df is not None:
        df_holdout = holdout_df.drop(cols_to_drop, axis=1)
        return df_real, df_fake, df_holdout
    return df_real, df_fake

#Calculating MIA score
def mia_score(recall,precision):
    return 2*(recall*precision)/(recall+precision)

# Synthesize and privacy protection metrics
def synthesize_data(train_df, n_rows, holdout_df=None, seed=42):

    #Creating metadata for synthesizers
    metadata = Metadata.detect_from_dataframe(data=train_df)

    metadata.update_column(column_name="arcsId", sdtype="id")
    metadata.update_column(column_name="hashedId", sdtype="id")
    metadata.update_column(column_name="countryCode",sdtype="categorical")
    metadata.update_column(column_name="Language",sdtype="categorical")

    # Train synthesizers
    synthesizers = {
        'GC': GaussianCopulaSynthesizer(metadata),
        'CTGAN': CTGANSynthesizer(metadata,cuda=True),
        'TVAE': TVAESynthesizer(metadata,cuda=True)
    }

    print('Training Synthetic Data')

    # Generate synthetic data 
    synthetic_data = {}
    for name, syn in synthesizers.items():
        print(f'Training {name}')
        syn.fit(train_df)
        print(f'Finished Training {name}')
        synthetic_data[name] = syn.sample(num_rows=n_rows)
        synthetic_data[name]['dateOfBirth']=synthetic_data[name]['dateOfBirth'].astype(str)

    # Privacy evaluation
    results = {}
    sens = ['id', 'dateOfBirth', 'emailAddress']
    train_df['dateOfBirth']=train_df['dateOfBirth'].astype(str)
    
    for name, syn_df in synthetic_data.items():
        try:
            # NNDR, NNAA, and MIA via Syntheval
            print(f'NNDR and NNAA {name}')
            real, fake, holdout = syntheval_preprocess(train_df, syn_df, holdout_df)
            evaluator = SynthEval(real,holdout_dataframe=holdout)
            privacy_metrics = evaluator.evaluate(fake, sens, presets_file='privacyMetrics.json')

            print(f'Finished NNDR and NNAA {name}')

            mia=mia_score(privacy_metrics.iloc[4]['val'],privacy_metrics.iloc[5]['val'])

            results[name] = {
                'nndr': privacy_metrics.iloc[0]['val'],
                'nnaa': privacy_metrics.iloc[2]['val'],
                 'mia': mia
            }

        except Exception as e:
            print(f"Error evaluating {name}: {str(e)}")
            continue

    return results



In [None]:
# Saving privacy metrics
def save_Metrics(dependency,df_results):
    if dependency=="simulated":
        path=''
    elif dependency=="high":
        path=f'{parent_dir}/Simulating Data/Dependency/Dependency Synthetic Data/High/Privacy Metrics/'
    elif dependency=="moderate":
        path=f'{parent_dir}/Simulating Data/Dependency/Dependency Synthetic Data/Moderate/Privacy Metrics/'
    else:
        path=f'{parent_dir}/Simulating Data/Dependency/Dependency Synthetic Data/Low/Privacy Metrics/'
        
    df_results.to_csv(f'{path}privacy_metrics1.csv',index=False)

In [None]:
def entirePipeline(dependency):
    #Reading data
    df_10000_0, df_10000_10, df_10000_20,df_25000_0, df_25000_10, df_25000_20,df_50000_0, df_50000_10, df_50000_20,dependency=read_Data(dependency)

    df_dict = {
        '10000_0': df_10000_0,
        '10000_10': df_10000_10,
        '10000_20': df_10000_20,
        '25000_0': df_25000_0,
        '25000_10': df_25000_10,
        '25000_20': df_25000_20,
        '50000_0': df_50000_0,
        '50000_10': df_50000_10,
        '50000_20': df_50000_20
    }

    # Train test split
    train_sets = {}
    holdout_sets = {}
    for name, df in df_dict.items():
        # Convert dateOfBirth to string to avoid datetime issues
        df['dateOfBirth'] = pd.to_datetime(df['dateOfBirth'], errors='coerce')

        train, holdout = train_test_split(df, test_size=0.2, random_state=42)
        train_sets[name] = train
        holdout_sets[name] = holdout

    # Execute pipeline
    all_results = {}
    for i, (name, train_df) in enumerate(train_sets.items()):
        print(f"Processing {name}...")
        #Training and synthesizing data and privacy metrics
        all_results[name] = synthesize_data(
            train_df=train_df,
            n_rows=len(train_df),
            holdout_df=holdout_sets[name]
        )
        
    # Wrangling and saving results
    results = []
    for dataset, methods in all_results.items():
        for method, metrics in methods.items():
            results.append({
                'Dataset': dataset,
                'Method': method,
                'NNDR': f"{metrics['nndr']:.3f}",
                'NNAA': f"{metrics['nnaa']:.3f}",
                'MIA Risk': f"{metrics['mia']:.3f}" if metrics['mia'] is not None else "N/A"
            })

    df_results = pd.DataFrame(results)
    save_Metrics(dependency,df_results)

In [None]:
script_dir = pathlib.Path().resolve()
os.chdir(script_dir)
parent_dir=script_dir.parent

# Running entire pipeline for all association strengths
entirePipeline("simulated")
entirePipeline("high")
entirePipeline("moderate")
entirePipeline("low")