# Imports

In [1]:
import sys
sys.path.append("..")

from pathlib import Path
import os
import shutil
import pickle 

# 3rd party
import pandas as pd
import numpy as np
import copy
import json
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

# Local
from src.data import process

midst_dir = '/data8/projets/dev_synthetic_data/code/lherbault/MIDSTModels'
sys.path.append(midst_dir)

clover_dir = '/data8/projets/dev_synthetic_data/code/lherbault/git_dev_synthetic_data'
sys.path.append(clover_dir)

from metrics.utility.population import Distinguishability

from midst_models.single_table_TabDDPM.complex_pipeline import (
    clava_clustering,
    clava_training,
    clava_fine_tuning,
    clava_load_pretrained,
    clava_synthesizing,
    load_configs,
    CustomUnpickler
)
from midst_models.single_table_TabDDPM.pipeline_modules import load_multi_table

In [2]:
data_dir = "/data8/projets/dev_synthetic_data/data/MIDST/tabddpm_black_box/train/tabddpm_1"

# Data processing

We want to test our attack idea. For that we need to produce 3 different training sets:
- tabddpm_1 without all challenge observations
- tabddpm_1 with only the challenge observations that were in the original training set
- tabddpm_1 with only the challenge observations that were NOT in the original training set

We will also produce similar training sets, limiting the observations in the first tabddpm_1 to 900, and then 100.

In [3]:
train_original = pd.read_csv(os.path.join(data_dir, 'train_with_id.csv'))
challenge = pd.read_csv(os.path.join(data_dir, 'challenge_with_id.csv'))
challenge_labels = pd.read_csv(os.path.join(data_dir, 'challenge_label.csv'))

In [4]:
# Training sets with full length
train_wo = train_original[~train_original['trans_id'].isin(challenge['trans_id'])].copy()

In [3]:
train_with_targets = train_original.copy()
train_with_traps = pd.concat([train_wo, challenge[challenge_labels['is_train']==0]], axis=0).copy()

# Training sets with len(train_wo)=900
train_wo_s = train_wo.sample(n=900, random_state=14).copy()
train_with_targets_s = pd.concat([train_wo_s, challenge[challenge_labels['is_train']==1]], axis=0).copy()
train_with_traps_s = pd.concat([train_wo_s, challenge[challenge_labels['is_train']==0]], axis=0).copy()

# Training sets with len(train_wo)=100
train_wo_xs = train_wo.sample(n=100, random_state=14).copy()
train_with_targets_xs = pd.concat([train_wo_xs, challenge[challenge_labels['is_train']==1]], axis=0).copy()
train_with_traps_xs = pd.concat([train_wo_xs, challenge[challenge_labels['is_train']==0]], axis=0).copy()

# Create the necessary folders and config files

In [5]:
base_path = '/data8/projets/dev_synthetic_data/data/MIDST/tabddpm_black_box/train/tabddpm_1_attack'
json_file_path = '/data8/projets/dev_synthetic_data/code/lherbault/github_ensemble_mia/configs/trans.json'

In [None]:
for string, tset in {
             'train_wo': train_wo, 
             'train_with_targets': train_with_targets, 
             'train_with_traps': train_with_traps,
             'train_wo_s': train_wo_s, 
             'train_with_targets_s': train_with_targets_s, 
             'train_with_traps_s': train_with_traps_s,
             'train_wo_xs': train_wo_xs, 
             'train_with_targets_xs': train_with_targets_xs, 
             'train_with_traps_xs': train_with_traps_xs
    }.items():
    new_folder = os.path.join(base_path, string)

    # create the new folder if it doesn't exist
    if not os.path.exists(new_folder):
        os.makedirs(new_folder)

    # save the training data
    tset.to_csv(os.path.join(new_folder, 'train.csv'), index=False)

    # copy the original config file to the new folder
    shutil.copy(json_file_path, new_folder)
    new_json_file_path = os.path.join(new_folder, 'trans.json')

    # modify the config file to give the correct training data and saving directory
    with open(new_json_file_path, "r") as file:
        data = json.load(file)
    data['general']['data_dir'] = new_folder
    data['general']['exp_name'] = string
    data['general']['workspace_dir'] = os.path.join(base_path, 'workspace')

    # modify the model parameters for smaller sets
    if string[-2:]=='_s' or string[-3:]=='_xs':
        data['diffusion']['d_layers'] = [32, 64, 64, 64, 64, 32]
        data['diffusion']['iterations'] = 10000
        data['classifier']['d_layers'] = [16, 32, 64, 128, 64, 32, 16]
        data['classifier']['dim_t'] = 16
        data['classifier']['iterations'] = 1000

    # save the changed to the new json file
    with open(new_json_file_path, "w") as file:
        json.dump(data, file, indent=4)

    print("Changes made successfully in folder ", new_folder)    

# Train all tabDDPM models

In [6]:
material = {
    'tables': {},
    'relation_order': {},
    'save_dir': {},
    'all_group_lengths_prob_dicts': {},
    'models': {},
    'configs': {},
    'synth_data': {},
    'distinguishability': {}}

In [None]:
for tset in ['train_wo', 'train_with_targets', 'train_with_traps',
             'train_wo_s', 'train_with_targets_s', 'train_with_traps_s',
             'train_wo_xs', 'train_with_targets_xs', 'train_with_traps_xs']:

    # Set up the config
    folder = os.path.join(base_path, tset)
    configs, save_dir = load_configs(os.path.join(folder, 'trans.json'))

    material['configs'][tset] = configs
    material['save_dir'][tset] = save_dir

    # Load tables
    tables, relation_order, dataset_meta = load_multi_table(configs["general"]["data_dir"])
    material['relation_order'][tset] = relation_order

    # Clustering on the multi-table dataset
    tables, all_group_lengths_prob_dicts = clava_clustering(
        tables, relation_order, save_dir, configs
    )
    material['tables'][tset] = tables
    material['all_group_lengths_prob_dicts'][tset] = all_group_lengths_prob_dicts

    # Train models
    models = clava_training(tables, relation_order, save_dir, configs)
    material['models'][tset] = models

    # Determine the sample scale
    # We want the final synthetic data = len(provided_synth_data) = 20,000
    sample_scale = 20000/len(tables['trans']['df'])

    # Generate synthetic data from scratch
    cleaned_tables, synthesizing_time_spent, matching_time_spent = clava_synthesizing(
        tables,
        relation_order,
        save_dir,
        all_group_lengths_prob_dicts,
        models,
        configs,
        sample_scale=sample_scale,
        )

    material['synth_data'][tset] = cleaned_tables['trans']

# Compute the distinguishability of all freshly generated data vs. provided synthetic data

In [7]:
synth_data = pd.read_csv(os.path.join(base_path, 'trans_synthetic.csv'))

synth_train, synth_test = train_test_split(synth_data, test_size=0.3, random_state=42)

In [8]:
metadata = {
    "continuous": ["trans_date", "amount", "balance", "bank"],
    "categorical": ["trans_type", "operation", "k_symbol", "account"],
    "variable_to_predict": "trans_type",
}

In [108]:
list(synth_data.columns)

['trans_date',
 'trans_type',
 'operation',
 'amount',
 'balance',
 'k_symbol',
 'bank',
 'account']

In [235]:
for tset in ['train_wo', 'train_with_targets', 'train_with_traps',
             'train_wo_s', 'train_with_targets_s', 'train_with_traps_s',
             'train_wo_xs', 'train_with_targets_xs', 'train_with_traps_xs']:

    print(tset)
    
    temp_train, temp_test = train_test_split(material['synth_data'][tset][list(synth_data.columns)].astype(synth_data.dtypes.to_dict()),
                                             test_size=0.3, random_state=42)
    
    material['distinguishability'][tset] = Distinguishability().compute(
        df_real={'train': synth_train, 'test': synth_test},
        df_synthetic={'train': temp_train, 'test': temp_test},
        metadata=metadata,
        optimize_xgb=False
    )

train_wo
train_with_targets
train_with_traps
train_wo_s
train_with_targets_s
train_with_traps_s
train_wo_xs
train_with_targets_xs
train_with_traps_xs


In [120]:
# Save distinguishability as it was super long to run
part_dict_2 = {'original_synth_train': synth_train, 
             'original_synth_test': synth_test,
             'other_synth_data': material['synth_data'],
             'distinguishability': material['distinguishability']}

with open(os.path.join(base_path, 'synth_data_dict_2.pkl'), 'wb') as file:
    pickle.dump(part_dict_2, file)

# Compare distinguishability for all sets

In [125]:
tset_list = ['train_wo', 'train_with_targets', 'train_with_traps',
             'train_wo_s', 'train_with_targets_s', 'train_with_traps_s',
             'train_wo_xs', 'train_with_targets_xs', 'train_with_traps_xs']

In [236]:
pd.DataFrame({tset: material['distinguishability'][tset]['average'] 
              for tset in ['train_wo', 'train_with_targets', 'train_with_traps']})

Unnamed: 0,train_wo,train_with_targets,train_with_traps
propensity_mse,0.00967,0.008447,0.007644
prediction_mse_real,0.009595,0.008498,0.007719
prediction_mse_synth,0.009745,0.008396,0.007569
prediction_auc_rescaled,0.043739,0.009333,0.054753
prediction_mse,0.005161,0.004152,0.004166


In [237]:
pd.DataFrame({tset: material['distinguishability'][tset]['average'] 
              for tset in ['train_wo_s', 'train_with_targets_s', 'train_with_traps_s']})

Unnamed: 0,train_wo_s,train_with_targets_s,train_with_traps_s
propensity_mse,0.23683,0.209051,0.279447
prediction_mse_real,0.144025,0.14424,0.18041
prediction_mse_synth,0.329634,0.273863,0.378483
prediction_auc_rescaled,0.63891,0.666148,0.717327
prediction_mse,0.223014,0.197152,0.263401


In [238]:
pd.DataFrame({tset: material['distinguishability'][tset]['average'] 
              for tset in ['train_wo_xs', 'train_with_targets_xs', 'train_with_traps_xs']})

Unnamed: 0,train_wo_xs,train_with_targets_xs,train_with_traps_xs
propensity_mse,0.413087,0.239378,0.236189
prediction_mse_real,0.310126,0.210041,0.19183
prediction_mse_synth,0.516049,0.268716,0.280549
prediction_auc_rescaled,0.846837,0.659184,0.698354
prediction_mse,0.40029,0.22222,0.220978


# Next steps
1. Refaire la meme experience en ne prenant pas le training set de depart mais plutot un subset au hasard parmi la population entiere (retirer toutes les donnees challenge, rajouter targets puis traps, et un mix des deux).
2. Si c'est concluant, choisir un algorithme (genetique?) pour la selection du vecteur binaire de selection des observations dans le set challenge.

In [9]:
pop_dir = '/data8/projets/dev_synthetic_data/data/MIDST/tabddpm_black_box/population/50000_rows_real_train/'

In [10]:
train_original = pd.read_csv(os.path.join(pop_dir, 'real_train.csv'))
challenge = pd.read_csv(os.path.join(data_dir, 'challenge_with_id.csv'))
challenge_labels = pd.read_csv(os.path.join(data_dir, 'challenge_label.csv'))

# Training set with full length
train_wo = train_original[~train_original['trans_id'].isin(challenge['trans_id'])].copy()

In [17]:
# Training sets with len(train_wo)=100
train_wo_xs = train_wo.sample(n=400, random_state=16).copy()
train_with_targets_xs = pd.concat([train_wo_xs, challenge[challenge_labels['is_train']==1]], axis=0).copy()
train_with_traps_xs = pd.concat([train_wo_xs, challenge[challenge_labels['is_train']==0]], axis=0).copy()
train_with_both_xs = pd.concat([train_wo_xs,
                                challenge[challenge_labels['is_train']==1].sample(n=50, random_state=14),
                                challenge[challenge_labels['is_train']==0].sample(n=50, random_state=14)],
                               axis=0)

In [18]:
for string, tset in {
             'train_pop_with_targets_xs': train_with_targets_xs, 
             'train_pop_with_traps_xs': train_with_traps_xs,
             'train_pop_with_both_xs': train_with_both_xs,
    }.items():
    new_folder = os.path.join(base_path, string)

    # create the new folder if it doesn't exist
    if not os.path.exists(new_folder):
        os.makedirs(new_folder)

    # save the training data
    tset.to_csv(os.path.join(new_folder, 'train.csv'), index=False)

    # copy the original config file to the new folder
    shutil.copy(json_file_path, new_folder)
    new_json_file_path = os.path.join(new_folder, 'trans.json')

    # modify the config file to give the correct training data and saving directory
    with open(new_json_file_path, "r") as file:
        data = json.load(file)
    data['general']['data_dir'] = new_folder
    data['general']['exp_name'] = string
    data['general']['workspace_dir'] = os.path.join(base_path, 'workspace')

    # modify the model parameters for smaller sets
    if string[-2:]=='_s' or string[-3:]=='_xs':
        data['diffusion']['d_layers'] = [32, 64, 64, 64, 64, 32]
        data['diffusion']['iterations'] = 10000
        data['classifier']['d_layers'] = [16, 32, 64, 128, 64, 32, 16]
        data['classifier']['dim_t'] = 16
        data['classifier']['iterations'] = 1000

    # save the changed to the new json file
    with open(new_json_file_path, "w") as file:
        json.dump(data, file, indent=4)

    print("Changes made successfully in folder ", new_folder)    

Changes made successfully in folder  /data8/projets/dev_synthetic_data/data/MIDST/tabddpm_black_box/train/tabddpm_1_attack/train_pop_with_targets_xs
Changes made successfully in folder  /data8/projets/dev_synthetic_data/data/MIDST/tabddpm_black_box/train/tabddpm_1_attack/train_pop_with_traps_xs
Changes made successfully in folder  /data8/projets/dev_synthetic_data/data/MIDST/tabddpm_black_box/train/tabddpm_1_attack/train_pop_with_both_xs


In [19]:
for tset in ['train_pop_with_targets_xs', 
             'train_pop_with_traps_xs',
             'train_pop_with_both_xs']:

    # Set up the config
    folder = os.path.join(base_path, tset)
    configs, save_dir = load_configs(os.path.join(folder, 'trans.json'))

    material['configs'][tset] = configs
    material['save_dir'][tset] = save_dir

    # Load tables
    tables, relation_order, dataset_meta = load_multi_table(configs["general"]["data_dir"])
    material['relation_order'][tset] = relation_order

    # Clustering on the multi-table dataset
    tables, all_group_lengths_prob_dicts = clava_clustering(
        tables, relation_order, save_dir, configs
    )
    material['tables'][tset] = tables
    material['all_group_lengths_prob_dicts'][tset] = all_group_lengths_prob_dicts

    # Train models
    models = clava_training(tables, relation_order, save_dir, configs)
    material['models'][tset] = models

    # Determine the sample scale
    # We want the final synthetic data = len(provided_synth_data) = 20,000
    sample_scale = 20000/len(tables['trans']['df'])

    # Generate synthetic data from scratch
    cleaned_tables, synthesizing_time_spent, matching_time_spent = clava_synthesizing(
        tables,
        relation_order,
        save_dir,
        all_group_lengths_prob_dicts,
        models,
        configs,
        sample_scale=sample_scale,
        )

    material['synth_data'][tset] = cleaned_tables['trans']

Table name: trans, Total dataframe shape: (500, 8), Numerical data shape: (500, 4), Categorical data shape: (500, 4)
Clustering checkpoint found, loading...
Training None -> trans model from scratch
Model params: {'num_classes': 0, 'is_y_cond': 'none', 'rtdl_params': {'d_layers': [32, 64, 64, 64, 64, 32], 'dropout': 0.0}, 'd_in': 8}
mlp
Step 500/10000 MLoss: 0.0 GLoss: 0.5356 Sum: 0.5356
Step 1000/10000 MLoss: 0.0 GLoss: 0.4394 Sum: 0.4394
Step 1500/10000 MLoss: 0.0 GLoss: 0.3945 Sum: 0.3945
Step 2000/10000 MLoss: 0.0 GLoss: 0.3491 Sum: 0.3491
Step 2500/10000 MLoss: 0.0 GLoss: 0.3277 Sum: 0.3277
Step 3000/10000 MLoss: 0.0 GLoss: 0.311 Sum: 0.311
Step 3500/10000 MLoss: 0.0 GLoss: 0.3 Sum: 0.3
Step 4000/10000 MLoss: 0.0 GLoss: 0.295 Sum: 0.295
Step 4500/10000 MLoss: 0.0 GLoss: 0.2901 Sum: 0.2901
Step 5000/10000 MLoss: 0.0 GLoss: 0.2843 Sum: 0.2843
Step 5500/10000 MLoss: 0.0 GLoss: 0.2776 Sum: 0.2776
Step 6000/10000 MLoss: 0.0 GLoss: 0.2754 Sum: 0.2754
Step 6500/10000 MLoss: 0.0 GLoss: 0.

In [20]:
dist = Distinguishability(use_gpu=True)

for tset in ['train_pop_with_targets_xs', 
             'train_pop_with_traps_xs',
             'train_pop_with_both_xs']:

    print(tset)
    
    temp_train, temp_test = train_test_split(material['synth_data'][tset][list(synth_data.columns)].astype(synth_data.dtypes.to_dict()),
                                             test_size=0.3, random_state=42)
    
    material['distinguishability'][tset] = dist.compute(
        df_real={'train': synth_train, 'test': synth_test},
        df_synthetic={'train': temp_train, 'test': temp_test},
        metadata=metadata,
        optimize_xgb=False
    )

train_pop_with_targets_xs
train_pop_with_traps_xs
train_pop_with_both_xs


In [21]:
pd.DataFrame({tset: material['distinguishability'][tset]['average']
              for tset in [ 
             'train_pop_with_targets_xs', 
             'train_pop_with_traps_xs',
             'train_pop_with_both_xs']})

Unnamed: 0,train_pop_with_targets_xs,train_pop_with_traps_xs,train_pop_with_both_xs
propensity_mse,0.368401,0.305286,0.372941
prediction_mse_real,0.28522,0.242248,0.316865
prediction_mse_synth,0.451582,0.368324,0.429016
prediction_auc_rescaled,0.760666,0.72931,0.77215
prediction_mse,0.348458,0.285901,0.350656


# Code structure for actual attack

pop_train = all training population excluding the challenge points

for n_reps:
- pick out a random slice of pop_train with n_train = 100
- optimize a binary vector for the inclusion of the challenge points
  - training set = slice of pop_train + included challenge_points
  - tabddpm is trained on the training set
  - distinguishability(generated 20k; provided 20k) is the lowest
- store the random slice, the best binary vector, the final distinguishability scores

at the end:
- compute the average of all the binary vectors to get the probability of inclusion
- compute the tpr_at_fpr_0.1

In [11]:
from pymoo.algorithms.soo.nonconvex.ga import GA
from pymoo.factory import get_problem, get_termination
from pymoo.optimize import minimize
from pymoo.core.problem import ElementwiseProblem, StarmapParallelization
from pymoo.core.population import Population
from pymoo.core.individual import Individual
from pymoo.core.evaluator import Evaluator
from multiprocessing.pool import ThreadPool

In [12]:
def config_tabddpm(
    data_dir='/data8/projets/dev_synthetic_data/data/MIDST/tabddpm_black_box/train/tabddpm_1_attack/temp_dir'
):
    # modify the config file to give the correct training data and saving directory
    temp_json_file_path = os.path.join(data_dir, 'trans.json')
    with open(temp_json_file_path, "r") as file:
        data = json.load(file)
    data['general']['data_dir'] = data_dir
    data['general']['exp_name'] = 'tmp'
    data['general']['workspace_dir'] = os.path.join(data_dir, 'tmp_workspace')

    # modify the model parameters for smaller sets
    data['diffusion']['d_layers'] = [32, 64, 64, 64, 64, 32]
    data['diffusion']['iterations'] = 10000
    data['classifier']['d_layers'] = [16, 32, 64, 128, 64, 32, 16]
    data['classifier']['dim_t'] = 16
    data['classifier']['iterations'] = 1000

    # save the changed to the new json file
    with open(temp_json_file_path, "w") as file:
        json.dump(data, file, indent=4)

    print("Changes made successfully in path ", temp_json_file_path)  

    # Set up the config
    configs, save_dir = load_configs(temp_json_file_path)

    return configs, save_dir
    

In [13]:
def train_tabddpm(
    train_set,
    configs,
    save_dir,
    data_dir='/data8/projets/dev_synthetic_data/data/MIDST/tabddpm_black_box/train/tabddpm_1_attack/temp_dir',
):
    material = {
    'tables': {},
    'relation_order': {},
    'save_dir': {},
    'all_group_lengths_prob_dicts': {},
    'models': {},
    'configs': {},
    'synth_data': {}}

    material['configs'] = configs
    material['save_dir'] = save_dir

    # Load tables
    tables, relation_order, dataset_meta = load_multi_table(configs["general"]["data_dir"], train_df=train_set)
    material['relation_order'] = relation_order

    # Clustering on the multi-table dataset
    tables, all_group_lengths_prob_dicts = clava_clustering(
        tables, relation_order, save_dir, configs
    )
    material['tables'] = tables
    material['all_group_lengths_prob_dicts'] = all_group_lengths_prob_dicts

    # Train models
    models = clava_training(tables, relation_order, save_dir, configs)
    material['models'] = models

    # Determine the sample scale
    # We want the final synthetic data = len(provided_synth_data) = 20,000
    sample_scale = 20000/len(tables['trans']['df'])

    # Generate synthetic data from scratch
    cleaned_tables, synthesizing_time_spent, matching_time_spent = clava_synthesizing(
        tables,
        relation_order,
        save_dir,
        all_group_lengths_prob_dicts,
        models,
        configs,
        sample_scale=sample_scale,
        )

    material['synth_data'] = cleaned_tables['trans']

    return material

In [14]:
def fine_tune_tabddpm(
    trained_models,
    new_train_set,
    configs,
    save_dir,
    new_diffusion_iterations = 100, 
    new_classifier_iterations = 10,
    data_dir='/data8/projets/dev_synthetic_data/data/MIDST/tabddpm_black_box/train/tabddpm_1_attack/temp_dir',
):
    material = {
    'tables': {},
    'relation_order': {},
    'save_dir': {},
    'all_group_lengths_prob_dicts': {},
    'models': {},
    'configs': {},
    'synth_data': {}}

    material['configs'] = configs
    material['save_dir'] = save_dir

    # Load tables
    new_tables, relation_order, dataset_meta = load_multi_table(configs["general"]["data_dir"], train_df=new_train_set)
    material['relation_order'] = relation_order

    # Clustering on the multi-table dataset
    new_tables, all_group_lengths_prob_dicts = clava_clustering(
        new_tables, relation_order, save_dir, configs
    )
    material['tables'] = new_tables
    material['all_group_lengths_prob_dicts'] = all_group_lengths_prob_dicts

    # Train models
    copied_models = copy.deepcopy(trained_models)
    new_models = clava_fine_tuning(copied_models, new_tables, relation_order, save_dir, configs, 
                                   new_diffusion_iterations, new_classifier_iterations)
    material['new_models'] = new_models

    # Determine the sample scale
    # We want the final synthetic data = len(provided_synth_data) = 20,000
    sample_scale = 20000/len(new_tables['trans']['df'])

    # Generate synthetic data from scratch
    cleaned_tables, synthesizing_time_spent, matching_time_spent = clava_synthesizing(
        new_tables,
        relation_order,
        save_dir,
        all_group_lengths_prob_dicts,
        new_models,
        configs,
        sample_scale=sample_scale,
        )

    material['synth_data'] = cleaned_tables['trans']

    return material

In [15]:
def evaluate_subset(
    generated_data,
    provided_data,
    metadata):

    synth_train, synth_test = train_test_split(provided_data, test_size=0.3, random_state=42)
    temp_train, temp_test = train_test_split(generated_data[list(provided_data.columns)].astype(provided_data.dtypes.to_dict()),
                                             test_size=0.3, random_state=42)

    dist = Distinguishability(use_gpu=True)
    
    temp_distinguishability = dist.compute(
        df_real={'train': synth_train, 'test': synth_test},
        df_synthetic={'train': temp_train, 'test': temp_test},
        metadata=metadata,
        optimize_xgb=False
    )

    return temp_distinguishability['average']['prediction_auc_rescaled']

In [16]:
class SubsetProblem(ElementwiseProblem):
    def __init__(self, train, challenge_df, ref_synth_data, metadata, n_selected, configs, save_dir):
        # train: DataFrame of the training data
        # L: DataFrame of the challenge observations
        # n_challenge: Number of challenge observations to be selected
        self.train = train
        self.challenge_df = challenge_df
        self.n_selected = n_selected
        self.ref_synth_data = ref_synth_data
        self.metadata = metadata
        n_var = len(challenge_df)  # Number of decision variables (one per challenge observation)
        xl = np.zeros(n_var)  # 0 means no selection
        xu = np.ones(n_var)  # 1 means select the challenge observation

        # initialize the thread pool and create the runner
        n_threads = 4
        pool = ThreadPool(n_threads)
        runner = StarmapParallelization(pool.starmap)
        
        super().__init__(n_var=n_var, n_obj=1, n_constr=0, xl=xl, xu=xu, elementwise_runner = runner)

    def _evaluate(self, x, out, *args, **kwargs):
        # Select observations from L based on x
        selected_indices = np.where(x == 1)[0]
        selected_challenges = self.challenge_df.iloc[selected_indices]

        if len(selected_challenges) != self.n_selected:
            out["F"] = np.inf  # Penalize if subset size is not 100
        else:
            # Concatenate train data with selected challenges
            augmented_train = pd.concat([self.train, selected_challenges], axis=0, ignore_index=True)
    
            # Train the model on the augmented training set
            train_result = train_tabddpm(augmented_train, configs, save_dir)
    
            # Compute the objective function
            out["F"] = evaluate_subset(generated_data=train_result['synth_data'],
                                       provided_data=self.ref_synth_data,
                                       metadata=self.metadata)

In [17]:
class FineSubsetProblem(ElementwiseProblem):
    def __init__(self, train, challenge_df, ref_synth_data, metadata, n_selected, configs, save_dir):
        # train: DataFrame of the training data
        # L: DataFrame of the challenge observations
        # n_challenge: Number of challenge observations to be selected
        self.train = train
        self.challenge_df = challenge_df
        self.n_selected = n_selected
        self.ref_synth_data = ref_synth_data
        self.metadata = metadata
        n_var = len(challenge_df)  # Number of decision variables (one per challenge observation)
        xl = np.zeros(n_var)  # 0 means no selection
        xu = np.ones(n_var)  # 1 means select the challenge observation

        # initialize the thread pool and create the runner
        n_threads = 4
        pool = ThreadPool(n_threads)
        runner = StarmapParallelization(pool.starmap)

        # initialize the model with the given population
        self.initial_model = train_tabddpm(train, configs, save_dir)['models']
        
        super().__init__(n_var=n_var, n_obj=1, n_constr=0, xl=xl, xu=xu, elementwise_runner = runner)

    def _evaluate(self, x, out, *args, **kwargs):
        # Select observations from L based on x
        selected_indices = np.where(x == 1)[0]
        selected_challenges = self.challenge_df.iloc[selected_indices]

        if len(selected_challenges) != self.n_selected:
            out["F"] = np.inf  # Penalize if subset size is not 100
        else:
            # Train the model on the augmented training set
            train_result = fine_tune_tabddpm(trained_models=self.initial_model,
                                             new_train_set=selected_challenges,
                                             configs=configs,
                                             save_dir=save_dir,
                                             new_diffusion_iterations = 2500, 
                                             new_classifier_iterations = 250,
                                            )
    
            # Compute the objective function
            out["F"] = evaluate_subset(generated_data=train_result['synth_data'],
                                       provided_data=self.ref_synth_data,
                                       metadata=self.metadata)

In [18]:
optim_results = {'X': [],
                 'dist_auc': []}

In [19]:
challenge_points = challenge
train_pop = train_wo
n_reps = 1

n_challenges = len(challenge_points)
n_selected = 100
n_pop_init = 50

In [20]:
configs, save_dir = config_tabddpm(
    data_dir='/data8/projets/dev_synthetic_data/data/MIDST/tabddpm_black_box/train/tabddpm_1_attack/temp_dir'
)

Changes made successfully in path  /data8/projets/dev_synthetic_data/data/MIDST/tabddpm_black_box/train/tabddpm_1_attack/temp_dir/trans.json


# Code to retrain the whole thing every time we have a new solution

In [18]:
for _ in range(n_reps):
    # pick out a random slice of pop_train with n_train = 400
    train = train_pop.sample(n=400)
    
    # optimize a binary vector for the inclusion of the challenge points
    problem = SubsetProblem(train = train,
                            challenge_df = challenge_points,
                            n_selected = n_selected,
                            ref_synth_data = synth_data,
                            metadata = metadata,
                            configs=configs,
                            save_dir=save_dir)

    # Create the initial data
    x_zeros = np.zeros((n_pop_init, n_challenges-n_selected))
    x_ones = np.ones((n_pop_init, n_selected))
    x = np.concatenate((x_zeros, x_ones), axis=1)
    for i in range(x.shape[0]):
        np.random.shuffle(x[i])
    pop = Population.new("X", x)

    Evaluator().eval(problem, pop)#, elementwise_runner=runner)
    
    # Define the genetic algorithm
    algorithm = GA(
        pop_size=100,
        eliminate_duplicates=True,
        sampling=pop,
    )
    
    # Define the termination criterion
    termination = get_termination("n_gen", 250)

    # Perform the optimization
    res = minimize(problem,
                   algorithm,
                   termination,
                   seed=1,
                   save_history=True,
                   verbose=True)

    # Output the best solution found
    print("Best solution: ", res.X)
    optim_results['X'].append(res.X)
    print("Best objective value: ", res.F)
    optim_results['dist_auc'].append(res.F)

NameError: name 'n_reps' is not defined

to do : 
- get the config part out of the train function
- change ddpm code to stop reading the train.csv file - take it as given here

In [31]:
from sklearn.metrics import confusion_matrix
confusion_matrix(challenge_labels, np.round(optim_results['X'][0]))

array([[54, 46],
       [46, 54]])

In [27]:
conf_df = pd.concat([challenge_labels, pd.Series(np.round(optim_results['X'][0]))], axis=1)

In [28]:
conf_df.to_csv('optim_results.csv')

In [34]:
# Pickle dump the result
with open('pymoo_result.pkl', 'wb') as f:
    pickle.dump(res, f)

# Code to only finetune the model

### Do it for each of the 200 observations (maybe a greedy version is enough)

In [21]:
n_reps = 1

In [22]:
master_dist_results = []

for rep in range(n_reps):
    print('Starting on random slice number ', rep)
    # pick out a random slice of pop_train with n_train = 400
    initial_train = train_pop.sample(n=400)

    # Train the model on this initial training set
    initial_result = train_tabddpm(initial_train, configs, save_dir)
    final_columns = [col for col in initial_result['synth_data'].columns if '_id' not in col]

    # Initialize the distinguishability results
    dist_results = []

    for i in challenge_points.index:
        print('Evaluating challenge point number ', i)
        # Finetune the model with this new_data
        new_result = fine_tune_tabddpm(
                        trained_models=initial_result['models'],
                        new_train_set=challenge_points.iloc[[i]],
                        configs=configs,
                        save_dir=save_dir,
                        new_diffusion_iterations = 100, 
                        new_classifier_iterations = 10,
                        data_dir='/data8/projets/dev_synthetic_data/data/MIDST/tabddpm_black_box/train/tabddpm_1_attack/temp_dir',
                    )
        # Compute the objective function
        dist_auc = evaluate_subset(generated_data=new_result['synth_data'][final_columns],
                                   provided_data=synth_data[final_columns],
                                   metadata=metadata)
        dist_results.append(dist_auc)
        print('AUC of point number ', i, ' = ', dist_auc)
    
    master_dist_results.append(dist_results)

Starting on random slice number  0
Table name: trans, Total dataframe shape: (400, 8), Numerical data shape: (400, 4), Categorical data shape: (400, 4)
Clustering checkpoint found, loading...
Training None -> trans model from scratch
Model params: {'num_classes': 0, 'is_y_cond': 'none', 'rtdl_params': {'d_layers': [32, 64, 64, 64, 64, 32], 'dropout': 0.0}, 'd_in': 8}
mlp
Step 500/10000 MLoss: 0.0 GLoss: 0.4949 Sum: 0.4949
Step 1000/10000 MLoss: 0.0 GLoss: 0.4132 Sum: 0.4132
Step 1500/10000 MLoss: 0.0 GLoss: 0.3489 Sum: 0.3489
Step 2000/10000 MLoss: 0.0 GLoss: 0.3123 Sum: 0.3123
Step 2500/10000 MLoss: 0.0 GLoss: 0.3012 Sum: 0.3012
Step 3000/10000 MLoss: 0.0 GLoss: 0.2929 Sum: 0.2929
Step 3500/10000 MLoss: 0.0 GLoss: 0.2885 Sum: 0.2885
Step 4000/10000 MLoss: 0.0 GLoss: 0.2845 Sum: 0.2845
Step 4500/10000 MLoss: 0.0 GLoss: 0.2773 Sum: 0.2773
Step 5000/10000 MLoss: 0.0 GLoss: 0.277 Sum: 0.277
Step 5500/10000 MLoss: 0.0 GLoss: 0.2768 Sum: 0.2768
Step 6000/10000 MLoss: 0.0 GLoss: 0.2738 Sum: 

In [23]:
dist_results

[0.8547082527777778,
 0.8421981722222223,
 0.8255688694444444,
 0.8136444805555556,
 0.8564126805555556,
 0.8347500888888888,
 0.8331871027777777,
 0.8749438805555556,
 0.8076010861111111,
 0.8170259916666666,
 0.8436717777777778,
 0.8503717638888888,
 0.8356802583333334,
 0.8314477166666666,
 0.8419398166666664,
 0.837362875,
 0.8220970361111111,
 0.8495920972222223,
 0.8522501833333334,
 0.8412599638888889,
 0.8160522611111112,
 0.8282693805555554,
 0.8309256805555554,
 0.8310658722222224,
 0.8172244,
 0.8140687111111111,
 0.8409317222222222,
 0.8172015888888888,
 0.8259076999999999,
 0.8538980305555557,
 0.8461848277777776,
 0.8349992527777779,
 0.8369098249999999,
 0.813833547222222,
 0.8323884500000001,
 0.8597206888888888,
 0.8286155416666665,
 0.8333039027777778,
 0.8433493222222221,
 0.84534155,
 0.8498884916666665,
 0.8363242861111113,
 0.8663963722222222,
 0.8551421138888887,
 0.8060821611111113,
 0.8132033111111111,
 0.8243582722222221,
 0.8348363388888889,
 0.85629498611111

In [24]:
challenge_test = challenge_points.copy()
challenge_test['auc'] = dist_results

In [27]:
challenge_test

Unnamed: 0,trans_id,account_id,trans_date,trans_type,operation,amount,balance,k_symbol,bank,account,auc,pred,truth
0,1134831,3878,1529,2,1,1648.0,13237.2,0,13,38488340,0.854708,0,1
1,3576816,1412,272,0,0,152.2,36926.6,7,0,0,0.842198,0,1
2,583865,1987,750,2,4,2400.0,35598.1,1,0,0,0.825569,0,1
3,895287,3051,1382,2,1,4104.0,65575.1,5,12,89318686,0.813644,0,0
4,1501078,5118,1239,2,4,800.0,27560.7,1,0,0,0.856413,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,669747,2285,1580,2,4,14.6,20441.4,6,0,0,0.799496,0,0
196,453622,1540,1987,2,4,2500.0,80540.9,1,0,0,0.815839,0,0
197,918970,3132,2060,2,4,2400.0,23541.3,1,0,0,0.827752,0,1
198,265153,905,2015,2,1,53.0,21380.3,3,6,13783419,0.870562,0,0


In [28]:
lowest_100_indices = challenge_test.nsmallest(100, 'auc').index
challenge_test['pred'] = 0
challenge_test.loc[lowest_100_indices, 'pred'] = 1
challenge_test['truth'] = challenge_labels

In [30]:
len(challenge_test[challenge_test['truth']==challenge_test['pred']])

86

# Code with problem optimization and fine-tuning

In [None]:
for _ in range(n_reps):
    # pick out a random slice of pop_train with n_train = 400
    train = train_pop.sample(n=400)
    
    # optimize a binary vector for the inclusion of the challenge points
    problem = FineSubsetProblem(train = train,
                            challenge_df = challenge_points,
                            n_selected = n_selected,
                            ref_synth_data = synth_data,
                            metadata = metadata,
                            configs=configs,
                            save_dir=save_dir)

    # Create the initial data
    x_zeros = np.zeros((n_pop_init, n_challenges-n_selected))
    x_ones = np.ones((n_pop_init, n_selected))
    x = np.concatenate((x_zeros, x_ones), axis=1)
    for i in range(x.shape[0]):
        np.random.shuffle(x[i])
    pop = Population.new("X", x)

    Evaluator().eval(problem, pop)#, elementwise_runner=runner)
    
    # Define the genetic algorithm
    algorithm = GA(
        pop_size=50,
        eliminate_duplicates=True,
        sampling=pop,
    )
    
    # Define the termination criterion
    termination = get_termination("n_gen", 200)

    # Perform the optimization
    res = minimize(problem,
                   algorithm,
                   termination,
                   seed=1,
                   save_history=True,
                   verbose=True)

    # Output the best solution found
    print("Best solution: ", res.X)
    optim_results['X'].append(res.X)
    print("Best objective value: ", res.F)
    optim_results['dist_auc'].append(res.F)

Table name: trans, Total dataframe shape: (400, 8), Numerical data shape: (400, 4), Categorical data shape: (400, 4)
Clustering checkpoint found, loading...
Training None -> trans model from scratch
Model params: {'num_classes': 0, 'is_y_cond': 'none', 'rtdl_params': {'d_layers': [32, 64, 64, 64, 64, 32], 'dropout': 0.0}, 'd_in': 8}
mlp
Step 500/10000 MLoss: 0.0 GLoss: 0.4912 Sum: 0.4912
Step 1000/10000 MLoss: 0.0 GLoss: 0.3897 Sum: 0.3897
Step 1500/10000 MLoss: 0.0 GLoss: 0.3677 Sum: 0.3677
Step 2000/10000 MLoss: 0.0 GLoss: 0.3478 Sum: 0.3478
Step 2500/10000 MLoss: 0.0 GLoss: 0.3367 Sum: 0.3367
Step 3000/10000 MLoss: 0.0 GLoss: 0.3099 Sum: 0.3099
Step 3500/10000 MLoss: 0.0 GLoss: 0.3022 Sum: 0.3022
Step 4000/10000 MLoss: 0.0 GLoss: 0.2941 Sum: 0.2941
Step 4500/10000 MLoss: 0.0 GLoss: 0.2914 Sum: 0.2914
Step 5000/10000 MLoss: 0.0 GLoss: 0.2871 Sum: 0.2871
Step 5500/10000 MLoss: 0.0 GLoss: 0.2857 Sum: 0.2857
Step 6000/10000 MLoss: 0.0 GLoss: 0.283 Sum: 0.283
Step 6500/10000 MLoss: 0.0 G

  termination = get_termination("n_gen", 200)


n_gen  |  n_eval  |     f_avg     |     f_min    
     1 |        0 |  0.7753869399 |  0.7560120861
Table name: trans, Total dataframe shape: (100, 8), Numerical data shape: (100, 4), Categorical data shape: (100, 4)
Table name: trans, Total dataframe shape: (100, 8), Numerical data shape: (100, 4), Categorical data shape: (100, 4)
Clustering checkpoint found, loading...
Clustering checkpoint found, loading...
Table name: trans, Total dataframe shape: (100, 8), Numerical data shape: (100, 4), Categorical data shape: (100, 4)
Clustering checkpoint found, loading...
Fine Tuning None -> trans model from pretrained models
Fine Tuning None -> trans model from pretrained models
Fine Tuning None -> trans model from pretrained models
Model params: {'num_classes': 0, 'is_y_cond': 'none', 'rtdl_params': {'d_layers': [32, 64, 64, 64, 64, 32], 'dropout': 0.0}, 'd_in': 8}
mlp
Model params: {'num_classes': 0, 'is_y_cond': 'none', 'rtdl_params': {'d_layers': [32, 64, 64, 64, 64, 32], 'dropout': 0.0},