# TabDDPM Population
Generate synthetic data with TabDDPM

# Imports packages

In [1]:
# Standard
import sys

sys.path.append("..")
import json
import shutil
from pathlib import Path

# 3rd party
import pandas as pd

# Local
from src.externals.MIDSTModels.midst_models.single_table_TabDDPM.complex_pipeline import (
    clava_clustering,
    clava_synthesizing,
    clava_training,
    load_configs,
)
from src.externals.MIDSTModels.midst_models.single_table_TabDDPM.pipeline_modules import (
    load_multi_table,
)
from src.utils import standard

# 1.Configuration

In [2]:
# Paths
input_dir = Path(
    "/data8/projets/dev_synthetic_data/data/MIDST/tabddpm_black_box/population/"
)
workspace = input_dir / "synth" / "1st_gen"

config_file_path = (
    Path("..")
    / "src"
    / "externals"
    / "MIDSTModels"
    / "midst_models"
    / "single_table_TabDDPM"
    / "configs"
)


# Real training data
real_train = "real_train_no_id.csv"

# 2.Load data
Load training data

In [3]:
df_real_train = pd.read_csv(input_dir / real_train)

In [4]:
df_real_train.shape

(230486, 8)

# 3.Create the necessary folders and config files

In [5]:
# Create the new folder if it doesn't exist
standard.create_directory(workspace)

# Save the training data
df_real_train.to_csv(workspace / "train.csv", index=False)

# Copy the original config file to the new folder
shutil.copy(config_file_path / "trans.json", workspace)
shutil.copy(config_file_path / "dataset_meta.json", workspace)
shutil.copy(config_file_path / "trans_domain.json", workspace)

# Modify the config file
with open(workspace / "trans.json", "r") as file:
    trans_config = json.load(file)

trans_config["general"]["data_dir"] = str(workspace)
trans_config["general"]["workspace_dir"] = str(workspace)
trans_config["general"]["test_data_dir"] = ""


# Save the changed
with open(workspace / "trans.json", "w") as file:
    json.dump(trans_config, file, indent=4)

# 4.Train TabDDPM model

In [6]:
# Set up the config
configs, save_dir = load_configs(workspace / "trans.json")

# Load tables
tables, relation_order, dataset_meta = load_multi_table(configs["general"]["data_dir"])

Table name: trans, Total dataframe shape: (230486, 8), Numerical data shape: (230486, 4), Categorical data shape: (230486, 4)


In [7]:
# Clustering on the multi-table dataset
tables, all_group_lengths_prob_dicts = clava_clustering(
    tables, relation_order, save_dir, configs
)

In [8]:
# Train models
models = clava_training(tables, relation_order, save_dir, configs)

Training None -> trans model from scratch
Model params: {'num_classes': 0, 'is_y_cond': 'none', 'rtdl_params': {'d_layers': [512, 1024, 1024, 1024, 1024, 512], 'dropout': 0.0}, 'd_in': 8}
mlp
Step 500/200000 MLoss: 0.0 GLoss: 0.253 Sum: 0.253
Step 1000/200000 MLoss: 0.0 GLoss: 0.2399 Sum: 0.2399
Step 1500/200000 MLoss: 0.0 GLoss: 0.2337 Sum: 0.2337
Step 2000/200000 MLoss: 0.0 GLoss: 0.2304 Sum: 0.2304
Step 2500/200000 MLoss: 0.0 GLoss: 0.2292 Sum: 0.2292
Step 3000/200000 MLoss: 0.0 GLoss: 0.2275 Sum: 0.2275
Step 3500/200000 MLoss: 0.0 GLoss: 0.2275 Sum: 0.2275
Step 4000/200000 MLoss: 0.0 GLoss: 0.2259 Sum: 0.2259
Step 4500/200000 MLoss: 0.0 GLoss: 0.2251 Sum: 0.2251
Step 5000/200000 MLoss: 0.0 GLoss: 0.2245 Sum: 0.2245
Step 5500/200000 MLoss: 0.0 GLoss: 0.2241 Sum: 0.2241
Step 6000/200000 MLoss: 0.0 GLoss: 0.2245 Sum: 0.2245
Step 6500/200000 MLoss: 0.0 GLoss: 0.2236 Sum: 0.2236
Step 7000/200000 MLoss: 0.0 GLoss: 0.2244 Sum: 0.2244
Step 7500/200000 MLoss: 0.0 GLoss: 0.2229 Sum: 0.2229
S

# 5.Generate synthetic data

In [9]:
# Determine the sample scale
sample_scale = 1.5

# Generate synthetic data from scratch
cleaned_tables, synthesizing_time_spent, matching_time_spent = clava_synthesizing(
    tables,
    relation_order,
    save_dir,
    all_group_lengths_prob_dicts,
    models,
    configs,
    sample_scale=sample_scale,
)

Generating None -> trans
Sample size: 345729
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0


In [11]:
# Type convertion

col_type = {
    "float": ["amount", "balance"],
    "int": [
        "trans_date",
        "account",
        "trans_type",
        "operation",
        "k_symbol",
        "bank",
    ],
}

df_synth = standard.trans_type(df=cleaned_tables["trans"], col_type=col_type, decimal=1)

In [13]:
df_synth.head()

Unnamed: 0,trans_date,trans_type,operation,amount,balance,k_symbol,bank,account
0,1845,0,0,60.3,14474.2,7,0,0
1,1276,0,0,129.9,28752.8,7,0,0
2,2117,0,3,9892.5,85662.2,1,0,0
3,1773,2,4,13433.8,24741.1,1,0,0
4,1764,2,1,9445.3,42860.4,5,5,18258595


In [12]:
df_synth.to_csv(workspace / "all_1st_gen.csv", index=False)