# Working pipeline

- **step 0**: get environment

```bash
source /home/ruoheng/anaconda3/bin/activate test3
```

- **step 1**: tune on 215 and save raw result

``` bash
python tune.py --model_name {model_name} --data_name {data_name} --object_name Distance --n_trials=10
! scp -r {source_path} {target_path}
```

``` python
model_list = [
    'ctgan',
    'tvae'
]
data_list = [
    'company_bankruptcy_prediction',
    'credit_card_customers',
    'darwin',
    'dna',
    'jannis',
    'parkinsons_updrs',
    'piechart3' 
]

source_path = 'ruc@10.77.110.215:/home/ruc/xiaotong/OpenDataGen/log/20231201/open-data-gen/src/tune'

for data_name in data_list:
    for model_name in model_list:
        print("time python tune.py --model_name {} --data_name {} --object_name Distance --n_trials=10".format(model_name, data_name))
    target_path = 'ctgan_tvae_10_{}'.format(data_name)
    print("! scp -r {} {}".format(source_path, target_path))
    print("\n")
```

- **step 3**: reconstruct the folder

```bash
mkdir exe1_sum/ctgan
mkdir exe1_sum/tvae

cp {source_file} exe1_sum/ctgan/{source_file} 
```

- **step 4**: evaluate, get json and csv

``` python
# get json
data_list = [
    'company_bankruptcy_prediction',
    'credit_card_customers',
    'darwin',
    'dna',
    'jannis',
    'parkinsons_updrs',
    'piechart3' 
]
model_list = [
    'ctgan',
    'tvae'
]

for model_name in model_list:
    for data_name in data_list:
        fake_path = './tune/exe1_sum/{}'.format(model_name)
        eval_path = fake_path
        print('time python eval_json.py --data_name {} --model_name {} --fake_path {} --eval_path {}'.format(data_name, model_name, fake_path, eval_path))
        print('')
```

```bash
# set eval_config.json
python eval_csv.py
```

- **step 5**: analyze and discuss

# Tune on 215 and save raw result

In [67]:
model_list = [
    'ctgan',
    'tvae'
]
data_list = [
    'company_bankruptcy_prediction',
    'credit_card_customers',
    'darwin',
    'dna',
    'jannis',
    'parkinsons_updrs',
    'piechart3' 
]

source_path = 'ruc@10.77.110.215:/home/ruc/xiaotong/OpenDataGen/log/20231201/open-data-gen/src/tune'

for data_name in data_list:
    for model_name in model_list:
        print("time python tune.py --model_name {} --data_name {} --object_name Distance --n_trials=10".format(model_name, data_name))
    target_path = 'ctgan_tvae_10_{}'.format(data_name)
    print("! scp -r {} {}".format(source_path, target_path))
    print("\n")

time python tune.py --model_name ctgan --data_name company_bankruptcy_prediction --object_name Distance --n_trials=10
time python tune.py --model_name tvae --data_name company_bankruptcy_prediction --object_name Distance --n_trials=10
! scp -r ruc@10.77.110.215:/home/ruc/xiaotong/OpenDataGen/log/20231201/open-data-gen/src/tune ctgan_tvae_10_company_bankruptcy_prediction


time python tune.py --model_name ctgan --data_name credit_card_customers --object_name Distance --n_trials=10
time python tune.py --model_name tvae --data_name credit_card_customers --object_name Distance --n_trials=10
! scp -r ruc@10.77.110.215:/home/ruc/xiaotong/OpenDataGen/log/20231201/open-data-gen/src/tune ctgan_tvae_10_credit_card_customers


time python tune.py --model_name ctgan --data_name darwin --object_name Distance --n_trials=10
time python tune.py --model_name tvae --data_name darwin --object_name Distance --n_trials=10
! scp -r ruc@10.77.110.215:/home/ruc/xiaotong/OpenDataGen/log/20231201/open-data-gen/

## ctgan tune result

```bash
time python tune.py --model_name ctgan --data_name company_bankruptcy_prediction --object_name Distance --n_trials=10
# real    170m32.660s
# user    267m30.563s
# sys     365m19.834s
# Best is trial 7 with value: 19793763.49429695.

time python tune.py --model_name ctgan --data_name credit_card_customers --object_name Distance --n_trials=10
# real    98m7.668s
# user    122m48.764s
# sys     82m6.909s
# Best is trial 4 with value: 29849.284444265642

time python tune.py --model_name ctgan --data_name darwin --object_name Distance --n_trials=10
# real    79m0.969s
# user    101m40.844s
# sys     4m24.075s
# Best is trial 1 with value: 2561.403212364159

time python tune.py --model_name ctgan --data_name dna --object_name Distance --n_trials=10
# real    149m26.658s
# user    194m59.621s
# sys     83m38.586s

time python tune.py --model_name ctgan --data_name jannis --object_name Distance --n_trials=10
# real    1412m12.679s
# user    1895m29.898s
# sys     2251m55.500s
# Best is trial 5 with value: 0.39704495690594926

time python tune.py --model_name ctgan --data_name parkinsons_updrs --object_name Distance --n_trials=10
# real    60m46.769s
# user    94m18.712s
# sys     128m28.509s
# Best is trial 5 with value: 0.32615180799474275.

time python tune.py --model_name ctgan --data_name piechart3 --object_name Distance --n_trials=10
# real    16m11.441s
# user    29m39.967s
# sys     5m52.807s
# Best is trial 4 with value: 324.43073889421
```

## tvae tune result
```bash
time python tune.py --model_name tvae --data_name company_bankruptcy_prediction --object_name Distance --n_trials=10
# real    50m33.271s
# user    154m0.317s
# sys     376m39.269s
# Best is trial 4 with value: 62930031.14287921.

time python tune.py --model_name tvae --data_name credit_card_customers --object_name Distance --n_trials=10
# real    31m44.083s
# user    57m54.129s
# sys     82m28.416s
# Best is trial 4 with value: 80628.99949147343

time python tune.py --model_name tvae --data_name darwin --object_name Distance --n_trials=10
# real    53m11.911s
# user    75m54.990s
# sys     4m15.883s
# Best is trial 4 with value: 3039.585556867186.

time python tune.py --model_name tvae --data_name dna --object_name Distance --n_trials=10
# real    46m14.887s
# user    95m44.456s
# sys     87m38.484s
# Best is trial 4 with value: 0.0724393792685642

time python tune.py --model_name tvae --data_name jannis --object_name Distance --n_trials=10
# real    312m41.458s
# user    783m44.144s
# sys     2176m54.819s
# Best is trial 4 with value: 1.2573201746107179.

time python tune.py --model_name tvae --data_name parkinsons_updrs --object_name Distance --n_trials=10
#   File "/home/ruc/xiaotong/OpenDataGen/log/20231201/open-data-gen/src/tune.py", line 180, in object
#     evaluate_result = eval.evaluate() evaluate err: evaluate 有报错 更改 metricslist=["Distance"]后依旧有报错
# real    22m11.641s
# user    54m10.787s
# sys     116m45.918s
# Best trial: 4. Best value: 0.378574

time python tune.py --model_name tvae --data_name piechart3 --object_name Distance --n_trials=10
# real    8m9.211s
# user    22m5.403s
# sys     5m55.596s
# Best is trial 4 with value: 304.6982483835972
```

# Evaluate: get json and csv

In [65]:
data_list = [
    'company_bankruptcy_prediction',
    'credit_card_customers',
    'darwin',
    'dna',
    'jannis',
    'parkinsons_updrs',
    'piechart3' 
]
model_list = [
    'ctgan',
    'tvae'
]

for model_name in model_list:
    for data_name in data_list:
        fake_path = './tune/exe1_sum/{}'.format(model_name)
        eval_path = fake_path
        print('time python eval_json.py --data_name {} --model_name {} --fake_path {} --eval_path {}'.format(data_name, model_name, fake_path, eval_path))
        print('')

time python eval_json.py --data_name company_bankruptcy_prediction --model_name ctgan --fake_path ./tune/exe1_sum/ctgan --eval_path ./tune/exe1_sum/ctgan

time python eval_json.py --data_name credit_card_customers --model_name ctgan --fake_path ./tune/exe1_sum/ctgan --eval_path ./tune/exe1_sum/ctgan

time python eval_json.py --data_name darwin --model_name ctgan --fake_path ./tune/exe1_sum/ctgan --eval_path ./tune/exe1_sum/ctgan

time python eval_json.py --data_name dna --model_name ctgan --fake_path ./tune/exe1_sum/ctgan --eval_path ./tune/exe1_sum/ctgan

time python eval_json.py --data_name jannis --model_name ctgan --fake_path ./tune/exe1_sum/ctgan --eval_path ./tune/exe1_sum/ctgan

time python eval_json.py --data_name parkinsons_updrs --model_name ctgan --fake_path ./tune/exe1_sum/ctgan --eval_path ./tune/exe1_sum/ctgan

time python eval_json.py --data_name piechart3 --model_name ctgan --fake_path ./tune/exe1_sum/ctgan --eval_path ./tune/exe1_sum/ctgan

time python eval_json.py --d

Takes 304s in total

``` bash
time python eval_json.py --data_name credit_card_customers --model_name ctgan --fake_path ./tune/exe1_sum/ctgan --eval_path ./tune/exe1_sum/ctgan
# real    0m30.416s
# user    17m46.117s
# sys     0m25.793s

time python eval_json.py --data_name darwin --model_name ctgan --fake_path ./tune/exe1_sum/ctgan --eval_path ./tune/exe1_sum/ctgan
# real    0m11.620s
# user    3m0.805s
# sys     0m5.635s

time python eval_json.py --data_name dna --model_name ctgan --fake_path ./tune/exe1_sum/ctgan --eval_path ./tune/exe1_sum/ctgan
# real    0m16.360s
# user    7m28.503s
# sys     0m13.742s

time python eval_json.py --data_name jannis --model_name ctgan --fake_path ./tune/exe1_sum/ctgan --eval_path ./tune/exe1_sum/ctgan
# real    1m7.110s
# user    27m18.753s
# sys     0m39.153s

time python eval_json.py --data_name parkinsons_updrs --model_name ctgan --fake_path ./tune/exe1_sum/ctgan --eval_path ./tune/exe1_sum/ctgan
# real    0m17.644s
# user    9m30.133s
# sys     0m15.220s

time python eval_json.py --data_name piechart3 --model_name ctgan --fake_path ./tune/exe1_sum/ctgan --eval_path ./tune/exe1_sum/ctgan
# real    0m6.824s
# user    2m14.768s
# sys     0m6.295s

time python eval_json.py --data_name company_bankruptcy_prediction --model_name tvae --fake_path ./tune/exe1_sum/tvae --eval_path ./tune/exe1_sum/tvae
# real    0m21.225s
# user    11m7.445s
# sys     0m17.943s

time python eval_json.py --data_name credit_card_customers --model_name tvae --fake_path ./tune/exe1_sum/tvae --eval_path ./tune/exe1_sum/tvae
# real    0m30.153s
# user    17m10.766s
# sys     0m27.060s

time python eval_json.py --data_name darwin --model_name tvae --fake_path ./tune/exe1_sum/tvae --eval_path ./tune/exe1_sum/tvae
# real    0m13.339s
# user    3m26.506s
# sys     0m5.182s

time python eval_json.py --data_name dna --model_name tvae --fake_path ./tune/exe1_sum/tvae --eval_path ./tune/exe1_sum/tvae
# real    0m14.101s
# user    6m36.598s
# sys     0m12.366s

time python eval_json.py --data_name jannis --model_name tvae --fake_path ./tune/exe1_sum/tvae --eval_path ./tune/exe1_sum/tvae
# real    0m54.117s
# user    24m39.153s
# sys     0m35.320s

time python eval_json.py --data_name parkinsons_updrs --model_name tvae --fake_path ./tune/exe1_sum/tvae --eval_path ./tune/exe1_sum/tvae
# real    0m18.734s
# user    9m57.441s
# sys     0m17.612s

time python eval_json.py --data_name piechart3 --model_name tvae --fake_path ./tune/exe1_sum/tvae --eval_path ./tune/exe1_sum/tvae
# real    0m7.105s
# user    2m41.615s
# sys     0m6.868s
```

# Appendix: ctgan & tvae without tune

## ctgan
以default.json为参数设定，训练模型生成数据，并将生成数据进行备份。
```bash
python train.py --data_name company_bankruptcy_prediction --model_name ctgan --model_params=../params/ctgan/default.json --times=5 --gpu=0 
python train.py --data_name credit_card_customers --model_name ctgan --model_params=../params/ctgan/default.json --times=5 --gpu=0
python train.py --data_name darwin --model_name ctgan --model_params=../params/ctgan/default.json --times=5 --gpu=0
python train.py --data_name dna --model_name ctgan --model_params=../params/ctgan/default.json --times=5 --gpu=2
python train.py --data_name jannis --model_name ctgan --model_params=../params/ctgan/default.json --times=5 --gpu=3
python train.py --data_name parkinsons_updrs --model_name ctgan --model_params=../params/ctgan/default.json --times=5 --gpu=0
python train.py --data_name piechart3 --model_name ctgan --model_params=../params/ctgan/default.json --times=5 --gpu=0
```

## tvae
以default.json为参数设定，训练模型生成数据，并将生成数据进行备份。
```bash
python train.py --data_name company_bankruptcy_prediction --model_name tvae --model_params=../params/tvae/default.json --times=5 --gpu=0
python train.py --data_name credit_card_customers --model_name tvae --model_params=../params/tvae/default.json --times=5 --gpu=1
python train.py --data_name darwin --model_name tvae --model_params=../params/tvae/default.json --times=5 --gpu=3
python train.py --data_name dna --model_name tvae --model_params=../params/tvae/default.json --times=5 --gpu=2

python train.py --data_name jannis --model_name tvae --model_params=../params/tvae/default.json --times=5 --gpu=0
python train.py --data_name parkinsons_updrs --model_name tvae --model_params=../params/tvae/default.json --times=5 --gpu=2
python train.py --data_name piechart3 --model_name tvae --model_params=../params/tvae/default.json --times=5 --gpu=3
```

In [39]:
import os
try:
    os.mkdir('tvae_train_generated')
except:
    pass
file_list = [
    'company_bankruptcy_prediction20231203-152117',
    'company_bankruptcy_prediction20231203-153313',
    'company_bankruptcy_prediction20231204-145611',
    'company_bankruptcy_prediction20231205-092404',
    'credit_card_customers20231203-154010',
    'darwin20231203-154032',
    'dna20231203-154148',
    'jannis20231204-090946', 
    'parkinsons_updrs20231204-091027',
    'piechart320231204-091047'
]
prefix = 'scp -r ruc@10.77.110.215:/home/ruc/xiaotong/OpenDataGen/log/20231201/open-data-gen/synthesized_data/tvae'
save_path = 'tvae_train_generated'
for file_name in file_list:
    print("! {}/{} {}/{}".format(prefix, file_name, save_path, file_name))

! scp -r ruc@10.77.110.215:/home/ruc/xiaotong/OpenDataGen/log/20231201/open-data-gen/synthesized_data/tvae/company_bankruptcy_prediction20231203-152117 tvae_train_generated/company_bankruptcy_prediction20231203-152117
! scp -r ruc@10.77.110.215:/home/ruc/xiaotong/OpenDataGen/log/20231201/open-data-gen/synthesized_data/tvae/company_bankruptcy_prediction20231203-153313 tvae_train_generated/company_bankruptcy_prediction20231203-153313
! scp -r ruc@10.77.110.215:/home/ruc/xiaotong/OpenDataGen/log/20231201/open-data-gen/synthesized_data/tvae/company_bankruptcy_prediction20231204-145611 tvae_train_generated/company_bankruptcy_prediction20231204-145611
! scp -r ruc@10.77.110.215:/home/ruc/xiaotong/OpenDataGen/log/20231201/open-data-gen/synthesized_data/tvae/company_bankruptcy_prediction20231205-092404 tvae_train_generated/company_bankruptcy_prediction20231205-092404
! scp -r ruc@10.77.110.215:/home/ruc/xiaotong/OpenDataGen/log/20231201/open-data-gen/synthesized_data/tvae/credit_card_customers

## ctgan vs tvae

ctgan和tvae的时间相差甚多，结论：tvae的时间开销是ctgan的35倍，进行以下实验验证猜想

```bash
time python train.py --data_name company_bankruptcy_prediction --model_name ctgan --model_params=../params/ctgan/default.json --times=3 --gpu=2 
# ../synthesized_data/ctgan/company_bankruptcy_prediction20231204-145553
# real    4m35.853s
# user    32m52.212s
# sys     137m34.304s

time python train.py --data_name company_bankruptcy_prediction --model_name tvae --model_params=../params/tvae/default.json --times=3 --gpu=3
# ../synthesized_data/tvae/company_bankruptcy_prediction20231205-092404
# real    157m39.073s
# user    187m43.554s
# sys     130m33.935s
```

**❗️Attention: 无论怎样设置gpu，ctgan和tvae只运行在0号显卡上❗️**