## Using Optuna's Bayesian optimization to tune hyperparameters 

- It is highly recommended that the Bayesian optimization routine 
is executed in an environment with access to CUDA and/or OpenMP, as it greatly 
accelerates the entire process.

In [1]:
from pysdg.synth.generate import Generator
from pysdg.synth.optimize import BayesianOptimizationRoutine



In [2]:
# load some data and define the generator 
gen = Generator(gen_name="synthcity_ctgan")
real=gen.load("./raw_data.csv", "./raw_info.json")
real


2024-11-26 10:57:59,356 - pysdg - INFO - 1882007 - **************Started logging the generator: synthcity_ctgan
2024-11-26 10:57:59,398 - pysdg - INFO - 1882007 - Checking the input metadata for any conflict in variable indexes - Passed.
2024-11-26 10:58:01,388 - pysdg - INFO - 1882007 - The dataset ['tutorial_data'] is loaded into the generator synthcity_ctgan


Unnamed: 0,outc_cod_0,event_dt,wt,wt_cod,age,age_cod,drugname_0,indi_pt_0,sex
0,,NaT,,,,,ZANTAC,,
1,DE,NaT,,,18,YR,OXYCONTIN,Drug abuse,M
2,OT,NaT,,,,,LEMTRADA,,
3,OT,2019-09-17,,,46,YR,COSENTYX,Psoriatic arthropathy,M
4,DE,2016-12-01,110.0,KG,73,YR,ENTRESTO,Cardiac failure,M
...,...,...,...,...,...,...,...,...,...
9995,,NaT,,,,,ELIQUIS,Product used for unknown indication,
9996,OT,NaT,,,26,YR,ISENTRESS,,M
9997,,NaT,,,,,MYCOPHENOLATE MOFETIL,,
9998,,NaT,,,,,AZOPT,Product used for unknown indication,F


In [None]:
# Write a custom evaluation function to be used in optimisaion. The shown function is just for demo purposes. Typically, you use the encoded versions of the datasets (as show below) with your evaluation measure of interest. 
def my_eval_function(gen: Generator):
    real_data = gen.enc_real
    synth_data = gen.enc_synths[0] # we'll be assuming that we're generating only one dataset and we compare the encoded datasets, for simplicity
    n_mismatches  = (real_data != synth_data).sum().sum()
    return n_mismatches

In [None]:
# Lets try finding such set of parameters that minimizes the number of mismatches
# At the very end of the optimization, the model with best set of parameters will be 
# retrained and stored inside bayes_opt

bayes_opt = BayesianOptimizationRoutine(
                                        gen=gen, 
                                        eval_function=my_eval_function,
                                        objective="minimize",
                                        n_trials=1, # to make it finish faster
                                        study_name="mismatches_study",
                                        dump_csv=False, # dumping csv will only happen at the end of the optimization
                                        dump_sqlite=False # dumping sql happens after each trial
                                        )

[2024-11-26T10:58:06.961473-0500][1882007][CRITICAL] module disabled: /share/personal/skababji/conda_envs/pysdg_dev/lib/python3.10/site-packages/synthcity/plugins/generic/plugin_goggle.py
2024-11-26 10:58:16,321 - pysdg - INFO - 1882007 - No of Iterations=50, Batch Size=256
INFO:pysdg:No of Iterations=50, Batch Size=256
 96%|█████████▌| 48/50 [06:29<00:16,  8.11s/it]


In [None]:
# We can use the best model to generate data
bayes_opt.gen.gen(num_rows=len(real), num_synths=1)
synths=bayes_opt.gen.unload()
synths[0]

- Generating synth no. 0 of size (10000, 12) -- Completed!


Unnamed: 0,outc_cod_0,event_dt,wt,wt_cod,age,age_cod,drugname_0,indi_pt_0,sex
0,HO,NaT,57.141148,KG,,YR,Carboplatin Concentrate for solution for infusion,B-cell small lymphocytic lymphoma,F
1,,2019-06-09,,KG,64,,ATORVASTATINE,Back injury,F
2,HO,2019-05-10,,KG,59,YR,CYCLOPHOSPHAMIDE.,Cryptococcosis,F
3,OT,2019-05-06,56.073206,KG,57,YR,SPINRAZA,Somatic symptom disorder,F
4,HO,2019-06-11,,KG,63,,OXYCODONE,Stent placement,F
...,...,...,...,...,...,...,...,...,...
9995,HO,2015-11-18,73.713297,,63,YR,HYDROXYCHLOROQUINE SULFATE,Smoking cessation therapy,F
9996,,NaT,,,62,YR,MAVYRET,Cystic fibrosis,F
9997,DE,2019-05-14,,KG,62,YR,INFANTS TYLENOL,Hyperparathyroidism secondary,F
9998,,2019-05-19,97.824914,KG,59,YR,ASPIRIN,Cystitis interstitial,F


In [None]:
bayes_opt.study.best_params

{'generator_n_layers_hidden': 7,
 'generator_n_units_hidden': 128,
 'generator_nonlin': 'leaky_relu',
 'generator_dropout': 0,
 'discriminator_n_layers_hidden': 5,
 'discriminator_n_units_hidden': 256,
 'discriminator_nonlin': 'leaky_relu',
 'discriminator_dropout': 0.1,
 'n_iter': 15,
 'lr': 0.001,
 'weight_decay': 0.003889457912935765,
 'batch_size': 512,
 'clipping_value': 1,
 'encoder_max_clusters': 8,
 'adjust_inference_sampling': False}

In [None]:
# look at the results and the values of our eval function
bayes_opt.get_optimization_results(), bayes_opt.get_optimization_results().user_attrs_my_eval_function

(   number     value             datetime_start          datetime_complete  \
 0       0  100007.0 2024-11-25 09:15:16.744372 2024-11-25 09:15:56.260456   
 
                 duration  params_adjust_inference_sampling  params_batch_size  \
 0 0 days 00:00:39.516084                             False                512   
 
    params_clipping_value  params_discriminator_dropout  \
 0                      1                           0.1   
 
    params_discriminator_n_layers_hidden  ...  params_encoder_max_clusters  \
 0                                     5  ...                            8   
 
   params_generator_dropout  params_generator_n_layers_hidden  \
 0                        0                                 7   
 
    params_generator_n_units_hidden  params_generator_nonlin  params_lr  \
 0                              128               leaky_relu      0.001   
 
   params_n_iter  params_weight_decay  user_attrs_my_eval_function     state  
 0            15             0.0038