## Using Optuna's Bayesian optimization to tune hyperparameters 

- It is highly recommended that the Bayesian optimization routine 
is executed in an environment with access to CUDA and/or OpenMP, as it greatly 
accelerates the entire process.

In [1]:
from pysdg.synth.generate import Generator
from pysdg.synth.optimize import BayesianOptimizationRoutine



In [2]:
# load some data and define the generator 
gen = Generator(gen_name="synthcity_ctgan")
real=gen.load("./raw_data.csv", "./raw_info.json")
real


2025-03-10 15:54:08,925 - pysdg - INFO - 3007576 - generate.py:88 - **************Started logging the generator: synthcity/ctgan, num_cores= None.**************
2025-03-10 15:54:08,955 - pysdg - INFO - 3007576 - generate.py:250 - Checking the input metadata for any conflict in variable indexes - Passed.
2025-03-10 15:54:11,094 - pysdg - INFO - 3007576 - generate.py:318 - The dataset ['tutorial_data'] is loaded into the generator synthcity_ctgan


Unnamed: 0,outc_cod_0,event_dt,wt,wt_cod,age,age_cod,drugname_0,indi_pt_0,sex
0,,NaT,,,,,ZANTAC,,
1,DE,NaT,,,18,YR,OXYCONTIN,Drug abuse,M
2,OT,NaT,,,,,LEMTRADA,,
3,OT,2019-09-17,,,46,YR,COSENTYX,Psoriatic arthropathy,M
4,DE,2016-12-01,110.0,KG,73,YR,ENTRESTO,Cardiac failure,M
...,...,...,...,...,...,...,...,...,...
9995,,NaT,,,,,ELIQUIS,Product used for unknown indication,
9996,OT,NaT,,,26,YR,ISENTRESS,,M
9997,,NaT,,,,,MYCOPHENOLATE MOFETIL,,
9998,,NaT,,,,,AZOPT,Product used for unknown indication,F


In [3]:
# Write a custom evaluation function to be used in optimisaion. The shown function is just for demo purposes. Typically, you use the encoded versions of the datasets (as show below) with your evaluation measure of interest. 
def my_eval_function(gen: Generator):
    real_data = gen.enc_real
    synth_data = gen.enc_synths[0] # we'll be assuming that we're generating only one dataset and we compare the encoded datasets, for simplicity
    n_mismatches  = (real_data != synth_data).sum().sum()
    return n_mismatches

In [4]:
# Lets try finding such set of parameters that minimizes the number of mismatches
# At the very end of the optimization, the model with best set of parameters will be 
# retrained and stored inside bayes_opt

bayes_opt = BayesianOptimizationRoutine(
                                        gen=gen, 
                                        eval_function=my_eval_function,
                                        objective="minimize",
                                        n_trials=1, # to make it finish faster
                                        study_name="mismatches_study",
                                        dump_csv=False, # dumping csv will only happen at the end of the optimization
                                        dump_sqlite=False # dumping sql happens after each trial
                                        )

[2025-03-10T15:54:14.341149-0400][3007576][CRITICAL] module disabled: /share/personal/skababji/conda_envs/pysdg_dev/lib/python3.10/site-packages/synthcity/plugins/generic/plugin_goggle.py
2025-03-10 15:54:26,966 - pysdg - INFO - 3007576 - generate.py:720 - Started training using synthcity_ctgan...
INFO:pysdg:Started training using synthcity_ctgan...
[2025-03-10T15:54:26.970870-0400][3007576][CRITICAL] module disabled: /share/personal/skababji/conda_envs/pysdg_dev/lib/python3.10/site-packages/synthcity/plugins/generic/plugin_goggle.py
2025-03-10 15:54:26,982 - pysdg - INFO - 3007576 - generate.py:724 - No of Iterations=2000, Batch Size=200
INFO:pysdg:No of Iterations=2000, Batch Size=200
 40%|███▉      | 799/2000 [26:20<39:36,  1.98s/it]  
2025-03-10 16:20:57,066 - pysdg - INFO - 3007576 - generate.py:732 - Completed training using synthcity_ctgan.
INFO:pysdg:Completed training using synthcity_ctgan.
2025-03-10 16:20:57,292 - pysdg - INFO - 3007576 - generate.py:756 - Generating synth n

In [5]:
# We can use the best model to generate data
bayes_opt.best_gen.gen(num_rows=len(real), num_synths=1)
synths=bayes_opt.gen.unload()
synths[0]

AttributeError: 'NoneType' object has no attribute 'gen'

In [None]:
bayes_opt.study.best_params

In [None]:
# look at the results and the values of our eval function
bayes_opt.get_optimization_results(), bayes_opt.get_optimization_results().user_attrs_my_eval_function