In [1]:
import pandas as pd
import numpy as np
from sdv.metadata import Metadata
from sdv.single_table import GaussianCopulaSynthesizer, CTGANSynthesizer, TVAESynthesizer
import pathlib
import os


In [None]:
def loadTrainedSynthesizers(dependency):
    synthesizers=['ctgan','tvae']
    rows=[10000,25000,50000]
    missings=[0,10,20]

    
    #Setting up paths to trained synthesizers based off dependency type
    if dependency=="simulated":
        path=f'{parent_dir}/'
    elif dependency=="high":
        path=f'{parent_dir}/../Simulating Data/Dependency/Dependency Synthetic Data/High/'
    elif dependency=="moderate":
        path=f'{parent_dir}/../Simulating Data/Dependency/Dependency Synthetic Data/Moderate/'
    else:
        path=f'{parent_dir}/../Simulating Data/Dependency/Dependency Synthetic Data/Low/'

    #Reading all CTGAN and TVAE trained synthesizers
    for synthesizer in synthesizers:
        for row in rows:
            for missing in missings:
                file_name = f"{synthesizer} {row} row {missing} missing.pkl"
                name=f"{synthesizer}_{row}_{missing}"
                print(name)
                if synthesizer=='ctgan':
                    globals()[name]= CTGANSynthesizer.load(path + file_name)
                else:
                    globals()[name]=TVAESynthesizer.load(path + file_name)
    
    return dependency

In [None]:
#Sampling datasets from the trained synthesizers and saving to csvs
def synthesizingDatasets(synthesizer_dict,iters,synthesizer_type,dependency):
    #Setting save path of csvs based on dependency type
    if dependency=="simulated":
        save_path=''
    elif dependency=="high":
        save_path=f'{parent_dir}/../Simulating Data/Dependency/Synthetic Data Multiple Iterations/High/'
    elif dependency=="moderate":
        save_path=f'{parent_dir}/../Simulating Data/Dependency/Synthetic Data Multiple Iterations/Moderate/'
    else:
        save_path=f'{parent_dir}/../Simulating Data/Dependency/Synthetic Data Multiple Iterations/Low/'
    
    
    #Sampling datasets from the trained synthesizers and saving to csvs
    for name,synthesizer in synthesizer_dict.items():
        split_name=name.split('_')
        row=split_name[0]
        n=int(row)
        missing=split_name[1]
        for i in range(iters):
            df=synthesizer.sample(num_rows=n)
            synthesizer_type_lower=synthesizer_type.lower()
            df.to_csv(f'{save_path}{row} row {missing} missing/{synthesizer_type}/{row} rows_synthetic_{synthesizer_type_lower}_{i}.csv',index=False)
       


In [None]:
#Setting up paths
script_dir = pathlib.Path().resolve()
os.chdir(script_dir)
parent_dir=script_dir.parent

def main(dependency):
    dependency=loadTrainedSynthesizers(dependency)

    ctgan_synthesizers = {
        '10000_0': ctgan_10000_0,
        '10000_10': ctgan_10000_10,
        '10000_20': ctgan_10000_20,
        '25000_0': ctgan_25000_0,
        '25000_10': ctgan_25000_10,
        '25000_20': ctgan_25000_20,
        '50000_0': ctgan_50000_0,
        '50000_10': ctgan_50000_10,
        '50000_20': ctgan_50000_20
    }


    tvae_synthesizers = {
        '10000_0': tvae_10000_0,
        '10000_10': tvae_10000_10,
        '10000_20': tvae_10000_20,
        '25000_0': tvae_25000_0,
        '25000_10': tvae_25000_10,
        '25000_20': tvae_25000_20,
        '50000_0': tvae_50000_0,
        '50000_10': tvae_50000_10,    
        '50000_20': tvae_50000_20
    }
    synthesizingDatasets(tvae_synthesizers,100,'TVAE',dependency)
    synthesizingDatasets(ctgan_synthesizers,100,'CTGAN',dependency)

In [None]:
main("simulated")
main("high")
main("moderate")
main("low")