In [None]:
## Importing libaries
import pandas as pd
import matplotlib.pyplot as plt
import os
import pathlib

## For synthesize
from sdv.metadata import Metadata
from sdv.single_table import GaussianCopulaSynthesizer, CTGANSynthesizer, TVAESynthesizer
from sdv.datasets.local import load_csvs


In [None]:
## Load data
def read_Data(dependency):
    if dependency=="simulated":
        prefix=""
        all_files = load_csvs(f'{parent_dir}/Simulated Data/')
    elif dependency=="high":
        prefix="high_"
        all_files = load_csvs(f'{parent_dir}/Simulating Data/Dependency/DataWithRelations/')
    elif dependency=="moderate":
        prefix="moderate_"
        all_files = load_csvs(f'{parent_dir}/Simulating Data/Dependency/DataModerateRelations/')
    else:
        prefix="low_"
        all_files = load_csvs(f'{parent_dir}/Simulating Data/Dependency/DataNoRelations/')

    df_10000_0 = all_files[f'{prefix}10000_obs_0_percent_missing']
    df_10000_10 = all_files[f'{prefix}10000_obs_10_percent_missing']
    df_10000_20 = all_files[f'{prefix}10000_obs_20_percent_missing']
    df_25000_0 = all_files[f'{prefix}25000_obs_0_percent_missing']
    df_25000_10 = all_files[f'{prefix}25000_obs_10_percent_missing']
    df_25000_20 = all_files[f'{prefix}25000_obs_20_percent_missing']
    df_50000_0 = all_files[f'{prefix}50000_obs_0_percent_missing']
    df_50000_10 = all_files[f'{prefix}50000_obs_10_percent_missing']
    df_50000_20 = all_files[f'{prefix}50000_obs_20_percent_missing']    

    all_df = list([df_10000_0, df_10000_10, df_10000_20,
               df_25000_0, df_25000_10, df_25000_20,
              df_50000_0, df_50000_10, df_50000_20
               ])
        
    return all_df,dependency         


In [None]:
#-----------------------------------------------------
## Function synthesize_data: Takes in a dataframe, number of rows to synthesize, output folder path index, 
## and dependency of dependency for saving data 

def synthesize_data(df, n, index,dependency):
  #Creating metadata for synthesizing datasets
  metadata = Metadata.detect_from_dataframe(data=df)

  metadata.update_column(column_name="arcsId", sdtype="id")
  metadata.update_column(column_name="hashedId", sdtype="id")
  metadata.update_column(column_name="countryCode",sdtype="categorical")
  metadata.update_column(column_name="Language",sdtype="categorical")

  ## Train synthesizers

  #GC
  gc_syn = GaussianCopulaSynthesizer(metadata)
  gc_syn.fit(df)
  #CTGAN
  ctgan_syn = CTGANSynthesizer(metadata,cuda=True)
  ctgan_syn.fit(df)
  #TVAE
  tvae_syn = TVAESynthesizer(metadata,cuda=True)
  tvae_syn.fit(df)
  print(f"Training synthesizers completed!")

  ## Creating paths for saving synthetic datasets
  if dependency=="simulated":
    output_path= f'{script_dir}/Synthetic Datasets/' + str(index) + '/' 

  elif dependency=="high":
    output_path= f'{parent_dir}/Simulating Data/Dependency/Dependency Synthetic Data/High/' + str(index) + '/' 

  elif dependency=="moderate":
    output_path= f'{parent_dir}/Simulating Data/Dependency/Dependency Synthetic Data/Moderate/' + str(index) + '/' 
 
  else:
    output_path= f'{parent_dir}/Simulating Data/Dependency/Dependency Synthetic Data/Low/' + str(index) + '/' 

  ## Create synthetic data
  gc_df = gc_syn.sample(num_rows=n)
  ctgan_df = ctgan_syn.sample(num_rows=n)
  tvae_df = tvae_syn.sample(num_rows=n)
  print(f"Data generation completed!")

  ## Export the data to a CSV file

  ## Making specific directories
  out_gc = output_path + "GC/"
  out_ctgan = output_path + "CTGAN/"
  out_tvae = output_path + "TVAE/"
  os.makedirs(out_gc, exist_ok=True)
  os.makedirs(out_ctgan, exist_ok=True)
  os.makedirs(out_tvae, exist_ok=True)
  
  ## Export
  gc_df.to_csv(out_gc + str(n) + f' rows_synthetic_gc.csv', index=False)
  ctgan_df.to_csv(out_ctgan + str(n) + f' rows_synthetic_ctgan.csv', index=False)
  tvae_df.to_csv(out_tvae + str(n) + f' rows_synthetic_tvae.csv', index=False)
  print(f"Data exported completed!")



In [None]:
# Full pipeline from reading to synthesizing and saving synthetic datasets
def fullPipeline(dependency):
    #Reading simulated data
    all_df,dependency=read_Data(dependency)

    # Process datetime for synthesizers
    for df in all_df:
        df['dateOfBirth'] = pd.to_datetime(df['dateOfBirth'], errors='coerce')
    
    # Lists for synthetic data sample sizes
    all_rows = list([10000,10000,10000, 25000,25000,25000, 50000,50000,50000])
    df_type = ['10000 row 0 missing','10000 row 10 missing','10000 row 20 missing',
                '25000 row 0 missing','25000 row 10 missing','25000 row 20 missing',
                '50000 row 0 missing','50000 row 10 missing','50000 row 20 missing']

    #Generating synthetic data
    print(f'Generating synthetic data for {dependency} data')
    for i in range(9): 
        print("-----------Generating dataframe " + df_type[i] + ' ----------------')
        synthesize_data(all_df[i], all_rows[i], df_type[i],dependency)

In [None]:
# Setting up paths
script_dir = pathlib.Path().resolve()
os.chdir(script_dir)
parent_dir=script_dir.parent

# Full pipelines for synthesizing data
fullPipeline("simulated")
fullPipeline("high")
fullPipeline("moderate")
fullPipeline("low")