# Synthetic Data Generation

This notebook standardized imputed data and then generates synthetic data using CTGAN, TVAE, and ADASYN for each of the imputed datasets (MICE, KNN, SoftImpute, GAIN).
Results are saved in `data/synthetic/{imputation_method}/`.

In [1]:
import pandas as pd
import numpy as np
from sdv.single_table import CTGANSynthesizer, TVAESynthesizer
from sdv.metadata import SingleTableMetadata
from imblearn.over_sampling import ADASYN
from sklearn.preprocessing import StandardScaler
import os

import warnings
warnings.filterwarnings("ignore")

In [2]:
imputation_methods = ['mice', 'knn', 'softimpute', 'gain', 'pmm']
target_col = 'mayo' # Target column for ADASYN oversampling

# Ensure base directory exists
os.makedirs('../data/synthetic', exist_ok=True)

In [3]:
for method in imputation_methods:
    print(f"\n{'='*40}\nProcessing Imputation Method: {method.upper()}\n{'='*40}")
    
    input_path = f'../data/processed/uc_diagnostic_tests_{method}.csv'
    if not os.path.exists(input_path):
        print(f"File not found: {input_path} -- Skipping.")
        continue
        
    df = pd.read_csv(input_path)
    print(f"Data loaded: {df.shape}")
    
    # --- STANDARDIZATION ---
    print("  > Standardizing data...")
    
    # Separate features and target
    if target_col in df.columns:
        X = df.drop(columns=[target_col])
        y = df[target_col]
    else:
        X = df
        y = None
        
    # Apply StandardScaler to Features
    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    
    # Recombine for SDV (SDV learns joint distribution)
    if y is not None:
        # Reset indices to avoid mismatch during concat
        X_scaled.reset_index(drop=True, inplace=True)
        y.reset_index(drop=True, inplace=True)
        df_scaled = pd.concat([X_scaled, y], axis=1)
    else:
        df_scaled = X_scaled

    print("    Data standardized.")
    
    # Output directory for this method
    output_dir = f'../data/synthetic/{method}'
    os.makedirs(output_dir, exist_ok=True)

    # Detect Metadata on Scaled Data
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(data=df_scaled)
    
    # Helper function to inverse transform and save
    def inverse_transform_and_save(synthetic_data, filename, scaler, target_col, y_original=None):
        if target_col in synthetic_data.columns:
            X_syn = synthetic_data.drop(columns=[target_col])
            y_syn = synthetic_data[target_col]
        else:
            X_syn = synthetic_data
            y_syn = None
            
        # Inverse transform features
        X_syn_inv = pd.DataFrame(scaler.inverse_transform(X_syn), columns=X_syn.columns)
        
        # Recombine
        if y_syn is not None:
            X_syn_inv.reset_index(drop=True, inplace=True)
            y_syn.reset_index(drop=True, inplace=True)
            df_syn_inv = pd.concat([X_syn_inv, y_syn], axis=1)
        else:
            df_syn_inv = X_syn_inv
            
        df_syn_inv.to_csv(filename, index=False)
    
    # --- CTGAN ---
    print(f"  > Generating CTGAN for {method}...")
    try:
        ctgan = CTGANSynthesizer(metadata, epochs=300, verbose=True)
        ctgan.fit(df_scaled)
        synthetic_ctgan = ctgan.sample(num_rows=len(df))
        
        inverse_transform_and_save(synthetic_ctgan, f'{output_dir}/uc_diagnostics_ctgan.csv', scaler, target_col)
        print("    CTGAN Saved (Standardized -> Generated -> Inverse Transformed).")
    except Exception as e:
        print(f"    CTGAN Failed: {e}")
    
    # --- TVAE ---
    print(f"  > Generating TVAE for {method}...")
    try:
        tvae = TVAESynthesizer(metadata, epochs=300, verbose=True)
        tvae.fit(df_scaled)
        synthetic_tvae = tvae.sample(num_rows=len(df))
        
        inverse_transform_and_save(synthetic_tvae, f'{output_dir}/uc_diagnostics_tvae.csv', scaler, target_col)
        print("    TVAE Saved (Standardized -> Generated -> Inverse Transformed).")
    except Exception as e:
        print(f"    TVAE Failed: {e}")
    
    # --- ADASYN ---
    print(f"  > Generating ADASYN for {method}...")
    try:
        if y is None or y.isnull().any():
             print("    ADASYN Skipping: Target column missing or contains NaNs.")
        else:
            # ADASYN on Scaled X and original y
            adasyn = ADASYN(sampling_strategy='not majority', random_state=42)
            X_res, y_res = adasyn.fit_resample(X_scaled, y)
            
            synthetic_adasyn = pd.concat([X_res, y_res], axis=1)
            
            inverse_transform_and_save(synthetic_adasyn, f'{output_dir}/uc_diagnostics_adasyn.csv', scaler, target_col)
            print(f"    ADASYN Saved (Standardized -> Generated -> Inverse Transformed). New shape: {synthetic_adasyn.shape}")
    except Exception as e:
        print(f"    ADASYN Failed: {e}")

print("\nAll synthetic data generation tasks completed.")


Processing Imputation Method: MICE
Data loaded: (252, 56)
  > Standardizing data...
    Data standardized.
  > Generating CTGAN for mice...


Gen. (-3.79) | Discrim. (-0.30): 100%|██████████| 300/300 [00:36<00:00,  8.14it/s]


    CTGAN Saved (Standardized -> Generated -> Inverse Transformed).
  > Generating TVAE for mice...


Loss: -62.697: 100%|██████████| 300/300 [00:13<00:00, 21.54it/s]


    TVAE Saved (Standardized -> Generated -> Inverse Transformed).
  > Generating ADASYN for mice...
    ADASYN Saved (Standardized -> Generated -> Inverse Transformed). New shape: (340, 56)

Processing Imputation Method: KNN
Data loaded: (252, 56)
  > Standardizing data...
    Data standardized.
  > Generating CTGAN for knn...


Gen. (-3.96) | Discrim. (-0.25): 100%|██████████| 300/300 [00:36<00:00,  8.29it/s]


    CTGAN Saved (Standardized -> Generated -> Inverse Transformed).
  > Generating TVAE for knn...


Loss: -51.820: 100%|██████████| 300/300 [00:14<00:00, 21.19it/s]


    TVAE Saved (Standardized -> Generated -> Inverse Transformed).
  > Generating ADASYN for knn...
    ADASYN Saved (Standardized -> Generated -> Inverse Transformed). New shape: (351, 56)

Processing Imputation Method: SOFTIMPUTE
Data loaded: (252, 56)
  > Standardizing data...
    Data standardized.
  > Generating CTGAN for softimpute...


Gen. (-4.44) | Discrim. (0.42): 100%|██████████| 300/300 [00:35<00:00,  8.48it/s] 


    CTGAN Saved (Standardized -> Generated -> Inverse Transformed).
  > Generating TVAE for softimpute...


Loss: -62.924: 100%|██████████| 300/300 [00:13<00:00, 21.46it/s]


    TVAE Saved (Standardized -> Generated -> Inverse Transformed).
  > Generating ADASYN for softimpute...
    ADASYN Saved (Standardized -> Generated -> Inverse Transformed). New shape: (346, 56)

Processing Imputation Method: GAIN
Data loaded: (252, 56)
  > Standardizing data...
    Data standardized.
  > Generating CTGAN for gain...


Gen. (-4.77) | Discrim. (-0.26): 100%|██████████| 300/300 [00:35<00:00,  8.41it/s]


    CTGAN Saved (Standardized -> Generated -> Inverse Transformed).
  > Generating TVAE for gain...


Loss: -54.350: 100%|██████████| 300/300 [00:14<00:00, 21.24it/s]


    TVAE Saved (Standardized -> Generated -> Inverse Transformed).
  > Generating ADASYN for gain...
    ADASYN Saved (Standardized -> Generated -> Inverse Transformed). New shape: (351, 56)

Processing Imputation Method: PMM
Data loaded: (252, 56)
  > Standardizing data...
    Data standardized.
  > Generating CTGAN for pmm...


Gen. (-4.66) | Discrim. (0.72): 100%|██████████| 300/300 [00:36<00:00,  8.24it/s] 


    CTGAN Saved (Standardized -> Generated -> Inverse Transformed).
  > Generating TVAE for pmm...


Loss: -43.910: 100%|██████████| 300/300 [00:14<00:00, 20.64it/s]


    TVAE Saved (Standardized -> Generated -> Inverse Transformed).
  > Generating ADASYN for pmm...
    ADASYN Saved (Standardized -> Generated -> Inverse Transformed). New shape: (347, 56)

All synthetic data generation tasks completed.
