# Synthetic Data Generation

This notebook standardized imputed data and then generates synthetic data using CTGAN, TVAE, and ADASYN for each of the imputed datasets (MICE, KNN, SoftImpute, GAIN).
Results are saved in `data/synthetic/{imputation_method}/`.

In [13]:
import pandas as pd
import numpy as np
from sdv.single_table import CTGANSynthesizer, TVAESynthesizer
from sdv.metadata import SingleTableMetadata
from imblearn.over_sampling import ADASYN, SMOTE
from sklearn.preprocessing import StandardScaler
import os

import warnings
warnings.filterwarnings("ignore")

In [14]:
imputation_methods = ['mice', 'knn', 'softimpute', 'gain', 'pmm']
target_col = 'mayo' # Target column for ADASYN oversampling

# Ensure base directory exists
os.makedirs('../data/synthetic', exist_ok=True)

In [15]:
for method in imputation_methods:
    print(f"\n{'='*40}\nProcessing Imputation Method: {method.upper()}\n{'='*40}")
    
    input_path = f'../data/processed/uc_diagnostic_tests_{method}.csv'
    if not os.path.exists(input_path):
        print(f"File not found: {input_path} -- Skipping.")
        continue
        
    df_original = pd.read_csv(input_path) 
    print(f"Original data loaded: {df_original.shape}")
    
    # --- STANDARDIZATION ---
    print("  > Standardizing data...")
    if target_col in df_original.columns:
        X = df_original.drop(columns=[target_col])
        y = df_original[target_col]
    else:
        X = df_original
        y = None
        
    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    
    if y is not None:
        X_scaled.reset_index(drop=True, inplace=True)
        y.reset_index(drop=True, inplace=True)
        df_scaled = pd.concat([X_scaled, y], axis=1)
    else:
        df_scaled = X_scaled

    print("    Data standardized.")
    
    output_dir = f'../data/synthetic/{method}'
    os.makedirs(output_dir, exist_ok=True)

    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(data=df_scaled)
    
    # Helper function to inverse transform back to original scale
    def inverse_transform_data(synthetic_data, scaler, target_col):
        if target_col in synthetic_data.columns:
            X_syn = synthetic_data.drop(columns=[target_col])
            y_syn = synthetic_data[target_col]
        else:
            X_syn = synthetic_data
            y_syn = None
            
        X_syn_inv = pd.DataFrame(scaler.inverse_transform(X_syn), columns=X_syn.columns)
        
        if y_syn is not None:
            X_syn_inv.reset_index(drop=True, inplace=True)
            y_syn.reset_index(drop=True, inplace=True)
            df_syn_inv = pd.concat([X_syn_inv, y_syn], axis=1)
        else:
            df_syn_inv = X_syn_inv
        return df_syn_inv
    
    # --- CTGAN ---
    print(f"  > Generating CTGAN for {method} (Augmentation)...")
    try:
        ctgan = CTGANSynthesizer(metadata, epochs=300, verbose=True)
        ctgan.fit(df_scaled)
        # Generate 252 new records
        synthetic_ctgan_scaled = ctgan.sample(num_rows=len(df_original))
        synthetic_ctgan_inv = inverse_transform_data(synthetic_ctgan_scaled, scaler, target_col)
        
        # COMBINE: Original + Synthetic
        df_augmented_ctgan = pd.concat([df_original, synthetic_ctgan_inv], axis=0).reset_index(drop=True)
        df_augmented_ctgan.to_csv(f'{output_dir}/uc_diagnostics_ctgan.csv', index=False)
        print(f"    CTGAN Augmented Saved. Final shape: {df_augmented_ctgan.shape}")
    except Exception as e:
        print(f"    CTGAN Failed: {e}")
    
    # --- TVAE ---
    print(f"  > Generating TVAE for {method} (Augmentation)...")
    try:
        tvae = TVAESynthesizer(metadata, epochs=300, verbose=True)
        tvae.fit(df_scaled)
        # Generate 252 new records
        synthetic_tvae_scaled = tvae.sample(num_rows=len(df_original))
        synthetic_tvae_inv = inverse_transform_data(synthetic_tvae_scaled, scaler, target_col)
        
        # COMBINE: Original + Synthetic
        df_augmented_tvae = pd.concat([df_original, synthetic_tvae_inv], axis=0).reset_index(drop=True)
        df_augmented_tvae.to_csv(f'{output_dir}/uc_diagnostics_tvae.csv', index=False)
        print(f"    TVAE Augmented Saved. Final shape: {df_augmented_tvae.shape}")
    except Exception as e:
        print(f"    TVAE Failed: {e}")
    
    # --- ADASYN ---
    print(f"  > Generating ADASYN for {method} (Oversampling)...")
    try:
        if y is None or y.isnull().any():
             print("    ADASYN Skipping: Target column missing or contains NaNs.")
        else:
            # ADASYN on Scaled X and original y
            adasyn = ADASYN(sampling_strategy='not majority', random_state=42)
            X_res, y_res = adasyn.fit_resample(X_scaled, y)
            
            # Inverse transform the combined (resampled) dataset
            X_res_inv = pd.DataFrame(scaler.inverse_transform(X_res), columns=X_res.columns)
            df_adasyn_inv = pd.concat([X_res_inv, y_res.reset_index(drop=True)], axis=1)
            
            df_adasyn_inv.to_csv(f'{output_dir}/uc_diagnostics_adasyn.csv', index=False)
            print(f"    ADASYN Saved. Final shape: {df_adasyn_inv.shape}")
    except Exception as e:
        print(f"    ADASYN Failed: {e}")

    # --- SMOTE ---
    print(f"  > Generating SMOTE for {method}...")
    try:
        from imblearn.over_sampling import SMOTE
        smote = SMOTE(sampling_strategy='not majority', random_state=42)
        X_res, y_res = smote.fit_resample(X_scaled, y)
        
        synthetic_smote = pd.concat([X_res, y_res], axis=1)
        
        inverse_transform_and_save(synthetic_smote, f'{output_dir}/uc_diagnostics_smote.csv', scaler, target_col)
        print(f"    SMOTE Saved. New shape: {synthetic_smote.shape}")
    except Exception as e:
        print(f"    SMOTE Failed: {e}")


Processing Imputation Method: MICE
Original data loaded: (252, 56)
  > Standardizing data...
    Data standardized.
  > Generating CTGAN for mice (Augmentation)...


Gen. (-4.46) | Discrim. (0.01): 100%|██████████| 300/300 [00:37<00:00,  8.07it/s] 


    CTGAN Augmented Saved. Final shape: (504, 56)
  > Generating TVAE for mice (Augmentation)...


Loss: -64.783: 100%|██████████| 300/300 [00:14<00:00, 21.23it/s]


    TVAE Augmented Saved. Final shape: (504, 56)
  > Generating ADASYN for mice (Oversampling)...
    ADASYN Saved. Final shape: (340, 56)
  > Generating SMOTE for mice...
    SMOTE Saved. New shape: (352, 56)

Processing Imputation Method: KNN
Original data loaded: (252, 56)
  > Standardizing data...
    Data standardized.
  > Generating CTGAN for knn (Augmentation)...


Gen. (-3.32) | Discrim. (0.01): 100%|██████████| 300/300 [00:37<00:00,  8.07it/s] 


    CTGAN Augmented Saved. Final shape: (504, 56)
  > Generating TVAE for knn (Augmentation)...


Loss: -53.084: 100%|██████████| 300/300 [00:14<00:00, 21.28it/s]


    TVAE Augmented Saved. Final shape: (504, 56)
  > Generating ADASYN for knn (Oversampling)...
    ADASYN Saved. Final shape: (351, 56)
  > Generating SMOTE for knn...
    SMOTE Saved. New shape: (352, 56)

Processing Imputation Method: SOFTIMPUTE
Original data loaded: (252, 56)
  > Standardizing data...
    Data standardized.
  > Generating CTGAN for softimpute (Augmentation)...


Gen. (-5.04) | Discrim. (0.08): 100%|██████████| 300/300 [00:37<00:00,  8.06it/s] 


    CTGAN Augmented Saved. Final shape: (504, 56)
  > Generating TVAE for softimpute (Augmentation)...


Loss: -62.355: 100%|██████████| 300/300 [00:14<00:00, 21.06it/s]


    TVAE Augmented Saved. Final shape: (504, 56)
  > Generating ADASYN for softimpute (Oversampling)...
    ADASYN Saved. Final shape: (346, 56)
  > Generating SMOTE for softimpute...
    SMOTE Saved. New shape: (352, 56)

Processing Imputation Method: GAIN
Original data loaded: (252, 56)
  > Standardizing data...
    Data standardized.
  > Generating CTGAN for gain (Augmentation)...


Gen. (-4.73) | Discrim. (0.34): 100%|██████████| 300/300 [00:37<00:00,  7.99it/s] 


    CTGAN Augmented Saved. Final shape: (504, 56)
  > Generating TVAE for gain (Augmentation)...


Loss: -51.664: 100%|██████████| 300/300 [00:14<00:00, 20.47it/s]


    TVAE Augmented Saved. Final shape: (504, 56)
  > Generating ADASYN for gain (Oversampling)...
    ADASYN Saved. Final shape: (351, 56)
  > Generating SMOTE for gain...
    SMOTE Saved. New shape: (352, 56)

Processing Imputation Method: PMM
Original data loaded: (252, 56)
  > Standardizing data...
    Data standardized.
  > Generating CTGAN for pmm (Augmentation)...


Gen. (-4.09) | Discrim. (0.15): 100%|██████████| 300/300 [00:37<00:00,  8.00it/s] 


    CTGAN Augmented Saved. Final shape: (504, 56)
  > Generating TVAE for pmm (Augmentation)...


Loss: -43.744: 100%|██████████| 300/300 [00:14<00:00, 21.07it/s]


    TVAE Augmented Saved. Final shape: (504, 56)
  > Generating ADASYN for pmm (Oversampling)...
    ADASYN Saved. Final shape: (347, 56)
  > Generating SMOTE for pmm...
    SMOTE Saved. New shape: (352, 56)
