In [3]:
import pandas as pd
from sdv.single_table import CTGANSynthesizer, TVAESynthesizer
from sdv.metadata import SingleTableMetadata
from imblearn.over_sampling import ADASYN
import os
import warnings

warnings.filterwarnings("ignore")

os.makedirs('../data/processed', exist_ok=True)

In [4]:
input_path = '../data/processed/uc_diagnostic_tests_mice.csv'
df = pd.read_csv(input_path)

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=df)
metadata.save_to_json('../data/processed/metadata.json')

print(f"Data loaded: {df.shape}")

Data loaded: (252, 56)


In [5]:
print("--- CTGAN ---")

ctgan = CTGANSynthesizer(metadata, epochs=300, verbose=True)
ctgan.fit(df)

synthetic_ctgan = ctgan.sample(num_rows=len(df))

output_path = '../data/processed/synthetic_ctgan.csv'
synthetic_ctgan.to_csv(output_path, index=False)
print("CTGAN Saved.")

--- CTGAN ---


Gen. (-4.32) | Discrim. (0.15): 100%|██████████| 300/300 [00:35<00:00,  8.50it/s] 


CTGAN Saved.


In [6]:
print("--- TVAE ---")

tvae = TVAESynthesizer(metadata, epochs=300, verbose=True)
tvae.fit(df)

synthetic_tvae = tvae.sample(num_rows=len(df))

output_path = '../data/processed/synthetic_tvae.csv'
synthetic_tvae.to_csv(output_path, index=False)
print("TVAE Saved.")

--- TVAE ---


Loss: -60.691: 100%|██████████| 300/300 [00:13<00:00, 21.77it/s]


TVAE Saved.


In [7]:
print("--- ADASYN ---")

target_col = 'mayo'

X = df.drop(columns=[target_col])
y = df[target_col]

try:
    adasyn = ADASYN(sampling_strategy='not majority', random_state=42)
    X_res, y_res = adasyn.fit_resample(X, y)

    synthetic_adasyn = pd.concat([X_res, y_res], axis=1)

    output_path = '../data/processed/synthetic_adasyn.csv'
    synthetic_adasyn.to_csv(output_path, index=False)
    print("ADASYN Saved.")

except Exception as e:
    print(e)

--- ADASYN ---
ADASYN Saved.
