In [None]:
!git clone https://github.com/sdv-dev/CTGAN.git /content/CTGAN
%cd /content/CTGAN
!pip install -r latest_requirements.txt
!pip install -e .
!pip install scikit-learn pandas numpy torch tqdm rdt joblib


Cloning into '/content/CTGAN'...
remote: Enumerating objects: 2271, done.[K
remote: Counting objects: 100% (980/980), done.[K
remote: Compressing objects: 100% (294/294), done.[K
remote: Total 2271 (delta 883), reused 687 (delta 686), pack-reused 1291 (from 3)[K
Receiving objects: 100% (2271/2271), 1.93 MiB | 4.18 MiB/s, done.
Resolving deltas: 100% (1425/1425), done.
/content/CTGAN
Collecting pandas==2.2.3 (from -r latest_requirements.txt (line 2))
  Downloading pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rdt==1.16.0 (from -r latest_requirements.txt (line 3))
  Downloading rdt-1.16.0-py3-none-any.whl.metadata (10 kB)
Collecting torch==2.7.0 (from -r latest_requirements.txt (line 4))
  Downloading torch-2.7.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting Faker>=17 (from rdt==1.16.0->

In [None]:
from google.colab import files
import pandas as pd

uploaded = files.upload()
file_name = list(uploaded.keys())[0]
df = pd.read_csv(file_name)

print("Shape:", df.shape)
df.head()


Saving adult.csv to adult.csv
Shape: (48842, 15)


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [None]:
import numpy as np
import torch
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from ctgan import CTGAN

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

target_col = "income"

# Detect categorical features (excluding the target)
discrete_cols = df.select_dtypes(include=['object']).columns.tolist()
if target_col in discrete_cols:
    discrete_cols.remove(target_col)

# Encode target column for stratified CV
le = LabelEncoder()
y_encoded = le.fit_transform(df[target_col])

# Use 150 epochs if large dataset
epochs = 150 if len(df) > 10000 else 100
print(f"Training CTGAN with {epochs} epochs")

kf = StratifiedKFold(n_splits=2, shuffle=True, random_state=SEED)
repeats = 3


Training CTGAN with 150 epochs


In [None]:
syn_results = []

for rep in range(repeats):
    print(f"\n=== Repetition {rep + 1} ===")
    for fold, (train_idx, test_idx) in enumerate(kf.split(df, y_encoded)):
        print(f"-- Fold {fold + 1} --")

        train_df = df.iloc[train_idx].reset_index(drop=True)
        test_df = df.iloc[test_idx].reset_index(drop=True)

        model = CTGAN(
            embedding_dim=128,
            generator_dim=(256, 256),
            discriminator_dim=(256, 256),
            generator_lr=2e-4,
            discriminator_lr=2e-4,
            batch_size=500,
            epochs=epochs,
            pac=10,
            log_frequency=True,
            verbose=False,
            cuda=torch.cuda.is_available()
        )
        model.set_random_state(SEED)

        #  exclude target column from training
        model.fit(train_df.drop(columns=[target_col]), discrete_cols)

        # Generate synthetic data (50% of original train size)
        syn_df = model.sample(len(train_df) // 2)
        syn_df[target_col] = train_df[target_col].sample(len(syn_df), replace=True).reset_index(drop=True)

        syn_results.append((train_df, test_df, syn_df))



=== Repetition 1 ===
-- Fold 1 --


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


-- Fold 2 --

=== Repetition 2 ===
-- Fold 1 --
-- Fold 2 --

=== Repetition 3 ===
-- Fold 1 --
-- Fold 2 --


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from scipy.spatial.distance import jensenshannon
from scipy.stats import wasserstein_distance
import numpy as np

# === Prepare result containers ===
results = {
    "RandomForest": [],
    "XGBoost": [],
    "MLP": [],
    "LogisticRegression": [],
    "JSD": [],
    "WD": []
}

# === JSD Evaluation ===
def evaluate_jsd(real, syn):
    scores = []
    for col in real.select_dtypes(include='number').columns:
        try:
            scores.append(jensenshannon(real[col].dropna(), syn[col].dropna(), base=2))
        except:
            continue
    return np.nanmean(scores) if scores else np.nan

# === WD Evaluation ===
def evaluate_wd(real, syn):
    scores = []
    for col in real.select_dtypes(include='number').columns:
        try:
            scores.append(wasserstein_distance(real[col].dropna(), syn[col].dropna()))
        except:
            continue
    return np.nanmean(scores) if scores else np.nan

# === Main Evaluation Loop ===
for train_df, test_df, syn_df in syn_results:
    le = LabelEncoder()
    y_syn = le.fit_transform(syn_df[target_col])
    y_test = le.transform(test_df[target_col])

    # One-hot encode and align features
    X_syn = pd.get_dummies(syn_df.drop(columns=[target_col]), drop_first=True)
    X_test = pd.get_dummies(test_df.drop(columns=[target_col]), drop_first=True)
    X_syn, X_test = X_syn.align(X_test, join='outer', axis=1, fill_value=0)

    # Define all models
    models = {
        "RandomForest": RandomForestClassifier(n_estimators=100, random_state=SEED),
        "XGBoost": XGBClassifier(eval_metric='logloss', random_state=SEED),
        "MLP": MLPClassifier(hidden_layer_sizes=(50,), max_iter=300, random_state=SEED),
        "LogisticRegression": LogisticRegression(max_iter=2000, random_state=SEED)
    }

    # Train & evaluate
    for name, model in models.items():
        model.fit(X_syn, y_syn[:len(X_syn)])
        preds = model.predict(X_test)
        acc = accuracy_score(y_test, preds)
        results[name].append(acc)

    # Distribution metrics
    results["JSD"].append(evaluate_jsd(train_df, syn_df))
    results["WD"].append(evaluate_wd(train_df, syn_df))

# === Print Final Averages ===
print("\n=== FINAL AVERAGED RESULTS ===")
for name, scores in results.items():
    print(f"{name}: {np.mean(scores):.4f}")


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


=== FINAL AVERAGED RESULTS ===
RandomForest: 0.7591
XGBoost: 0.7501
MLP: 0.6740
LogisticRegression: 0.7616
JSD: nan
WD: 3729.8213


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
