In [1]:
!git clone https://github.com/sdv-dev/CTGAN.git /content/CTGAN
%cd /content/CTGAN
!pip install -r latest_requirements.txt
!pip install -e .
!pip install scikit-learn pandas numpy torch tqdm rdt joblib xgboost


Cloning into '/content/CTGAN'...
remote: Enumerating objects: 2271, done.[K
remote: Counting objects: 100% (980/980), done.[K
remote: Compressing objects: 100% (294/294), done.[K
remote: Total 2271 (delta 883), reused 687 (delta 686), pack-reused 1291 (from 3)[K
Receiving objects: 100% (2271/2271), 1.93 MiB | 11.29 MiB/s, done.
Resolving deltas: 100% (1425/1425), done.
/content/CTGAN
Collecting pandas==2.2.3 (from -r latest_requirements.txt (line 2))
  Downloading pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rdt==1.16.0 (from -r latest_requirements.txt (line 3))
  Downloading rdt-1.16.0-py3-none-any.whl.metadata (10 kB)
Collecting torch==2.7.0 (from -r latest_requirements.txt (line 4))
  Downloading torch-2.7.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting Faker>=17 (from rdt==1.16.0-

In [3]:
from google.colab import files
import pandas as pd
import numpy as np

uploaded = files.upload()
file_path = list(uploaded.keys())[0]

# Read data
df = pd.read_csv(file_path, header=None)

# Rename last two columns for clarity
df.columns = list(range(33)) + ['Age', 'class']

# Handle missing 'Age' values
df = df[df['Age'] != '?']
df['Age'] = df['Age'].astype(float)

# Convert all columns to numeric
df = df.apply(pd.to_numeric)

# Set target
target_col = 'class'
print("Shape after cleaning:", df.shape)
df.head()


Saving dermatology.data to dermatology (1).data
Shape after cleaning: (358, 35)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,Age,class
0,2,2,0,3,0,0,0,0,1,0,...,0,0,3,0,0,0,1,0,55.0,2
1,3,3,3,2,1,0,0,0,1,1,...,0,0,0,0,0,0,1,0,8.0,1
2,2,1,2,3,1,3,0,3,0,0,...,0,2,3,2,0,0,2,3,26.0,3
3,2,2,2,0,0,0,0,0,3,2,...,3,0,0,0,0,0,3,0,40.0,1
4,2,3,2,2,2,2,0,2,0,0,...,2,3,2,3,0,0,2,3,45.0,3


In [4]:
import numpy as np
import torch
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from ctgan import CTGAN

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

target_col = 'class'
y_encoded = df[target_col].values

epochs = 100
kf = StratifiedKFold(n_splits=2, shuffle=True, random_state=SEED)
repeats = 3


In [6]:
syn_results = []

for rep in range(repeats):
    print(f"\n=== Repetition {rep + 1} ===")
    for fold, (train_idx, test_idx) in enumerate(kf.split(df, y_encoded)):
        print(f"-- Fold {fold + 1} --")

        train_df = df.iloc[train_idx].reset_index(drop=True)
        test_df = df.iloc[test_idx].reset_index(drop=True)

        train_features = train_df.drop(columns=[target_col]).copy()
        train_features.columns = train_features.columns.astype(str)

        model = CTGAN(
            embedding_dim=128,
            generator_dim=(256, 256),
            discriminator_dim=(256, 256),
            generator_lr=2e-4,
            discriminator_lr=2e-4,
            batch_size=64,
            epochs=epochs,
            pac=1,
            verbose=False,
            cuda=torch.cuda.is_available()
        )
        model.set_random_state(SEED)

        model.fit(train_features)

        syn_df = model.sample(len(train_df) // 2)
        syn_df.columns = train_features.columns
        syn_df[target_col] = train_df[target_col].sample(len(syn_df), replace=True).reset_index(drop=True)

        syn_results.append((train_df, test_df, syn_df))



=== Repetition 1 ===
-- Fold 1 --


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


-- Fold 2 --

=== Repetition 2 ===
-- Fold 1 --
-- Fold 2 --

=== Repetition 3 ===
-- Fold 1 --
-- Fold 2 --


In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from scipy.spatial.distance import jensenshannon
from scipy.stats import wasserstein_distance

results = {
    "RandomForest": [],
    "XGBoost": [],
    "MLP": [],
    "LogisticRegression": [],
    "JSD": [],
    "WD": []
}

def evaluate_jsd(real, syn):
    scores = []
    for col in real.select_dtypes(include='number').columns:
        try:
            scores.append(jensenshannon(real[col].dropna(), syn[col].dropna(), base=2))
        except:
            continue
    return np.nanmean(scores) if scores else np.nan

def evaluate_wd(real, syn):
    scores = []
    for col in real.select_dtypes(include='number').columns:
        try:
            scores.append(wasserstein_distance(real[col].dropna(), syn[col].dropna()))
        except:
            continue
    return np.nanmean(scores) if scores else np.nan

label_encoder = LabelEncoder().fit(df[target_col])

for train_df, test_df, syn_df in syn_results:
    y_syn = label_encoder.transform(syn_df[target_col])
    y_test = label_encoder.transform(test_df[target_col])

    X_syn = syn_df.drop(columns=[target_col])
    X_test = test_df.drop(columns=[target_col])
    X_syn.columns = X_syn.columns.astype(str)
    X_test.columns = X_test.columns.astype(str)
    X_syn, X_test = X_syn.align(X_test, join='outer', axis=1, fill_value=0)

    models = {
        "RandomForest": RandomForestClassifier(n_estimators=100, random_state=SEED),
        "XGBoost": XGBClassifier(eval_metric='mlogloss', random_state=SEED),
        "MLP": MLPClassifier(hidden_layer_sizes=(50,), max_iter=300, random_state=SEED),
        "LogisticRegression": LogisticRegression(max_iter=2000, random_state=SEED)
    }

    for name, model in models.items():
        try:
            model.fit(X_syn, y_syn[:len(X_syn)])
            preds = model.predict(X_test)
            acc = accuracy_score(y_test, preds)
            results[name].append(acc)
        except ValueError as e:
            print(f"[Skipped {name}] - {e}")
            results[name].append(np.nan)

    results["JSD"].append(evaluate_jsd(train_df, syn_df))
    results["WD"].append(evaluate_wd(train_df, syn_df))

print("\n=== FINAL AVERAGED RESULTS ===")
for name, scores in results.items():
    print(f"{name}: {np.nanmean(scores):.4f}")





=== FINAL AVERAGED RESULTS ===
RandomForest: 0.2719
XGBoost: 0.2542
MLP: 0.2598
LogisticRegression: 0.2076
JSD: nan
WD: 7.3968


  print(f"{name}: {np.nanmean(scores):.4f}")
