In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from scipy.stats import entropy, wasserstein_distance
from scipy.io import arff
from collections import defaultdict
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# ===== Load + Clean ARFF =====
data, meta = arff.loadarff("adult 1.arff")
df = pd.DataFrame(data)
for col in df.select_dtypes([object]).columns:
    df[col] = df[col].str.decode("utf-8").str.replace(r"[\\'\"]", "", regex=True)
df = df.dropna().reset_index(drop=True)

# ===== Encode Categorical =====
encoders = {}
for col in df.columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

X = df.drop(columns=["class"])
y = df["class"]
input_dim = X.shape[1]
num_classes = y.nunique()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ===== Metrics =====
def compute_jsd(p, q):
    p, q = np.array(p) + 1e-10, np.array(q) + 1e-10
    p, q = p / p.sum(), q / q.sum()
    m = 0.5 * (p + q)
    return 0.5 * (entropy(p, m) + entropy(q, m))

def evaluate_jsd_wd(real_df, synth_df):
    jsd_scores, wd_scores = [], []
    for col in real_df.columns:
        real, synth = real_df[col].values, synth_df[col].values
        jsd = compute_jsd(np.histogram(real, bins=20)[0], np.histogram(synth, bins=20)[0])
        wd = wasserstein_distance(real, synth)
        jsd_scores.append(jsd)
        wd_scores.append(wd)
    return np.mean(jsd_scores), np.mean(wd_scores)

# ===== CW-GAN Models =====
class Generator(nn.Module):
    def __init__(self):
        super().__init__()
        self.label_emb = nn.Embedding(num_classes, num_classes)
        self.model = nn.Sequential(
            nn.Linear(32 + num_classes, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )
    def forward(self, z, labels):
        c = self.label_emb(labels)
        x = torch.cat((z, c), dim=1)
        return self.model(x)

class Critic(nn.Module):
    def __init__(self):
        super().__init__()
        self.label_emb = nn.Embedding(num_classes, num_classes)
        self.model = nn.Sequential(
            nn.Linear(input_dim + num_classes, 128),
            nn.LeakyReLU(0.2),
            nn.Linear(128, 64),
            nn.LeakyReLU(0.2),
            nn.Linear(64, 1)
        )
    def forward(self, x, labels):
        c = self.label_emb(labels)
        d_in = torch.cat((x, c), dim=1)
        return self.model(d_in)

def compute_gp(critic, real_samples, fake_samples, labels, device):
    alpha = torch.rand(real_samples.size(0), 1).to(device)
    interpolates = (alpha * real_samples + (1 - alpha) * fake_samples).requires_grad_(True)
    d_interpolates = critic(interpolates, labels)
    fake = torch.ones_like(d_interpolates)
    gradients = torch.autograd.grad(
        outputs=d_interpolates, inputs=interpolates, grad_outputs=fake,
        create_graph=True, retain_graph=True, only_inputs=True
    )[0]
    gradients = gradients.view(gradients.size(0), -1)
    return ((gradients.norm(2, dim=1) - 1) ** 2).mean()

# ===== Experiment Setup =====
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
all_results = defaultdict(list)

for repeat in range(3):
    print(f"\n🔁 Repeat {repeat+1}/3")

    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        print(f"\n🔄 Fold {fold+1}/2")

        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        X_tensor = torch.tensor(X_train.values, dtype=torch.float32)
        y_tensor = torch.tensor(y_train.values, dtype=torch.long)
        loader = DataLoader(TensorDataset(X_tensor, y_tensor), batch_size=128, shuffle=True)

        generator = Generator().to(device)
        critic = Critic().to(device)
        opt_G = torch.optim.Adam(generator.parameters(), lr=1e-4, betas=(0.5, 0.9))
        opt_C = torch.optim.Adam(critic.parameters(), lr=1e-4, betas=(0.5, 0.9))

        for epoch in range(100):
            for i, (real_samples, labels) in enumerate(loader):
                real_samples, labels = real_samples.to(device), labels.to(device)
                opt_C.zero_grad()
                z = torch.randn(real_samples.size(0), 32).to(device)
                fake_samples = generator(z, labels)
                real_validity = critic(real_samples, labels)
                fake_validity = critic(fake_samples.detach(), labels)
                gp = compute_gp(critic, real_samples, fake_samples, labels, device)
                c_loss = -torch.mean(real_validity) + torch.mean(fake_validity) + 10 * gp
                c_loss.backward()
                opt_C.step()
                if i % 5 == 0:
                    opt_G.zero_grad()
                    gen_samples = generator(z, labels)
                    g_loss = -torch.mean(critic(gen_samples, labels))
                    g_loss.backward()
                    opt_G.step()

        # Generate synthetic data
        synth_size = len(X_train) // 2
        z = torch.randn(synth_size, 32).to(device)
        synth_labels = torch.randint(0, num_classes, (synth_size,), dtype=torch.long).to(device)
        gen_data = generator(z, synth_labels).detach().cpu().numpy()
        synth_df = pd.DataFrame(gen_data, columns=X.columns)
        synth_df["class"] = synth_labels.cpu().numpy()

        # Classifier Evaluation (TSTR)
        print("\n📊 TSTR Accuracy:")
        models = {
            "LogReg": LogisticRegression(max_iter=300),
            "MLP": MLPClassifier(max_iter=300),
            "RF": RandomForestClassifier(),
            "XGBT": XGBClassifier(use_label_encoder=False, eval_metric="mlogloss")
        }

        for name, model in models.items():
            model.fit(synth_df.drop(columns=["class"]), synth_df["class"])
            y_pred = model.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            all_results[f"{name}_acc"].append(acc)
            print(f"{name}: {acc:.4f}")

        # JSD + WD
        jsd, wd = evaluate_jsd_wd(X_train, synth_df.drop(columns=["class"]))
        all_results["jsd"].append(jsd)
        all_results["wd"].append(wd)
        print(f"\n🔬 JSD: {jsd:.4f} | WD: {wd:.4f}")

# ===== Final Average Summary =====
print("\n📈 FINAL AVERAGE RESULTS ACROSS 3x2 CV:")
for name in ["LogReg", "MLP", "RF", "XGBT"]:
    avg = np.mean(all_results[f"{name}_acc"])
    print(f"{name} TSTR Accuracy: {avg:.4f}")

print(f"\nJSD: {np.mean(all_results['jsd']):.4f}")
print(f"Wasserstein Distance: {np.mean(all_results['wd']):.4f}")


🔁 Repeat 1/3

🔄 Fold 1/2

📊 TSTR Accuracy:
LogReg: 0.6497
MLP: 0.6627
RF: 0.6850
XGBT: 0.6276

🔬 JSD: 0.4973 | WD: 1.1072

🔄 Fold 2/2

📊 TSTR Accuracy:
LogReg: 0.7597
MLP: 0.7631
RF: 0.7752
XGBT: 0.7635

🔬 JSD: 0.5044 | WD: 1.4226

🔁 Repeat 2/3

🔄 Fold 1/2

📊 TSTR Accuracy:
LogReg: 0.6664
MLP: 0.6609
RF: 0.7410
XGBT: 0.7204

🔬 JSD: 0.4996 | WD: 1.0518

🔄 Fold 2/2

📊 TSTR Accuracy:
LogReg: 0.6411
MLP: 0.6382
RF: 0.7165
XGBT: 0.6962

🔬 JSD: 0.4903 | WD: 0.8455

🔁 Repeat 3/3

🔄 Fold 1/2

📊 TSTR Accuracy:
LogReg: 0.7030
MLP: 0.7116
RF: 0.7642
XGBT: 0.7181

🔬 JSD: 0.4930 | WD: 0.9391

🔄 Fold 2/2

📊 TSTR Accuracy:
LogReg: 0.6567
MLP: 0.6698
RF: 0.7659
XGBT: 0.7097

🔬 JSD: 0.4929 | WD: 0.9036

📈 FINAL AVERAGE RESULTS ACROSS 3x2 CV:
LogReg TSTR Accuracy: 0.6794
MLP TSTR Accuracy: 0.6844
RF TSTR Accuracy: 0.7413
XGBT TSTR Accuracy: 0.7059

JSD: 0.4963
Wasserstein Distance: 1.0450


In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from scipy.stats import entropy, wasserstein_distance
from scipy.io import arff
from collections import defaultdict
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# ===== Load + Clean ARFF =====
data, meta = arff.loadarff("adult 1.arff")
df = pd.DataFrame(data)
for col in df.select_dtypes([object]).columns:
    df[col] = df[col].str.decode("utf-8").str.replace(r"[\\'\"]", "", regex=True)
df = df.dropna().reset_index(drop=True)

# ===== Encode Categorical =====
encoders = {}
for col in df.columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

X = df.drop(columns=["class"])
y = df["class"]
input_dim = X.shape[1]
num_classes = y.nunique()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ===== Metrics =====
def compute_jsd(p, q):
    p, q = np.array(p) + 1e-10, np.array(q) + 1e-10
    p, q = p / p.sum(), q / q.sum()
    m = 0.5 * (p + q)
    return 0.5 * (entropy(p, m) + entropy(q, m))

def evaluate_jsd_wd(real_df, synth_df):
    jsd_scores, wd_scores = [], []
    for col in real_df.columns:
        real, synth = real_df[col].values, synth_df[col].values
        jsd = compute_jsd(np.histogram(real, bins=20)[0], np.histogram(synth, bins=20)[0])
        wd = wasserstein_distance(real, synth)
        jsd_scores.append(jsd)
        wd_scores.append(wd)
    return np.mean(jsd_scores), np.mean(wd_scores)

# ===== CW-GAN Models =====
class Generator(nn.Module):
    def __init__(self):
        super().__init__()
        self.label_emb = nn.Embedding(num_classes, num_classes)
        self.model = nn.Sequential(
            nn.Linear(32 + num_classes, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )
    def forward(self, z, labels):
        c = self.label_emb(labels)
        x = torch.cat((z, c), dim=1)
        return self.model(x)

class Critic(nn.Module):
    def __init__(self):
        super().__init__()
        self.label_emb = nn.Embedding(num_classes, num_classes)
        self.model = nn.Sequential(
            nn.Linear(input_dim + num_classes, 128),
            nn.LeakyReLU(0.2),
            nn.Linear(128, 64),
            nn.LeakyReLU(0.2),
            nn.Linear(64, 1)
        )
    def forward(self, x, labels):
        c = self.label_emb(labels)
        d_in = torch.cat((x, c), dim=1)
        return self.model(d_in)

def compute_gp(critic, real_samples, fake_samples, labels, device):
    alpha = torch.rand(real_samples.size(0), 1).to(device)
    interpolates = (alpha * real_samples + (1 - alpha) * fake_samples).requires_grad_(True)
    d_interpolates = critic(interpolates, labels)
    fake = torch.ones_like(d_interpolates)
    gradients = torch.autograd.grad(
        outputs=d_interpolates, inputs=interpolates, grad_outputs=fake,
        create_graph=True, retain_graph=True, only_inputs=True
    )[0]
    gradients = gradients.view(gradients.size(0), -1)
    return ((gradients.norm(2, dim=1) - 1) ** 2).mean()

# ===== Experiment Setup =====
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
all_results = defaultdict(list)

for repeat in range(3):
    print(f"\n🔁 Repeat {repeat+1}/3")
    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        print(f"\n🔄 Fold {fold+1}/2")
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        X_tensor = torch.tensor(X_train.values, dtype=torch.float32)
        y_tensor = torch.tensor(y_train.values, dtype=torch.long)
        loader = DataLoader(TensorDataset(X_tensor, y_tensor), batch_size=128, shuffle=True)

        generator = Generator().to(device)
        critic = Critic().to(device)
        opt_G = torch.optim.Adam(generator.parameters(), lr=1e-4, betas=(0.5, 0.9))
        opt_C = torch.optim.Adam(critic.parameters(), lr=1e-4, betas=(0.5, 0.9))

        for epoch in range(100):
            for i, (real_samples, labels) in enumerate(loader):
                real_samples, labels = real_samples.to(device), labels.to(device)
                opt_C.zero_grad()
                z = torch.randn(real_samples.size(0), 32).to(device)
                fake_samples = generator(z, labels)
                real_validity = critic(real_samples, labels)
                fake_validity = critic(fake_samples.detach(), labels)
                gp = compute_gp(critic, real_samples, fake_samples, labels, device)
                c_loss = -torch.mean(real_validity) + torch.mean(fake_validity) + 10 * gp
                c_loss.backward()
                opt_C.step()

                if i % 5 == 0:
                    opt_G.zero_grad()
                    gen_samples = generator(z, labels)
                    g_loss = -torch.mean(critic(gen_samples, labels))
                    g_loss.backward()
                    opt_G.step()

        synth_size = len(X_train) // 2
        z = torch.randn(synth_size, 32).to(device)
        synth_labels = torch.randint(0, num_classes, (synth_size,), dtype=torch.long).to(device)
        gen_data = generator(z, synth_labels).detach().cpu().numpy()
        synth_df = pd.DataFrame(gen_data, columns=X.columns)
        synth_df["class"] = synth_labels.cpu().numpy()

        print("\n📊 TSTR Accuracy:")
        models = {
            "LogReg": LogisticRegression(max_iter=300),
            "MLP": MLPClassifier(max_iter=300),
            "RF": RandomForestClassifier(),
            "XGBT": XGBClassifier(use_label_encoder=False, eval_metric="mlogloss")
        }
        for name, model in models.items():
            model.fit(synth_df.drop(columns=["class"]), synth_df["class"])
            y_pred = model.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            all_results[f"{name}_acc"].append(acc)
            print(f"{name}: {acc:.4f}")

        jsd, wd = evaluate_jsd_wd(X_train, synth_df.drop(columns=["class"]))
        all_results["jsd"].append(jsd)
        all_results["wd"].append(wd)
        print(f"\n🔬 JSD: {jsd:.4f} | WD: {wd:.4f}")

# ===== Final Average Summary =====
print("\n📈 FINAL AVERAGE RESULTS ACROSS 3x2 CV:")
for name in ["LogReg", "MLP", "RF", "XGBT"]:
    avg = np.mean(all_results[f"{name}_acc"])
    print(f"{name} TSTR Accuracy: {avg:.4f}")
print(f"\nJSD: {np.mean(all_results['jsd']):.4f}")
print(f"Wasserstein Distance: {np.mean(all_results['wd']):.4f}")

# ===== Final Generator Training on Full Dataset =====
print("\n🚀 Training generator on full dataset for final synthetic data...")
X_tensor_full = torch.tensor(X.values, dtype=torch.float32)
y_tensor_full = torch.tensor(y.values, dtype=torch.long)
loader_full = DataLoader(TensorDataset(X_tensor_full, y_tensor_full), batch_size=128, shuffle=True)

generator = Generator().to(device)
critic = Critic().to(device)
opt_G = torch.optim.Adam(generator.parameters(), lr=1e-4, betas=(0.5, 0.9))
opt_C = torch.optim.Adam(critic.parameters(), lr=1e-4, betas=(0.5, 0.9))

for epoch in range(100):
    for i, (real_samples, labels) in enumerate(loader_full):
        real_samples, labels = real_samples.to(device), labels.to(device)
        opt_C.zero_grad()
        z = torch.randn(real_samples.size(0), 32).to(device)
        fake_samples = generator(z, labels)
        real_validity = critic(real_samples, labels)
        fake_validity = critic(fake_samples.detach(), labels)
        gp = compute_gp(critic, real_samples, fake_samples, labels, device)
        c_loss = -torch.mean(real_validity) + torch.mean(fake_validity) + 10 * gp
        c_loss.backward()
        opt_C.step()

        if i % 5 == 0:
            opt_G.zero_grad()
            gen_samples = generator(z, labels)
            g_loss = -torch.mean(critic(gen_samples, labels))
            g_loss.backward()
            opt_G.step()

# ===== Generate and Save Final Synthetic Dataset =====
print("\n💾 Generating final synthetic dataset (50% size of original)")
synth_size = len(X) // 2
z = torch.randn(synth_size, 32).to(device)
synth_labels = torch.randint(0, num_classes, (synth_size,), dtype=torch.long).to(device)
gen_data = generator(z, synth_labels).detach().cpu().numpy()
synth_df_final = pd.DataFrame(gen_data, columns=X.columns)
synth_df_final["class"] = synth_labels.cpu().numpy()

# ===== Postprocess Categorical + Clamp Continuous Columns =====
categorical_columns = [
    "workclass", "education", "marital-status", "occupation",
    "relationship", "race", "sex", "native-country"
]

for col in categorical_columns:
    synth_df_final[col] = synth_df_final[col].round().astype(int)

# ✅ ADD THESE RIGHT HERE
for col in categorical_columns:
    synth_df_final[col] = synth_df_final[col].clip(0, df[col].max())
    
# Clamp continuous columns
for col in ["fnlwgt", "capital-gain", "capital-loss", "hours-per-week"]:
    min_val = df[col].min()
    max_val = df[col].max()
    synth_df_final[col] = synth_df_final[col].clip(min_val, max_val)


    synth_df_final.to_csv("synthetic_dataset_half.csv", index=False)
print("✅ Saved synthetic dataset as 'synthetic_dataset_half.csv'")


🔁 Repeat 1/3

🔄 Fold 1/2

📊 TSTR Accuracy:
LogReg: 0.7636
MLP: 0.7734
RF: 0.7829
XGBT: 0.7581

🔬 JSD: 0.4944 | WD: 1.1796

🔄 Fold 2/2

📊 TSTR Accuracy:
LogReg: 0.6364
MLP: 0.6224
RF: 0.7422
XGBT: 0.6911

🔬 JSD: 0.4998 | WD: 1.1183

🔁 Repeat 2/3

🔄 Fold 1/2

📊 TSTR Accuracy:
LogReg: 0.6774
MLP: 0.6521
RF: 0.7806
XGBT: 0.7205

🔬 JSD: 0.4994 | WD: 1.3236

🔄 Fold 2/2

📊 TSTR Accuracy:
LogReg: 0.7582
MLP: 0.7705
RF: 0.7609
XGBT: 0.7627

🔬 JSD: 0.5011 | WD: 1.4967

🔁 Repeat 3/3

🔄 Fold 1/2

📊 TSTR Accuracy:
LogReg: 0.7392
MLP: 0.6616
RF: 0.7188
XGBT: 0.6166

🔬 JSD: 0.4988 | WD: 1.1839

🔄 Fold 2/2

📊 TSTR Accuracy:
LogReg: 0.6254
MLP: 0.6511
RF: 0.7006
XGBT: 0.6392

🔬 JSD: 0.4990 | WD: 1.1816

📈 FINAL AVERAGE RESULTS ACROSS 3x2 CV:
LogReg TSTR Accuracy: 0.7000
MLP TSTR Accuracy: 0.6885
RF TSTR Accuracy: 0.7477
XGBT TSTR Accuracy: 0.6980

JSD: 0.4987
Wasserstein Distance: 1.2473

🚀 Training generator on full dataset for final synthetic data...

💾 Generating final synthetic dataset (50% size o

In [12]:
# ===== Postprocess Categorical + Clamp Continuous Columns =====
categorical_columns = [
    "workclass", "education", "marital-status", "occupation",
    "relationship", "race", "sex", "native-country"
]

for col in categorical_columns:
    synth_df_final[col] = synth_df_final[col].round().astype(int)

# ✅ ADD THESE RIGHT HERE
for col in categorical_columns:
    synth_df_final[col] = synth_df_final[col].clip(0, df[col].max())
    
# Clamp continuous columns
for col in ["fnlwgt", "capital-gain", "capital-loss", "hours-per-week"]:
    min_val = df[col].min()
    max_val = df[col].max()
    synth_df_final[col] = synth_df_final[col].clip(min_val, max_val)


    synth_df_final.to_csv("synthetic_dataset_half.csv", index=False)
print("✅ Saved synthetic dataset as 'synthetic_dataset_half.csv'")

✅ Saved synthetic dataset as 'synthetic_dataset_half.csv'


In [19]:
# ===== Define Critic for CAR Dataset =====
class CarCritic(nn.Module):
    def __init__(self):
        super().__init__()
        self.label_emb = nn.Embedding(num_classes_car, num_classes_car)
        self.model = nn.Sequential(
            nn.Linear(input_dim_car + num_classes_car, 128),
            nn.LeakyReLU(0.2),
            nn.Linear(128, 64),
            nn.LeakyReLU(0.2),
            nn.Linear(64, 1)
        )
    def forward(self, x, labels):
        c = self.label_emb(labels)
        d_in = torch.cat((x, c), dim=1)
        return self.model(d_in)

# ===== Reusable Metrics =====
def compute_jsd(p, q):
    p, q = np.array(p) + 1e-10, np.array(q) + 1e-10
    p, q = p / p.sum(), q / q.sum()
    m = 0.5 * (p + q)
    return 0.5 * (entropy(p, m) + entropy(q, m))

def evaluate_jsd_wd(real_df, synth_df):
    jsd_scores, wd_scores = [], []
    for col in real_df.columns:
        real, synth = real_df[col].values, synth_df[col].values
        jsd = compute_jsd(np.histogram(real, bins=20)[0], np.histogram(synth, bins=20)[0])
        wd = wasserstein_distance(real, synth)
        jsd_scores.append(jsd)
        wd_scores.append(wd)
    return np.mean(jsd_scores), np.mean(wd_scores)

# ===== 3x2 CV + Evaluation =====
skf_car = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
all_results_car = defaultdict(list)

for repeat in range(3):
    print(f"\n🔁 Repeat {repeat+1}/3")
    for fold, (train_idx, test_idx) in enumerate(skf_car.split(X_car, y_car)):
        print(f"\n🔄 Fold {fold+1}/2")

        X_train, X_test = X_car.iloc[train_idx], X_car.iloc[test_idx]
        y_train, y_test = y_car.iloc[train_idx], y_car.iloc[test_idx]

        X_tensor = torch.tensor(X_train.values, dtype=torch.float32)
        y_tensor = torch.tensor(y_train.values, dtype=torch.long)
        loader = DataLoader(TensorDataset(X_tensor, y_tensor), batch_size=128, shuffle=True)

        generator_car = CarGenerator().to(device)
        critic_car = CarCritic().to(device)
        opt_G = torch.optim.Adam(generator_car.parameters(), lr=1e-4, betas=(0.5, 0.9))
        opt_C = torch.optim.Adam(critic_car.parameters(), lr=1e-4, betas=(0.5, 0.9))

        for epoch in range(100):
            for i, (real_samples, labels) in enumerate(loader):
                real_samples, labels = real_samples.to(device), labels.to(device)
                opt_C.zero_grad()
                z = torch.randn(real_samples.size(0), 32).to(device)
                fake_samples = generator_car(z, labels)
                real_validity = critic_car(real_samples, labels)
                fake_validity = critic_car(fake_samples.detach(), labels)
                gp = compute_gp(critic_car, real_samples, fake_samples, labels, device)
                c_loss = -torch.mean(real_validity) + torch.mean(fake_validity) + 10 * gp
                c_loss.backward()
                opt_C.step()

                if i % 5 == 0:
                    opt_G.zero_grad()
                    gen_samples = generator_car(z, labels)
                    g_loss = -torch.mean(critic_car(gen_samples, labels))
                    g_loss.backward()
                    opt_G.step()

        # ===== Generate Synthetic Data =====
        synth_size = len(X_train) // 2
        z = torch.randn(synth_size, 32).to(device)
        synth_labels = torch.randint(0, num_classes_car, (synth_size,), dtype=torch.long).to(device)
        gen_data = generator_car(z, synth_labels).detach().cpu().numpy()

        synth_df = pd.DataFrame(gen_data, columns=X_car.columns)
        synth_df["class"] = synth_labels.cpu().numpy()

        # Postprocess
        for col in synth_df.columns:
            synth_df[col] = synth_df[col].round().astype(int)
            synth_df[col] = synth_df[col].clip(0, car_df[col].max())

        # ===== TSTR Evaluation =====
        print("\n📊 TSTR Accuracy:")
        models = {
            "LogReg": LogisticRegression(max_iter=300),
            "MLP": MLPClassifier(max_iter=300),
            "RF": RandomForestClassifier(),
            "XGBT": XGBClassifier(use_label_encoder=False, eval_metric="mlogloss")
        }

        for name, model in models.items():
            model.fit(synth_df.drop(columns=["class"]), synth_df["class"])
            y_pred = model.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            all_results_car[f"{name}_acc"].append(acc)
            print(f"{name}: {acc:.4f}")

        # ===== JSD + WD
        jsd, wd = evaluate_jsd_wd(X_train, synth_df.drop(columns=["class"]))
        all_results_car["jsd"].append(jsd)
        all_results_car["wd"].append(wd)
        print(f"\n🔬 JSD: {jsd:.4f} | WD: {wd:.4f}")

# ===== Final Summary =====
print("\n📈 FINAL AVERAGE RESULTS ACROSS 3x2 CV:")
for name in ["LogReg", "MLP", "RF", "XGBT"]:
    avg = np.mean(all_results_car[f"{name}_acc"])
    print(f"{name} TSTR Accuracy: {avg:.4f}")
print(f"\nJSD: {np.mean(all_results_car['jsd']):.4f}")
print(f"Wasserstein Distance: {np.mean(all_results_car['wd']):.4f}")

# ===== Final Training + Save CSV =====
print("\n🚀 Training generator on full dataset for final synthetic data...")
X_tensor_full_car = torch.tensor(X_car.values, dtype=torch.float32)
y_tensor_full_car = torch.tensor(y_car.values, dtype=torch.long)
loader_full_car = DataLoader(TensorDataset(X_tensor_full_car, y_tensor_full_car), batch_size=128, shuffle=True)

generator_car = CarGenerator().to(device)
critic_car = CarCritic().to(device)
opt_G = torch.optim.Adam(generator_car.parameters(), lr=1e-4, betas=(0.5, 0.9))
opt_C = torch.optim.Adam(critic_car.parameters(), lr=1e-4, betas=(0.5, 0.9))

for epoch in range(100):
    for i, (real_samples, labels) in enumerate(loader_full_car):
        real_samples, labels = real_samples.to(device), labels.to(device)
        opt_C.zero_grad()
        z = torch.randn(real_samples.size(0), 32).to(device)
        fake_samples = generator_car(z, labels)
        real_validity = critic_car(real_samples, labels)
        fake_validity = critic_car(fake_samples.detach(), labels)
        gp = compute_gp(critic_car, real_samples, fake_samples, labels, device)
        c_loss = -torch.mean(real_validity) + torch.mean(fake_validity) + 10 * gp
        c_loss.backward()
        opt_C.step()

        if i % 5 == 0:
            opt_G.zero_grad()
            gen_samples = generator_car(z, labels)
            g_loss = -torch.mean(critic_car(gen_samples, labels))
            g_loss.backward()
            opt_G.step()

# ===== Generate Final Synthetic Data =====
print("\n💾 Generating final synthetic dataset (50% size of original)")
synth_size_final = len(X_car) // 2
z = torch.randn(synth_size_final, 32).to(device)
synth_labels_final = torch.randint(0, num_classes_car, (synth_size_final,), dtype=torch.long).to(device)
gen_data_final = generator_car(z, synth_labels_final).detach().cpu().numpy()

synth_df_final = pd.DataFrame(gen_data_final, columns=X_car.columns)
synth_df_final["class"] = synth_labels_final.cpu().numpy()

# Postprocess
for col in synth_df_final.columns:
    synth_df_final[col] = synth_df_final[col].round().astype(int)
    synth_df_final[col] = synth_df_final[col].clip(0, car_df[col].max())

synth_df_final.to_csv("synthetic_car_dataset_half.csv", index=False)
print("✅ Saved synthetic car dataset as 'synthetic_car_dataset_half.csv'")


🔁 Repeat 1/3

🔄 Fold 1/2

📊 TSTR Accuracy:
LogReg: 0.3634
MLP: 0.2928
RF: 0.4271
XGBT: 0.3299

🔬 JSD: 0.3243 | WD: 0.7552

🔄 Fold 2/2

📊 TSTR Accuracy:
LogReg: 0.3125
MLP: 0.2535
RF: 0.2836
XGBT: 0.2627

🔬 JSD: 0.3755 | WD: 0.8764

🔁 Repeat 2/3

🔄 Fold 1/2

📊 TSTR Accuracy:
LogReg: 0.1412
MLP: 0.1308
RF: 0.0949
XGBT: 0.1597

🔬 JSD: 0.2786 | WD: 0.7483

🔄 Fold 2/2

📊 TSTR Accuracy:
LogReg: 0.3252
MLP: 0.3252
RF: 0.2523
XGBT: 0.3438

🔬 JSD: 0.3940 | WD: 0.7359

🔁 Repeat 3/3

🔄 Fold 1/2

📊 TSTR Accuracy:
LogReg: 0.3958
MLP: 0.3854
RF: 0.3553
XGBT: 0.3345

🔬 JSD: 0.2860 | WD: 0.6831

🔄 Fold 2/2

📊 TSTR Accuracy:
LogReg: 0.3218
MLP: 0.1968
RF: 0.2558
XGBT: 0.2431

🔬 JSD: 0.2457 | WD: 0.7884

📈 FINAL AVERAGE RESULTS ACROSS 3x2 CV:
LogReg TSTR Accuracy: 0.3100
MLP TSTR Accuracy: 0.2641
RF TSTR Accuracy: 0.2782
XGBT TSTR Accuracy: 0.2789

JSD: 0.3173
Wasserstein Distance: 0.7645

🚀 Training generator on full dataset for final synthetic data...

💾 Generating final synthetic dataset (50% size o

In [24]:
import pandas as pd
from scipy.io import arff
from sklearn.preprocessing import LabelEncoder

# ===== Load and Clean CREDIT Dataset =====
credit_data, credit_meta = arff.loadarff("credit-a.arff")
credit_df = pd.DataFrame(credit_data)

# Decode byte strings to normal strings
for col in credit_df.select_dtypes([object]).columns:
    credit_df[col] = credit_df[col].str.decode("utf-8").str.replace(r"[\\'\"]", "", regex=True)

# Encode all categorical columns
encoders_credit = {}
for col in credit_df.columns:
    le = LabelEncoder()
    credit_df[col] = le.fit_transform(credit_df[col])
    encoders_credit[col] = le

# Define input features and target
X_credit = credit_df.drop(columns=["class"])
y_credit = credit_df["class"]

# Dimensions for GAN
input_dim_credit = X_credit.shape[1]
num_classes_credit = y_credit.nunique()

print("✅ CREDIT dataset ready — encoded and split.")

✅ CREDIT dataset ready — encoded and split.


In [25]:
# ===== Define Critic for CREDIT Dataset =====
class CreditCritic(nn.Module):
    def __init__(self):
        super().__init__()
        self.label_emb = nn.Embedding(num_classes_credit, num_classes_credit)
        self.model = nn.Sequential(
            nn.Linear(input_dim_credit + num_classes_credit, 128),
            nn.LeakyReLU(0.2),
            nn.Linear(128, 64),
            nn.LeakyReLU(0.2),
            nn.Linear(64, 1)
        )
    def forward(self, x, labels):
        c = self.label_emb(labels)
        d_in = torch.cat((x, c), dim=1)
        return self.model(d_in)

# ===== Reusable Metrics =====
def compute_jsd(p, q):
    p, q = np.array(p) + 1e-10, np.array(q) + 1e-10
    p, q = p / p.sum(), q / q.sum()
    m = 0.5 * (p + q)
    return 0.5 * (entropy(p, m) + entropy(q, m))

def evaluate_jsd_wd(real_df, synth_df):
    jsd_scores, wd_scores = [], []
    for col in real_df.columns:
        real, synth = real_df[col].values, synth_df[col].values
        jsd = compute_jsd(np.histogram(real, bins=20)[0], np.histogram(synth, bins=20)[0])
        wd = wasserstein_distance(real, synth)
        jsd_scores.append(jsd)
        wd_scores.append(wd)
    return np.mean(jsd_scores), np.mean(wd_scores)

# ===== 3x2 CV + Evaluation =====
skf_credit = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
all_results_credit = defaultdict(list)

for repeat in range(3):
    print(f"\n🔁 Repeat {repeat+1}/3")
    for fold, (train_idx, test_idx) in enumerate(skf_credit.split(X_credit, y_credit)):
        print(f"\n🔄 Fold {fold+1}/2")

        X_train, X_test = X_credit.iloc[train_idx], X_credit.iloc[test_idx]
        y_train, y_test = y_credit.iloc[train_idx], y_credit.iloc[test_idx]

        X_tensor = torch.tensor(X_train.values, dtype=torch.float32)
        y_tensor = torch.tensor(y_train.values, dtype=torch.long)
        loader = DataLoader(TensorDataset(X_tensor, y_tensor), batch_size=128, shuffle=True)

        generator_credit = CreditGenerator().to(device)
        critic_credit = CreditCritic().to(device)
        opt_G = torch.optim.Adam(generator_credit.parameters(), lr=1e-4, betas=(0.5, 0.9))
        opt_C = torch.optim.Adam(critic_credit.parameters(), lr=1e-4, betas=(0.5, 0.9))

        for epoch in range(100):
            for i, (real_samples, labels) in enumerate(loader):
                real_samples, labels = real_samples.to(device), labels.to(device)
                opt_C.zero_grad()
                z = torch.randn(real_samples.size(0), 32).to(device)
                fake_samples = generator_credit(z, labels)
                real_validity = critic_credit(real_samples, labels)
                fake_validity = critic_credit(fake_samples.detach(), labels)
                gp = compute_gp(critic_credit, real_samples, fake_samples, labels, device)
                c_loss = -torch.mean(real_validity) + torch.mean(fake_validity) + 10 * gp
                c_loss.backward()
                opt_C.step()

                if i % 5 == 0:
                    opt_G.zero_grad()
                    gen_samples = generator_credit(z, labels)
                    g_loss = -torch.mean(critic_credit(gen_samples, labels))
                    g_loss.backward()
                    opt_G.step()

        # ===== Generate Synthetic Data =====
        synth_size = len(X_train) // 2
        z = torch.randn(synth_size, 32).to(device)
        synth_labels = torch.randint(0, num_classes_credit, (synth_size,), dtype=torch.long).to(device)
        gen_data = generator_credit(z, synth_labels).detach().cpu().numpy()

        synth_df = pd.DataFrame(gen_data, columns=X_credit.columns)
        synth_df["class"] = synth_labels.cpu().numpy()

        # Postprocess
        for col in synth_df.columns:
            synth_df[col] = synth_df[col].round().astype(int)
            synth_df[col] = synth_df[col].clip(0, credit_df[col].max())

        # ===== TSTR Evaluation =====
        print("\n📊 TSTR Accuracy:")
        models = {
            "LogReg": LogisticRegression(max_iter=300),
            "MLP": MLPClassifier(max_iter=300),
            "RF": RandomForestClassifier(),
            "XGBT": XGBClassifier(use_label_encoder=False, eval_metric="mlogloss")
        }

        for name, model in models.items():
            model.fit(synth_df.drop(columns=["class"]), synth_df["class"])
            y_pred = model.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            all_results_credit[f"{name}_acc"].append(acc)
            print(f"{name}: {acc:.4f}")

        # ===== JSD + WD
        jsd, wd = evaluate_jsd_wd(X_train, synth_df.drop(columns=["class"]))
        all_results_credit["jsd"].append(jsd)
        all_results_credit["wd"].append(wd)
        print(f"\n🔬 JSD: {jsd:.4f} | WD: {wd:.4f}")

# ===== Final Summary =====
print("\n📈 FINAL AVERAGE RESULTS ACROSS 3x2 CV:")
for name in ["LogReg", "MLP", "RF", "XGBT"]:
    avg = np.mean(all_results_credit[f"{name}_acc"])
    print(f"{name} TSTR Accuracy: {avg:.4f}")
print(f"\nJSD: {np.mean(all_results_credit['jsd']):.4f}")
print(f"Wasserstein Distance: {np.mean(all_results_credit['wd']):.4f}")

# ===== Final Training + Save CSV =====
print("\n🚀 Training generator on full dataset for final synthetic data...")
X_tensor_full_credit = torch.tensor(X_credit.values, dtype=torch.float32)
y_tensor_full_credit = torch.tensor(y_credit.values, dtype=torch.long)
loader_full_credit = DataLoader(TensorDataset(X_tensor_full_credit, y_tensor_full_credit), batch_size=128, shuffle=True)

generator_credit = CreditGenerator().to(device)
critic_credit = CreditCritic().to(device)
opt_G = torch.optim.Adam(generator_credit.parameters(), lr=1e-4, betas=(0.5, 0.9))
opt_C = torch.optim.Adam(critic_credit.parameters(), lr=1e-4, betas=(0.5, 0.9))

for epoch in range(100):
    for i, (real_samples, labels) in enumerate(loader_full_credit):
        real_samples, labels = real_samples.to(device), labels.to(device)
        opt_C.zero_grad()
        z = torch.randn(real_samples.size(0), 32).to(device)
        fake_samples = generator_credit(z, labels)
        real_validity = critic_credit(real_samples, labels)
        fake_validity = critic_credit(fake_samples.detach(), labels)
        gp = compute_gp(critic_credit, real_samples, fake_samples, labels, device)
        c_loss = -torch.mean(real_validity) + torch.mean(fake_validity) + 10 * gp
        c_loss.backward()
        opt_C.step()

        if i % 5 == 0:
            opt_G.zero_grad()
            gen_samples = generator_credit(z, labels)
            g_loss = -torch.mean(critic_credit(gen_samples, labels))
            g_loss.backward()
            opt_G.step()

# ===== Generate Final Synthetic Data =====
print("\n💾 Generating final synthetic dataset (50% size of original)")
synth_size_final = len(X_credit) // 2
z = torch.randn(synth_size_final, 32).to(device)
synth_labels_final = torch.randint(0, num_classes_credit, (synth_size_final,), dtype=torch.long).to(device)
gen_data_final = generator_credit(z, synth_labels_final).detach().cpu().numpy()

synth_df_final = pd.DataFrame(gen_data_final, columns=X_credit.columns)
synth_df_final["class"] = synth_labels_final.cpu().numpy()

# Postprocess
for col in synth_df_final.columns:
    synth_df_final[col] = synth_df_final[col].round().astype(int)
    synth_df_final[col] = synth_df_final[col].clip(0, credit_df[col].max())

synth_df_final.to_csv("synthetic_credit_dataset_half.csv", index=False)
print("✅ Saved synthetic credit dataset as 'synthetic_credit_dataset_half.csv'")


🔁 Repeat 1/3

🔄 Fold 1/2

📊 TSTR Accuracy:
LogReg: 0.5507
MLP: 0.5507
RF: 0.5101
XGBT: 0.4000

🔬 JSD: 0.4958 | WD: 1.2855

🔄 Fold 2/2

📊 TSTR Accuracy:
LogReg: 0.4725
MLP: 0.4812
RF: 0.5217
XGBT: 0.5188

🔬 JSD: 0.4970 | WD: 1.2699

🔁 Repeat 2/3

🔄 Fold 1/2

📊 TSTR Accuracy:
LogReg: 0.4464
MLP: 0.4464
RF: 0.5043
XGBT: 0.5275

🔬 JSD: 0.3654 | WD: 1.2408

🔄 Fold 2/2

📊 TSTR Accuracy:
LogReg: 0.5565
MLP: 0.5536
RF: 0.5565
XGBT: 0.5565

🔬 JSD: 0.4796 | WD: 1.2458

🔁 Repeat 3/3

🔄 Fold 1/2

📊 TSTR Accuracy:
LogReg: 0.5565
MLP: 0.5623
RF: 0.5594
XGBT: 0.2899

🔬 JSD: 0.3906 | WD: 1.2084

🔄 Fold 2/2

📊 TSTR Accuracy:
LogReg: 0.4435
MLP: 0.4435
RF: 0.4406
XGBT: 0.4261

🔬 JSD: 0.3817 | WD: 1.2346

📈 FINAL AVERAGE RESULTS ACROSS 3x2 CV:
LogReg TSTR Accuracy: 0.5043
MLP TSTR Accuracy: 0.5063
RF TSTR Accuracy: 0.5155
XGBT TSTR Accuracy: 0.4531

JSD: 0.4350
Wasserstein Distance: 1.2475

🚀 Training generator on full dataset for final synthetic data...

💾 Generating final synthetic dataset (50% size o

In [26]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from scipy.stats import entropy, wasserstein_distance
from scipy.io import arff
from collections import defaultdict

# ===== Load and Encode Letter Dataset =====
data, meta = arff.loadarff("letter-recog.arff")
letter_df = pd.DataFrame(data)
for col in letter_df.select_dtypes([object]).columns:
    letter_df[col] = letter_df[col].str.decode("utf-8")
encoders = {}
for col in letter_df.columns:
    le = LabelEncoder()
    letter_df[col] = le.fit_transform(letter_df[col])
    encoders[col] = le

X_letter = letter_df.drop(columns=["class"])
y_letter = letter_df["class"]
input_dim_letter = X_letter.shape[1]
num_classes_letter = y_letter.nunique()
device = torch.device("cpu")  # Force CPU use

# ===== Define Generator and Critic =====
class LetterGenerator(nn.Module):
    def __init__(self):
        super().__init__()
        self.label_emb = nn.Embedding(num_classes_letter, num_classes_letter)
        self.model = nn.Sequential(
            nn.Linear(32 + num_classes_letter, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim_letter)
        )
    def forward(self, z, labels):
        c = self.label_emb(labels)
        x = torch.cat((z, c), dim=1)
        return self.model(x)

class LetterCritic(nn.Module):
    def __init__(self):
        super().__init__()
        self.label_emb = nn.Embedding(num_classes_letter, num_classes_letter)
        self.model = nn.Sequential(
            nn.Linear(input_dim_letter + num_classes_letter, 128),
            nn.LeakyReLU(0.2),
            nn.Linear(128, 64),
            nn.LeakyReLU(0.2),
            nn.Linear(64, 1)
        )
    def forward(self, x, labels):
        c = self.label_emb(labels)
        d_in = torch.cat((x, c), dim=1)
        return self.model(d_in)

def compute_gp(critic, real_samples, fake_samples, labels, device):
    alpha = torch.rand(real_samples.size(0), 1).to(device)
    interpolates = (alpha * real_samples + (1 - alpha) * fake_samples).requires_grad_(True)
    d_interpolates = critic(interpolates, labels)
    fake = torch.ones_like(d_interpolates)
    gradients = torch.autograd.grad(outputs=d_interpolates, inputs=interpolates,
                                    grad_outputs=fake, create_graph=True,
                                    retain_graph=True, only_inputs=True)[0]
    gradients = gradients.view(gradients.size(0), -1)
    return ((gradients.norm(2, dim=1) - 1) ** 2).mean()

def compute_jsd(p, q):
    p, q = np.array(p) + 1e-10, np.array(q) + 1e-10
    p, q = p / p.sum(), q / p.sum()
    m = 0.5 * (p + q)
    return 0.5 * (entropy(p, m) + entropy(q, m))

def evaluate_jsd_wd(real_df, synth_df):
    jsd_scores, wd_scores = [], []
    for col in real_df.columns:
        real, synth = real_df[col].values, synth_df[col].values
        jsd = compute_jsd(np.histogram(real, bins=20)[0], np.histogram(synth, bins=20)[0])
        wd = wasserstein_distance(real, synth)
        jsd_scores.append(jsd)
        wd_scores.append(wd)
    return np.mean(jsd_scores), np.mean(wd_scores)

# ===== 3x2 CV + Evaluation =====
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
all_results_letter = defaultdict(list)

for repeat in range(3):
    print(f"\n🔁 Repeat {repeat+1}/3")
    for fold, (train_idx, test_idx) in enumerate(skf.split(X_letter, y_letter)):
        print(f"\n🔄 Fold {fold+1}/2")
        X_train, X_test = X_letter.iloc[train_idx], X_letter.iloc[test_idx]
        y_train, y_test = y_letter.iloc[train_idx], y_letter.iloc[test_idx]

        X_tensor = torch.tensor(X_train.values, dtype=torch.float32)
        y_tensor = torch.tensor(y_train.values, dtype=torch.long)
        loader = DataLoader(TensorDataset(X_tensor, y_tensor), batch_size=128, shuffle=True)

        generator = LetterGenerator().to(device)
        critic = LetterCritic().to(device)
        opt_G = torch.optim.Adam(generator.parameters(), lr=1e-4, betas=(0.5, 0.9))
        opt_C = torch.optim.Adam(critic.parameters(), lr=1e-4, betas=(0.5, 0.9))

        for epoch in range(100):
            for i, (real_samples, labels) in enumerate(loader):
                real_samples, labels = real_samples.to(device), labels.to(device)
                opt_C.zero_grad()
                z = torch.randn(real_samples.size(0), 32).to(device)
                fake_samples = generator(z, labels)
                real_validity = critic(real_samples, labels)
                fake_validity = critic(fake_samples.detach(), labels)
                gp = compute_gp(critic, real_samples, fake_samples, labels, device)
                c_loss = -torch.mean(real_validity) + torch.mean(fake_validity) + 10 * gp
                c_loss.backward()
                opt_C.step()
                if i % 5 == 0:
                    opt_G.zero_grad()
                    gen_samples = generator(z, labels)
                    g_loss = -torch.mean(critic(gen_samples, labels))
                    g_loss.backward()
                    opt_G.step()

        # Generate and evaluate synthetic data
        synth_size = len(X_train) // 2
        z = torch.randn(synth_size, 32).to(device)
        synth_labels = torch.randint(0, num_classes_letter, (synth_size,), dtype=torch.long).to(device)
        gen_data = generator(z, synth_labels).detach().cpu().numpy()
        synth_df = pd.DataFrame(gen_data, columns=X_letter.columns)
        synth_df["class"] = synth_labels.cpu().numpy()
        for col in synth_df.columns:
            synth_df[col] = synth_df[col].round().astype(int)
            synth_df[col] = synth_df[col].clip(0, letter_df[col].max())

        print("\n📊 TSTR Accuracy:")
        models = {
            "LogReg": LogisticRegression(max_iter=300),
            "MLP": MLPClassifier(max_iter=300),
            "RF": RandomForestClassifier(),
            "XGBT": XGBClassifier(use_label_encoder=False, eval_metric="mlogloss")
        }
        for name, model in models.items():
            model.fit(synth_df.drop(columns=["class"]), synth_df["class"])
            y_pred = model.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            all_results_letter[f"{name}_acc"].append(acc)
            print(f"{name}: {acc:.4f}")

        jsd, wd = evaluate_jsd_wd(X_train, synth_df.drop(columns=["class"]))
        all_results_letter["jsd"].append(jsd)
        all_results_letter["wd"].append(wd)
        print(f"\n🔬 JSD: {jsd:.4f} | WD: {wd:.4f}")

# ===== Final Report + Save Dataset =====
print("\n📈 FINAL AVERAGE RESULTS ACROSS 3x2 CV:")
for name in ["LogReg", "MLP", "RF", "XGBT"]:
    print(f"{name} TSTR Accuracy: {np.mean(all_results_letter[f'{name}_acc']):.4f}")
print(f"\nJSD: {np.mean(all_results_letter['jsd']):.4f}")
print(f"Wasserstein Distance: {np.mean(all_results_letter['wd']):.4f}")

print("\n🚀 Training on full dataset for final synthetic sample...")
X_tensor_full = torch.tensor(X_letter.values, dtype=torch.float32)
y_tensor_full = torch.tensor(y_letter.values, dtype=torch.long)
loader_full = DataLoader(TensorDataset(X_tensor_full, y_tensor_full), batch_size=128, shuffle=True)

generator = LetterGenerator().to(device)
critic = LetterCritic().to(device)
opt_G = torch.optim.Adam(generator.parameters(), lr=1e-4, betas=(0.5, 0.9))
opt_C = torch.optim.Adam(critic.parameters(), lr=1e-4, betas=(0.5, 0.9))

for epoch in range(100):
    for i, (real_samples, labels) in enumerate(loader_full):
        real_samples, labels = real_samples.to(device), labels.to(device)
        opt_C.zero_grad()
        z = torch.randn(real_samples.size(0), 32).to(device)
        fake_samples = generator(z, labels)
        real_validity = critic(real_samples, labels)
        fake_validity = critic(fake_samples.detach(), labels)
        gp = compute_gp(critic, real_samples, fake_samples, labels, device)
        c_loss = -torch.mean(real_validity) + torch.mean(fake_validity) + 10 * gp
        c_loss.backward()
        opt_C.step()
        if i % 5 == 0:
            opt_G.zero_grad()
            gen_samples = generator(z, labels)
            g_loss = -torch.mean(critic(gen_samples, labels))
            g_loss.backward()
            opt_G.step()

print("\n💾 Generating and saving final dataset")
synth_size_final = len(X_letter) // 2
z = torch.randn(synth_size_final, 32).to(device)
synth_labels_final = torch.randint(0, num_classes_letter, (synth_size_final,), dtype=torch.long).to(device)
gen_data_final = generator(z, synth_labels_final).detach().cpu().numpy()

synth_df_final = pd.DataFrame(gen_data_final, columns=X_letter.columns)
synth_df_final["class"] = synth_labels_final.cpu().numpy()
for col in synth_df_final.columns:
    synth_df_final[col] = synth_df_final[col].round().astype(int)
    synth_df_final[col] = synth_df_final[col].clip(0, letter_df[col].max())

synth_df_final.to_csv("synthetic_letter_dataset_half.csv", index=False)
print("✅ Saved synthetic letter dataset as 'synthetic_letter_dataset_half.csv'")


🔁 Repeat 1/3

🔄 Fold 1/2

📊 TSTR Accuracy:
LogReg: 0.1554
MLP: 0.1574
RF: 0.1627
XGBT: 0.1326

🔬 JSD: 0.1773 | WD: 0.5672

🔄 Fold 2/2

📊 TSTR Accuracy:
LogReg: 0.1843
MLP: 0.1705
RF: 0.1959
XGBT: 0.1752

🔬 JSD: 0.1673 | WD: 0.5455

🔁 Repeat 2/3

🔄 Fold 1/2

📊 TSTR Accuracy:
LogReg: 0.2058
MLP: 0.2175
RF: 0.2149
XGBT: 0.2229

🔬 JSD: 0.1510 | WD: 0.5500

🔄 Fold 2/2

📊 TSTR Accuracy:
LogReg: 0.2184
MLP: 0.2162
RF: 0.1912
XGBT: 0.1965

🔬 JSD: 0.2151 | WD: 0.5812

🔁 Repeat 3/3

🔄 Fold 1/2

📊 TSTR Accuracy:
LogReg: 0.2047
MLP: 0.2273
RF: 0.1953
XGBT: 0.2087

🔬 JSD: 0.2071 | WD: 0.5677

🔄 Fold 2/2

📊 TSTR Accuracy:
LogReg: 0.2000
MLP: 0.1995
RF: 0.2173
XGBT: 0.2036

🔬 JSD: 0.1713 | WD: 0.5397

📈 FINAL AVERAGE RESULTS ACROSS 3x2 CV:
LogReg TSTR Accuracy: 0.1948
MLP TSTR Accuracy: 0.1981
RF TSTR Accuracy: 0.1962
XGBT TSTR Accuracy: 0.1899

JSD: 0.1815
Wasserstein Distance: 0.5586

🚀 Training on full dataset for final synthetic sample...

💾 Generating and saving final dataset
✅ Saved synthetic 

In [27]:
# ===== Encode and Prepare CHESS Dataset =====
from sklearn.preprocessing import LabelEncoder

chess_df = df.copy()
encoders = {}
for col in chess_df.columns:
    le = LabelEncoder()
    chess_df[col] = le.fit_transform(chess_df[col])
    encoders[col] = le

X_chess = chess_df.drop(columns=["class"])
y_chess = chess_df["class"]
input_dim_chess = X_chess.shape[1]
num_classes_chess = y_chess.nunique()

# ===== Generator for CHESS =====
class ChessGenerator(nn.Module):
    def __init__(self):
        super().__init__()
        self.label_emb = nn.Embedding(num_classes_chess, num_classes_chess)
        self.model = nn.Sequential(
            nn.Linear(32 + num_classes_chess, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim_chess)
        )
    def forward(self, z, labels):
        c = self.label_emb(labels)
        x = torch.cat((z, c), dim=1)
        return self.model(x)

# ===== Critic for CHESS =====
class ChessCritic(nn.Module):
    def __init__(self):
        super().__init__()
        self.label_emb = nn.Embedding(num_classes_chess, num_classes_chess)
        self.model = nn.Sequential(
            nn.Linear(input_dim_chess + num_classes_chess, 128),
            nn.LeakyReLU(0.2),
            nn.Linear(128, 64),
            nn.LeakyReLU(0.2),
            nn.Linear(64, 1)
        )
    def forward(self, x, labels):
        c = self.label_emb(labels)
        d_in = torch.cat((x, c), dim=1)
        return self.model(d_in)

# ===== 3x2 Stratified Evaluation =====
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
all_results_chess = defaultdict(list)

for repeat in range(3):
    print(f"\n🔁 Repeat {repeat+1}/3")
    for fold, (train_idx, test_idx) in enumerate(skf.split(X_chess, y_chess)):
        print(f"\n🔄 Fold {fold+1}/2")
        X_train, X_test = X_chess.iloc[train_idx], X_chess.iloc[test_idx]
        y_train, y_test = y_chess.iloc[train_idx], y_chess.iloc[test_idx]

        X_tensor = torch.tensor(X_train.values, dtype=torch.float32)
        y_tensor = torch.tensor(y_train.values, dtype=torch.long)
        loader = DataLoader(TensorDataset(X_tensor, y_tensor), batch_size=128, shuffle=True)

        generator = ChessGenerator().to(device)
        critic = ChessCritic().to(device)
        opt_G = torch.optim.Adam(generator.parameters(), lr=1e-4, betas=(0.5, 0.9))
        opt_C = torch.optim.Adam(critic.parameters(), lr=1e-4, betas=(0.5, 0.9))

        for epoch in range(100):
            for i, (real_samples, labels) in enumerate(loader):
                real_samples, labels = real_samples.to(device), labels.to(device)
                opt_C.zero_grad()
                z = torch.randn(real_samples.size(0), 32).to(device)
                fake_samples = generator(z, labels)
                real_validity = critic(real_samples, labels)
                fake_validity = critic(fake_samples.detach(), labels)
                gp = compute_gp(critic, real_samples, fake_samples, labels, device)
                c_loss = -torch.mean(real_validity) + torch.mean(fake_validity) + 10 * gp
                c_loss.backward()
                opt_C.step()

                if i % 5 == 0:
                    opt_G.zero_grad()
                    gen_samples = generator(z, labels)
                    g_loss = -torch.mean(critic(gen_samples, labels))
                    g_loss.backward()
                    opt_G.step()

        # ===== Generate Synthetic Data =====
        synth_size = len(X_train) // 2
        z = torch.randn(synth_size, 32).to(device)
        synth_labels = torch.randint(0, num_classes_chess, (synth_size,), dtype=torch.long).to(device)
        gen_data = generator(z, synth_labels).detach().cpu().numpy()

        synth_df = pd.DataFrame(gen_data, columns=X_chess.columns)
        synth_df["class"] = synth_labels.cpu().numpy()

        for col in synth_df.columns:
            synth_df[col] = synth_df[col].round().astype(int)
            synth_df[col] = synth_df[col].clip(0, chess_df[col].max())

        # ===== TSTR Evaluation =====
        print("\n📊 TSTR Accuracy:")
        models = {
            "LogReg": LogisticRegression(max_iter=300),
            "MLP": MLPClassifier(max_iter=300),
            "RF": RandomForestClassifier(),
            "XGBT": XGBClassifier(use_label_encoder=False, eval_metric="mlogloss")
        }
        for name, model in models.items():
            model.fit(synth_df.drop(columns=["class"]), synth_df["class"])
            y_pred = model.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            all_results_chess[f"{name}_acc"].append(acc)
            print(f"{name}: {acc:.4f}")

        jsd, wd = evaluate_jsd_wd(X_train, synth_df.drop(columns=["class"]))
        all_results_chess["jsd"].append(jsd)
        all_results_chess["wd"].append(wd)
        print(f"\n🔬 JSD: {jsd:.4f} | WD: {wd:.4f}")

# ===== Final Evaluation =====
print("\n📈 FINAL AVERAGE RESULTS ACROSS 3x2 CV:")
for name in ["LogReg", "MLP", "RF", "XGBT"]:
    print(f"{name} TSTR Accuracy: {np.mean(all_results_chess[f'{name}_acc']):.4f}")
print(f"\nJSD: {np.mean(all_results_chess['jsd']):.4f}")
print(f"Wasserstein Distance: {np.mean(all_results_chess['wd']):.4f}")

# ===== Final Generator Training =====
print("\n🚀 Training generator on full CHESS dataset...")
X_tensor_full = torch.tensor(X_chess.values, dtype=torch.float32)
y_tensor_full = torch.tensor(y_chess.values, dtype=torch.long)
loader_full = DataLoader(TensorDataset(X_tensor_full, y_tensor_full), batch_size=128, shuffle=True)

generator = ChessGenerator().to(device)
critic = ChessCritic().to(device)
opt_G = torch.optim.Adam(generator.parameters(), lr=1e-4, betas=(0.5, 0.9))
opt_C = torch.optim.Adam(critic.parameters(), lr=1e-4, betas=(0.5, 0.9))

for epoch in range(100):
    for i, (real_samples, labels) in enumerate(loader_full):
        real_samples, labels = real_samples.to(device), labels.to(device)
        opt_C.zero_grad()
        z = torch.randn(real_samples.size(0), 32).to(device)
        fake_samples = generator(z, labels)
        real_validity = critic(real_samples, labels)
        fake_validity = critic(fake_samples.detach(), labels)
        gp = compute_gp(critic, real_samples, fake_samples, labels, device)
        c_loss = -torch.mean(real_validity) + torch.mean(fake_validity) + 10 * gp
        c_loss.backward()
        opt_C.step()

        if i % 5 == 0:
            opt_G.zero_grad()
            gen_samples = generator(z, labels)
            g_loss = -torch.mean(critic(gen_samples, labels))
            g_loss.backward()
            opt_G.step()

# ===== Save Final CHESS Dataset =====
print("\n💾 Generating final synthetic dataset (50% size of original)")
synth_size_final = len(X_chess) // 2
z = torch.randn(synth_size_final, 32).to(device)
synth_labels_final = torch.randint(0, num_classes_chess, (synth_size_final,), dtype=torch.long).to(device)
gen_data_final = generator(z, synth_labels_final).detach().cpu().numpy()

synth_df_final = pd.DataFrame(gen_data_final, columns=X_chess.columns)
synth_df_final["class"] = synth_labels_final.cpu().numpy()

for col in synth_df_final.columns:
    synth_df_final[col] = synth_df_final[col].round().astype(int)
    synth_df_final[col] = synth_df_final[col].clip(0, chess_df[col].max())

synth_df_final.to_csv("synthetic_chess_dataset_half.csv", index=False)
print("✅ Saved synthetic chess dataset as 'synthetic_chess_dataset_half.csv'")


🔁 Repeat 1/3

🔄 Fold 1/2

📊 TSTR Accuracy:
LogReg: 0.7652
MLP: 0.7765
RF: 0.7774
XGBT: 0.7643

🔬 JSD: 0.1400 | WD: 0.9349

🔄 Fold 2/2

📊 TSTR Accuracy:
LogReg: 0.7182
MLP: 0.7059
RF: 0.7252
XGBT: 0.6474

🔬 JSD: 0.1436 | WD: 1.0236

🔁 Repeat 2/3

🔄 Fold 1/2

📊 TSTR Accuracy:
LogReg: 0.6544
MLP: 0.5350
RF: 0.6144
XGBT: 0.5811

🔬 JSD: 0.1339 | WD: 1.0025

🔄 Fold 2/2

📊 TSTR Accuracy:
LogReg: 0.7621
MLP: 0.7423
RF: 0.7682
XGBT: 0.7460

🔬 JSD: 0.1324 | WD: 0.9667

🔁 Repeat 3/3

🔄 Fold 1/2

📊 TSTR Accuracy:
LogReg: 0.7498
MLP: 0.7283
RF: 0.7666
XGBT: 0.7282

🔬 JSD: 0.1395 | WD: 0.9409

🔄 Fold 2/2

📊 TSTR Accuracy:
LogReg: 0.7385
MLP: 0.7408
RF: 0.7334
XGBT: 0.6936

🔬 JSD: 0.1402 | WD: 0.8349

📈 FINAL AVERAGE RESULTS ACROSS 3x2 CV:
LogReg TSTR Accuracy: 0.7314
MLP TSTR Accuracy: 0.7048
RF TSTR Accuracy: 0.7309
XGBT TSTR Accuracy: 0.6934

JSD: 0.1382
Wasserstein Distance: 0.9506

🚀 Training generator on full CHESS dataset...

💾 Generating final synthetic dataset (50% size of original)
✅ Saved

In [28]:
# ===== Encode and Prepare NURSERY Dataset =====
from sklearn.preprocessing import LabelEncoder

nursery_df = df.copy()
nursery_encoders = {}
for col in nursery_df.columns:
    le = LabelEncoder()
    nursery_df[col] = le.fit_transform(nursery_df[col])
    nursery_encoders[col] = le

X_nursery = nursery_df.drop(columns=["class"])
y_nursery = nursery_df["class"]
input_dim_nursery = X_nursery.shape[1]
num_classes_nursery = y_nursery.nunique()

# ===== Generator for NURSERY =====
class NurseryGenerator(nn.Module):
    def __init__(self):
        super().__init__()
        self.label_emb = nn.Embedding(num_classes_nursery, num_classes_nursery)
        self.model = nn.Sequential(
            nn.Linear(32 + num_classes_nursery, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim_nursery)
        )
    def forward(self, z, labels):
        c = self.label_emb(labels)
        x = torch.cat((z, c), dim=1)
        return self.model(x)

# ===== Critic for NURSERY =====
class NurseryCritic(nn.Module):
    def __init__(self):
        super().__init__()
        self.label_emb = nn.Embedding(num_classes_nursery, num_classes_nursery)
        self.model = nn.Sequential(
            nn.Linear(input_dim_nursery + num_classes_nursery, 128),
            nn.LeakyReLU(0.2),
            nn.Linear(128, 64),
            nn.LeakyReLU(0.2),
            nn.Linear(64, 1)
        )
    def forward(self, x, labels):
        c = self.label_emb(labels)
        d_in = torch.cat((x, c), dim=1)
        return self.model(d_in)

# ===== 3x2 Stratified Evaluation =====
skf_nursery = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
all_results_nursery = defaultdict(list)

for repeat in range(3):
    print(f"\n🔁 Repeat {repeat+1}/3")
    for fold, (train_idx, test_idx) in enumerate(skf_nursery.split(X_nursery, y_nursery)):
        print(f"\n🔀 Fold {fold+1}/2")
        X_train, X_test = X_nursery.iloc[train_idx], X_nursery.iloc[test_idx]
        y_train, y_test = y_nursery.iloc[train_idx], y_nursery.iloc[test_idx]

        X_tensor = torch.tensor(X_train.values, dtype=torch.float32)
        y_tensor = torch.tensor(y_train.values, dtype=torch.long)
        loader = DataLoader(TensorDataset(X_tensor, y_tensor), batch_size=128, shuffle=True)

        generator = NurseryGenerator().to(device)
        critic = NurseryCritic().to(device)
        opt_G = torch.optim.Adam(generator.parameters(), lr=1e-4, betas=(0.5, 0.9))
        opt_C = torch.optim.Adam(critic.parameters(), lr=1e-4, betas=(0.5, 0.9))

        for epoch in range(100):
            for i, (real_samples, labels) in enumerate(loader):
                real_samples, labels = real_samples.to(device), labels.to(device)
                opt_C.zero_grad()
                z = torch.randn(real_samples.size(0), 32).to(device)
                fake_samples = generator(z, labels)
                real_validity = critic(real_samples, labels)
                fake_validity = critic(fake_samples.detach(), labels)
                gp = compute_gp(critic, real_samples, fake_samples, labels, device)
                c_loss = -torch.mean(real_validity) + torch.mean(fake_validity) + 10 * gp
                c_loss.backward()
                opt_C.step()

                if i % 5 == 0:
                    opt_G.zero_grad()
                    gen_samples = generator(z, labels)
                    g_loss = -torch.mean(critic(gen_samples, labels))
                    g_loss.backward()
                    opt_G.step()

        synth_size = len(X_train) // 2
        z = torch.randn(synth_size, 32).to(device)
        synth_labels = torch.randint(0, num_classes_nursery, (synth_size,), dtype=torch.long).to(device)
        gen_data = generator(z, synth_labels).detach().cpu().numpy()

        synth_df = pd.DataFrame(gen_data, columns=X_nursery.columns)
        synth_df["class"] = synth_labels.cpu().numpy()

        for col in synth_df.columns:
            synth_df[col] = synth_df[col].round().astype(int)
            synth_df[col] = synth_df[col].clip(0, nursery_df[col].max())

        print("\n📊 TSTR Accuracy:")
        models = {
            "LogReg": LogisticRegression(max_iter=300),
            "MLP": MLPClassifier(max_iter=300),
            "RF": RandomForestClassifier(),
            "XGBT": XGBClassifier(use_label_encoder=False, eval_metric="mlogloss")
        }
        for name, model in models.items():
            model.fit(synth_df.drop(columns=["class"]), synth_df["class"])
            y_pred = model.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            all_results_nursery[f"{name}_acc"].append(acc)
            print(f"{name}: {acc:.4f}")

        jsd, wd = evaluate_jsd_wd(X_train, synth_df.drop(columns=["class"]))
        all_results_nursery["jsd"].append(jsd)
        all_results_nursery["wd"].append(wd)
        print(f"\n🔬 JSD: {jsd:.4f} | WD: {wd:.4f}")

print("\n📈 FINAL AVERAGE RESULTS ACROSS 3x2 CV:")
for name in ["LogReg", "MLP", "RF", "XGBT"]:
    print(f"{name} TSTR Accuracy: {np.mean(all_results_nursery[f'{name}_acc']):.4f}")
print(f"\nJSD: {np.mean(all_results_nursery['jsd']):.4f}")
print(f"Wasserstein Distance: {np.mean(all_results_nursery['wd']):.4f}")

print("\n🚀 Training final generator on full nursery dataset...")
X_tensor_full = torch.tensor(X_nursery.values, dtype=torch.float32)
y_tensor_full = torch.tensor(y_nursery.values, dtype=torch.long)
loader_full = DataLoader(TensorDataset(X_tensor_full, y_tensor_full), batch_size=128, shuffle=True)

generator = NurseryGenerator().to(device)
critic = NurseryCritic().to(device)
opt_G = torch.optim.Adam(generator.parameters(), lr=1e-4, betas=(0.5, 0.9))
opt_C = torch.optim.Adam(critic.parameters(), lr=1e-4, betas=(0.5, 0.9))

for epoch in range(100):
    for i, (real_samples, labels) in enumerate(loader_full):
        real_samples, labels = real_samples.to(device), labels.to(device)
        opt_C.zero_grad()
        z = torch.randn(real_samples.size(0), 32).to(device)
        fake_samples = generator(z, labels)
        real_validity = critic(real_samples, labels)
        fake_validity = critic(fake_samples.detach(), labels)
        gp = compute_gp(critic, real_samples, fake_samples, labels, device)
        c_loss = -torch.mean(real_validity) + torch.mean(fake_validity) + 10 * gp
        c_loss.backward()
        opt_C.step()

        if i % 5 == 0:
            opt_G.zero_grad()
            gen_samples = generator(z, labels)
            g_loss = -torch.mean(critic(gen_samples, labels))
            g_loss.backward()
            opt_G.step()

print("\n📏 Generating final synthetic nursery dataset...")
synth_size_final = len(X_nursery) // 2
z = torch.randn(synth_size_final, 32).to(device)
synth_labels_final = torch.randint(0, num_classes_nursery, (synth_size_final,), dtype=torch.long).to(device)
gen_data_final = generator(z, synth_labels_final).detach().cpu().numpy()

synth_df_final = pd.DataFrame(gen_data_final, columns=X_nursery.columns)
synth_df_final["class"] = synth_labels_final.cpu().numpy()

for col in synth_df_final.columns:
    synth_df_final[col] = synth_df_final[col].round().astype(int)
    synth_df_final[col] = synth_df_final[col].clip(0, nursery_df[col].max())

synth_df_final.to_csv("synthetic_nursery_dataset_half.csv", index=False)
print("✅ Saved synthetic nursery dataset as 'synthetic_nursery_dataset_half.csv'")



🔁 Repeat 1/3

🔀 Fold 1/2

📊 TSTR Accuracy:
LogReg: 0.7568
MLP: 0.7724
RF: 0.7680
XGBT: 0.7322

🔬 JSD: 0.1257 | WD: 0.8123

🔀 Fold 2/2

📊 TSTR Accuracy:
LogReg: 0.7127
MLP: 0.6508
RF: 0.7115
XGBT: 0.6971

🔬 JSD: 0.1179 | WD: 0.7782

🔁 Repeat 2/3

🔀 Fold 1/2

📊 TSTR Accuracy:
LogReg: 0.7662
MLP: 0.7755
RF: 0.7727
XGBT: 0.7390

🔬 JSD: 0.1330 | WD: 0.8792

🔀 Fold 2/2

📊 TSTR Accuracy:
LogReg: 0.7644
MLP: 0.7493
RF: 0.7749
XGBT: 0.7378

🔬 JSD: 0.1393 | WD: 1.1428

🔁 Repeat 3/3

🔀 Fold 1/2

📊 TSTR Accuracy:
LogReg: 0.7057
MLP: 0.7046
RF: 0.7218
XGBT: 0.6736

🔬 JSD: 0.1360 | WD: 0.8897

🔀 Fold 2/2

📊 TSTR Accuracy:
LogReg: 0.5085
MLP: 0.4768
RF: 0.4845
XGBT: 0.4519

🔬 JSD: 0.1324 | WD: 0.7952

📈 FINAL AVERAGE RESULTS ACROSS 3x2 CV:
LogReg TSTR Accuracy: 0.7024
MLP TSTR Accuracy: 0.6882
RF TSTR Accuracy: 0.7056
XGBT TSTR Accuracy: 0.6719

JSD: 0.1307
Wasserstein Distance: 0.8829

🚀 Training final generator on full nursery dataset...

📏 Generating final synthetic nursery dataset...
✅ Saved syn