In [8]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from scipy.stats import entropy, wasserstein_distance
from scipy.io import arff
from collections import defaultdict
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# ===== Load + Clean ARFF =====
data, meta = arff.loadarff("adult 1.arff")
df = pd.DataFrame(data)
for col in df.select_dtypes([object]).columns:
    df[col] = df[col].str.decode("utf-8").str.replace(r"[\\'\"]", "", regex=True)
df = df.dropna().reset_index(drop=True)

# ===== Encode Categorical =====
encoders = {}
for col in df.columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

X = df.drop(columns=["class"])
y = df["class"]
input_dim = X.shape[1]
num_classes = y.nunique()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ===== Metrics =====
def compute_jsd(p, q):
    p, q = np.array(p) + 1e-10, np.array(q) + 1e-10
    p, q = p / p.sum(), q / q.sum()
    m = 0.5 * (p + q)
    return 0.5 * (entropy(p, m) + entropy(q, m))

def evaluate_jsd_wd(real_df, synth_df):
    jsd_scores, wd_scores = [], []
    for col in real_df.columns:
        real, synth = real_df[col].values, synth_df[col].values
        jsd = compute_jsd(np.histogram(real, bins=20)[0], np.histogram(synth, bins=20)[0])
        wd = wasserstein_distance(real, synth)
        jsd_scores.append(jsd)
        wd_scores.append(wd)
    return np.mean(jsd_scores), np.mean(wd_scores)

# ===== CW-GAN Models =====
class Generator(nn.Module):
    def __init__(self):
        super().__init__()
        self.label_emb = nn.Embedding(num_classes, num_classes)
        self.model = nn.Sequential(
            nn.Linear(32 + num_classes, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )
    def forward(self, z, labels):
        c = self.label_emb(labels)
        x = torch.cat((z, c), dim=1)
        return self.model(x)

class Critic(nn.Module):
    def __init__(self):
        super().__init__()
        self.label_emb = nn.Embedding(num_classes, num_classes)
        self.model = nn.Sequential(
            nn.Linear(input_dim + num_classes, 128),
            nn.LeakyReLU(0.2),
            nn.Linear(128, 64),
            nn.LeakyReLU(0.2),
            nn.Linear(64, 1)
        )
    def forward(self, x, labels):
        c = self.label_emb(labels)
        d_in = torch.cat((x, c), dim=1)
        return self.model(d_in)

def compute_gp(critic, real_samples, fake_samples, labels, device):
    alpha = torch.rand(real_samples.size(0), 1).to(device)
    interpolates = (alpha * real_samples + (1 - alpha) * fake_samples).requires_grad_(True)
    d_interpolates = critic(interpolates, labels)
    fake = torch.ones_like(d_interpolates)
    gradients = torch.autograd.grad(
        outputs=d_interpolates, inputs=interpolates, grad_outputs=fake,
        create_graph=True, retain_graph=True, only_inputs=True
    )[0]
    gradients = gradients.view(gradients.size(0), -1)
    return ((gradients.norm(2, dim=1) - 1) ** 2).mean()

# ===== Experiment Setup =====
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
all_results = defaultdict(list)

for repeat in range(3):
    print(f"\n🔁 Repeat {repeat+1}/3")

    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        print(f"\n🔄 Fold {fold+1}/2")

        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        X_tensor = torch.tensor(X_train.values, dtype=torch.float32)
        y_tensor = torch.tensor(y_train.values, dtype=torch.long)
        loader = DataLoader(TensorDataset(X_tensor, y_tensor), batch_size=128, shuffle=True)

        generator = Generator().to(device)
        critic = Critic().to(device)
        opt_G = torch.optim.Adam(generator.parameters(), lr=1e-4, betas=(0.5, 0.9))
        opt_C = torch.optim.Adam(critic.parameters(), lr=1e-4, betas=(0.5, 0.9))

        for epoch in range(100):
            for i, (real_samples, labels) in enumerate(loader):
                real_samples, labels = real_samples.to(device), labels.to(device)
                opt_C.zero_grad()
                z = torch.randn(real_samples.size(0), 32).to(device)
                fake_samples = generator(z, labels)
                real_validity = critic(real_samples, labels)
                fake_validity = critic(fake_samples.detach(), labels)
                gp = compute_gp(critic, real_samples, fake_samples, labels, device)
                c_loss = -torch.mean(real_validity) + torch.mean(fake_validity) + 10 * gp
                c_loss.backward()
                opt_C.step()
                if i % 5 == 0:
                    opt_G.zero_grad()
                    gen_samples = generator(z, labels)
                    g_loss = -torch.mean(critic(gen_samples, labels))
                    g_loss.backward()
                    opt_G.step()

        # Generate synthetic data
        synth_size = len(X_train) // 2
        z = torch.randn(synth_size, 32).to(device)
        synth_labels = torch.randint(0, num_classes, (synth_size,), dtype=torch.long).to(device)
        gen_data = generator(z, synth_labels).detach().cpu().numpy()
        synth_df = pd.DataFrame(gen_data, columns=X.columns)
        synth_df["class"] = synth_labels.cpu().numpy()

        # Classifier Evaluation (TSTR)
        print("\n📊 TSTR Accuracy:")
        models = {
            "LogReg": LogisticRegression(max_iter=300),
            "MLP": MLPClassifier(max_iter=300),
            "RF": RandomForestClassifier(),
            "XGBT": XGBClassifier(use_label_encoder=False, eval_metric="mlogloss")
        }

        for name, model in models.items():
            model.fit(synth_df.drop(columns=["class"]), synth_df["class"])
            y_pred = model.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            all_results[f"{name}_acc"].append(acc)
            print(f"{name}: {acc:.4f}")

        # JSD + WD
        jsd, wd = evaluate_jsd_wd(X_train, synth_df.drop(columns=["class"]))
        all_results["jsd"].append(jsd)
        all_results["wd"].append(wd)
        print(f"\n🔬 JSD: {jsd:.4f} | WD: {wd:.4f}")

# ===== Final Average Summary =====
print("\n📈 FINAL AVERAGE RESULTS ACROSS 3x2 CV:")
for name in ["LogReg", "MLP", "RF", "XGBT"]:
    avg = np.mean(all_results[f"{name}_acc"])
    print(f"{name} TSTR Accuracy: {avg:.4f}")

print(f"\nJSD: {np.mean(all_results['jsd']):.4f}")
print(f"Wasserstein Distance: {np.mean(all_results['wd']):.4f}")


🔁 Repeat 1/3

🔄 Fold 1/2

📊 TSTR Accuracy:
LogReg: 0.7588
MLP: 0.7528
RF: 0.7758
XGBT: 0.7722

🔬 JSD: 0.5025 | WD: 1.2743

🔄 Fold 2/2

📊 TSTR Accuracy:
LogReg: 0.7656
MLP: 0.7702
RF: 0.7782
XGBT: 0.7784

🔬 JSD: 0.5116 | WD: 1.4590

🔁 Repeat 2/3

🔄 Fold 1/2

📊 TSTR Accuracy:
LogReg: 0.7273
MLP: 0.6779
RF: 0.7786
XGBT: 0.7743

🔬 JSD: 0.5047 | WD: 1.4928

🔄 Fold 2/2

📊 TSTR Accuracy:
LogReg: 0.7028
MLP: 0.7229
RF: 0.7424
XGBT: 0.7180

🔬 JSD: 0.5056 | WD: 1.3454

🔁 Repeat 3/3

🔄 Fold 1/2

📊 TSTR Accuracy:
LogReg: 0.6712
MLP: 0.6716
RF: 0.7740
XGBT: 0.7140

🔬 JSD: 0.5030 | WD: 1.2092

🔄 Fold 2/2

📊 TSTR Accuracy:
LogReg: 0.6829
MLP: 0.6840
RF: 0.7758
XGBT: 0.7252

🔬 JSD: 0.5002 | WD: 1.3722

📈 FINAL AVERAGE RESULTS ACROSS 3x2 CV:
LogReg TSTR Accuracy: 0.7181
MLP TSTR Accuracy: 0.7132
RF TSTR Accuracy: 0.7708
XGBT TSTR Accuracy: 0.7470

JSD: 0.5046
Wasserstein Distance: 1.3588
