In [None]:
# ===== CW-GAN-GP on Dermatology Dataset with Final Average Metric Summary =====

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from scipy.stats import entropy, wasserstein_distance
from scipy.io import arff
from collections import defaultdict
import warnings
warnings.filterwarnings("ignore")

# ===== Load and Preprocess =====
data, meta = arff.loadarff("dermatology.arff")
df = pd.DataFrame(data)

df["Age"] = df["Age"].apply(lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x))
df["Age"] = df["Age"].str.replace(r"[\"'\\\\]", "", regex=True).str.strip()
df["Age"] = df["Age"].replace({"missing": np.nan, "(-inf-14]": 10, "(14-inf)": 30})
df = df.dropna(subset=["Age"])
df["Age"] = df["Age"].astype(int)

encoders = {}
for col in df.columns:
    if df[col].dtype == object:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        encoders[col] = le
    else:
        df[col] = df[col].astype(int)

X = df.drop(columns=["class"])
y = df["class"]
input_dim = X.shape[1]
num_classes = y.nunique()
device = torch.device("cpu")

# ===== Metrics =====
def compute_jsd(p, q):
    p, q = np.array(p) + 1e-10, np.array(q) + 1e-10
    p, q = p / p.sum(), q / p.sum()
    m = 0.5 * (p + q)
    return 0.5 * (entropy(p, m) + entropy(q, m))

def evaluate_jsd_wd(real_df, synth_df):
    jsd_scores, wd_scores = [], []
    for col in real_df.columns:
        real, synth = real_df[col].values, synth_df[col].values
        jsd = compute_jsd(np.histogram(real, bins=20)[0], np.histogram(synth, bins=20)[0])
        wd = wasserstein_distance(real, synth)
        jsd_scores.append(jsd)
        wd_scores.append(wd)
    return np.mean(jsd_scores), np.mean(wd_scores)

# ===== Models =====
class Generator(nn.Module):
    def __init__(self):
        super().__init__()
        self.label_emb = nn.Embedding(num_classes, 32)
        self.model = nn.Sequential(
            nn.Linear(64, 128), nn.ReLU(), nn.BatchNorm1d(128), nn.Dropout(0.2),
            nn.Linear(128, 256), nn.ReLU(), nn.BatchNorm1d(256),
            nn.Linear(256, 128), nn.ReLU(),
            nn.Linear(128, input_dim)
        )
    def forward(self, z, labels):
        c = self.label_emb(labels)
        return self.model(torch.cat((z, c), dim=1))

class Critic(nn.Module):
    def __init__(self):
        super().__init__()
        self.label_emb = nn.Embedding(num_classes, 32)
        self.model = nn.Sequential(
            nn.Linear(input_dim + 32, 256), nn.LeakyReLU(0.2),
            nn.Linear(256, 128), nn.LeakyReLU(0.2),
            nn.Linear(128, 64), nn.LeakyReLU(0.2),
            nn.Linear(64, 1)
        )
    def forward(self, x, labels):
        c = self.label_emb(labels)
        return self.model(torch.cat((x, c), dim=1))

def compute_gp(critic, real_samples, fake_samples, labels):
    alpha = torch.rand(real_samples.size(0), 1).to(device)
    interpolates = (alpha * real_samples + (1 - alpha) * fake_samples).requires_grad_(True)
    d_interpolates = critic(interpolates, labels)
    fake = torch.ones_like(d_interpolates)
    gradients = torch.autograd.grad(
        outputs=d_interpolates, inputs=interpolates, grad_outputs=fake,
        create_graph=True, retain_graph=True, only_inputs=True
    )[0]
    return ((gradients.view(gradients.size(0), -1).norm(2, dim=1) - 1) ** 2).mean()

# ===== Train and Evaluate =====
results = defaultdict(list)
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

for repeat in range(3):
    print(f"🔁 Repeat {repeat+1}/3")
    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        print(f"🔄 Fold {fold+1}/2")
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        loader = DataLoader(TensorDataset(torch.tensor(X_train.values, dtype=torch.float32),
                                          torch.tensor(y_train.values, dtype=torch.long)), batch_size=64, shuffle=True)

        generator, critic = Generator().to(device), Critic().to(device)
        opt_G = torch.optim.Adam(generator.parameters(), lr=2e-4, betas=(0.5, 0.9))
        opt_C = torch.optim.Adam(critic.parameters(), lr=2e-4, betas=(0.5, 0.9))

        for epoch in range(200):
            for i, (real_samples, labels) in enumerate(loader):
                real_samples, labels = real_samples.to(device), labels.to(device)

                for _ in range(5):
                    z = torch.randn(real_samples.size(0), 32).to(device)
                    fake_samples = generator(z, labels)
                    real_validity = critic(real_samples, labels)
                    fake_validity = critic(fake_samples.detach(), labels)
                    gp = compute_gp(critic, real_samples, fake_samples, labels)
                    c_loss = -torch.mean(real_validity) + torch.mean(fake_validity) + 10 * gp
                    opt_C.zero_grad(); c_loss.backward(); opt_C.step()

                if i % 5 == 0:
                    z = torch.randn(real_samples.size(0), 32).to(device)
                    gen_samples = generator(z, labels)
                    g_loss = -torch.mean(critic(gen_samples, labels))
                    opt_G.zero_grad(); g_loss.backward(); opt_G.step()

        synth_size = len(X_train) // 2
        real_dist = y_train.value_counts(normalize=True).sort_index().values
        synth_labels = torch.tensor(np.random.choice(num_classes, size=synth_size, p=real_dist), dtype=torch.long).to(device)
        z = torch.randn(synth_size, 32).to(device)
        gen_data = generator(z, synth_labels).detach().cpu().numpy()
        synth_df = pd.DataFrame(gen_data, columns=X.columns)
        synth_df["class"] = synth_labels.cpu().numpy()

        print("📊 TSTR Accuracy:")
        models = {
            "LogReg": LogisticRegression(max_iter=300),
            "MLP": MLPClassifier(max_iter=300),
            "RF": RandomForestClassifier(),
            "XGBT": XGBClassifier(use_label_encoder=False, eval_metric="mlogloss")
        }
        for name, model in models.items():
            model.fit(synth_df.drop(columns=["class"]), synth_df["class"])
            acc = accuracy_score(y_test, model.predict(X_test))
            results[f"{name}_acc"].append(acc)
            print(f"{name}: {acc:.4f}")

        jsd, wd = evaluate_jsd_wd(X_train, synth_df.drop(columns=["class"]))
        results["jsd"].append(jsd)
        results["wd"].append(wd)
        print(f"🔬 JSD: {jsd:.4f} | WD: {wd:.4f}")

# ===== Summary =====
print("\\n📊 AVERAGE TSTR Accuracy:")
for name in ["LogReg", "MLP", "RF", "XGBT"]:
    avg_score = np.mean(results[f"{name}_acc"])
    print(f"{name}: {avg_score:.4f}")

print(f"\\n🔬 Average JSD: {np.mean(results['jsd']):.4f}")
print(f"🔬 Average WD: {np.mean(results['wd']):.4f}")

# ===== Final Synthetic Dataset Generation =====
print("\\n💾 Generating final synthetic dataset (50% size of original)")
synth_size = len(X) // 2
z = torch.randn(synth_size, 32).to(device)
real_dist = y.value_counts(normalize=True).sort_index().values
synth_labels = torch.tensor(np.random.choice(num_classes, size=synth_size, p=real_dist), dtype=torch.long).to(device)
gen_data = generator(z, synth_labels).detach().cpu().numpy()
synth_df_final = pd.DataFrame(gen_data, columns=X.columns)
synth_df_final["class"] = synth_labels.cpu().numpy()

# Postprocess
for col in X.columns:
    synth_df_final[col] = synth_df_final[col].round().astype(int)
    synth_df_final[col] = synth_df_final[col].clip(lower=0, upper=df[col].max())
synth_df_final["class"] = synth_df_final["class"].clip(lower=0, upper=df["class"].max())

synth_df_final.to_csv("synthetic_dermatology_half_verified V2.csv", index=False)
print("✅ Saved to synthetic_dermatology_half_verified V2.csv")
print(f"Original dataset size: {len(X)}")
print(f"Synthetic dataset size: {len(synth_df_final)}")
print(f"Percentage of original: {round(100 * len(synth_df_final) / len(X), 2)}%")

🔁 Repeat 1/3
🔄 Fold 1/2
📊 TSTR Accuracy:
LogReg: 0.4693
MLP: 0.6927
RF: 0.9441
XGBT: 0.7374
🔬 JSD: 0.5549 | WD: 0.6076
🔄 Fold 2/2
📊 TSTR Accuracy:
LogReg: 0.6927
MLP: 0.8883
RF: 0.8156
XGBT: 0.4581
🔬 JSD: 0.5665 | WD: 0.5486
🔁 Repeat 2/3
🔄 Fold 1/2
📊 TSTR Accuracy:
LogReg: 0.4916
MLP: 0.8547
RF: 0.8268
XGBT: 0.5698
🔬 JSD: 0.5422 | WD: 0.5840
🔄 Fold 2/2
📊 TSTR Accuracy:
LogReg: 0.3631
MLP: 0.6983
RF: 0.8212
XGBT: 0.7542
🔬 JSD: 0.5404 | WD: 0.6827
🔁 Repeat 3/3
🔄 Fold 1/2
📊 TSTR Accuracy:
LogReg: 0.8045
MLP: 0.7989
RF: 0.8268
XGBT: 0.6145
🔬 JSD: 0.5656 | WD: 0.5613
🔄 Fold 2/2
📊 TSTR Accuracy:
LogReg: 0.8659
MLP: 0.8380
RF: 0.7709
XGBT: 0.6480
🔬 JSD: 0.5627 | WD: 0.4822
\n📊 AVERAGE TSTR Accuracy:
LogReg: 0.6145
MLP: 0.7952
RF: 0.8343
XGBT: 0.6304
\n🔬 Average JSD: 0.5554
🔬 Average WD: 0.5777
\n💾 Generating final synthetic dataset (50% size of original)
✅ Saved to synthetic_dermatology_half_verified V2.csv
Original dataset size: 358
Synthetic dataset size: 179
Percentage of original: 50.0%
