In [1]:
# Full updated CW-GAN-GP code with improved architecture and training strategy for connect-4.arff

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from scipy.stats import entropy, wasserstein_distance
from scipy.io import arff
from collections import defaultdict
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# ===== Load + Preprocess ARFF =====
data, meta = arff.loadarff("connect-4.arff")
df = pd.DataFrame(data)
for col in df.select_dtypes([object]).columns:
    df[col] = df[col].str.decode("utf-8").str.replace(r"[\\'\"]", "", regex=True)
df = df.dropna().reset_index(drop=True)

encoders = {}
for col in df.columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

X = df.drop(columns=["class"])
y = df["class"]
input_dim = X.shape[1]
num_classes = y.nunique()
device = torch.device("cpu")

# ===== Metrics =====
def compute_jsd(p, q):
    p, q = np.array(p) + 1e-10, np.array(q) + 1e-10
    p, q = p / p.sum(), q / q.sum()
    m = 0.5 * (p + q)
    return 0.5 * (entropy(p, m) + entropy(q, m))

def evaluate_jsd_wd(real_df, synth_df):
    jsd_scores, wd_scores = [], []
    for col in real_df.columns:
        real, synth = real_df[col].values, synth_df[col].values
        jsd = compute_jsd(np.histogram(real, bins=20)[0], np.histogram(synth, bins=20)[0])
        wd = wasserstein_distance(real, synth)
        jsd_scores.append(jsd)
        wd_scores.append(wd)
    return np.mean(jsd_scores), np.mean(wd_scores)

# ===== Improved CW-GAN Models =====
class Generator(nn.Module):
    def __init__(self):
        super().__init__()
        self.label_emb = nn.Embedding(num_classes, 32)
        self.model = nn.Sequential(
            nn.Linear(32 + 32, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )
    def forward(self, z, labels):
        c = self.label_emb(labels)
        x = torch.cat((z, c), dim=1)
        return self.model(x)

class Critic(nn.Module):
    def __init__(self):
        super().__init__()
        self.label_emb = nn.Embedding(num_classes, 32)
        self.model = nn.Sequential(
            nn.Linear(input_dim + 32, 256),
            nn.LeakyReLU(0.2),
            nn.Linear(256, 128),
            nn.LeakyReLU(0.2),
            nn.Linear(128, 64),
            nn.LeakyReLU(0.2),
            nn.Linear(64, 1)
        )
    def forward(self, x, labels):
        c = self.label_emb(labels)
        d_in = torch.cat((x, c), dim=1)
        return self.model(d_in)

def compute_gp(critic, real_samples, fake_samples, labels, device):
    alpha = torch.rand(real_samples.size(0), 1).to(device)
    interpolates = (alpha * real_samples + (1 - alpha) * fake_samples).requires_grad_(True)
    d_interpolates = critic(interpolates, labels)
    fake = torch.ones_like(d_interpolates)
    gradients = torch.autograd.grad(
        outputs=d_interpolates, inputs=interpolates, grad_outputs=fake,
        create_graph=True, retain_graph=True, only_inputs=True
    )[0]
    gradients = gradients.view(gradients.size(0), -1)
    return ((gradients.norm(2, dim=1) - 1) ** 2).mean()

# ===== Train + Evaluate =====
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
all_results = defaultdict(list)

for repeat in range(3):
    print(f"\n🔁 Repeat {repeat+1}/3")
    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        print(f"\n🔄 Fold {fold+1}/2")
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        X_tensor = torch.tensor(X_train.values, dtype=torch.float32)
        y_tensor = torch.tensor(y_train.values, dtype=torch.long)
        loader = DataLoader(TensorDataset(X_tensor, y_tensor), batch_size=128, shuffle=True)

        generator = Generator().to(device)
        critic = Critic().to(device)
        opt_G = torch.optim.Adam(generator.parameters(), lr=2e-4, betas=(0.5, 0.9))
        opt_C = torch.optim.Adam(critic.parameters(), lr=2e-4, betas=(0.5, 0.9))

        for epoch in range(200):
            for i, (real_samples, labels) in enumerate(loader):
                real_samples, labels = real_samples.to(device), labels.to(device)
                for _ in range(5):  # more critic updates
                    z = torch.randn(real_samples.size(0), 32).to(device)
                    fake_samples = generator(z, labels)
                    real_validity = critic(real_samples, labels)
                    fake_validity = critic(fake_samples.detach(), labels)
                    gp = compute_gp(critic, real_samples, fake_samples, labels, device)
                    c_loss = -torch.mean(real_validity) + torch.mean(fake_validity) + 10 * gp
                    opt_C.zero_grad()
                    c_loss.backward()
                    opt_C.step()

                if i % 5 == 0:
                    z = torch.randn(real_samples.size(0), 32).to(device)
                    opt_G.zero_grad()
                    gen_samples = generator(z, labels)
                    g_loss = -torch.mean(critic(gen_samples, labels))
                    g_loss.backward()
                    opt_G.step()

        synth_size = len(X_train) // 2
        real_dist = y_train.value_counts(normalize=True).sort_index().values
        synth_labels = torch.tensor(np.random.choice(num_classes, size=synth_size, p=real_dist), dtype=torch.long).to(device)
        z = torch.randn(synth_size, 32).to(device)
        gen_data = generator(z, synth_labels).detach().cpu().numpy()
        synth_df = pd.DataFrame(gen_data, columns=X.columns)
        synth_df["class"] = synth_labels.cpu().numpy()

        print("\n📊 TSTR Accuracy:")
        models = {
            "LogReg": LogisticRegression(max_iter=300),
            "MLP": MLPClassifier(max_iter=300),
            "RF": RandomForestClassifier(),
            "XGBT": XGBClassifier(use_label_encoder=False, eval_metric="mlogloss")
        }
        for name, model in models.items():
            model.fit(synth_df.drop(columns=["class"]), synth_df["class"])
            y_pred = model.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            all_results[f"{name}_acc"].append(acc)
            print(f"{name}: {acc:.4f}")

        jsd, wd = evaluate_jsd_wd(X_train, synth_df.drop(columns=["class"]))
        all_results["jsd"].append(jsd)
        all_results["wd"].append(wd)
        print(f"\n🔬 JSD: {jsd:.4f} | WD: {wd:.4f}")

# ===== Summary =====
print("\n📈 FINAL AVERAGE RESULTS ACROSS 3x2 CV:")
for name in ["LogReg", "MLP", "RF", "XGBT"]:
    avg = np.mean(all_results[f"{name}_acc"])
    print(f"{name} TSTR Accuracy: {avg:.4f}")
print(f"\nJSD: {np.mean(all_results['jsd']):.4f}")
print(f"Wasserstein Distance: {np.mean(all_results['wd']):.4f}")

# ===== Final Generator Training =====
print("\n🚀 Training generator on full dataset for final synthetic data...")
X_tensor_full = torch.tensor(X.values, dtype=torch.float32)
y_tensor_full = torch.tensor(y.values, dtype=torch.long)
loader_full = DataLoader(TensorDataset(X_tensor_full, y_tensor_full), batch_size=128, shuffle=True)

generator = Generator().to(device)
critic = Critic().to(device)
opt_G = torch.optim.Adam(generator.parameters(), lr=2e-4, betas=(0.5, 0.9))
opt_C = torch.optim.Adam(critic.parameters(), lr=2e-4, betas=(0.5, 0.9))

for epoch in range(200):
    for i, (real_samples, labels) in enumerate(loader_full):
        real_samples, labels = real_samples.to(device), labels.to(device)
        for _ in range(5):
            z = torch.randn(real_samples.size(0), 32).to(device)
            fake_samples = generator(z, labels)
            real_validity = critic(real_samples, labels)
            fake_validity = critic(fake_samples.detach(), labels)
            gp = compute_gp(critic, real_samples, fake_samples, labels, device)
            opt_C.zero_grad()
            c_loss = -torch.mean(real_validity) + torch.mean(fake_validity) + 10 * gp
            c_loss.backward()
            opt_C.step()

        if i % 5 == 0:
            z = torch.randn(real_samples.size(0), 32).to(device)
            opt_G.zero_grad()
            gen_samples = generator(z, labels)
            g_loss = -torch.mean(critic(gen_samples, labels))
            g_loss.backward()
            opt_G.step()

# ===== Generate Final Dataset =====
print("\n💾 Generating final synthetic dataset (50% size of original)")
synth_size = len(X) // 2
z = torch.randn(synth_size, 32).to(device)
real_dist = y.value_counts(normalize=True).sort_index().values
synth_labels = torch.tensor(np.random.choice(num_classes, size=synth_size, p=real_dist), dtype=torch.long).to(device)
gen_data = generator(z, synth_labels).detach().cpu().numpy()
synth_df_final = pd.DataFrame(gen_data, columns=X.columns)
synth_df_final["class"] = synth_labels.cpu().numpy()

# ===== Postprocess =====
for col in X.columns:
    synth_df_final[col] = synth_df_final[col].round().astype(int)
    synth_df_final[col] = synth_df_final[col].clip(0, df[col].max())
synth_df_final["class"] = synth_df_final["class"].clip(0, df["class"].max())

# Save
final_path = "synthetic_connect4_half_improved.csv"
synth_df_final.to_csv(final_path, index=False)
final_path



🔁 Repeat 1/3

🔄 Fold 1/2

📊 TSTR Accuracy:
LogReg: 0.6442
MLP: 0.6358
RF: 0.6640
XGBT: 0.6841

🔬 JSD: 0.5872 | WD: 0.0711

🔄 Fold 2/2

📊 TSTR Accuracy:
LogReg: 0.6456
MLP: 0.6537
RF: 0.6668
XGBT: 0.6781

🔬 JSD: 0.6349 | WD: 0.0718

🔁 Repeat 2/3

🔄 Fold 1/2

📊 TSTR Accuracy:
LogReg: 0.6345
MLP: 0.6171
RF: 0.6585
XGBT: 0.6822

🔬 JSD: 0.6488 | WD: 0.0724

🔄 Fold 2/2

📊 TSTR Accuracy:
LogReg: 0.6347
MLP: 0.6228
RF: 0.6745
XGBT: 0.6940

🔬 JSD: 0.6113 | WD: 0.0725

🔁 Repeat 3/3

🔄 Fold 1/2

📊 TSTR Accuracy:
LogReg: 0.6474
MLP: 0.6527
RF: 0.6837
XGBT: 0.7049

🔬 JSD: 0.6350 | WD: 0.0729

🔄 Fold 2/2

📊 TSTR Accuracy:
LogReg: 0.6411
MLP: 0.5909
RF: 0.6912
XGBT: 0.6981

🔬 JSD: 0.6352 | WD: 0.0728

📈 FINAL AVERAGE RESULTS ACROSS 3x2 CV:
LogReg TSTR Accuracy: 0.6412
MLP TSTR Accuracy: 0.6289
RF TSTR Accuracy: 0.6731
XGBT TSTR Accuracy: 0.6902

JSD: 0.6254
Wasserstein Distance: 0.0723

🚀 Training generator on full dataset for final synthetic data...

💾 Generating final synthetic dataset (50% size o

'synthetic_connect4_half_improved.csv'

In [4]:
# ===== Final Average Summary =====
print("\n📈 FINAL AVERAGE RESULTS ACROSS 3x2 CV:")
avg_accuracies = {}

for name in ["LogReg", "MLP", "RF", "XGBT"]:
    acc_list = all_results[f"{name}_acc"]
    avg_acc = np.mean(acc_list)
    avg_accuracies[name] = avg_acc
    print(f"{name} TSTR Accuracy: {avg_acc:.4f}")

# Calculate overall average accuracy across all classifiers
overall_avg_acc = np.mean(list(avg_accuracies.values()))
print(f"\n⭐ Average Accuracy (All Models): {overall_avg_acc:.4f}")

# Print divergence scores
avg_jsd = np.mean(all_results['jsd'])
avg_wd = np.mean(all_results['wd'])
print(f"\n🔬 Average JSD: {avg_jsd:.4f}")
print(f"🔬 Average Wasserstein Distance: {avg_wd:.4f}")



📈 FINAL AVERAGE RESULTS ACROSS 3x2 CV:
LogReg TSTR Accuracy: 0.6412
MLP TSTR Accuracy: 0.6289
RF TSTR Accuracy: 0.6731
XGBT TSTR Accuracy: 0.6902

⭐ Average Accuracy (All Models): 0.6584

🔬 Average JSD: 0.6254
🔬 Average Wasserstein Distance: 0.0723
