In [1]:
import torch
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Load data
df = pd.read_csv("chess-official.csv")

# Encode the target column
le = LabelEncoder()
df['class'] = le.fit_transform(df['class'])

# Separate features and target
NUMERIC_COLS = df.columns[df.columns != 'class'].tolist()
TARGET_COL = 'class'

# Scale numeric features
scaler = StandardScaler()
df[NUMERIC_COLS] = scaler.fit_transform(df[NUMERIC_COLS])

# Save preprocessed dataset
out_path = "preprocessed_chess.csv"
df.to_csv(out_path, index=False)
print("✅ Preprocessed data saved to:", out_path)


Using device: cpu
✅ Preprocessed data saved to: preprocessed_chess.csv


In [2]:
# --- GAN Training & Evaluation ---

# Hyperparameters
PREPROCESSED_PATH = "preprocessed_chess.csv"
LATENT_DIM        = 100
BATCH_SIZE        = 64
EPOCHS            = 100
REPEATS           = 3
FOLDS             = 2
SYN_RATIO         = 0.5
TARGET_COL        = "class"

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from scipy.stats import wasserstein_distance
from scipy.spatial.distance import jensenshannon

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load data
df = pd.read_csv(PREPROCESSED_PATH)
X_full = df.drop(columns=[TARGET_COL]).values.astype(np.float32)
y_full = df[TARGET_COL].values.astype(int)
NUMERIC_COLS = df.columns[df.columns != TARGET_COL].tolist()
num_idx = [df.columns.get_loc(c) for c in NUMERIC_COLS]

# Define models
class Generator(nn.Module):
    def __init__(self, z_dim, out_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(z_dim, 256), nn.ReLU(),
            nn.Linear(256, 512), nn.ReLU(),
            nn.Linear(512, 256), nn.ReLU(),
            nn.Linear(256, out_dim),
            nn.Tanh()
        )
    def forward(self, z):
        return self.net(z)

class Discriminator(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 512), nn.ReLU(),
            nn.Linear(512, 256), nn.ReLU(),
            nn.Linear(256, 128), nn.ReLU(),
            nn.Linear(128, 1), nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

def train_cramer_gan(G, D, loader, epochs):
    G, D = G.to(device), D.to(device)
    optg = optim.Adam(G.parameters(), lr=2e-4)
    optd = optim.Adam(D.parameters(), lr=2e-4)
    loss_fn = nn.BCELoss()
    for ep in range(1, epochs + 1):
        for real_batch, _ in loader:
            real_batch = real_batch.to(device)
            bsz = real_batch.size(0)

            # Discriminator step
            optd.zero_grad()
            z = torch.randn(bsz, LATENT_DIM, device=device)
            fake = G(z).detach()
            d_real = D(real_batch)
            d_fake = D(fake)
            lossd = loss_fn(d_real, torch.ones_like(d_real)) + \
                    loss_fn(d_fake, torch.zeros_like(d_fake))
            lossd.backward()
            optd.step()

            # Generator step
            optg.zero_grad()
            z = torch.randn(bsz, LATENT_DIM, device=device)
            fake2 = G(z)
            dg = D(fake2)
            lossg = loss_fn(dg, torch.ones_like(dg))
            lossg.backward()
            optg.step()

        if ep % 20 == 0 or ep == 1 or ep == epochs:
            print(f"Epoch {ep}/{epochs} - D_loss={lossd.item():.4f}  G_loss={lossg.item():.4f}")
    return G, D

def generate_synthetic(G, n_samples):
    G = G.to(device).eval()
    with torch.no_grad():
        z = torch.randn(n_samples, LATENT_DIM, device=device)
        return G(z).cpu().numpy()

def compute_tstr_all(X_real, y_real, X_syn, y_syn):
    results = {}
    for name, clf in [
        ("LR", LogisticRegression(max_iter=5000)),
        ("MLP", MLPClassifier(hidden_layer_sizes=(128,64), max_iter=1000)),
        ("RF", RandomForestClassifier(n_estimators=200)),
        ("XGB", XGBClassifier(eval_metric="logloss"))
    ]:
        clf.fit(X_syn, y_syn)
        results[name] = clf.score(X_real, y_real) * 100.0
    return results

def compute_jsd_wd(X_real, X_syn, num_idx):
    jsd_list, wd_list = [], []
    for i in num_idx:
        p_real, _ = np.histogram(X_real[:, i], bins=50, density=True)
        p_syn, _ = np.histogram(X_syn[:, i], bins=50, density=True)
        jsd_list.append(jensenshannon(p_real, p_syn))
        wd_list.append(wasserstein_distance(X_real[:, i], X_syn[:, i]))
    return np.mean(jsd_list), np.mean(wd_list)

# Cross-validation Training
tstr_scores = {m: [] for m in ["LR", "MLP", "RF", "XGB"]}
jsd_scores, wd_scores = [], []
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

for rep in range(REPEATS):
    for fold, (train_idx, test_idx) in enumerate(kf.split(X_full), 1):
        print(f"\n— Rep {rep + 1}/{REPEATS} Fold {fold}/{FOLDS} —")
        X_tr, X_te = X_full[train_idx], X_full[test_idx]
        y_tr, y_te = y_full[train_idx], y_full[test_idx]

        loader = DataLoader(
            TensorDataset(torch.from_numpy(X_tr), torch.from_numpy(y_tr)),
            batch_size=BATCH_SIZE, shuffle=True
        )

        G = Generator(LATENT_DIM, X_tr.shape[1])
        D = Discriminator(X_tr.shape[1])
        G, D = train_cramer_gan(G, D, loader, EPOCHS)

        X_syn = generate_synthetic(G, int(SYN_RATIO * len(X_tr)))
        y_syn = np.random.choice(y_tr, size=X_syn.shape[0], replace=True)

        tstr = compute_tstr_all(X_te, y_te, X_syn, y_syn)
        for m, score in tstr.items():
            tstr_scores[m].append(score)

        js, wd = compute_jsd_wd(X_te, X_syn, num_idx)
        jsd_scores.append(js)
        wd_scores.append(wd)

# Report Results
print("\n=== CV Results (mean ± std) ===")
for m in tstr_scores:
    scores = np.array(tstr_scores[m])
    print(f" • {m} TSTR = {scores.mean():.2f}% ± {scores.std():.2f}%")
print(f" • JSD = {np.mean(jsd_scores):.4f} ± {np.std(jsd_scores):.4f}")
print(f" • WD  = {np.mean(wd_scores):.4f} ± {np.std(wd_scores):.4f}")



— Rep 1/3 Fold 1/2 —
Epoch 1/100 - D_loss=1.3021  G_loss=0.7138
Epoch 20/100 - D_loss=0.6076  G_loss=1.1980
Epoch 40/100 - D_loss=0.0911  G_loss=3.4959
Epoch 60/100 - D_loss=0.0298  G_loss=4.7595
Epoch 80/100 - D_loss=0.0009  G_loss=7.4344
Epoch 100/100 - D_loss=0.2912  G_loss=11.2808

— Rep 1/3 Fold 2/2 —
Epoch 1/100 - D_loss=1.3279  G_loss=0.7011
Epoch 20/100 - D_loss=0.8696  G_loss=1.0404
Epoch 40/100 - D_loss=0.1176  G_loss=3.3301
Epoch 60/100 - D_loss=0.0476  G_loss=5.2470
Epoch 80/100 - D_loss=0.1814  G_loss=5.9640
Epoch 100/100 - D_loss=0.1248  G_loss=12.7826

— Rep 2/3 Fold 1/2 —
Epoch 1/100 - D_loss=1.3345  G_loss=0.7323
Epoch 20/100 - D_loss=0.4959  G_loss=1.3732
Epoch 40/100 - D_loss=0.2457  G_loss=2.6262
Epoch 60/100 - D_loss=0.0283  G_loss=4.3815
Epoch 80/100 - D_loss=0.6308  G_loss=6.3929
Epoch 100/100 - D_loss=0.0359  G_loss=5.2464

— Rep 2/3 Fold 2/2 —
Epoch 1/100 - D_loss=1.3213  G_loss=0.7076
Epoch 20/100 - D_loss=0.8108  G_loss=1.1800
Epoch 40/100 - D_loss=0.1376  G

In [3]:
# Final model on all data
full_loader = DataLoader(
    TensorDataset(torch.from_numpy(X_full), torch.from_numpy(y_full)),
    batch_size=BATCH_SIZE, shuffle=True
)
Gf = Generator(LATENT_DIM, X_full.shape[1])
Df = Discriminator(X_full.shape[1])
Gf, Df = train_cramer_gan(Gf, Df, full_loader, EPOCHS)

# Generate final synthetic data
n_final = int(SYN_RATIO * len(X_full))
Xf_syn = generate_synthetic(Gf, n_final)
yf_syn = np.random.choice(y_full, size=n_final, replace=True)

# Save to CSV
syn_df = pd.DataFrame(Xf_syn, columns=NUMERIC_COLS)
syn_df[TARGET_COL] = yf_syn
out_path = "synthetic_chess_final.csv"
syn_df.to_csv(out_path, index=False)
print(f"\n✅ Final synthetic dataset saved to: {out_path}")


Epoch 1/100 - D_loss=1.2406  G_loss=0.6604
Epoch 20/100 - D_loss=0.9899  G_loss=1.8146
Epoch 40/100 - D_loss=0.4565  G_loss=5.3574
Epoch 60/100 - D_loss=0.0303  G_loss=6.0340
Epoch 80/100 - D_loss=0.3548  G_loss=10.2845
Epoch 100/100 - D_loss=0.0724  G_loss=6.3636

✅ Final synthetic dataset saved to: synthetic_chess_final.csv
