In [1]:
# --- Data Preprocessing ---
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Load car dataset
df = pd.read_csv("car-official.csv")

# Encode target column
le = LabelEncoder()
df['class'] = le.fit_transform(df['class'])

# Categorical columns (all except 'class')
cat_cols = [col for col in df.columns if col != 'class']

# One-hot encode features
df_cat = pd.get_dummies(df[cat_cols], drop_first=False).astype(int)

# Target column
df_target = df[['class']]

# Record group sizes for GAN output splitting (if needed later)
cat_group_sizes = [
    len(pd.get_dummies(df[col], drop_first=False).columns)
    for col in cat_cols
]

# Combine features and target
df_processed = pd.concat([df_cat, df_target], axis=1)

# Save preprocessed data
out_path = "preprocessed_car.csv"
df_processed.to_csv(out_path, index=False)
print("✅ Preprocessed data saved to:", out_path)


✅ Preprocessed data saved to: preprocessed_car.csv


In [2]:
# --- GAN Training & Evaluation ---

# Installs & Imports
# !pip install torch torchvision scipy scikit-learn xgboost --quiet

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from scipy.stats import wasserstein_distance
from scipy.spatial.distance import jensenshannon
import numpy as np
import pandas as pd

# Hyperparameters
PREPROCESSED_PATH = "preprocessed_car.csv"
LATENT_DIM        = 100
BATCH_SIZE        = 64
EPOCHS            = 100
REPEATS           = 3
FOLDS             = 2
SYN_RATIO         = 0.5
NUMERIC_COLS      = []  # No numeric columns
TARGET_COL        = "class"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

class Generator(nn.Module):
    def __init__(self, z_dim, out_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(z_dim,256), nn.ReLU(),
            nn.Linear(256,512),   nn.ReLU(),
            nn.Linear(512,256),   nn.ReLU(),
            nn.Linear(256,out_dim),
            nn.Tanh()
        )
    def forward(self, z):
        return self.net(z)

class Discriminator(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim,512), nn.ReLU(),
            nn.Linear(512,256),   nn.ReLU(),
            nn.Linear(256,128),   nn.ReLU(),
            nn.Linear(128,1),     nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

def train_cramer_gan(G, D, loader, epochs):
    G, D = G.to(device), D.to(device)
    optg = optim.Adam(G.parameters(), lr=2e-4)
    optd = optim.Adam(D.parameters(), lr=2e-4)
    loss_fn = nn.BCELoss()
    for ep in range(1, epochs+1):
        for real_batch, _ in loader:
            real_batch = real_batch.to(device)
            bsz = real_batch.size(0)
            # — D step
            optd.zero_grad()
            z      = torch.randn(bsz, LATENT_DIM, device=device)
            fake   = G(z).detach()
            d_real = D(real_batch)
            d_fake = D(fake)
            lossd  = loss_fn(d_real, torch.ones_like(d_real)) + \
                     loss_fn(d_fake, torch.zeros_like(d_fake))
            lossd.backward();  optd.step()
            # — G step
            optg.zero_grad()
            z     = torch.randn(bsz, LATENT_DIM, device=device)
            fake2 = G(z)
            dg    = D(fake2)
            lossg = loss_fn(dg, torch.ones_like(dg))
            lossg.backward(); optg.step()
        if ep%20==0 or ep==1 or ep==epochs:
            print(f" Ep {ep}/{epochs}  D_loss={lossd.item():.4f}  G_loss={lossg.item():.4f}")
    return G, D

def generate_synthetic(G, n_samples):
    G = G.to(device).eval()
    with torch.no_grad():
        z    = torch.randn(n_samples, LATENT_DIM, device=device)
        data = G(z).cpu().numpy()
    return data

def compute_tstr_all(X_real, y_real, X_syn, y_syn):
    # Train each classifier on synthetic → score on real
    results = {}
    for name, clf in [
        ("LR",  LogisticRegression(max_iter=5000)),
        ("MLP", MLPClassifier(hidden_layer_sizes=(128,64), max_iter=1000)),
        ("RF",  RandomForestClassifier(n_estimators=200)),
        ("XGB", XGBClassifier(eval_metric="logloss"))
    ]:
        clf.fit(X_syn, y_syn)
        results[name] = clf.score(X_real, y_real)*100.0
    return results

def compute_jsd_wd(X_real, X_syn, num_idx):
    if not num_idx:
        return 0.0, 0.0  # explicitly return zero if no numeric features
    jsd_list, wd_list = [], []
    for i in num_idx:
        p_real, _ = np.histogram(X_real[:, i], bins=50, density=True)
        p_syn, _ = np.histogram(X_syn[:, i], bins=50, density=True)
        jsd_list.append(jensenshannon(p_real, p_syn))
        wd_list.append(wasserstein_distance(X_real[:, i], X_syn[:, i]))
    return np.mean(jsd_list), np.mean(wd_list)

# Load Preprocessed Data
df = pd.read_csv(PREPROCESSED_PATH)
X_full = df.drop(columns=[TARGET_COL]).values.astype(np.float32)
y_full = df[TARGET_COL].values.astype(int)
num_idx = [df.columns.get_loc(c) for c in NUMERIC_COLS]

# 3×(2-Fold CV)
tstr_scores = {m:[] for m in ["LR","MLP","RF","XGB"]}
jsd_scores, wd_scores = [], []

kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
for rep in range(1, REPEATS+1):
    for fold,(train_idx, test_idx) in enumerate(kf.split(X_full),1):
        print(f"\n— Rep {rep}/{REPEATS}  Fold {fold}/{FOLDS} —")
        X_tr, X_te = X_full[train_idx], X_full[test_idx]
        y_tr, y_te = y_full[train_idx], y_full[test_idx]
        loader = DataLoader(
            TensorDataset(torch.from_numpy(X_tr), torch.from_numpy(y_tr)),
            batch_size=BATCH_SIZE, shuffle=True
        )
        # train
        G = Generator(LATENT_DIM, X_tr.shape[1])
        D = Discriminator(X_tr.shape[1])
        G, D = train_cramer_gan(G, D, loader, epochs=EPOCHS)
        # synth
        n_syn = int(SYN_RATIO * len(X_tr))
        X_syn = generate_synthetic(G, n_syn)
        y_syn = np.random.choice(y_tr, size=n_syn, replace=True)
        # metrics
        tstrs = compute_tstr_all(X_te, y_te, X_syn, y_syn)
        for m,sc in tstrs.items(): tstr_scores[m].append(sc)
        js, wd = compute_jsd_wd(X_te, X_syn, num_idx)
        jsd_scores.append(js);  wd_scores.append(wd)

# Report Results
print("\n=== CV Results (mean ± std) ===")
for m in ["LR","MLP","RF","XGB"]:
    arr = np.array(tstr_scores[m])
    print(f" • {m:4s} TSTR = {arr.mean():.2f}% ± {arr.std():.2f}%")
print(f" • JSD = {np.mean(jsd_scores):.4f} ± {np.std(jsd_scores):.4f}")
print(f" • WD  = {np.mean(wd_scores):.4f} ± {np.std(wd_scores):.4f}")

Using device: cpu

— Rep 1/3  Fold 1/2 —
 Ep 1/100  D_loss=1.2785  G_loss=0.6690
 Ep 20/100  D_loss=1.0441  G_loss=2.0299
 Ep 40/100  D_loss=1.2539  G_loss=2.2646
 Ep 60/100  D_loss=1.0598  G_loss=1.2805
 Ep 80/100  D_loss=1.1717  G_loss=2.0051
 Ep 100/100  D_loss=0.6089  G_loss=2.4384

— Rep 1/3  Fold 2/2 —
 Ep 1/100  D_loss=1.3130  G_loss=0.6244
 Ep 20/100  D_loss=0.7113  G_loss=2.5002
 Ep 40/100  D_loss=0.1990  G_loss=2.7012
 Ep 60/100  D_loss=0.4334  G_loss=1.8863
 Ep 80/100  D_loss=0.5669  G_loss=2.4862
 Ep 100/100  D_loss=0.7529  G_loss=1.3862

— Rep 2/3  Fold 1/2 —
 Ep 1/100  D_loss=1.3047  G_loss=0.6571
 Ep 20/100  D_loss=0.4094  G_loss=2.6086
 Ep 40/100  D_loss=1.6211  G_loss=1.5562
 Ep 60/100  D_loss=0.7200  G_loss=2.4813
 Ep 80/100  D_loss=0.5187  G_loss=2.7554
 Ep 100/100  D_loss=0.4123  G_loss=2.6142

— Rep 2/3  Fold 2/2 —
 Ep 1/100  D_loss=1.3148  G_loss=0.6733
 Ep 20/100  D_loss=0.5695  G_loss=1.9879
 Ep 40/100  D_loss=1.4279  G_loss=2.3275
 Ep 60/100  D_loss=1.6628  G_l

In [3]:
# Final Model Training & Save Synthetic Dataset

# Train on full dataset
full_loader = DataLoader(
    TensorDataset(torch.from_numpy(X_full), torch.from_numpy(y_full)),
    batch_size=BATCH_SIZE, shuffle=True
)
Gf = Generator(LATENT_DIM, X_full.shape[1])
Df = Discriminator(X_full.shape[1])
Gf, Df = train_cramer_gan(Gf, Df, full_loader, epochs=EPOCHS)

# Generate synthetic data
n_final = int(SYN_RATIO * len(X_full))
Xf_syn = generate_synthetic(Gf, n_final)
yf_syn = np.random.choice(y_full, size=n_final, replace=True)

# Save to CSV
cols = df.columns[:-1]
syn_df = pd.DataFrame(Xf_syn, columns=cols)
syn_df[TARGET_COL] = yf_syn
out_path = "synthetic_car_final.csv"
syn_df.to_csv(out_path, index=False)
print(f"\n✅ Saved final synthetic dataset ({n_final} rows) to:\n  {out_path}")


 Ep 1/100  D_loss=1.2215  G_loss=0.6590
 Ep 20/100  D_loss=0.3558  G_loss=2.3499
 Ep 40/100  D_loss=0.7506  G_loss=1.7841
 Ep 60/100  D_loss=1.0647  G_loss=1.9161
 Ep 80/100  D_loss=1.1154  G_loss=1.8145
 Ep 100/100  D_loss=0.3999  G_loss=2.9209

✅ Saved final synthetic dataset (864 rows) to:
  synthetic_car_final.csv
