# **CRGAN Model Architecture**

In [None]:


import torch
import torch.nn as nn
import torch.optim as optim


#Device configuration

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



# Generator definition
class Generator(nn.Module):
    def __init__(self, latent_dim: int, output_dim: int):
        """
        Args:
            latent_dim: size of the input noise vector
            output_dim: dimensionality of each generated sample
        """
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(latent_dim, 256),
            nn.ReLU(inplace=True),
            nn.Linear(256, 512),
            nn.ReLU(inplace=True),
            nn.Linear(512, 256),
            nn.ReLU(inplace=True),
            nn.Linear(256, output_dim),
            nn.Tanh()
        )

    def forward(self, z: torch.Tensor) -> torch.Tensor:
        """
        z: noise tensor of shape (batch_size, latent_dim)
        returns: generated samples of shape (batch_size, output_dim)
        """
        return self.net(z)



#Discriminator definition

class Discriminator(nn.Module):
    def __init__(self, input_dim: int):
        """
        Args:
            input_dim: dimensionality of each sample (real or fake)
        """
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(inplace=True),
            nn.Linear(512, 256),
            nn.ReLU(inplace=True),
            nn.Linear(256, 128),
            nn.ReLU(inplace=True),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        x: tensor of shape (batch_size, input_dim)
        returns: probability of realness, shape (batch_size, 1)
        """
        return self.net(x)


#Training loop for Cramér GAN
def train_cramer_gan(
    G: Generator,
    D: Discriminator,
    loader: torch.utils.data.DataLoader,
    *,
    latent_dim: int = 100,
    epochs: int = 100,
    lr: float = 2e-4
) -> Generator:
    """
    Train the Generator G and Discriminator D adversarially.

    Args:
      G: Generator instance
      D: Discriminator instance
      loader: DataLoader yielding (real_batch, _)
      latent_dim: dimension of noise vector
      epochs: number of training epochs
      lr: learning rate for both optimizers

    Returns:
      The trained Generator (and Discriminator, if you need it).
    """
    G.to(device)
    D.to(device)
    opt_g = optim.Adam(G.parameters(), lr=lr)
    opt_d = optim.Adam(D.parameters(), lr=lr)
    criterion = nn.BCELoss()

    for ep in range(1, epochs + 1):
        for real_batch, _ in loader:
            real_batch = real_batch.to(device).float()
            bsz = real_batch.size(0)

            # --- Discriminator step ---
            opt_d.zero_grad()
            # Real
            d_real = D(real_batch)
            # Fake
            z = torch.randn(bsz, latent_dim, device=device)
            fake = G(z).detach()
            d_fake = D(fake)
            loss_d = criterion(d_real, torch.ones_like(d_real)) + \
                     criterion(d_fake, torch.zeros_like(d_fake))
            loss_d.backward()
            opt_d.step()

            # --- Generator step ---
            opt_g.zero_grad()
            z2 = torch.randn(bsz, latent_dim, device=device)
            fake2 = G(z2)
            d_fake2 = D(fake2)
            loss_g = criterion(d_fake2, torch.ones_like(d_fake2))
            loss_g.backward()
            opt_g.step()

        # optional logging
        if ep == 1 or ep % 20 == 0 or ep == epochs:
            print(f"Epoch {ep}/{epochs}  D_loss={loss_d.item():.4f}  G_loss={loss_g.item():.4f}")

    return G



#Synthetic data generation

def generate_synthetic(
    G: Generator,
    n_samples: int,
    *,
    latent_dim: int = 100
) -> torch.Tensor:
    """
    Sample new data points from a trained Generator.

    Args:
      G: trained Generator
      n_samples: number of synthetic samples to generate
      latent_dim: dimension of noise vector

    Returns:
      A tensor of shape (n_samples, output_dim) on CPU.
    """
    G.to(device).eval()
    with torch.no_grad():
        z = torch.randn(n_samples, latent_dim, device=device)
        synth = G(z).cpu()
    return synth


# **CRGAN Training Procedure**

In [None]:
from torch.utils.data import DataLoader, TensorDataset
import torch
import pandas as pd

from cr_gan import Generator, Discriminator, train_cramer_gan, generate_synthetic

# Load your preprocessed DataFrame df (must include 'target' column)
df = pd.read_csv("preprocessed_data.csv")

# Create DataLoader over real features (ignore target)
X = df.drop(columns="target").values.astype("float32")
loader = DataLoader(TensorDataset(torch.from_numpy(X), torch.zeros(len(X))),
                    batch_size=64, shuffle=True)

#Instantiate models
G = Generator(latent_dim=100, output_dim=X.shape[1])
D = Discriminator(input_dim=X.shape[1])

# Train GAN
G_trained = train_cramer_gan(G, D, loader, latent_dim=100, epochs=100)

# Generate synthetic samples
n_syn = int(0.5 * len(X))
X_synth = generate_synthetic(G_trained, n_samples=n_syn, latent_dim=100).numpy()

# Reattach target
y_real = df["target"].values
y_synth = np.random.choice(y_real, size=n_syn, replace=True)
synth_df = pd.DataFrame(X_synth, columns=df.columns[:-1])
synth_df["target"] = y_synth

# 7) Save or pass to downstream code
synth_df.to_csv("synthetic_data.csv", index=False)


## **CRGAN Model Evaluation**

In [None]:
import pandas as pd
import numpy as np

from sklearn.linear_model    import LogisticRegression
from sklearn.neural_network  import MLPClassifier
from sklearn.ensemble        import RandomForestClassifier
from xgboost                 import XGBClassifier
from sklearn.metrics         import accuracy_score, roc_auc_score
from scipy.stats             import entropy, wasserstein_distance


#Helper Functions for JSD & Wasserstein Distance
def compute_jsd(real: pd.DataFrame, synth: pd.DataFrame, bins=20) -> float:
    """Average Jensen–Shannon divergence over all columns."""
    jsd_vals = []
    for col in real.columns:
        r = real[col].to_numpy()
        s = synth[col].to_numpy()
        # build histograms
        mn, mx = min(r.min(), s.min()), max(r.max(), s.max())
        edges = np.linspace(mn, mx, bins+1)
        p_r, _ = np.histogram(r, bins=edges, density=True)
        p_s, _ = np.histogram(s, bins=edges, density=True)
        # avoid zero‐bins
        p_r += 1e-8; p_s += 1e-8
        m = 0.5 * (p_r + p_s)
        jsd_vals.append(0.5 * (entropy(p_r, m) + entropy(p_s, m)))
    return float(np.mean(jsd_vals))

def compute_wd(real: pd.DataFrame, synth: pd.DataFrame) -> float:
    """Average 1D Wasserstein (Earth‐Mover) distance over all columns."""
    wd_vals = []
    for col in real.columns:
        r = real[col].to_numpy()
        s = synth[col].to_numpy()
        wd_vals.append(wasserstein_distance(r, s))
    return float(np.mean(wd_vals))


#Load the real & synthetic datasets

real_path  = "adult_preprocessed.csv"           # original preprocessed file
synth_path = "adult_tabpfn_synthetic.csv"      # synthetic data generated

df_real  = pd.read_csv(real_path)
df_synth = pd.read_csv(synth_path)

# Separate features & target
X_real, y_real   = df_real.drop("target", axis=1), df_real["target"]
X_synth, y_synth = df_synth.drop("target", axis=1), df_synth["target"]

#Distributional fidelity: JSD & WD

jsd_score = compute_jsd(X_real, X_synth)
wd_score  = compute_wd(X_real, X_synth)

# TSTR: train‐on‐synthetic, test‐on‐real with 4 classifiers

classifiers = {
    "LR":   LogisticRegression(max_iter=1000),
    "MLP":  MLPClassifier(hidden_layer_sizes=(128,64), max_iter=500),
    "RF":   RandomForestClassifier(n_estimators=200),
    "XGBT": XGBClassifier(use_label_encoder=False, eval_metric="logloss")
}

tstr_results = {}
for name, clf in classifiers.items():
    clf.fit(X_synth, y_synth)                         # train on synthetic
    y_pred = clf.predict(X_real)                      # test on real
    acc = accuracy_score(y_real, y_pred)
    auc = roc_auc_score(y_real, clf.predict_proba(X_real)[:,1])
    tstr_results[name] = {"Accuracy": acc, "AUC": auc}


print("\nDistributional Metrics:")
print(f"  • Jensen–Shannon Divergence (JSD): {jsd_score:.4f}")
print(f"  • Wasserstein Distance (WD):      {wd_score:.4f}")

print("\nTSTR Results (train on synthetic → test on real):")
print(f"{'Model':<6}  {'Accuracy':>8}   {'AUC':>6}")
print("-" * 26)
for name, metrics in tstr_results.items():
    print(f"{name:<6}  {metrics['Accuracy']*100:8.2f}%   {metrics['AUC']:6.3f}")
