In [1]:
%pip install xgboost

import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
from torch.autograd import grad as torch_grad
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
from scipy.stats import entropy
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import time
from tqdm import tqdm
from google.colab import drive

drive.mount('/content/drive/')
%cd /content/drive/MyDrive/Colab Notebooks/Katabatic/CrGAN/adult/

# Suppress warnings
warnings.filterwarnings("ignore")

# Setting random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Check if CUDA is available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Mounted at /content/drive/
/content/drive/MyDrive/Colab Notebooks/Katabatic/CrGAN/adult
Using device: cuda:0


In [2]:
# Load and preprocess data
def load_data(train_path, test_path):
    # Load data
    df_train = pd.read_csv(train_path)
    df_test = pd.read_csv(test_path)

    # Replace spaces in column names
    df_train.columns = [col.strip().replace(' ', '') for col in df_train.columns]
    df_test.columns = [col.strip().replace(' ', '') for col in df_test.columns]

    # Define categorical and numerical columns
    categorical_cols = ['workclass', 'education', 'marital.status', 'occupation',
                       'relationship', 'race', 'sex', 'native.country']
    numerical_cols = ['age', 'fnlwgt', 'education.num', 'capital.gain',
                     'capital.loss', 'hours.per.week']

    # Process target variable
    df_train['income'] = df_train['income'].map({' <=50K': 0, ' >50K': 1})
    df_test['income'] = df_test['income'].map({' <=50K': 0, ' >50K': 1})

    # Create preprocessing pipeline
    numerical_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    # Split features and target
    X_train = df_train.drop('income', axis=1)
    y_train = df_train['income']
    X_test = df_test.drop('income', axis=1)
    y_test = df_test['income']

    # Fit and transform the data
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)

    # Get output feature names
    cat_feature_names = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_cols)
    all_feature_names = list(numerical_cols) + list(cat_feature_names)

    print(f"Training data shape: {X_train_transformed.shape}")
    print(f"Testing data shape: {X_test_transformed.shape}")

    return X_train_transformed, y_train, X_test_transformed, y_test, preprocessor, all_feature_names

# Custom dataset class
class AdultDataset(Dataset):
    def __init__(self, features, labels=None):
        self.features = torch.tensor(features, dtype=torch.float32)
        if labels is not None:
            self.labels = torch.tensor(labels, dtype=torch.float32).view(-1, 1)
        else:
            self.labels = None

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        if self.labels is not None:
            return self.features[idx], self.labels[idx]
        else:
            return self.features[idx]

# Generator Network
class Generator(nn.Module):
    def __init__(self, latent_dim, output_dim):
        super(Generator, self).__init__()
        self.latent_dim = latent_dim
        self.output_dim = output_dim

        self.model = nn.Sequential(
            nn.Linear(latent_dim, 256),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(0.2),

            nn.Linear(256, 512),
            nn.BatchNorm1d(512),
            nn.LeakyReLU(0.2),

            nn.Linear(512, 1024),
            nn.BatchNorm1d(1024),
            nn.LeakyReLU(0.2),

            nn.Linear(1024, output_dim),
            nn.Tanh()  # Output layer - maps to (-1, 1) range
        )

    def forward(self, z):
        return self.model(z)

# Critic Network (Discriminator)
class Critic(nn.Module):
    def __init__(self, input_dim):
        super(Critic, self).__init__()
        self.input_dim = input_dim

        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),

            nn.Linear(512, 256),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),

            nn.Linear(256, 128),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),

            nn.Linear(128, 1)
        )

    def forward(self, x):
        return self.model(x)

# Cramer GAN Implementation
class CramerGAN:
    def __init__(self, data_dim, latent_dim=100, critic_iterations=5, lambda_gp=10):
        self.data_dim = data_dim
        self.latent_dim = latent_dim
        self.critic_iterations = critic_iterations
        self.lambda_gp = lambda_gp

        # Initialize networks
        self.generator = Generator(latent_dim, data_dim).to(device)
        self.critic = Critic(data_dim).to(device)

        # Setup optimizers
        self.g_optimizer = optim.Adam(self.generator.parameters(), lr=0.0002, betas=(0.5, 0.9))
        self.c_optimizer = optim.Adam(self.critic.parameters(), lr=0.0002, betas=(0.5, 0.9))

        # Initialize loss tracking
        self.g_losses = []
        self.c_losses = []

    def _critic_train_iteration(self, real_data, batch_size):
        # Generate random noise
        noise = torch.randn(batch_size, self.latent_dim).to(device)

        # Generate fake data
        fake_data = self.generator(noise)

        # Get critic outputs
        critic_real = self.critic(real_data)
        critic_fake = self.critic(fake_data)

        # Calculate Cramer distance
        critic_real2 = self.critic(torch.roll(real_data, shifts=1, dims=0))
        critic_fake2 = self.critic(torch.roll(fake_data, shifts=1, dims=0))

        # Cramer GAN loss function
        c_loss = torch.mean(critic_real - critic_fake) - 0.5 * torch.mean(torch.pow(critic_real - critic_real2, 2)) + 0.5 * torch.mean(torch.pow(critic_fake - critic_fake2, 2))

        # Calculate gradient penalty
        alpha = torch.rand(batch_size, 1).to(device)
        interpolates = alpha * real_data + ((1 - alpha) * fake_data)
        interpolates.requires_grad_(True)

        critic_interpolates = self.critic(interpolates)
        gradients = torch_grad(outputs=critic_interpolates, inputs=interpolates,
                              grad_outputs=torch.ones_like(critic_interpolates).to(device),
                              create_graph=True, retain_graph=True)[0]

        gradients = gradients.view(batch_size, -1)
        gradient_penalty = self.lambda_gp * ((gradients.norm(2, dim=1) - 1) ** 2).mean()

        # Update critic
        self.c_optimizer.zero_grad()
        c_loss_total = c_loss + gradient_penalty
        c_loss_total.backward()
        self.c_optimizer.step()

        return c_loss_total.item()

    def _generator_train_iteration(self, batch_size):
        # Generate random noise
        noise = torch.randn(batch_size, self.latent_dim).to(device)

        # Generate fake data
        fake_data = self.generator(noise)

        # Calculate critic outputs
        critic_fake = self.critic(fake_data)
        critic_fake2 = self.critic(torch.roll(fake_data, shifts=1, dims=0))

        # Generator loss is negative of critic loss
        g_loss = -torch.mean(critic_fake) + 0.5 * torch.mean(torch.pow(critic_fake - critic_fake2, 2))

        # Update generator
        self.g_optimizer.zero_grad()
        g_loss.backward()
        self.g_optimizer.step()

        return g_loss.item()

    def train(self, data_loader, epochs, save_interval=10, verbose=True):
        for epoch in range(epochs):
            epoch_start_time = time.time()
            c_loss_total = 0
            g_loss_total = 0
            num_batches = 0

            for i, (real_data, _) in enumerate(data_loader):
                batch_size = real_data.size(0)
                real_data = real_data.to(device)

                # Train critic
                for _ in range(self.critic_iterations):
                    c_loss = self._critic_train_iteration(real_data, batch_size)
                c_loss_total += c_loss

                # Train generator
                g_loss = self._generator_train_iteration(batch_size)
                g_loss_total += g_loss

                num_batches += 1

            # Calculate average loss for the epoch
            c_loss_avg = c_loss_total / num_batches
            g_loss_avg = g_loss_total / num_batches

            self.c_losses.append(c_loss_avg)
            self.g_losses.append(g_loss_avg)

            epoch_time = time.time() - epoch_start_time

            if verbose and (epoch % save_interval == 0 or epoch == epochs - 1):
                print(f"Epoch [{epoch+1}/{epochs}] | Critic Loss: {c_loss_avg:.4f} | Generator Loss: {g_loss_avg:.4f} | Time: {epoch_time:.2f}s")

    def generate_samples(self, num_samples):
        self.generator.eval()
        noise = torch.randn(num_samples, self.latent_dim).to(device)
        with torch.no_grad():
            generated_data = self.generator(noise).cpu().numpy()
        self.generator.train()
        return generated_data

    def save_model(self, path):
        torch.save({
            'generator_state_dict': self.generator.state_dict(),
            'critic_state_dict': self.critic.state_dict(),
            'g_optimizer_state_dict': self.g_optimizer.state_dict(),
            'c_optimizer_state_dict': self.c_optimizer.state_dict(),
        }, path)

    def load_model(self, path):
        checkpoint = torch.load(path)
        self.generator.load_state_dict(checkpoint['generator_state_dict'])
        self.critic.load_state_dict(checkpoint['critic_state_dict'])
        self.g_optimizer.load_state_dict(checkpoint['g_optimizer_state_dict'])
        self.c_optimizer.load_state_dict(checkpoint['c_optimizer_state_dict'])



In [3]:
# 1. Machine Learning Utility (TSTR)
def evaluate_tstr(real_data, synthetic_data, real_labels, random_state=42):
    """
    Train classifiers on synthetic data and test on real data (TSTR)
    Returns accuracy for each classifier
    """
    # Train-test split for real data
    X_train, X_test, y_train, y_test = train_test_split(
        real_data, real_labels, test_size=0.2, random_state=random_state
    )

    # Synthetic data (all used for training)
    X_synth = synthetic_data

    # Ensure proper dimensions for labels
    if isinstance(y_train, pd.Series):
        y_train = y_train.values

    # Create synthetic labels based on real distribution
    np.random.seed(random_state)
    y_synth = np.random.choice([0, 1], size=len(X_synth), p=[
                               1-y_train.mean(), y_train.mean()])

    # Define classifiers
    classifiers = {
        'Logistic Regression': LogisticRegression(max_iter=1000, random_state=random_state),
        'MLP': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=random_state),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=random_state),
        'XGBoost': xgb.XGBClassifier(n_estimators=100, random_state=random_state)
    }

    results = {}

    for name, clf in classifiers.items():
        # Train on synthetic data
        clf.fit(X_synth, y_synth)

        # Test on real data
        y_pred = clf.predict(X_test)

        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        results[name] = {
            'accuracy': accuracy,
            'f1_score': f1
        }

    return results

# 2. Statistical Similarity


def jensen_shannon_divergence(p, q):
    """
    Calculate Jensen-Shannon Divergence between distributions p and q
    """
    # Ensure p and q are normalized
    p = p / np.sum(p)
    q = q / np.sum(q)

    m = 0.5 * (p + q)

    # Calculate JSD
    jsd = 0.5 * (entropy(p, m) + entropy(q, m))

    return jsd


def wasserstein_distance(p, q):
    """
    Calculate 1D Wasserstein distance (Earth Mover's Distance)
    """
    from scipy.stats import wasserstein_distance

    return wasserstein_distance(p, q)


def evaluate_statistical_similarity(real_data, synthetic_data, feature_names):
    """
    Calculate statistical similarity metrics between real and synthetic data
    """
    results = {'JSD': {}, 'WD': {}}

    # Calculate metrics for each feature
    for i in range(real_data.shape[1]):
        feature_name = feature_names[i] if i < len(
            feature_names) else f"feature_{i}"

        # Get feature values
        real_values = real_data[:, i]
        synth_values = synthetic_data[:, i]

        # Calculate histogram (discrete distribution)
        hist_bins = min(50, len(np.unique(real_values)))

        hist_real, bin_edges = np.histogram(
            real_values, bins=hist_bins, density=True)
        hist_synth, _ = np.histogram(
            synth_values, bins=bin_edges, density=True)

        # Add a small epsilon to avoid division by zero
        epsilon = 1e-10
        hist_real = hist_real + epsilon
        hist_synth = hist_synth + epsilon

        # Calculate JSD
        jsd = jensen_shannon_divergence(hist_real, hist_synth)
        results['JSD'][feature_name] = jsd

        # Calculate Wasserstein Distance
        wd = wasserstein_distance(real_values, synth_values)
        results['WD'][feature_name] = wd

    # Calculate average metrics
    results['JSD_avg'] = np.mean(list(results['JSD'].values()))
    results['WD_avg'] = np.mean(list(results['WD'].values()))

    return results


def plot_loss_curves(model):
    """
    Plot the loss curves for the generator and critic
    """
    plt.figure(figsize=(10, 5))
    plt.plot(model.g_losses, label='Generator Loss')
    plt.plot(model.c_losses, label='Critic Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('CramerGAN Training Loss')
    plt.legend()
    plt.grid(True)
    plt.savefig('crgan_loss_curves.png')
    plt.close()


def plot_feature_distributions(real_data, synthetic_data, feature_names, n_features=5):
    """
    Plot distributions of real vs synthetic data for selected features
    """
    if n_features > len(feature_names):
        n_features = len(feature_names)

    # Select a subset of features to visualize
    selected_indices = np.random.choice(
        range(len(feature_names)), size=n_features, replace=False)

    plt.figure(figsize=(15, 10))
    for i, idx in enumerate(selected_indices):
        feature_name = feature_names[idx]

        plt.subplot(n_features, 1, i+1)

        # Get feature values
        real_values = real_data[:, idx]
        synth_values = synthetic_data[:, idx]

        # Plot histograms
        sns.histplot(real_values, kde=True, stat="density",
                     label="Real", alpha=0.6, color="blue")
        sns.histplot(synth_values, kde=True, stat="density",
                     label="Synthetic", alpha=0.6, color="red")

        plt.title(f"Distribution for {feature_name}")
        plt.legend()

    plt.tight_layout()
    plt.savefig('feature_distributions.png')
    plt.close()

# Main function


def main():
    # File paths
    train_path = "data/adult-train.csv"
    test_path = "data/adult-test.csv"

    # Load and preprocess data
    X_train, y_train, X_test, y_test, preprocessor, feature_names = load_data(
        train_path, test_path)

    # Create dataset and dataloader
    train_dataset = AdultDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

    # Initialize and train the model
    data_dim = X_train.shape[1]
    latent_dim = 100

    print(f"Data dimension: {data_dim}")
    print(f"Latent dimension: {latent_dim}")

    crgan = CramerGAN(data_dim, latent_dim)

    # Train the model
    epochs = 300
    print(f"Training CramerGAN for {epochs} epochs...")
    crgan.train(train_loader, epochs, save_interval=10)

    # Save the model
    crgan.save_model('crgan_model.pt')

    # Plot loss curves
    plot_loss_curves(crgan)

    # Generate synthetic data
    num_samples = 1000
    print(f"Generating {num_samples} synthetic samples...")
    synthetic_data = crgan.generate_samples(num_samples)

    # Statistical similarity evaluation
    print("Evaluating statistical similarity...")
    stat_results = evaluate_statistical_similarity(
        X_train, synthetic_data, feature_names)

    print("\nJensen-Shannon Divergence (average):", stat_results['JSD_avg'])
    print("Wasserstein Distance (average):", stat_results['WD_avg'])

    # Machine Learning Utility (TSTR) evaluation
    print("\nEvaluating Machine Learning Utility (TSTR)...")
    tstr_results = evaluate_tstr(X_train, synthetic_data, y_train)

    print("\nTSTR Results:")
    for clf_name, metrics in tstr_results.items():
        print(
            f"{clf_name}: Accuracy = {metrics['accuracy']:.4f}, F1 Score = {metrics['f1_score']:.4f}")

    # Plot feature distributions
    plot_feature_distributions(X_train, synthetic_data, feature_names)

    print("\nEvaluation complete! Check the output directory for plots and saved model.")



In [4]:
if __name__ == "__main__":
    main()

Training data shape: (32561, 108)
Testing data shape: (16281, 108)
Data dimension: 108
Latent dimension: 100
Training CramerGAN for 300 epochs...
Epoch [1/300] | Critic Loss: -20886.9372 | Generator Loss: -0.7912 | Time: 26.98s
Epoch [11/300] | Critic Loss: -107412639743100.4844 | Generator Loss: 7836976108.9255 | Time: 12.70s
Epoch [21/300] | Critic Loss: -23363338636110892.0000 | Generator Loss: 1318491068054.5881 | Time: 12.91s
Epoch [31/300] | Critic Loss: -578347327770891392.0000 | Generator Loss: 40657577346180.5156 | Time: 12.70s
Epoch [41/300] | Critic Loss: -5402991783680845824.0000 | Generator Loss: 384980910632044.4375 | Time: 12.75s
Epoch [51/300] | Critic Loss: -33389915447510511616.0000 | Generator Loss: 2716727609663520.0000 | Time: 13.13s
Epoch [61/300] | Critic Loss: -141076770875941306368.0000 | Generator Loss: 11323803380350462.0000 | Time: 12.72s
Epoch [71/300] | Critic Loss: -473359144968169586688.0000 | Generator Loss: 45362624376342536.0000 | Time: 12.62s
Epoch [