In [1]:
%pip install xgboost

import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
from scipy.stats import entropy
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import time
from tqdm import tqdm
from google.colab import drive

drive.mount('/content/drive/')
%cd /content/drive/MyDrive/Colab Notebooks/Katabatic/MedGAN/adult/

# Suppress warnings
warnings.filterwarnings("ignore")

# Setting random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Check if CUDA is available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Mounted at /content/drive/
/content/drive/MyDrive/Colab Notebooks/Katabatic/MedGAN/adult
Using device: cpu


In [2]:
# Load and preprocess data
def load_data(train_path, test_path):
    # Load data
    df_train = pd.read_csv(train_path)
    df_test = pd.read_csv(test_path)

    # Replace spaces in column names
    df_train.columns = [col.strip().replace(' ', '') for col in df_train.columns]
    df_test.columns = [col.strip().replace(' ', '') for col in df_test.columns]

    # Define categorical and numerical columns
    categorical_cols = ['workclass', 'education', 'marital.status', 'occupation',
                       'relationship', 'race', 'sex', 'native.country']
    numerical_cols = ['age', 'fnlwgt', 'education.num', 'capital.gain',
                     'capital.loss', 'hours.per.week']

    # Process target variable
    df_train['income'] = df_train['income'].map({' <=50K': 0, ' >50K': 1})
    df_test['income'] = df_test['income'].map({' <=50K': 0, ' >50K': 1})

    # Create preprocessing pipeline
    numerical_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    # Split features and target
    X_train = df_train.drop('income', axis=1)
    y_train = df_train['income']
    X_test = df_test.drop('income', axis=1)
    y_test = df_test['income']

    # Fit and transform the data
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)

    # Get output feature names
    cat_feature_names = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_cols)
    all_feature_names = list(numerical_cols) + list(cat_feature_names)

    print(f"Training data shape: {X_train_transformed.shape}")
    print(f"Testing data shape: {X_test_transformed.shape}")

    return X_train_transformed, y_train, X_test_transformed, y_test, preprocessor, all_feature_names

# Custom dataset class
class AdultDataset(Dataset):
    def __init__(self, features, labels=None):
        self.features = torch.tensor(features, dtype=torch.float32)
        if labels is not None:
            self.labels = torch.tensor(labels, dtype=torch.float32).view(-1, 1)
        else:
            self.labels = None

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        if self.labels is not None:
            return self.features[idx], self.labels[idx]
        else:
            return self.features[idx]

# Autoencoder for MedGAN
class Autoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, latent_dim=64):
        super(Autoencoder, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.latent_dim = latent_dim

        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, latent_dim),
            nn.BatchNorm1d(latent_dim),
            nn.ReLU()
        )

        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim),
            nn.Sigmoid()  # Output activation for normalized data
        )

    def encode(self, x):
        return self.encoder(x)

    def decode(self, z):
        return self.decoder(z)

    def forward(self, x):
        z = self.encode(x)
        return self.decode(z)

# Generator for MedGAN
class Generator(nn.Module):
    def __init__(self, latent_dim, hidden_dim=128, output_dim=64):
        super(Generator, self).__init__()
        self.latent_dim = latent_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim

        self.model = nn.Sequential(
            nn.Linear(latent_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),

            nn.Linear(hidden_dim, hidden_dim*2),
            nn.BatchNorm1d(hidden_dim*2),
            nn.ReLU(),

            nn.Linear(hidden_dim*2, output_dim),
            nn.Tanh()  # Output in [-1,1] range
        )

    def forward(self, z):
        return self.model(z)

# Discriminator for MedGAN
class Discriminator(nn.Module):
    def __init__(self, input_dim, hidden_dim=128):
        super(Discriminator, self).__init__()
        self.input_dim = input_dim

        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),

            nn.Linear(hidden_dim, hidden_dim//2),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),

            nn.Linear(hidden_dim//2, 1),
            nn.Sigmoid()  # Binary classification output
        )

    def forward(self, x):
        return self.model(x)

# MedGAN Implementation
class MedGAN:
    def __init__(self, input_dim, hidden_dim=128, latent_dim=64, z_dim=128,
                 ae_pretrain_epochs=100, discriminator_steps=1, lambda_rec=0.2):
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.latent_dim = latent_dim  # Autoencoder latent dimension
        self.z_dim = z_dim  # Generator input noise dimension
        self.ae_pretrain_epochs = ae_pretrain_epochs
        self.discriminator_steps = discriminator_steps
        self.lambda_rec = lambda_rec  # Weight for reconstruction loss

        # Initialize networks
        self.autoencoder = Autoencoder(input_dim, hidden_dim, latent_dim).to(device)
        self.generator = Generator(z_dim, hidden_dim, latent_dim).to(device)
        self.discriminator = Discriminator(input_dim).to(device)

        # Setup optimizers
        self.ae_optimizer = optim.Adam(self.autoencoder.parameters(), lr=0.001)
        self.g_optimizer = optim.Adam(self.generator.parameters(), lr=0.0002, betas=(0.5, 0.999))
        self.d_optimizer = optim.Adam(self.discriminator.parameters(), lr=0.0002, betas=(0.5, 0.999))

        # Loss functions
        self.criterion_reconstruct = nn.MSELoss()
        self.criterion_gan = nn.BCELoss()

        # Initialize loss tracking
        self.ae_losses = []
        self.g_losses = []
        self.d_losses = []

    def pretrain_autoencoder(self, data_loader, epochs=None):
        """Pretrain the autoencoder"""
        if epochs is None:
            epochs = self.ae_pretrain_epochs

        print(f"Pretraining autoencoder for {epochs} epochs...")
        self.autoencoder.train()

        for epoch in range(epochs):
            epoch_loss = 0
            for batch_data, _ in data_loader:
                batch_data = batch_data.to(device)

                # Forward pass
                reconstructed = self.autoencoder(batch_data)
                loss = self.criterion_reconstruct(reconstructed, batch_data)

                # Backward pass and optimize
                self.ae_optimizer.zero_grad()
                loss.backward()
                self.ae_optimizer.step()

                epoch_loss += loss.item()

            # Record average epoch loss
            avg_loss = epoch_loss / len(data_loader)
            self.ae_losses.append(avg_loss)

            if (epoch+1) % 10 == 0:
                print(f"Epoch [{epoch+1}/{epochs}], Reconstruction Loss: {avg_loss:.6f}")

    def train_gan(self, data_loader, epochs, save_interval=10):
        """Train the GAN after autoencoder pretraining"""
        print(f"Training MedGAN for {epochs} epochs...")

        # Set networks to training mode
        self.autoencoder.eval()  # Freeze autoencoder after pretraining
        self.generator.train()
        self.discriminator.train()

        # Initialize labels for real and fake data
        real_label = 1.0
        fake_label = 0.0

        for epoch in range(epochs):
            d_loss_sum = 0
            g_loss_sum = 0
            batch_count = 0

            for batch_data, _ in data_loader:
                batch_size = batch_data.size(0)
                batch_data = batch_data.to(device)

                # -----------------------
                # Train Discriminator
                # -----------------------
                for _ in range(self.discriminator_steps):
                    self.d_optimizer.zero_grad()

                    # Real data
                    output_real = self.discriminator(batch_data)
                    labels_real = torch.full((batch_size, 1), real_label, device=device)
                    d_loss_real = self.criterion_gan(output_real, labels_real)

                    # Fake data: Generator -> Decoder -> Discriminator
                    z = torch.randn(batch_size, self.z_dim, device=device)
                    latent_fake = self.generator(z)
                    fake_data = self.autoencoder.decode(latent_fake)
                    output_fake = self.discriminator(fake_data.detach())
                    labels_fake = torch.full((batch_size, 1), fake_label, device=device)
                    d_loss_fake = self.criterion_gan(output_fake, labels_fake)

                    # Total discriminator loss
                    d_loss = d_loss_real + d_loss_fake
                    d_loss.backward()
                    self.d_optimizer.step()

                # -----------------------
                # Train Generator
                # -----------------------
                self.g_optimizer.zero_grad()

                # Generate fake data
                z = torch.randn(batch_size, self.z_dim, device=device)
                latent_fake = self.generator(z)
                fake_data = self.autoencoder.decode(latent_fake)

                # Discriminator on fake data
                output_fake = self.discriminator(fake_data)
                labels_real = torch.full((batch_size, 1), real_label, device=device)

                # Adversarial loss
                g_loss_adv = self.criterion_gan(output_fake, labels_real)

                # Optional: Add reconstruction loss
                if self.lambda_rec > 0:
                    # Reconstruct random real samples through AE
                    real_latent = self.autoencoder.encode(batch_data)
                    real_reconstructed = self.autoencoder.decode(real_latent)
                    rec_loss = self.criterion_reconstruct(real_reconstructed, batch_data)
                    g_loss = g_loss_adv + self.lambda_rec * rec_loss
                else:
                    g_loss = g_loss_adv

                g_loss.backward()
                self.g_optimizer.step()

                # Record statistics
                d_loss_sum += d_loss.item()
                g_loss_sum += g_loss.item()
                batch_count += 1

            # Calculate average losses
            avg_d_loss = d_loss_sum / batch_count
            avg_g_loss = g_loss_sum / batch_count

            self.d_losses.append(avg_d_loss)
            self.g_losses.append(avg_g_loss)

            if (epoch+1) % save_interval == 0 or epoch == epochs-1:
                print(f"Epoch [{epoch+1}/{epochs}], D Loss: {avg_d_loss:.4f}, G Loss: {avg_g_loss:.4f}")

    def generate_samples(self, num_samples):
        """Generate synthetic samples"""
        self.generator.eval()
        self.autoencoder.eval()

        with torch.no_grad():
            # Generate random noise
            z = torch.randn(num_samples, self.z_dim).to(device)

            # Generate latent representations
            latent_fake = self.generator(z)

            # Decode to data space
            fake_data = self.autoencoder.decode(latent_fake).cpu().numpy()

        return fake_data

    def save_model(self, path):
        """Save the model states"""
        torch.save({
            'autoencoder_state_dict': self.autoencoder.state_dict(),
            'generator_state_dict': self.generator.state_dict(),
            'discriminator_state_dict': self.discriminator.state_dict(),
            'ae_optimizer_state_dict': self.ae_optimizer.state_dict(),
            'g_optimizer_state_dict': self.g_optimizer.state_dict(),
            'd_optimizer_state_dict': self.d_optimizer.state_dict(),
        }, path)

    def load_model(self, path):
        """Load model states"""
        checkpoint = torch.load(path)
        self.autoencoder.load_state_dict(checkpoint['autoencoder_state_dict'])
        self.generator.load_state_dict(checkpoint['generator_state_dict'])
        self.discriminator.load_state_dict(checkpoint['discriminator_state_dict'])
        self.ae_optimizer.load_state_dict(checkpoint['ae_optimizer_state_dict'])
        self.g_optimizer.load_state_dict(checkpoint['g_optimizer_state_dict'])
        self.d_optimizer.load_state_dict(checkpoint['d_optimizer_state_dict'])


In [3]:
# 1. Machine Learning Utility (TSTR)
def evaluate_tstr(real_data, synthetic_data, real_labels, random_state=42):
    """
    Train classifiers on synthetic data and test on real data (TSTR)
    Returns accuracy for each classifier
    """
    # Train-test split for real data
    X_train, X_test, y_train, y_test = train_test_split(
        real_data, real_labels, test_size=0.2, random_state=random_state
    )

    # Synthetic data (all used for training)
    X_synth = synthetic_data

    # Ensure proper dimensions for labels
    if isinstance(y_train, pd.Series):
        y_train = y_train.values

    # Create synthetic labels based on real distribution
    np.random.seed(random_state)
    y_synth = np.random.choice([0, 1], size=len(X_synth), p=[
                               1-y_train.mean(), y_train.mean()])

    # Define classifiers
    classifiers = {
        'Logistic Regression': LogisticRegression(max_iter=1000, random_state=random_state),
        'MLP': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=random_state),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=random_state),
        'XGBoost': xgb.XGBClassifier(n_estimators=100, random_state=random_state)
    }

    results = {}

    for name, clf in classifiers.items():
        # Train on synthetic data
        clf.fit(X_synth, y_synth)

        # Test on real data
        y_pred = clf.predict(X_test)

        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        results[name] = {
            'accuracy': accuracy,
            'f1_score': f1
        }

    return results

# 2. Statistical Similarity
def jensen_shannon_divergence(p, q):
    """
    Calculate Jensen-Shannon Divergence between distributions p and q
    """
    # Ensure p and q are normalized
    p = p / np.sum(p)
    q = q / np.sum(q)

    m = 0.5 * (p + q)

    # Calculate JSD
    jsd = 0.5 * (entropy(p, m) + entropy(q, m))

    return jsd


def wasserstein_distance(p, q):
    """
    Calculate 1D Wasserstein distance (Earth Mover's Distance)
    """
    from scipy.stats import wasserstein_distance

    return wasserstein_distance(p, q)


def evaluate_statistical_similarity(real_data, synthetic_data, feature_names):
    """
    Calculate statistical similarity metrics between real and synthetic data
    """
    results = {'JSD': {}, 'WD': {}}

    # Calculate metrics for each feature
    for i in range(real_data.shape[1]):
        feature_name = feature_names[i] if i < len(
            feature_names) else f"feature_{i}"

        # Get feature values
        real_values = real_data[:, i]
        synth_values = synthetic_data[:, i]

        # Calculate histogram (discrete distribution)
        hist_bins = min(50, len(np.unique(real_values)))

        hist_real, bin_edges = np.histogram(
            real_values, bins=hist_bins, density=True)
        hist_synth, _ = np.histogram(
            synth_values, bins=bin_edges, density=True)

        # Add a small epsilon to avoid division by zero
        epsilon = 1e-10
        hist_real = hist_real + epsilon
        hist_synth = hist_synth + epsilon

        # Calculate JSD
        jsd = jensen_shannon_divergence(hist_real, hist_synth)
        results['JSD'][feature_name] = jsd

        # Calculate Wasserstein Distance
        wd = wasserstein_distance(real_values, synth_values)
        results['WD'][feature_name] = wd

    # Calculate average metrics
    results['JSD_avg'] = np.mean(list(results['JSD'].values()))
    results['WD_avg'] = np.mean(list(results['WD'].values()))

    return results


def plot_loss_curves(model):
    """
    Plot the loss curves for the autoencoder, generator and discriminator
    """
    plt.figure(figsize=(10, 8))

    # Plot autoencoder pretraining loss
    plt.subplot(2, 1, 1)
    plt.plot(model.ae_losses, label='Autoencoder Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('MedGAN Autoencoder Pretraining Loss')
    plt.legend()
    plt.grid(True)

    # Plot GAN training loss
    plt.subplot(2, 1, 2)
    plt.plot(model.g_losses, label='Generator Loss')
    plt.plot(model.d_losses, label='Discriminator Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('MedGAN Training Loss')
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.savefig('medgan_loss_curves.png')
    plt.close()


def plot_feature_distributions(real_data, synthetic_data, feature_names, n_features=5):
    """
    Plot distributions of real vs synthetic data for selected features
    """
    if n_features > len(feature_names):
        n_features = len(feature_names)

    # Select a subset of features to visualize
    selected_indices = np.random.choice(
        range(len(feature_names)), size=n_features, replace=False)

    plt.figure(figsize=(15, 10))
    for i, idx in enumerate(selected_indices):
        feature_name = feature_names[idx]

        plt.subplot(n_features, 1, i+1)

        # Get feature values
        real_values = real_data[:, idx]
        synth_values = synthetic_data[:, idx]

        # Plot histograms
        sns.histplot(real_values, kde=True, stat="density",
                     label="Real", alpha=0.6, color="blue")
        sns.histplot(synth_values, kde=True, stat="density",
                     label="Synthetic", alpha=0.6, color="red")

        plt.title(f"Distribution for {feature_name}")
        plt.legend()

    plt.tight_layout()
    plt.savefig('feature_distributions.png')
    plt.close()

# Main function
def main():
    # File paths
    train_path = "data/adult-train.csv"
    test_path = "data/adult-test.csv"

    # Load and preprocess data
    X_train, y_train, X_test, y_test, preprocessor, feature_names = load_data(
        train_path, test_path)

    # Create dataset and dataloader
    train_dataset = AdultDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

    # Initialize model
    data_dim = X_train.shape[1]
    hidden_dim = 128
    latent_dim = 64
    z_dim = 128

    print(f"Data dimension: {data_dim}")
    print(f"Autoencoder latent dimension: {latent_dim}")
    print(f"Generator input dimension: {z_dim}")

    # Initialize MedGAN
    medgan = MedGAN(
        input_dim=data_dim,
        hidden_dim=hidden_dim,
        latent_dim=latent_dim,
        z_dim=z_dim,
        ae_pretrain_epochs=50,  # Reduced for demonstration
        discriminator_steps=1,
        lambda_rec=0.2
    )

    # 1. Pretrain autoencoder
    medgan.pretrain_autoencoder(train_loader)

    # 2. Train GAN
    print("\nTraining MedGAN...")
    gan_epochs = 300
    medgan.train_gan(train_loader, gan_epochs, save_interval=10)

    # Save the model
    medgan.save_model('medgan_model.pt')

    # Plot loss curves
    plot_loss_curves(medgan)

    # Generate synthetic data
    num_samples = 1000
    print(f"Generating {num_samples} synthetic samples...")
    synthetic_data = medgan.generate_samples(num_samples)

    # Statistical similarity evaluation
    print("Evaluating statistical similarity...")
    stat_results = evaluate_statistical_similarity(
        X_train, synthetic_data, feature_names)

    print("\nJensen-Shannon Divergence (average):", stat_results['JSD_avg'])
    print("Wasserstein Distance (average):", stat_results['WD_avg'])

    # Machine Learning Utility (TSTR) evaluation
    print("\nEvaluating Machine Learning Utility (TSTR)...")
    tstr_results = evaluate_tstr(X_train, synthetic_data, y_train)

    print("\nTSTR Results:")
    for clf_name, metrics in tstr_results.items():
        print(
            f"{clf_name}: Accuracy = {metrics['accuracy']:.4f}, F1 Score = {metrics['f1_score']:.4f}")

    # Plot feature distributions
    plot_feature_distributions(X_train, synthetic_data, feature_names)

    print("\nEvaluation complete! Check the output directory for plots and saved model.")

In [4]:
if __name__ == "__main__":
    main()

Training data shape: (32561, 108)
Testing data shape: (16281, 108)
Data dimension: 108
Autoencoder latent dimension: 64
Generator input dimension: 128
Pretraining autoencoder for 50 epochs...
Epoch [10/50], Reconstruction Loss: 0.035789
Epoch [20/50], Reconstruction Loss: 0.035351
Epoch [30/50], Reconstruction Loss: 0.035337
Epoch [40/50], Reconstruction Loss: 0.035069
Epoch [50/50], Reconstruction Loss: 0.034908

Training MedGAN...
Training MedGAN for 300 epochs...
Epoch [10/300], D Loss: 0.0954, G Loss: 4.0298
Epoch [20/300], D Loss: 0.0245, G Loss: 6.2375
Epoch [30/300], D Loss: 0.0044, G Loss: 8.0276
Epoch [40/300], D Loss: 0.0017, G Loss: 9.0998
Epoch [50/300], D Loss: 0.0008, G Loss: 10.4458
Epoch [60/300], D Loss: 0.0007, G Loss: 10.6637
Epoch [70/300], D Loss: 0.0005, G Loss: 10.9912
Epoch [80/300], D Loss: 0.0003, G Loss: 10.9004
Epoch [90/300], D Loss: 0.0002, G Loss: 12.0342
Epoch [100/300], D Loss: 0.0002, G Loss: 12.4445
Epoch [110/300], D Loss: 0.0004, G Loss: 12.5494
Epo