In [7]:
#  %pip install xgboost
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
from torch.autograd import grad as torch_grad
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, f1_score, classification_report, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
from scipy.stats import entropy
import warnings
import time
from tqdm import tqdm

# from google.colab import drive

# drive.mount('/content/drive/')
# %cd /content/drive/MyDrive/Colab Notebooks/Katabatic/CrGAN/credit/cross/
# %cd /content/drive/MyDrive/

# Suppress warnings
warnings.filterwarnings("ignore")

# Setting random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Check if CUDA is available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load and preprocess data
def load_credit_card_data(file_path, test_size=0.2, random_state=42, shuffle=True):
    # Load data
    df = pd.read_csv(file_path)

    print(f"Dataset shape: {df.shape}")

    # Drop ID column as it's not a feature
    if 'ID' in df.columns:
        df = df.drop('ID', axis=1)

    # Rename target variable for consistency
    if 'default.payment.next.month' in df.columns:
        df = df.rename(columns={'default.payment.next.month': 'TARGET'})

    # Split features and target
    X = df.drop('TARGET', axis=1)
    y = df['TARGET']

    # Define categorical and numerical columns
    categorical_cols = ['SEX', 'EDUCATION', 'MARRIAGE', 'PAY_0', 'PAY_2', 'PAY_3',
                         'PAY_4', 'PAY_5', 'PAY_6']
    numerical_cols = ['LIMIT_BAL', 'AGE', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3',
                       'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2',
                       'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']

    # Create preprocessing pipeline
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    numerical_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, categorical_cols),
            ('num', numerical_transformer, numerical_cols)
        ])

    # Return the full dataset, preprocessor, and feature info
    return X, y, preprocessor, categorical_cols, numerical_cols

# Custom dataset class
class CreditCardDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features, dtype=torch.float32)

        if labels is not None:
            # Convert to numpy array and ensure float32 dtype
            self.labels = torch.tensor(labels.values.astype(np.float32)).view(-1, 1)
        else:
            self.labels = None

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        if self.labels is not None:
            return self.features[idx], self.labels[idx]
        else:
            return self.features[idx]


# Generator Network
class Generator(nn.Module):
    def __init__(self, latent_dim, output_dim):
        super(Generator, self).__init__()
        self.latent_dim = latent_dim
        self.output_dim = output_dim

        self.model = nn.Sequential(
            nn.Linear(latent_dim, 256),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(0.2),

            nn.Linear(256, 512),
            nn.BatchNorm1d(512),
            nn.LeakyReLU(0.2),

            nn.Linear(512, 1024),
            nn.BatchNorm1d(1024),
            nn.LeakyReLU(0.2),

            nn.Linear(1024, output_dim),
            nn.Tanh()  # Output layer - maps to (-1, 1) range
        )

    def forward(self, z):
        return self.model(z)

# Critic Network (Discriminator)
class Critic(nn.Module):
    def __init__(self, input_dim):
        super(Critic, self).__init__()
        self.input_dim = input_dim

        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),

            nn.Linear(512, 256),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),

            nn.Linear(256, 128),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),

            nn.Linear(128, 1)
        )

    def forward(self, x):
        return self.model(x)

# Cramer GAN Implementation
class CramerGAN:
    def __init__(self, data_dim, latent_dim=100, critic_iterations=5, lambda_gp=10):
        self.data_dim = data_dim
        self.latent_dim = latent_dim
        self.critic_iterations = critic_iterations
        self.lambda_gp = lambda_gp

        # Initialize networks
        self.generator = Generator(latent_dim, data_dim).to(device)
        self.critic = Critic(data_dim).to(device)

        # Setup optimizers
        self.g_optimizer = optim.Adam(self.generator.parameters(), lr=0.0002, betas=(0.5, 0.9))
        self.c_optimizer = optim.Adam(self.critic.parameters(), lr=0.0002, betas=(0.5, 0.9))

        # Initialize loss tracking
        self.g_losses = []
        self.c_losses = []

    def _critic_train_iteration(self, real_data, batch_size):
        # Generate random noise
        noise = torch.randn(batch_size, self.latent_dim).to(device)

        # Generate fake data
        fake_data = self.generator(noise)

        # Get critic outputs
        critic_real = self.critic(real_data)
        critic_fake = self.critic(fake_data)

        # Calculate Cramer distance
        critic_real2 = self.critic(torch.roll(real_data, shifts=1, dims=0))
        critic_fake2 = self.critic(torch.roll(fake_data, shifts=1, dims=0))

        # Cramer GAN loss function
        c_loss = torch.mean(critic_real - critic_fake) - 0.5 * torch.mean(torch.pow(critic_real - critic_real2, 2)) + 0.5 * torch.mean(torch.pow(critic_fake - critic_fake2, 2))

        # Calculate gradient penalty
        alpha = torch.rand(batch_size, 1).to(device)
        interpolates = alpha * real_data + ((1 - alpha) * fake_data)
        interpolates.requires_grad_(True)

        critic_interpolates = self.critic(interpolates)
        gradients = torch_grad(outputs=critic_interpolates, inputs=interpolates,
                              grad_outputs=torch.ones_like(critic_interpolates).to(device),
                              create_graph=True, retain_graph=True)[0]

        gradients = gradients.view(batch_size, -1)
        gradient_penalty = self.lambda_gp * ((gradients.norm(2, dim=1) - 1) ** 2).mean()

        # Update critic
        self.c_optimizer.zero_grad()
        c_loss_total = c_loss + gradient_penalty
        c_loss_total.backward()
        self.c_optimizer.step()

        return c_loss_total.item()

    def _generator_train_iteration(self, batch_size):
        # Generate random noise
        noise = torch.randn(batch_size, self.latent_dim).to(device)

        # Generate fake data
        fake_data = self.generator(noise)

        # Calculate critic outputs
        critic_fake = self.critic(fake_data)
        critic_fake2 = self.critic(torch.roll(fake_data, shifts=1, dims=0))

        # Generator loss is negative of critic loss
        g_loss = -torch.mean(critic_fake) + 0.5 * torch.mean(torch.pow(critic_fake - critic_fake2, 2))

        # Update generator
        self.g_optimizer.zero_grad()
        g_loss.backward()
        self.g_optimizer.step()

        return g_loss.item()

    def train(self, data_loader, epochs, save_interval=10, verbose=True):
        for epoch in range(epochs):
            epoch_start_time = time.time()
            c_loss_total = 0
            g_loss_total = 0
            num_batches = 0

            for i, (real_data, _) in enumerate(data_loader):
                batch_size = real_data.size(0)
                real_data = real_data.to(device)

                # Train critic
                for _ in range(self.critic_iterations):
                    c_loss = self._critic_train_iteration(real_data, batch_size)
                c_loss_total += c_loss

                # Train generator
                g_loss = self._generator_train_iteration(batch_size)
                g_loss_total += g_loss

                num_batches += 1

            # Calculate average loss for the epoch
            c_loss_avg = c_loss_total / num_batches
            g_loss_avg = g_loss_total / num_batches

            self.c_losses.append(c_loss_avg)
            self.g_losses.append(g_loss_avg)

            epoch_time = time.time() - epoch_start_time

            if verbose and (epoch % save_interval == 0 or epoch == epochs - 1):
                print(f"Epoch [{epoch+1}/{epochs}] | Critic Loss: {c_loss_avg:.4f} | Generator Loss: {g_loss_avg:.4f} | Time: {epoch_time:.2f}s")

    def generate_samples(self, num_samples):
        self.generator.eval()
        noise = torch.randn(num_samples, self.latent_dim).to(device)
        with torch.no_grad():
            generated_data = self.generator(noise).cpu().numpy()
        self.generator.train()
        return generated_data

# Post-process generated data to make it valid for credit card data
def post_process_credit_card_data(synthetic_data, preprocessor, feature_names):
    """
    Post-process generated data to ensure it makes sense for credit card data
    """
    # Get the number of categorical features (one-hot encoded)
    cat_feature_names = [name for name in feature_names if any(col in name for col in
                         ['SEX', 'EDUCATION', 'MARRIAGE', 'PAY_'])]
    num_cat_features = len(cat_feature_names)

    # Split the synthetic data into categorical and numerical
    synthetic_cat = synthetic_data[:, :num_cat_features]
    synthetic_num = synthetic_data[:, num_cat_features:]

    # Process categorical features
    processed_cat = np.zeros_like(synthetic_cat)

    # Get categorical column groups (for one-hot encoded features)
    cat_groups = {}
    current_idx = 0

    for col in ['SEX', 'EDUCATION', 'MARRIAGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']:
        col_features = [f for f in feature_names if f.startswith(f"{col}_")]
        if col_features:
            end_idx = current_idx + len(col_features)
            cat_groups[col] = (current_idx, end_idx)
            current_idx = end_idx

    # Process each categorical group to ensure one-hot encoding is maintained
    for col, (start_idx, end_idx) in cat_groups.items():
        # For each row, set the highest value to 1 and others to 0
        max_indices = np.argmax(synthetic_cat[:, start_idx:end_idx], axis=1)

        # Create a zeros array and set the max index to 1 for each row
        one_hot = np.zeros((synthetic_cat.shape[0], end_idx - start_idx))
        for i, idx in enumerate(max_indices):
            one_hot[i, idx] = 1

        processed_cat[:, start_idx:end_idx] = one_hot

    # Process numerical features
    processed_num = synthetic_num.copy()

    # Define column constraints for numerical columns
    # Each tuple contains (column name, min value, max value)
    num_constraints = [
        ("LIMIT_BAL", 10000, 1000000),  # Credit limit
        ("AGE", 21, 80),                # Age
        # Bill amounts can be negative (credit)
        ("BILL_AMT1", -100000, 1000000),
        ("BILL_AMT2", -100000, 1000000),
        ("BILL_AMT3", -100000, 1000000),
        ("BILL_AMT4", -100000, 1000000),
        ("BILL_AMT5", -100000, 1000000),
        ("BILL_AMT6", -100000, 1000000),
        # Payment amounts should be non-negative
        ("PAY_AMT1", 0, 1000000),
        ("PAY_AMT2", 0, 1000000),
        ("PAY_AMT3", 0, 1000000),
        ("PAY_AMT4", 0, 1000000),
        ("PAY_AMT5", 0, 1000000),
        ("PAY_AMT6", 0, 1000000)
    ]

    # Get the indices of numerical columns
    num_col_indices = {}
    for i, name in enumerate(feature_names[num_cat_features:]):
        num_col_indices[name] = i

    # Apply constraints for numerical columns
    for col, min_val, max_val in num_constraints:
        if col in num_col_indices:
            idx = num_col_indices[col]
            processed_num[:, idx] = np.clip(processed_num[:, idx], min_val, max_val)

            # Round specific columns that should be integers
            if col in ["AGE"]:
                processed_num[:, idx] = np.round(processed_num[:, idx])

    # Combine processed categorical and numerical data
    processed_data = np.hstack((processed_cat, processed_num))

    return processed_data

# Evaluation Metrics

# 1. Machine Learning Utility (TSTR)
def evaluate_tstr(real_data, synthetic_data, real_labels, random_state=42):
    """
    Train classifiers on synthetic data and test on real data (TSTR)
    Returns accuracy for each classifier
    """
    # Train-test split for real data
    X_train, X_test, y_train, y_test = train_test_split(
        real_data, real_labels, test_size=0.2, random_state=random_state
    )

    # Synthetic data (all used for training)
    X_synth = synthetic_data

    # Ensure proper dimensions for labels
    if isinstance(y_train, pd.Series):
        y_train = y_train.values

    # Create synthetic labels based on real distribution
    # For credit card default prediction, we need binary labels (0-1)
    class_distribution = np.bincount(y_train.astype(int)) / len(y_train)
    np.random.seed(random_state)
    y_synth = np.random.choice([0, 1], size=len(X_synth), p=class_distribution)

    # Define classifiers
    classifiers = {
        'Logistic Regression': LogisticRegression(max_iter=1000, random_state=random_state),
        'MLP': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=random_state),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=random_state),
        'XGBoost': xgb.XGBClassifier(n_estimators=100, random_state=random_state)
    }

    results = {}

    for name, clf in classifiers.items():
        # Train on synthetic data
        clf.fit(X_synth, y_synth)

        # Test on real data
        y_pred = clf.predict(X_test)

        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')

        # For binary classification, also calculate AUC-ROC
        auc = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])

        results[name] = {
            'accuracy': accuracy,
            'f1_score': f1,
            'auc_roc': auc
        }

    return results

# 2. Statistical Similarity
def jensen_shannon_divergence(p, q):
    """
    Calculate Jensen-Shannon Divergence between distributions p and q
    """
    # Ensure p and q are normalized
    p = p / np.sum(p)
    q = q / np.sum(q)

    m = 0.5 * (p + q)

    # Calculate JSD
    jsd = 0.5 * (entropy(p, m) + entropy(q, m))

    return jsd

def wasserstein_distance(p, q):
    """
    Calculate 1D Wasserstein distance (Earth Mover's Distance)
    """
    from scipy.stats import wasserstein_distance

    return wasserstein_distance(p, q)

def evaluate_statistical_similarity(real_data, synthetic_data, feature_names):
    """
    Calculate statistical similarity metrics between real and synthetic data
    """
    results = {'JSD': {}, 'WD': {}}

    # Calculate metrics for each feature
    for i in range(real_data.shape[1]):
        feature_name = feature_names[i] if i < len(feature_names) else f"feature_{i}"

        # Get feature values
        real_values = real_data[:, i]
        synth_values = synthetic_data[:, i]

        # Calculate histogram (discrete distribution)
        hist_bins = min(50, len(np.unique(real_values)))

        hist_real, bin_edges = np.histogram(real_values, bins=hist_bins, density=True)
        hist_synth, _ = np.histogram(synth_values, bins=bin_edges, density=True)

        # Add a small epsilon to avoid division by zero
        epsilon = 1e-10
        hist_real = hist_real + epsilon
        hist_synth = hist_synth + epsilon

        # Calculate JSD
        jsd = jensen_shannon_divergence(hist_real, hist_synth)
        results['JSD'][feature_name] = jsd

        # Calculate Wasserstein Distance
        wd = wasserstein_distance(real_values, synth_values)
        results['WD'][feature_name] = wd

    # Calculate average metrics
    results['JSD_avg'] = np.mean(list(results['JSD'].values()))
    results['WD_avg'] = np.mean(list(results['WD'].values()))

    return results

# Function to run a single fold
def run_fold(X_train, y_train, X_test, y_test, preprocessor, cat_cols, num_cols, fold_num, run_num):
    print(f"\nRunning Fold {fold_num} of Run {run_num}")

    # Fit preprocessor and transform data
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)

    # Get feature names
    cat_feature_names = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(cat_cols)
    all_feature_names = list(cat_feature_names) + list(num_cols)

    print(f"Processed training data shape: {X_train_transformed.shape}")
    print(f"Processed testing data shape: {X_test_transformed.shape}")

    # Create dataset and dataloader
    train_dataset = CreditCardDataset(X_train_transformed, y_train)
    train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

    # Initialize and train the model
    data_dim = X_train_transformed.shape[1]
    latent_dim = 100

    print(f"Data dimension: {data_dim}")
    print(f"Latent dimension: {latent_dim}")

    crgan = CramerGAN(data_dim, latent_dim)

    # Train the model
    epochs = 300

    print(f"Training CramerGAN for {epochs} epochs...")
    crgan.train(train_loader, epochs, save_interval=10, verbose=True)

    # Generate synthetic data
    num_samples = 1000
    print(f"Generating {num_samples} synthetic samples...")
    synthetic_data_raw = crgan.generate_samples(num_samples)

    # Post-process the synthetic data
    synthetic_data_processed = post_process_credit_card_data(synthetic_data_raw, preprocessor, all_feature_names)

    # Generate synthetic labels using a classifier trained on real data
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train_transformed, y_train)
    synthetic_labels = clf.predict(synthetic_data_processed)

    # Evaluate statistical similarity
    print("Evaluating statistical similarity...")
    stat_results = evaluate_statistical_similarity(X_train_transformed, synthetic_data_processed, all_feature_names)

    # Evaluate Machine Learning Utility (TSTR)
    print("Evaluating Machine Learning Utility (TSTR)...")
    tstr_results = evaluate_tstr(X_train_transformed, synthetic_data_processed, y_train)

    # Combine results
    results = {
        'statistical': stat_results,
        'tstr': tstr_results
    }

    return results

# Main function
def main():
    print("Starting CrGAN Cross-Validation on Credit Card Dataset")

    # File path
    file_path = 'UCI_Credit_Card.csv'

    # Run cross-validation twice (once with original data, once with shuffled data)
    all_results = []

    # Run 1: Original data order
    print("\n===== Run 1: Original Data Order =====")
    X, y, preprocessor, cat_cols, num_cols = load_credit_card_data(file_path, shuffle=True, random_state=42)

    # Perform 5-fold cross-validation
    kf = KFold(n_splits=5, shuffle=False)
    fold_num = 1

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Run this fold
        fold_results = run_fold(X_train, y_train, X_test, y_test, preprocessor, cat_cols, num_cols, fold_num, 1)
        all_results.append(fold_results)
        fold_num += 1

    # Run 2: Shuffled data with seed=123
    print("\n===== Run 2: Shuffled Data (seed=123) =====")
    X, y, preprocessor, cat_cols, num_cols = load_credit_card_data(file_path, shuffle=True, random_state=123)

    # Perform 5-fold cross-validation
    kf = KFold(n_splits=5, shuffle=False)
    fold_num = 1

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Run this fold
        fold_results = run_fold(X_train, y_train, X_test, y_test, preprocessor, cat_cols, num_cols, fold_num, 2)
        all_results.append(fold_results)
        fold_num += 1

    # Calculate average results
    print("\n===== Final Average Results (10 runs) =====")

    # Initialize dictionaries to store aggregate results
    agg_statistical = {
        'JSD_avg': 0.0,
        'WD_avg': 0.0
    }

    agg_tstr = {
        'Logistic Regression': {'accuracy': 0.0, 'f1_score': 0.0, 'auc_roc': 0.0},
        'MLP': {'accuracy': 0.0, 'f1_score': 0.0, 'auc_roc': 0.0},
        'Random Forest': {'accuracy': 0.0, 'f1_score': 0.0, 'auc_roc': 0.0},
        'XGBoost': {'accuracy': 0.0, 'f1_score': 0.0, 'auc_roc': 0.0}
    }

    # Sum up all results
    for result in all_results:
        # Statistical results
        agg_statistical['JSD_avg'] += result['statistical']['JSD_avg']
        agg_statistical['WD_avg'] += result['statistical']['WD_avg']

        # TSTR results
        for clf_name, metrics in result['tstr'].items():
            for metric_name, value in metrics.items():
                agg_tstr[clf_name][metric_name] += value

    # Calculate averages
    num_runs = len(all_results)
    agg_statistical['JSD_avg'] /= num_runs
    agg_statistical['WD_avg'] /= num_runs

    for clf_name in agg_tstr.keys():
        for metric_name in agg_tstr[clf_name].keys():
            agg_tstr[clf_name][metric_name] /= num_runs

    # Print final results
    print("\nAverage Statistical Similarity Metrics:")
    print(f"Average Jensen-Shannon Divergence: {agg_statistical['JSD_avg']:.6f}")
    print(f"Average Wasserstein Distance: {agg_statistical['WD_avg']:.6f}")

    print("\nAverage TSTR Results:")
    for clf_name, metrics in agg_tstr.items():
        print(f"{clf_name}:")
        print(f"  Accuracy: {metrics['accuracy']:.6f}")
        print(f"  F1 Score: {metrics['f1_score']:.6f}")
        print(f"  AUC-ROC: {metrics['auc_roc']:.6f}")

    print("\nCross-validation complete!")


Using device: cpu


In [8]:
if __name__ == "__main__":
    main()

Starting CrGAN Cross-Validation on Credit Card Dataset

===== Run 1: Original Data Order =====
Dataset shape: (30000, 25)

Running Fold 1 of Run 1
Processed training data shape: (24000, 91)
Processed testing data shape: (6000, 91)
Data dimension: 91
Latent dimension: 100
Training CramerGAN for 300 epochs...
Epoch [1/300] | Critic Loss: -1138.2170 | Generator Loss: -13.4279 | Time: 26.98s
Epoch [11/300] | Critic Loss: -1490200095286.4680 | Generator Loss: 7509217.6150 | Time: 25.65s
Epoch [21/300] | Critic Loss: -33722822418257.7031 | Generator Loss: 67179789.7553 | Time: 26.49s
Epoch [31/300] | Critic Loss: -7130078429315072.0000 | Generator Loss: 5553004187.2340 | Time: 27.95s
Epoch [41/300] | Critic Loss: -8304007664654118.0000 | Generator Loss: 133867073971.7447 | Time: 26.63s
Epoch [51/300] | Critic Loss: -72251315613580960.0000 | Generator Loss: 88186062956.9362 | Time: 28.44s
Epoch [61/300] | Critic Loss: -77655208374611536.0000 | Generator Loss: 9644158403605.7871 | Time: 27.66s