In [9]:
%pip install xgboost

import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
from torch.autograd import grad as torch_grad
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
from scipy.stats import entropy
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import time
from tqdm import tqdm

from google.colab import drive

drive.mount('/content/drive/')
%cd /content/drive/MyDrive/Colab Notebooks/Katabatic/CrGAN/poker/

# Suppress warnings
warnings.filterwarnings("ignore")

# Setting random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Check if CUDA is available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/Katabatic/CrGAN/poker
Using device: cpu


In [10]:
# Load and preprocess data
def load_poker_data(train_path, test_path):
    # Column names based on the dataset description
    column_names = ['S1', 'C1', 'S2', 'C2', 'S3', 'C3', 'S4', 'C4', 'S5', 'C5', 'CLASS']

    # Load data
    df_train = pd.read_csv(train_path, header=None, names=column_names)
    df_test = pd.read_csv(test_path, header=None, names=column_names)

    print(f"Training data shape: {df_train.shape}")
    print(f"Testing data shape: {df_test.shape}")

    # Split features and target
    X_train = df_train.drop('CLASS', axis=1)
    y_train = df_train['CLASS']
    X_test = df_test.drop('CLASS', axis=1)
    y_test = df_test['CLASS']

    # Create preprocessing pipeline - for poker hands, we'll use one-hot encoding for suits
    # and normalize the card ranks

    # Define suit columns and rank columns
    suit_cols = ['S1', 'S2', 'S3', 'S4', 'S5']
    rank_cols = ['C1', 'C2', 'C3', 'C4', 'C5']

    # Create transformers
    suit_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    rank_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('suit', suit_transformer, suit_cols),
            ('rank', rank_transformer, rank_cols)
        ])

    # Fit and transform the data
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)

    # Get feature names
    suit_feature_names = preprocessor.named_transformers_['suit'].named_steps['onehot'].get_feature_names_out(suit_cols)
    all_feature_names = list(suit_feature_names) + list(rank_cols)

    print(f"Processed training data shape: {X_train_transformed.shape}")
    print(f"Processed testing data shape: {X_test_transformed.shape}")

    return X_train_transformed, y_train, X_test_transformed, y_test, preprocessor, all_feature_names

# Custom dataset class
class PokerDataset(Dataset):
    def __init__(self, features, labels=None):
        self.features = torch.tensor(features, dtype=torch.float32)
        if labels is not None:
            self.labels = torch.tensor(labels, dtype=torch.float32).view(-1, 1)
        else:
            self.labels = None

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        if self.labels is not None:
            return self.features[idx], self.labels[idx]
        else:
            return self.features[idx]

# Generator Network
class Generator(nn.Module):
    def __init__(self, latent_dim, output_dim):
        super(Generator, self).__init__()
        self.latent_dim = latent_dim
        self.output_dim = output_dim

        self.model = nn.Sequential(
            nn.Linear(latent_dim, 256),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(0.2),

            nn.Linear(256, 512),
            nn.BatchNorm1d(512),
            nn.LeakyReLU(0.2),

            nn.Linear(512, 1024),
            nn.BatchNorm1d(1024),
            nn.LeakyReLU(0.2),

            nn.Linear(1024, output_dim),
            nn.Tanh()  # Output layer - maps to (-1, 1) range
        )

    def forward(self, z):
        return self.model(z)

# Critic Network (Discriminator)
class Critic(nn.Module):
    def __init__(self, input_dim):
        super(Critic, self).__init__()
        self.input_dim = input_dim

        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),

            nn.Linear(512, 256),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),

            nn.Linear(256, 128),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),

            nn.Linear(128, 1)
        )

    def forward(self, x):
        return self.model(x)

# Cramer GAN Implementation
class CramerGAN:
    def __init__(self, data_dim, latent_dim=100, critic_iterations=5, lambda_gp=10):
        self.data_dim = data_dim
        self.latent_dim = latent_dim
        self.critic_iterations = critic_iterations
        self.lambda_gp = lambda_gp

        # Initialize networks
        self.generator = Generator(latent_dim, data_dim).to(device)
        self.critic = Critic(data_dim).to(device)

        # Setup optimizers
        self.g_optimizer = optim.Adam(self.generator.parameters(), lr=0.0002, betas=(0.5, 0.9))
        self.c_optimizer = optim.Adam(self.critic.parameters(), lr=0.0002, betas=(0.5, 0.9))

        # Initialize loss tracking
        self.g_losses = []
        self.c_losses = []

    def _critic_train_iteration(self, real_data, batch_size):
        # Generate random noise
        noise = torch.randn(batch_size, self.latent_dim).to(device)

        # Generate fake data
        fake_data = self.generator(noise)

        # Get critic outputs
        critic_real = self.critic(real_data)
        critic_fake = self.critic(fake_data)

        # Calculate Cramer distance
        critic_real2 = self.critic(torch.roll(real_data, shifts=1, dims=0))
        critic_fake2 = self.critic(torch.roll(fake_data, shifts=1, dims=0))

        # Cramer GAN loss function
        c_loss = torch.mean(critic_real - critic_fake) - 0.5 * torch.mean(torch.pow(critic_real - critic_real2, 2)) + 0.5 * torch.mean(torch.pow(critic_fake - critic_fake2, 2))

        # Calculate gradient penalty
        alpha = torch.rand(batch_size, 1).to(device)
        interpolates = alpha * real_data + ((1 - alpha) * fake_data)
        interpolates.requires_grad_(True)

        critic_interpolates = self.critic(interpolates)
        gradients = torch_grad(outputs=critic_interpolates, inputs=interpolates,
                              grad_outputs=torch.ones_like(critic_interpolates).to(device),
                              create_graph=True, retain_graph=True)[0]

        gradients = gradients.view(batch_size, -1)
        gradient_penalty = self.lambda_gp * ((gradients.norm(2, dim=1) - 1) ** 2).mean()

        # Update critic
        self.c_optimizer.zero_grad()
        c_loss_total = c_loss + gradient_penalty
        c_loss_total.backward()
        self.c_optimizer.step()

        return c_loss_total.item()

    def _generator_train_iteration(self, batch_size):
        # Generate random noise
        noise = torch.randn(batch_size, self.latent_dim).to(device)

        # Generate fake data
        fake_data = self.generator(noise)

        # Calculate critic outputs
        critic_fake = self.critic(fake_data)
        critic_fake2 = self.critic(torch.roll(fake_data, shifts=1, dims=0))

        # Generator loss is negative of critic loss
        g_loss = -torch.mean(critic_fake) + 0.5 * torch.mean(torch.pow(critic_fake - critic_fake2, 2))

        # Update generator
        self.g_optimizer.zero_grad()
        g_loss.backward()
        self.g_optimizer.step()

        return g_loss.item()

    def train(self, data_loader, epochs, save_interval=10, verbose=True):
        for epoch in range(epochs):
            epoch_start_time = time.time()
            c_loss_total = 0
            g_loss_total = 0
            num_batches = 0

            for i, (real_data, _) in enumerate(data_loader):
                batch_size = real_data.size(0)
                real_data = real_data.to(device)

                # Train critic
                for _ in range(self.critic_iterations):
                    c_loss = self._critic_train_iteration(real_data, batch_size)
                c_loss_total += c_loss

                # Train generator
                g_loss = self._generator_train_iteration(batch_size)
                g_loss_total += g_loss

                num_batches += 1

            # Calculate average loss for the epoch
            c_loss_avg = c_loss_total / num_batches
            g_loss_avg = g_loss_total / num_batches

            self.c_losses.append(c_loss_avg)
            self.g_losses.append(g_loss_avg)

            epoch_time = time.time() - epoch_start_time

            if verbose and (epoch % save_interval == 0 or epoch == epochs - 1):
                print(f"Epoch [{epoch+1}/{epochs}] | Critic Loss: {c_loss_avg:.4f} | Generator Loss: {g_loss_avg:.4f} | Time: {epoch_time:.2f}s")

    def generate_samples(self, num_samples):
        self.generator.eval()
        noise = torch.randn(num_samples, self.latent_dim).to(device)
        with torch.no_grad():
            generated_data = self.generator(noise).cpu().numpy()
        self.generator.train()
        return generated_data

    def save_model(self, path):
        torch.save({
            'generator_state_dict': self.generator.state_dict(),
            'critic_state_dict': self.critic.state_dict(),
            'g_optimizer_state_dict': self.g_optimizer.state_dict(),
            'c_optimizer_state_dict': self.c_optimizer.state_dict(),
        }, path)

    def load_model(self, path):
        checkpoint = torch.load(path)
        self.generator.load_state_dict(checkpoint['generator_state_dict'])
        self.critic.load_state_dict(checkpoint['critic_state_dict'])
        self.g_optimizer.load_state_dict(checkpoint['g_optimizer_state_dict'])
        self.c_optimizer.load_state_dict(checkpoint['c_optimizer_state_dict'])

# Post-process generated data to make it valid poker hands
def post_process_poker_data(synthetic_data, preprocessor):
    """
    Post-process generated data to ensure it can be translated back into valid poker hands
    """
    # Get the number of suit features (one-hot encoded)
    n_suit_features = len(preprocessor.named_transformers_['suit'].named_steps['onehot'].get_feature_names_out(['S1', 'S2', 'S3', 'S4', 'S5']))

    # Split the synthetic data into suits and ranks
    synthetic_suits = synthetic_data[:, :n_suit_features]
    synthetic_ranks = synthetic_data[:, n_suit_features:]

    # For each card's suit (one-hot encoded), take the max value to get the most likely suit
    num_cards = 5
    n_suits = 4  # Hearts, Spades, Diamonds, Clubs

    processed_suits = np.zeros((len(synthetic_data), num_cards))

    for i in range(num_cards):
        one_hot_indices = np.argmax(synthetic_suits[:, i*n_suits:(i+1)*n_suits], axis=1)
        processed_suits[:, i] = one_hot_indices + 1  # Add 1 to match original encoding (1-4)

    # Inverse transform the rank data
    processed_ranks = preprocessor.named_transformers_['rank'].named_steps['scaler'].inverse_transform(synthetic_ranks)

    # Clip and round ranks to valid values (1-13)
    processed_ranks = np.clip(np.round(processed_ranks), 1, 13)

    # Combine suits and ranks
    processed_data = np.zeros((len(synthetic_data), num_cards * 2))

    for i in range(num_cards):
        processed_data[:, 2*i] = processed_suits[:, i]       # Suit
        processed_data[:, 2*i+1] = processed_ranks[:, i]     # Rank

    return processed_data

In [11]:
# Evaluation Metrics

# 1. Machine Learning Utility (TSTR)
def evaluate_tstr(real_data, synthetic_data, real_labels, random_state=42):
    """
    Train classifiers on synthetic data and test on real data (TSTR)
    Returns accuracy for each classifier
    """
    # Train-test split for real data
    X_train, X_test, y_train, y_test = train_test_split(
        real_data, real_labels, test_size=0.2, random_state=random_state
    )

    # Synthetic data (all used for training)
    X_synth = synthetic_data

    # Ensure proper dimensions for labels
    if isinstance(y_train, pd.Series):
        y_train = y_train.values

    # Create synthetic labels based on real distribution
    # Get class distribution from y_test (to ensure synthetic labels match test classes)
    class_distribution = np.bincount(y_test.astype(int), minlength=10).astype(float)
    class_distribution += 1e-6  # Add small value to ensure all classes have a non-zero probability
    class_distribution /= class_distribution.sum()  # Normalize

    # Sample synthetic labels using this adjusted distribution
    np.random.seed(random_state)
    y_synth = np.random.choice(range(10), size=len(X_synth), p=class_distribution)

    # Define classifiers
    classifiers = {
        'Logistic Regression': LogisticRegression(max_iter=1000, random_state=random_state),
        'MLP': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=random_state),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=random_state),
        'XGBoost': xgb.XGBClassifier(n_estimators=100, random_state=random_state)
    }

    results = {}

    for name, clf in classifiers.items():
        # Train on synthetic data
        clf.fit(X_synth, y_synth)

        # Test on real data
        y_pred = clf.predict(X_test)

        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)

        # We use weighted f1 since poker hands dataset is heavily imbalanced
        f1 = f1_score(y_test, y_pred, average='weighted')

        results[name] = {
            'accuracy': accuracy,
            'f1_score': f1
        }

    return results

# 2. Statistical Similarity
def jensen_shannon_divergence(p, q):
    """
    Calculate Jensen-Shannon Divergence between distributions p and q
    """
    # Ensure p and q are normalized
    p = p / np.sum(p)
    q = q / np.sum(q)

    m = 0.5 * (p + q)

    # Calculate JSD
    jsd = 0.5 * (entropy(p, m) + entropy(q, m))

    return jsd

def wasserstein_distance(p, q):
    """
    Calculate 1D Wasserstein distance (Earth Mover's Distance)
    """
    from scipy.stats import wasserstein_distance

    return wasserstein_distance(p, q)

def evaluate_statistical_similarity(real_data, synthetic_data, feature_names):
    """
    Calculate statistical similarity metrics between real and synthetic data
    """
    results = {'JSD': {}, 'WD': {}}

    # Calculate metrics for each feature
    for i in range(real_data.shape[1]):
        feature_name = feature_names[i] if i < len(feature_names) else f"feature_{i}"

        # Get feature values
        real_values = real_data[:, i]
        synth_values = synthetic_data[:, i]

        # Calculate histogram (discrete distribution)
        hist_bins = min(50, len(np.unique(real_values)))

        hist_real, bin_edges = np.histogram(real_values, bins=hist_bins, density=True)
        hist_synth, _ = np.histogram(synth_values, bins=bin_edges, density=True)

        # Add a small epsilon to avoid division by zero
        epsilon = 1e-10
        hist_real = hist_real + epsilon
        hist_synth = hist_synth + epsilon

        # Calculate JSD
        jsd = jensen_shannon_divergence(hist_real, hist_synth)
        results['JSD'][feature_name] = jsd

        # Calculate Wasserstein Distance
        wd = wasserstein_distance(real_values, synth_values)
        results['WD'][feature_name] = wd

    # Calculate average metrics
    results['JSD_avg'] = np.mean(list(results['JSD'].values()))
    results['WD_avg'] = np.mean(list(results['WD'].values()))

    return results

def plot_loss_curves(model):
    """
    Plot the loss curves for the generator and critic
    """
    plt.figure(figsize=(10, 5))
    plt.plot(model.g_losses, label='Generator Loss')
    plt.plot(model.c_losses, label='Critic Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('CramerGAN Training Loss for Poker Hand Dataset')
    plt.legend()
    plt.grid(True)
    plt.savefig('poker_crgan_loss_curves.png')
    plt.close()

def plot_feature_distributions(real_data, synthetic_data, feature_names, n_features=10):
    """
    Plot distributions of real vs synthetic data for selected features
    """
    if n_features > len(feature_names):
        n_features = len(feature_names)

    # Select a subset of features to visualize
    selected_indices = np.random.choice(range(len(feature_names)), size=n_features, replace=False)

    plt.figure(figsize=(15, 20))
    for i, idx in enumerate(selected_indices):
        feature_name = feature_names[idx]

        plt.subplot(n_features, 1, i+1)

        # Get feature values
        real_values = real_data[:, idx]
        synth_values = synthetic_data[:, idx]

        # Plot histograms
        sns.histplot(real_values, kde=True, stat="density", label="Real", alpha=0.6, color="blue")
        sns.histplot(synth_values, kde=True, stat="density", label="Synthetic", alpha=0.6, color="red")

        plt.title(f"Distribution for {feature_name}")
        plt.legend()

    plt.tight_layout()
    plt.savefig('poker_feature_distributions.png')
    plt.close()

def plot_class_distribution(real_labels, synthetic_labels):
    """
    Plot the class distribution of real vs synthetic data
    """
    plt.figure(figsize=(12, 6))

    # Get class counts
    real_class_counts = np.bincount(real_labels.astype(int), minlength=10)
    synth_class_counts = np.bincount(synthetic_labels.astype(int), minlength=10)

    # Normalize
    real_class_dist = real_class_counts / np.sum(real_class_counts)
    synth_class_dist = synth_class_counts / np.sum(synth_class_counts)

    # Define class names
    class_names = [
        'Nothing in hand',
        'One pair',
        'Two pairs',
        'Three of a kind',
        'Straight',
        'Flush',
        'Full house',
        'Four of a kind',
        'Straight flush',
        'Royal flush'
    ]

    # Plot
    bar_width = 0.35
    x = np.arange(10)

    plt.bar(x - bar_width/2, real_class_dist, bar_width, label='Real Data', color='blue', alpha=0.7)
    plt.bar(x + bar_width/2, synth_class_dist, bar_width, label='Synthetic Data', color='red', alpha=0.7)

    plt.xlabel('Poker Hand')
    plt.ylabel('Proportion')
    plt.title('Class Distribution: Real vs Synthetic Poker Hands')
    plt.xticks(x, class_names, rotation=45, ha='right')
    plt.legend()
    plt.tight_layout()
    plt.savefig('poker_class_distribution.png')
    plt.close()

# Main function
def main():
    # File paths
    train_path = 'data/poker-hand-training-true.csv'
    test_path = 'data/poker-hand-testing.csv'

    # Load and preprocess data
    X_train, y_train, X_test, y_test, preprocessor, feature_names = load_poker_data(train_path, test_path)

    # Create dataset and dataloader
    train_dataset = PokerDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

    # Initialize and train the model
    data_dim = X_train.shape[1]
    latent_dim = 100

    print(f"Data dimension: {data_dim}")
    print(f"Latent dimension: {latent_dim}")

    crgan = CramerGAN(data_dim, latent_dim)

    # Train the model
    epochs = 100
    print(f"Training CramerGAN for {epochs} epochs...")
    crgan.train(train_loader, epochs, save_interval=10)

    # Save the model
    crgan.save_model('poker_crgan_model.pt')

    # Plot loss curves
    plot_loss_curves(crgan)

    # Generate synthetic data
    num_samples = 1000
    print(f"Generating {num_samples} synthetic samples...")
    synthetic_data_raw = crgan.generate_samples(num_samples)

    # Post-process the synthetic data to make it valid poker hands
    synthetic_data_processed = post_process_poker_data(synthetic_data_raw, preprocessor)

    # Generate synthetic labels using a classifier trained on real data
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)
    synthetic_labels = clf.predict(synthetic_data_raw)

    # Plot class distribution
    plot_class_distribution(y_train, synthetic_labels)

    # Statistical similarity evaluation
    print("Evaluating statistical similarity...")
    stat_results = evaluate_statistical_similarity(X_train, synthetic_data_raw, feature_names)

    print("\nJensen-Shannon Divergence (average):", stat_results['JSD_avg'])
    print("Wasserstein Distance (average):", stat_results['WD_avg'])

    # Machine Learning Utility (TSTR) evaluation
    print("\nEvaluating Machine Learning Utility (TSTR)...")
    tstr_results = evaluate_tstr(X_train, synthetic_data_raw, y_train)

    print("\nTSTR Results:")
    for clf_name, metrics in tstr_results.items():
        print(f"{clf_name}: Accuracy = {metrics['accuracy']:.4f}, F1 Score = {metrics['f1_score']:.4f}")

    # Plot feature distributions
    plot_feature_distributions(X_train, synthetic_data_raw, feature_names)

    print("\nEvaluation complete! Check the output directory for plots and saved model.")



In [12]:
if __name__ == "__main__":
    main()

Training data shape: (25010, 11)
Testing data shape: (1000000, 11)
Processed training data shape: (25010, 25)
Processed testing data shape: (1000000, 25)
Data dimension: 25
Latent dimension: 100
Training CramerGAN for 100 epochs...
Epoch [1/100] | Critic Loss: -6389.4067 | Generator Loss: 142.7483 | Time: 69.83s
Epoch [11/100] | Critic Loss: -12541250863772.7344 | Generator Loss: 453898789449.1429 | Time: 50.62s
Epoch [21/100] | Critic Loss: -2613237636217208.0000 | Generator Loss: 106842132433857.3125 | Time: 49.75s
Epoch [31/100] | Critic Loss: -63612343801053856.0000 | Generator Loss: 2636557737739243.0000 | Time: 48.80s
Epoch [41/100] | Critic Loss: -702356441053372928.0000 | Generator Loss: 9841405389412226.0000 | Time: 49.78s
Epoch [51/100] | Critic Loss: -4426850062399958016.0000 | Generator Loss: 59454153814431288.0000 | Time: 50.90s
Epoch [61/100] | Critic Loss: -20006992484721840128.0000 | Generator Loss: 277621657833869888.0000 | Time: 49.23s
Epoch [71/100] | Critic Loss: -7