In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Define the Generator network
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dims=[256, 256]):
        super(Generator, self).__init__()
        
        self.layers = nn.ModuleList()
        prev_dim = input_dim
        
        # Hidden layers
        for hidden_dim in hidden_dims:
            self.layers.append(nn.Linear(prev_dim, hidden_dim))
            self.layers.append(nn.BatchNorm1d(hidden_dim))
            self.layers.append(nn.LeakyReLU(0.2))
            prev_dim = hidden_dim
            
        # Output layer
        self.layers.append(nn.Linear(prev_dim, output_dim))
        self.layers.append(nn.Tanh())  # Output values between -1 and 1
        
    def forward(self, x):
        for layer in self.layers:
            if isinstance(layer, nn.BatchNorm1d) and len(x) == 1:
                continue  # Skip batch norm for single sample
            x = layer(x)
        return x

# Define the Discriminator network
class Discriminator(nn.Module):
    def __init__(self, input_dim, hidden_dims=[256, 128]):
        super(Discriminator, self).__init__()
        
        self.layers = nn.ModuleList()
        prev_dim = input_dim
        
        # Hidden layers
        for hidden_dim in hidden_dims:
            self.layers.append(nn.Linear(prev_dim, hidden_dim))
            self.layers.append(nn.LeakyReLU(0.2))
            self.layers.append(nn.Dropout(0.3))
            prev_dim = hidden_dim
            
        # Output layer
        self.layers.append(nn.Linear(prev_dim, 1))
        self.layers.append(nn.Sigmoid())  # Output probability
        
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

# Simple CTGAN class
class SimpleCTGAN:
    def __init__(self, 
                 latent_dim=100, 
                 hidden_dims_gen=[256, 256],
                 hidden_dims_disc=[256, 128],
                 lr=0.0002,
                 beta1=0.5,
                 beta2=0.999):
        
        self.latent_dim = latent_dim
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.scaler = MinMaxScaler(feature_range=(-1, 1))
        
        # Initialize networks
        self.generator = None
        self.discriminator = None
        
        # Hyperparameters
        self.hidden_dims_gen = hidden_dims_gen
        self.hidden_dims_disc = hidden_dims_disc
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        
        # Track losses
        self.g_losses = []
        self.d_losses = []
        
        # Column info
        self.columns = None
        self.continuous_columns = None
        self.categorical_columns = None
        self.categorical_dimensions = None
    
    def fit(self, data, continuous_columns=None, categorical_columns=None, epochs=300, batch_size=64):
        """
        Train the CTGAN on the provided data
        
        Parameters:
        -----------
        data: pandas DataFrame
            The training data
        continuous_columns: list
            List of column names with continuous data
        categorical_columns: list
            List of column names with categorical data
        epochs: int
            Number of training epochs
        batch_size: int
            Batch size for training
        """
        # Store column information
        self.columns = data.columns.tolist()
        
        # Identify continuous and categorical columns if not provided
        if continuous_columns is None and categorical_columns is None:
            self.continuous_columns = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
            self.categorical_columns = data.select_dtypes(include=['object', 'category']).columns.tolist()
        else:
            self.continuous_columns = continuous_columns if continuous_columns else []
            self.categorical_columns = categorical_columns if categorical_columns else []
        
        # Process categorical variables
        self.categorical_dimensions = {}
        transformed_data = data.copy()
        
        for column in self.categorical_columns:
            dummies = pd.get_dummies(data[column], prefix=column)
            transformed_data = pd.concat([transformed_data.drop(column, axis=1), dummies], axis=1)
            self.categorical_dimensions[column] = dummies.columns.tolist()
        
        # Only keep the relevant columns after transformation
        processed_columns = self.continuous_columns.copy()
        for column in self.categorical_columns:
            processed_columns.extend(self.categorical_dimensions[column])
        
        transformed_data = transformed_data[processed_columns]
        
        # Scale the data
        scaled_data = self.scaler.fit_transform(transformed_data)
        
        # Convert to PyTorch tensor
        tensor_data = torch.FloatTensor(scaled_data).to(self.device)
        dataset = TensorDataset(tensor_data)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
        
        # Initialize networks
        output_dim = scaled_data.shape[1]
        self.generator = Generator(self.latent_dim, output_dim, self.hidden_dims_gen).to(self.device)
        self.discriminator = Discriminator(output_dim, self.hidden_dims_disc).to(self.device)
        
        # Initialize optimizers
        g_optimizer = optim.Adam(self.generator.parameters(), lr=self.lr, betas=(self.beta1, self.beta2))
        d_optimizer = optim.Adam(self.discriminator.parameters(), lr=self.lr, betas=(self.beta1, self.beta2))
        
        # Loss function
        criterion = nn.BCELoss()
        
        # Training loop
        for epoch in range(epochs):
            g_epoch_loss = 0.0
            d_epoch_loss = 0.0
            
            for batch_idx, (real_data,) in enumerate(dataloader):
                batch_size = real_data.size(0)
                
                # Create labels
                real_labels = torch.ones(batch_size, 1).to(self.device)
                fake_labels = torch.zeros(batch_size, 1).to(self.device)
                
                # Train Discriminator
                d_optimizer.zero_grad()
                
                # With real data
                real_outputs = self.discriminator(real_data)
                d_real_loss = criterion(real_outputs, real_labels)
                
                # With fake data
                noise = torch.randn(batch_size, self.latent_dim).to(self.device)
                fake_data = self.generator(noise)
                fake_outputs = self.discriminator(fake_data.detach())
                d_fake_loss = criterion(fake_outputs, fake_labels)
                
                # Combine losses and backpropagate
                d_loss = d_real_loss + d_fake_loss
                d_loss.backward()
                d_optimizer.step()
                
                d_epoch_loss += d_loss.item()
                
                # Train Generator
                g_optimizer.zero_grad()
                
                # Generate fake data and try to fool discriminator
                noise = torch.randn(batch_size, self.latent_dim).to(self.device)
                fake_data = self.generator(noise)
                fake_outputs = self.discriminator(fake_data)
                
                # Generator wants discriminator to classify fake as real
                g_loss = criterion(fake_outputs, real_labels)
                g_loss.backward()
                g_optimizer.step()
                
                g_epoch_loss += g_loss.item()
            
            # Record losses
            avg_g_loss = g_epoch_loss / len(dataloader)
            avg_d_loss = d_epoch_loss / len(dataloader)
            self.g_losses.append(avg_g_loss)
            self.d_losses.append(avg_d_loss)
            
            # Print progress
            if (epoch + 1) % 50 == 0 or epoch == 0:
                print(f'Epoch [{epoch+1}/{epochs}], G Loss: {avg_g_loss:.4f}, D Loss: {avg_d_loss:.4f}')
    
    def generate(self, n_samples=100):
        """
        Generate synthetic samples
        
        Parameters:
        -----------
        n_samples: int
            Number of samples to generate
            
        Returns:
        --------
        pandas DataFrame with synthetic data
        """
        if self.generator is None:
            raise ValueError("Model hasn't been trained yet. Call fit() first.")
        
        # Generate noise
        noise = torch.randn(n_samples, self.latent_dim).to(self.device)
        
        # Generate synthetic data
        with torch.no_grad():
            synthetic_data = self.generator(noise).cpu().numpy()
        
        # Inverse transform to get original scale
        synthetic_data = self.scaler.inverse_transform(synthetic_data)
        
        # Convert to DataFrame
        synthetic_df = pd.DataFrame(synthetic_data, columns=self.continuous_columns.copy())
        
        # We need to post-process the data to get back the categorical variables
        for column in self.categorical_columns:
            # Get the one-hot encoded columns for this category
            category_columns = self.categorical_dimensions[column]
            
            # Get the indices of these columns in the DataFrame
            column_indices = [self.continuous_columns.index(cat_col) for cat_col in category_columns 
                             if cat_col in self.continuous_columns]
            
            if column_indices:
                # Extract the values
                category_values = synthetic_df.iloc[:, column_indices].values
                
                # Get the index of the max value for each row (one-hot encoding)
                max_indices = np.argmax(category_values, axis=1)
                
                # Map back to the original categories
                original_categories = [category_columns[idx].split('_')[-1] for idx in max_indices]
                
                # Add the column back to the DataFrame
                synthetic_df[column] = original_categories
                
                # Remove the one-hot encoded columns
                synthetic_df = synthetic_df.drop(columns=[category_columns[idx] for idx in column_indices])
        
        return synthetic_df
    
    def plot_losses(self):
        """Plot generator and discriminator losses"""
        plt.figure(figsize=(10, 5))
        plt.plot(self.g_losses, label='Generator Loss')
        plt.plot(self.d_losses, label='Discriminator Loss')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.title('Training Losses')
        plt.legend()
        plt.grid(True)
        plt.show()

# Example usage
def example_usage():
    # Create some sample data
    np.random.seed(42)
    n_samples = 1000
    
    # Continuous features
    age = np.random.normal(loc=45, scale=15, size=n_samples)
    income = np.random.lognormal(mean=10.5, sigma=0.5, size=n_samples)
    
    # Categorical features
    gender = np.random.choice(['Male', 'Female'], size=n_samples)
    education = np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], 
                               p=[0.4, 0.3, 0.2, 0.1], 
                               size=n_samples)
    
    # Create DataFrame
    data = pd.DataFrame({
        'Age': age,
        'Income': income,
        'Gender': gender,
        'Education': education
    })
    
    # Train the model
    ctgan = SimpleCTGAN(latent_dim=64, hidden_dims_gen=[128, 128], hidden_dims_disc=[128, 64])
    continuous_cols = ['Age', 'Income']
    categorical_cols = ['Gender', 'Education']
    
    print("Training CTGAN model...")
    ctgan.fit(data, continuous_columns=continuous_cols, categorical_columns=categorical_cols, 
             epochs=300, batch_size=64)
    
    # Generate synthetic data
    print("Generating synthetic data...")
    synthetic_data = ctgan.generate(n_samples=500)
    
    # Display some statistics
    print("\nOriginal Data Statistics:")
    print(data.describe())
    
    print("\nSynthetic Data Statistics:")
    print(synthetic_data.describe())
    
    # Plot distributions
    fig, axs = plt.subplots(2, 2, figsize=(15, 10))
    
    # Age distribution
    axs[0, 0].hist(data['Age'], bins=20, alpha=0.5, label='Original')
    axs[0, 0].hist(synthetic_data['Age'], bins=20, alpha=0.5, label='Synthetic')
    axs[0, 0].set_title('Age Distribution')
    axs[0, 0].legend()
    
    # Income distribution
    axs[0, 1].hist(data['Income'], bins=20, alpha=0.5, label='Original')
    axs[0, 1].hist(synthetic_data['Income'], bins=20, alpha=0.5, label='Synthetic')
    axs[0, 1].set_title('Income Distribution')
    axs[0, 1].legend()
    
    # Gender distribution
    orig_gender_counts = data['Gender'].value_counts(normalize=True)
    syn_gender_counts = synthetic_data['Gender'].value_counts(normalize=True)
    
    axs[1, 0].bar(orig_gender_counts.index, orig_gender_counts.values, alpha=0.5, label='Original')
    axs[1, 0].bar(syn_gender_counts.index, syn_gender_counts.values, alpha=0.5, label='Synthetic')
    axs[1, 0].set_title('Gender Distribution')
    axs[1, 0].legend()
    
    # Education distribution
    orig_edu_counts = data['Education'].value_counts(normalize=True)
    syn_edu_counts = synthetic_data['Education'].value_counts(normalize=True)
    
    axs[1, 1].bar(orig_edu_counts.index, orig_edu_counts.values, alpha=0.5, label='Original')
    axs[1, 1].bar(syn_edu_counts.index, syn_edu_counts.values, alpha=0.5, label='Synthetic')
    axs[1, 1].set_title('Education Distribution')
    axs[1, 1].set_xticklabels(orig_edu_counts.index, rotation=45)
    axs[1, 1].legend()
    
    plt.tight_layout()
    plt.show()
    
    # Plot training losses
    ctgan.plot_losses()
    
    return ctgan, data, synthetic_data

if __name__ == "__main__":
    example_usage()

Training CTGAN model...
Epoch [1/300], G Loss: 0.7274, D Loss: 1.4008
Epoch [50/300], G Loss: 0.8291, D Loss: 1.2470
Epoch [100/300], G Loss: 0.8121, D Loss: 1.2833
Epoch [150/300], G Loss: 0.8120, D Loss: 1.2916
Epoch [200/300], G Loss: 0.7933, D Loss: 1.2990
Epoch [250/300], G Loss: 0.7746, D Loss: 1.3033
Epoch [300/300], G Loss: 0.7929, D Loss: 1.3015
Generating synthetic data...


ValueError: Shape of passed values is (500, 8), indices imply (500, 2)