In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import os

from torch.optim.lr_scheduler import StepLR, MultiStepLR, ExponentialLR, ReduceLROnPlateau, CosineAnnealingLR, LambdaLR
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore")
import wandb
import transformer
from transformer import CondVec, CTGANDataset, DataTransformer, TransformedCTGANDataset
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

data = pd.DataFrame({
    'fruits': ['pomme', 'banane', 'pomme', 'orange', 'banane']
})

# Catégoriser la colonne 'fruits' avec des catégories prédéfinies
data['fruits_cat'] = pd.Categorical(data['fruits'], categories=['pomme', 'banane', 'orange', 'fraise'])
pd.get_dummies(data)


Unnamed: 0,fruits_banane,fruits_orange,fruits_pomme,fruits_cat_pomme,fruits_cat_banane,fruits_cat_orange,fruits_cat_fraise
0,False,False,True,True,False,False,False
1,True,False,False,False,True,False,False
2,False,False,True,True,False,False,False
3,False,True,False,False,False,True,False
4,True,False,False,False,True,False,False


In [38]:
class SmallRes(nn.Module):
    def __init__(self, d_input, d_output):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(d_input,d_output),
            nn.BatchNorm1d(d_output),
            nn.LeakyReLU(0.2)
        )
    def forward(self, x):
        out = self.layers(x)
        return torch.cat([x, out], dim = 1)
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim, n_categories=0, hidden_dims=[256, 256]):
        super(Generator, self).__init__()
        seq = []
        dim = input_dim + n_categories
        for index in hidden_dims:
            seq.append(SmallRes(dim,index))
            dim += index
        seq.append(nn.Dropout(0.3))
        seq.append(nn.Linear(dim, output_dim))
        self.layers = nn.Sequential(*seq)
                
    def forward(self, noise, cond_vec=None):
        if cond_vec is not None:
            x = torch.cat([noise, cond_vec], dim=1)

        else:
            x = noise
            
        for layer in self.layers:
            x = layer(x)
        return x
    
class Discriminator(nn.Module):
    def __init__(self, input_dim, n_categories=0, hidden_dims=[256, 128]):
        super(Discriminator, self).__init__()
        
        self.input_dim = input_dim
        self.pac = 1  # Default no pac
        
        # Placeholder for main layers - will be initialized in set_pac
        self.main_layers = None
        self.output_layer = None
        self.sigmoid = nn.Sigmoid()
        
        # Store parameters for layer initialization
        self.hidden_dims = hidden_dims
        self.n_categories = n_categories
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        # Initialize layers with default pac=1
        self._init_layers()
            
    def _init_layers(self):
        """Initialize network layers based on current pac value"""
        pac_input_dim = self.input_dim * self.pac
        
        self.main_layers = nn.ModuleList()
        for i in range(len(self.hidden_dims)):
            if i == 0:
                self.main_layers.append(nn.Linear(pac_input_dim, self.hidden_dims[i]))
            else:
                self.main_layers.append(nn.Linear(self.hidden_dims[i-1], self.hidden_dims[i]))
            self.main_layers.append(nn.LeakyReLU(0.2))
            self.main_layers.append(nn.Dropout(0.2))
        self.main_layers.to(self.device)
        self.output_layer = nn.Linear(self.hidden_dims[-1], 1, device= self.device)
        
        # Conditional embedding layers
        self.cond_layers = None
        if self.n_categories > 0:
            
            self.cond_layers = nn.Sequential(
                nn.Linear(self.n_categories, pac_input_dim),
                nn.ReLU()
            ).to(self.device)



    """ def calc_gradient_penalty(self, real_data, fake_data, lambda_=10):
        //Compute the gradient penalty.
        alpha = torch.rand(real_data.size(0) // self.pac, 1, 1, device=self.device)
        alpha = alpha.repeat(1, self.pac, real_data.size(1))
        alpha = alpha.view(-1, real_data.size(1))

        interpolates = alpha * real_data + ((1 - alpha) * fake_data)

        disc_interpolates = self(interpolates)

        gradients = torch.autograd.grad(
            outputs=disc_interpolates,
            inputs=interpolates,
            grad_outputs=torch.ones(disc_interpolates.size(), device=device),
            create_graph=True,
            retain_graph=True,
            only_inputs=True,
        )[0]

        gradients_view = gradients.view(-1, self.pac * real_data.size(1)).norm(2, dim=1) - 1
        gradient_penalty = ((gradients_view) ** 2).mean() * lambda_

        return gradient_penalty"""    
    def set_pac(self, pac):
        """Update the model to handle pac-sized inputs"""
        self.pac = pac
        self._init_layers()
        
    def forward(self, x, cond_vec=None):
        if cond_vec is not None and self.cond_layers is not None:
            cond = self.cond_layers(cond_vec)
            x = x + cond
            
        for layer in self.main_layers:
            x = layer(x)
            
        x = self.output_layer(x)
        return self.sigmoid(x)

class PacGan(nn.Module):
    """PacGAN discriminator for improved GAN training stability."""
    def __init__(self, discriminator, pac=10):
        super(PacGan, self).__init__()
        self.discriminator = discriminator
        self.pac = pac
        # Tell the discriminator about pac to handle dimensions
        if hasattr(self.discriminator, 'set_pac'):
            self.discriminator.set_pac(pac)
        
    def forward(self, x, cond_vec=None):
        batch_size = x.size(0)
        if batch_size % self.pac != 0:
            # Padding to make divisible by pac
            pad_size = self.pac - (batch_size % self.pac)
            indices = np.random.choice(batch_size, pad_size)
            x = torch.cat([x, x[indices]], dim=0)
            if cond_vec is not None:
                cond_vec = torch.cat([cond_vec, cond_vec[indices]], dim=0)
                
        # Reshape x for PacGAN structure
        new_batch_size = x.size(0) // self.pac
        x_reshaped = x.view(new_batch_size, self.pac * x.size(1))
        
        # For conditional vectors, we need to have one per batch
        if cond_vec is not None:
            # Take one conditional vector per pac group
            cond_vec_reshaped = cond_vec.view(new_batch_size, self.pac, cond_vec.size(1))
            cond_vec_flat = cond_vec_reshaped[:, 0, :]  # Just take the first one
            return self.discriminator(x_reshaped, cond_vec_flat)
        else:
            return self.discriminator(x_reshaped, None)

class CTGAN:
    def __init__(self, categorical_columns=None, noise_dim=100, batch_size=500, 
                 generator_lr=2e-4, discriminator_lr=2e-4, pac=10):
        self.categorical_columns = categorical_columns if categorical_columns else []
        self.noise_dim = noise_dim
        self.batch_size = batch_size
        self.generator_lr = generator_lr
        self.discriminator_lr = discriminator_lr
        self.pac = pac
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        self.transformer = None
        self.dataset = None
        self.train = None
        self.val = None
        self.generator = None
        self.discriminator = None
        self.generator_optimizer = None
        self.discriminator_optimizer = None
    def fit(self, data, val_size= 0.2, epochs=300):
        """Fit CTGAN to the data with train and validation datasets and learning rate scheduling"""
        # Create datasets
        self.dataset = CTGANDataset(data, categorical_columns=self.categorical_columns)
        categorical_dims = self.dataset.get_categorical_dims()
        #########################################################################################################################################
        train, val = self.dataset.train_val_split(val_ratio= 0.2)
        
        self.train = TransformedCTGANDataset(train, categorical_columns=self.categorical_columns, categorical_dims=categorical_dims)
        self.val = TransformedCTGANDataset(val, categorical_columns=self.categorical_columns, categorical_dims=categorical_dims)
        
        # Use the same transformer for both datasets
        self.transformer = self.dataset.transformer
        
        # Calculate dimensions
        data_dim = self.dataset.transformed_data.shape[1]
        n_categories = 0
        if self.train.cond_vec:
            n_categories = self.dataset.cond_vec.n_categories
        
        # Initialize models
        self.generator = Generator(
            input_dim=self.noise_dim, 
            output_dim=data_dim,
            n_categories=n_categories
        ).to(self.device)
        
        discriminator = Discriminator(
            input_dim=data_dim,
            n_categories=n_categories
        ).to(self.device)
        self.discriminator = PacGan(discriminator, pac=self.pac)
        
        # Setup optimizers
        self.generator_optimizer = optim.Adam(
            self.generator.parameters(), 
            lr=self.generator_lr, 
            betas=(0.5, 0.9)
        )
        
        self.discriminator_optimizer = optim.Adam(
            self.discriminator.parameters(), 
            lr=self.discriminator_lr, 
            betas=(0.5, 0.9)
        )
        
        # Setup learning rate schedulers with proper optimizer references
        generator_scheduler = ReduceLROnPlateau(
            self.generator_optimizer, 
            'min', 
            patience=10, 
            factor=0.2
        )
        
        discriminator_scheduler = ReduceLROnPlateau(
            self.discriminator_optimizer, 
            'min', 
            patience=10, 
            factor=0.2
        )
        
        # Loss function
        criterion = nn.BCELoss()
        
        # Ensure batch size is a multiple of pac for PacGAN
        batch_size = (self.batch_size // self.pac) * self.pac
        if batch_size == 0:
            batch_size = self.pac
            
        run = wandb.init(project="Data Augmentation - CTGAN", name="ctgan", config={
        "epochs": epochs,
        "batch_size": self.batch_size,
        "generator_lr": self.generator_lr,
        "discriminator_lr": self.discriminator_lr,
        "pac": self.pac,
        "noise_dim": self.noise_dim,
        "categorical_columns": self.categorical_columns,
        "data_dim": data_dim,
        "n_categories": n_categories
    })
        # Training loop
        print("Training CTGAN model...")
        for epoch in range(epochs):
            self.generator.train()
            self.discriminator.train()
            
            g_losses = []
            d_losses = []
            
            for _ in range(max(1, len(self.train) // batch_size)):
                # Get real data and conditional vectors from training set
                real_data, cond_vec, mask = self.train.sample(batch_size)
                if _ == 1:
                    print(real_data)
                    print(cond_vec)
                    print(mask)
                real_data = torch.from_numpy(real_data).to(self.device)
                if cond_vec is not None:
                    
                    #cond_vec = torch.from_numpy(cond_vec).to(self.device)
                    cond_vec = cond_vec.to(self.device)
                    #mask = torch.from_numpy(mask).to(self.device)
                    mask = mask.to(self.device)
                
                # Labels for real and fake data
                real_labels = torch.ones(batch_size // self.pac, 1).to(self.device)
                fake_labels = torch.zeros(batch_size // self.pac, 1).to(self.device)
                
                # Train Discriminator
                self.discriminator_optimizer.zero_grad()
                
                # Real data loss
                outputs = self.discriminator(real_data, cond_vec)
                d_real_loss = criterion(outputs, real_labels)
                
                # Generate fake data
                noise = torch.randn(batch_size, self.noise_dim).to(self.device)
                fake_data = self.generator(noise, cond_vec)
                
                outputs = self.discriminator(fake_data.detach(), cond_vec)
                d_fake_loss = criterion(outputs, fake_labels)
                
                d_loss = d_real_loss + d_fake_loss
                d_loss.backward()
                self.discriminator_optimizer.step()
                
                # Train Generator
                self.generator_optimizer.zero_grad()
                
                outputs = self.discriminator(fake_data, cond_vec)
                g_loss = criterion(outputs, real_labels)
                
                g_loss.backward()
                self.generator_optimizer.step()
                
                g_losses.append(g_loss.item())
                d_losses.append(d_loss.item())
            
            # Validation phase
            self.generator.eval()
            self.discriminator.eval()
            
            g_val_losses = []
            d_val_losses = []
            d_val_real_losses = []
            d_val_fake_losses = []
            
            with torch.no_grad():
                for _ in range(max(1, len(self.val) // batch_size)):
                    # Get real data and conditional vectors from validation set
                    real_val_data, val_cond_vec, val_mask = self.val.sample(batch_size)
                    real_val_data = torch.from_numpy(real_val_data).to(self.device)
                    
                    if val_cond_vec is not None:
                        #val_cond_vec = torch.from_numpy(val_cond_vec).to(self.device)
                        #val_mask = torch.from_numpy(val_mask).to(self.device)
                        val_cond_vec = val_cond_vec.to(self.device)
                        val_mask = val_mask.to(self.device)
                        
                    
                    # Labels for real and fake validation data
                    real_labels = torch.ones(batch_size // self.pac, 1).to(self.device)
                    fake_labels = torch.zeros(batch_size // self.pac, 1).to(self.device)
                    
                    # Validation - Discriminator
                    val_outputs = self.discriminator(real_val_data, val_cond_vec)
                    d_val_real_loss = criterion(val_outputs, real_labels)
                    d_val_real_losses.append(d_val_real_loss.item())
                    
                    # Generate fake validation data
                    val_noise = torch.randn(batch_size, self.noise_dim).to(self.device)
                    val_fake_data = self.generator(val_noise, val_cond_vec)
                    
                    val_outputs = self.discriminator(val_fake_data, val_cond_vec)
                    d_val_fake_loss = criterion(val_outputs, fake_labels)
                    d_val_fake_losses.append(d_val_fake_loss.item())
                    
                    d_val_loss = d_val_real_loss + d_val_fake_loss
                    
                    # Validation - Generator
                    g_val_loss = criterion(val_outputs, real_labels)
                    
                    g_val_losses.append(g_val_loss.item())
                    d_val_losses.append(d_val_loss.item())
            
            # Calculate average losses
            avg_g_loss = np.mean(g_losses)
            avg_d_loss = np.mean(d_losses)
            avg_g_val_loss = np.mean(g_val_losses)
            avg_d_val_loss = np.mean(d_val_losses)
            avg_d_val_real_loss = np.mean(d_val_real_losses)
            avg_d_val_fake_loss = np.mean(d_val_fake_losses)
            
            # Update learning rates based on validation losses
            generator_scheduler.step(avg_g_val_loss)
            discriminator_scheduler.step(avg_d_val_loss)
            
            # Get current learning rates
            g_lr = self.generator_optimizer.param_groups[0]['lr']
            d_lr = self.discriminator_optimizer.param_groups[0]['lr']
            wandb.log({
            "epoch": epoch,
            "g_loss": avg_g_loss,
            "d_loss": avg_d_loss,
            "g_val_loss": avg_g_val_loss,
            "d_val_loss": avg_d_val_loss,
            "d_val_real_loss": avg_d_val_real_loss,
            "d_val_fake_loss": avg_d_val_fake_loss,
            "generator_lr": g_lr,
            "discriminator_lr": d_lr
        })
            if (epoch + 1) % 10 == 0 or epoch == 0:
                print(f"Epoch [{epoch+1}/{epochs}], "
                      f"Train - G Loss: {avg_g_loss:.4f}, D Loss: {avg_d_loss:.4f}, "
                      f"Val - G Loss: {avg_g_val_loss:.4f}, D Loss: {avg_d_val_loss:.4f}, "
                      f"LR - Generator: {g_lr:.6f}, Discriminator: {d_lr:.6f}")
        wandb.finish()
        save = int(input("Save? 0 is not save, other is save"))
        is_saved = False
        if save:
            file_name = input("Tap model's name:" )
            while not is_saved:
                if os.path.exists(f"models/{file_name}.pth"):
                    print("Model's name existes!!!!")
                    file_name = input("Tap model's name:" )
                else:
                    self.save(f"models/{file_name}.pth")
                    is_saved = True
                    print("Done!!!!")
            return 1
    def generate(self, n_samples, conditions= None):
        """Generate synthetic samples with optional conditioning."""
        if self.generator is None:
            raise RuntimeError("Model not trained. Call fit() first.")
        
        self.generator.eval()
        
        steps = n_samples // self.batch_size + 1
        data = []
        
        for i in range(steps):
            n_batch = min(self.batch_size, n_samples - i * self.batch_size)
            if n_batch <= 0:
                break
                
            # Generate noise
            noise = torch.randn(n_batch, self.noise_dim).to(self.device)
            
            # Generate conditional vector if necessary
            cond_vec = None
            if self.dataset.cond_vec and conditions:
                cond_vec = self.dataset.cond_vec.generate_conditional_vector(conditions, n_batch)
                print(cond_vec)
                cond_vec = cond_vec.to(self.device)
                
            # Generate data
            with torch.no_grad():
                
                fake = self.generator(noise, cond_vec)
            data.append(fake.cpu().numpy())
            
        data = np.concatenate(data, axis=0)
        
        # Convert to the original data format
        synthetic_data = self.transformer.inverse_transform(data[:n_samples])
        return synthetic_data
    
    def save(self, path):
        """Save the model."""
        if self.generator is None or self.discriminator is None:
            raise RuntimeError("Model not trained. Call fit() first.")
        
        """state = {
            'generator': self.generator.state_dict(),
            'discriminator': self.discriminator.state_dict(),
            'noise_dim': self.noise_dim,
            'categorical_columns': self.categorical_columns,
            'transformer': self.transformer
        }
        
        torch.save(state, path)"""
        checkpoint = {
            'generator': self.generator.state_dict(),
            'discriminator': self.discriminator.state_dict(),
            'generator_optimizer': self.generator_optimizer.state_dict(),
            'discriminator_optimizer': self.discriminator_optimizer.state_dict(),
            'categorical_columns': self.categorical_columns,
            'noise_dim': self.noise_dim,
            'transformer': self.transformer  
        }
        torch.save(checkpoint, path)
    
    def load(self, path):
        """Load the model."""
        state = torch.load(path, map_location=self.device, weights_only= False)
        self.generator_optimizer = state['generator_optimizer']
        self.discriminator_optimizer = state['discriminator_optimizer']
        self.noise_dim = state['noise_dim']
        self.categorical_columns = state['categorical_columns']
        self.transformer = state['transformer']
        
        # Recreate the dataset and models
        n_categories = 0
        data_dim = 0
        
        if hasattr(self.transformer, 'categorical_dims'):
            n_categories = sum(self.transformer.categorical_dims.values())
            if hasattr(self.transformer, 'continuous_gmms'):
                continuous_dims = sum([gmm.n_components + 1 for gmm in self.transformer.continuous_gmms.values()])
                data_dim = n_categories + continuous_dims
        
        self.generator = Generator(
            input_dim=self.noise_dim,
            output_dim=data_dim,
            n_categories=n_categories
        ).to(self.device)
        
        discriminator = Discriminator(
            input_dim=data_dim,
            n_categories=n_categories
        ).to(self.device)
        
        self.discriminator = PacGan(discriminator, pac=self.pac)
        
        self.generator.load_state_dict(state['generator'])
        self.discriminator.load_state_dict(state['discriminator'])
        
        self.generator.eval()
        self.discriminator.eval()


# Test


In [5]:
# Example usage
from sklearn.datasets import fetch_california_housing

data = fetch_california_housing(as_frame=True).frame
data['HouseAge_Cat'] = pd.cut(data['HouseAge'], bins=5, labels=False)
categorical_columns = ['HouseAge_Cat']
ctgan = CTGAN(categorical_columns=categorical_columns, pac=5, generator_lr=0.005, discriminator_lr=0.002, batch_size= 500)  # Ajusté le pac à 5 pour éviter des problèmes de dimensionnalité
ctgan.fit(data , epochs=50)


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mtuanhquanle[0m ([33mtuanhquanle-insa-toulouse[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Training CTGAN model...
-------R D------------
tensor([[0.0000e+00, 0.0000e+00, 1.0000e+00,  ..., 0.0000e+00, 2.0355e-01,
         1.2932e-13],
        [0.0000e+00, 0.0000e+00, 1.0000e+00,  ..., 0.0000e+00, 7.7352e-02,
         8.0182e-06],
        [0.0000e+00, 1.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 5.2669e-13,
         8.3012e-31],
        ...,
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 5.6294e-06,
         3.8154e-22],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 3.2678e-01,
         5.7652e-13],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 8.5811e-01,
         2.7577e-08]], device='cuda:0')
--------------C V --------------
tensor([[0., 0., 0., 0., 1.],
        [0., 0., 1., 0., 0.],
        [0., 1., 0., 0., 0.],
        ...,
        [0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1.],
        [0., 0., 1., 0., 0.]])
--------------------------
-------R D------------
tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 

KeyboardInterrupt: 

In [None]:
file = "models\ctgan1.pth"
ctgan_new = CTGAN()
ctgan_new.load(file)
conditioned_data = ctgan_new.generate(
        n_samples=7,
        conditions={'HouseAge_Cat': 2}
    )
print("\nReal data :")
print(data)
print("\nReal data statistics:")
print(data.describe())
print("\nConditioned data (HouseAge_Cat = 2) :")
print(conditioned_data)
print("\nConditioned data (HouseAge_Cat = 2) statistics:")
print(conditioned_data.describe())

# Test1

In [None]:
a['C'].values

In [40]:
a = pd.DataFrame({
    'A': [1,2,1,2,1,2,1,3,3,2,2,2,1,1,1], 
    'B': ['z','z','zz', 'z', 'z','z','z','zz', 'z', 'z','z','z','zz', 'z', 'z'], 
    'C': [0.99404096, 0.58273721, 0.21701061, 0.1175965,  0.68291119, 0.62865904, 
          0.68754258, 0.51539969, 0.70036077, 0.94512348, 0.13780938, 0.04576671,
          0.0784216, 0.19138225, 0.78545446]
})
categorical_columns = ['A', 'B']
ctgan = CTGAN(categorical_columns=categorical_columns, pac=5, generator_lr=0.005, discriminator_lr=0.002, batch_size= 5)
ctgan.fit(data= a, epochs=1)


Training CTGAN model...
[[ 0.0000000e+00  1.0000000e+00  0.0000000e+00  1.0000000e+00
   0.0000000e+00 -2.3506643e-01  8.6826862e-13  1.0000000e+00]
 [ 0.0000000e+00  1.0000000e+00  0.0000000e+00  1.0000000e+00
   0.0000000e+00 -2.3506643e-01  8.6826862e-13  1.0000000e+00]
 [ 1.0000000e+00  0.0000000e+00  0.0000000e+00  1.0000000e+00
   0.0000000e+00  4.4829467e-01  2.8025969e-45  1.0000000e+00]
 [ 1.0000000e+00  0.0000000e+00  0.0000000e+00  1.0000000e+00
   0.0000000e+00 -6.0937427e-02  1.9675944e-19  1.0000000e+00]
 [ 0.0000000e+00  1.0000000e+00  0.0000000e+00  1.0000000e+00
   0.0000000e+00 -2.3506643e-01  8.6826862e-13  1.0000000e+00]]
tensor([[0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 1.],
        [0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 1.]])
tensor([[1., 0.],
        [0., 1.],
        [1., 0.],
        [1., 0.],
        [0., 1.]])
Epoch [1/100], Train - G Loss: 0.7655, D Loss: 1.3928, Val - G Loss: 0.7300, D Loss: 1.3092, LR - Generator:

0,1
d_loss,▅▅▅▅▅▅▃▄▃▃▃▂▄▃▂▃▄▇█▁▄▅▅▄▄▄▄▄▄▄▃▂▃▃▇▁▃▄▃▄
d_val_fake_loss,▆▅▅▃▄▄▂▃▄▄▂▁▃▂▃▃█▃▂▅▂▂▃▆▂▁▃▄▆▆▃▂▁▄▃▇▃▆▄▁
d_val_loss,▅▆▇▄▄▄▅▃▄▃▃█▄▃▄▃▂▄▃▄▁▃▆▃▁▂▅▅▃█▂▁▄▃▃▅▂▄▅▅
d_val_real_loss,▄██▅▅▆▇▇▇▅▅▃▂▃▂▃▃▁▁▂▃▃▄▂▃▂▂▇▃▂▂▄▅▂▅▅▃▂▂▆
discriminator_lr,██████████▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇██
g_loss,▃▂▂▃▅█▇▆▇█▇▇▆▅▂▆▄▃█▄▅▃▁▃▅▅▂▄▃▄▃▆▃▆▄▁▆▅▃▆
g_val_loss,▂▂▂▂▃▃▄▄▅▃▃▂▂▁▃▃▃▂▂█▂▄▄▃▂▄▃▆▃▃▄▁▅▆▃▂▄▁▂▆
generator_lr,██████████▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
d_loss,1.20086
d_val_fake_loss,0.10369
d_val_loss,0.58148
d_val_real_loss,0.47779
discriminator_lr,0.0
epoch,99.0
g_loss,1.43494
g_val_loss,2.31774
generator_lr,0.0


Save? 0 is not save, other is save 000


In [82]:
ctgan.generate(conditions={'A': 0}, n_samples=7)

Unnamed: 0,A,B,C
0,0,0,-0.093204
1,0,0,0.092794
2,0,0,-0.158658
3,0,0,-0.02923
4,0,0,0.417205
5,0,0,0.09215
6,0,1,1.615696


In [71]:
import pandas as pd
import numpy as np
from ctgan import CTGAN
from sklearn.preprocessing import LabelEncoder

# Create the original DataFrame
a = pd.DataFrame({
    'A': [1,2,1,2,1,2,1,3,3,2,2,2,1,1,1], 
    'B': ['z','z','zz', 'z', 'z','z','z','zz', 'z', 'z','z','z','zz', 'z', 'z'], 
    'C': [0.99404096, 0.58273721, 0.21701061, 0.1175965,  0.68291119, 0.62865904, 
          0.68754258, 0.51539969, 0.70036077, 0.94512348, 0.13780938, 0.04576671,
          0.0784216, 0.19138225, 0.78545446]
})

# Create a copy of the dataframe to preserve original
df = a.copy()

# Identify categorical columns
categorical_columns = ['A', 'B']
numeric_columns = ['C']

# Preprocessing: Label Encoding for categorical columns
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Initialize the CTGAN model
ctgan = CTGAN(
    epochs=100,  # Number of training epochs
    batch_size=500,  # Batch size for training
    verbose=True  # Show training progress
)

# Fit the model to the preprocessed data
ctgan.fit(df)

# Generate synthetic data
synthetic_df = ctgan.sample(len(df))

# Safe inverse transform function
def safe_inverse_transform(le, y):
    # Clip values to the range of known labels
    max_label = len(le.classes_) - 1
    y_clipped = np.clip(y, 0, max_label)
    return le.inverse_transform(y_clipped.astype(int))

# Reverse the label encoding for categorical columns
for col, le in label_encoders.items():
    synthetic_df[col] = safe_inverse_transform(le, synthetic_df[col])

# Print comparisons
print("Original Data:")
print(a)
print("\nSynthetic Data:")
print(synthetic_df)

# Validate distributions
for col in categorical_columns:
    print(f"\nOriginal {col} Distribution:")
    print(a[col].value_counts(normalize=True))
    print(f"\nSynthetic {col} Distribution:")
    print(synthetic_df[col].value_counts(normalize=True))

# Optional: Save synthetic data
synthetic_df.to_csv('synthetic_data.csv', index=False)

# Additional validation
print("\nValidation:")
for col in categorical_columns:
    print(f"\nUnique values in original {col}: {sorted(a[col].unique())}")
    print(f"Unique values in synthetic {col}: {sorted(synthetic_df[col].unique())}")

Gen. (1.09) | Discrim. (0.15): 100%|█████████████████████████████████████████████████| 100/100 [00:01<00:00, 50.66it/s]


Original Data:
    A   B         C
0   1   z  0.994041
1   2   z  0.582737
2   1  zz  0.217011
3   2   z  0.117597
4   1   z  0.682911
5   2   z  0.628659
6   1   z  0.687543
7   3  zz  0.515400
8   3   z  0.700361
9   2   z  0.945123
10  2   z  0.137809
11  2   z  0.045767
12  1  zz  0.078422
13  1   z  0.191382
14  1   z  0.785454

Synthetic Data:
    A   B         C
0   1   z  0.306026
1   1   z  1.183757
2   1   z  0.802667
3   1  zz  0.637552
4   1   z  0.913374
5   1  zz  0.692626
6   2   z -0.080936
7   1   z  0.558993
8   3   z  1.058552
9   1   z  0.472356
10  1   z  0.978346
11  1   z  0.741702
12  1  zz  0.143329
13  1   z  0.782236
14  1   z  0.615643

Original A Distribution:
A
1    0.466667
2    0.400000
3    0.133333
Name: proportion, dtype: float64

Synthetic A Distribution:
A
1    0.866667
2    0.066667
3    0.066667
Name: proportion, dtype: float64

Original B Distribution:
B
z     0.8
zz    0.2
Name: proportion, dtype: float64

Synthetic B Distribution:
B
z     0.8
z

In [67]:
class SmallRes(nn.Module):
    def __init__(self, d_input, d_output):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(d_input,d_output),
            nn.BatchNorm1d(d_output),
            nn.LeakyReLU(0.2)
        )
    def forward(self, x):
        out = self.layers(x)
        return torch.cat([x, out], dim = 1)
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim, n_categories=0, hidden_dims=[256, 256]):
        super(Generator, self).__init__()
        seq = []
        dim = input_dim + n_categories
        for index in hidden_dims:
            seq.append(SmallRes(dim,index))
            dim += index
        seq.append(nn.Linear(dim, output_dim))
        self.layers = nn.Sequential(*seq)
                
    def forward(self, noise, cond_vec=None):
        if cond_vec is not None:
            x = torch.cat([noise, cond_vec], dim=1)

        else:
            x = noise
            
        for layer in self.layers:
            x = layer(x)
        return x
    

Unnamed: 0,A,B,C
0,1,z,0.994041
2,1,zz,0.217011
4,1,z,0.682911
6,1,z,0.687543
12,1,zz,0.078422
13,1,z,0.191382
14,1,z,0.785454
