In [24]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import os
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
from sklearn.mixture import GaussianMixture
from torch.optim.lr_scheduler import StepLR, MultiStepLR, ExponentialLR, ReduceLROnPlateau, CosineAnnealingLR, LambdaLR
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore")
import wandb


In [8]:
import pandas as pd

data = pd.DataFrame({
    'fruits': ['pomme', 'banane', 'pomme', 'orange', 'banane']
})

# Catégoriser la colonne 'fruits' avec des catégories prédéfinies
data['fruits_cat'] = pd.Categorical(data['fruits'], categories=['pomme', 'banane', 'orange', 'fraise'])
pd.get_dummies(data)


Unnamed: 0,fruits_banane,fruits_orange,fruits_pomme,fruits_cat_pomme,fruits_cat_banane,fruits_cat_orange,fruits_cat_fraise
0,False,False,True,True,False,False,False
1,True,False,False,False,True,False,False
2,False,False,True,True,False,False,False
3,False,True,False,False,False,True,False
4,True,False,False,False,True,False,False


In [50]:
class CondVec:
    def __init__(self, data, categorical_columns, categorical_dims):
        self.categorical_columns = categorical_columns
        self.categorical_dims = categorical_dims
        self.n_categories = sum(categorical_dims.values())
        self.n_features = len(categorical_columns)
        self.data = data
        
    def sample_conditional_vector(self, batch_size):
        """Sample conditional vectors for training."""
        if self.n_features == 0:
            return None, None
        
        vec = np.zeros((batch_size, self.n_categories), dtype='float32')
        mask = np.zeros((batch_size, self.n_features), dtype='float32')
        
        for i in range(batch_size):
            # Choose a random discrete column
            feature_idx = np.random.choice(range(self.n_features))
            feature = self.categorical_columns[feature_idx]
            
            # Choose a random category from that column
            feature_dim = self.categorical_dims[feature]
            category_idx = np.random.choice(range(feature_dim))
            
            # Set mask and vec values
            mask[i, feature_idx] = 1
            vec[i, sum(list(self.categorical_dims.values())[:feature_idx]) + category_idx] = 1
            
        return torch.from_numpy(vec), torch.from_numpy(mask)
    
    def generate_conditional_vector(self, conditions, batch_size):
        """Generate conditional vector based on conditions."""
        if self.n_features == 0:
            return None
            
        vec = np.zeros((batch_size, self.n_categories), dtype='float32')
        for feature, category in conditions.items():
            if feature in self.categorical_columns:
                feature_idx = self.categorical_columns.index(feature)
                category_idx = int(category)  # Assuming category is an index
                
                vec[:, sum(list(self.categorical_dims.values())[:feature_idx]) + category_idx] = 1
        
        return torch.from_numpy(vec)
    
class CTGANDataset(Dataset):
    def __init__(self, data, categorical_columns=None):
        self.data = data
        self.categorical_columns = categorical_columns if categorical_columns else []
        self.continuous_columns = [col for col in data.columns if col not in self.categorical_columns]
        
        # Create encoders for categorical columns and fit GMMs for continuous columns
        self.cond_vec = None
        self.transformer = DataTransformer(self.categorical_columns)
        self.transformer.fit(data)
        self.transformed_data = self.transformer.transform(data)
        
        if len(self.categorical_columns) > 0:
            self.cond_vec = CondVec(
                data, 
                categorical_columns=self.categorical_columns,
                categorical_dims=self.transformer.categorical_dims
            )
    def get_categorical_dims(self):
        if len(self.categorical_columns) > 0:
            return self.transformer.categorical_dims
        else:
            return 0
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.transformed_data[idx]
    
    def sample(self, batch_size):
        """Sample data and conditional vectors for training."""
        # Sample data
        idx = np.random.choice(range(len(self)), batch_size)
        data = self.transformed_data[idx]
        
        # Sample conditional vectors if categorical columns exist
        if self.cond_vec:
            cond_vec, mask = self.cond_vec.sample_conditional_vector(batch_size)
            return data, cond_vec, mask
        
        return data, None, None
    def train_val_split(self, val_ratio = 0.2):
        return train_test_split(self.transformed_data, test_size=val_ratio)


class TransformedCTGANDataset(Dataset):
    def __init__(self, trans_data, categorical_columns=None, categorical_dims = 0 ):
        self.transformed_data = trans_data
        self.categorical_columns = categorical_columns if categorical_columns else []
        self.cond_vec = None
        if len(self.categorical_columns) > 0:
            self.cond_vec = CondVec(
                data, 
                categorical_columns=self.categorical_columns,
                categorical_dims= categorical_dims
            )    
    def __len__(self):
        return len(self.transformed_data)
    def __getitem__(self, idx):
        return self.transformed_data[idx]
    def sample(self, batch_size):
        """Sample data and conditional vectors for training."""
        # Sample data
        idx = np.random.choice(range(len(self)), batch_size)
        data = self.transformed_data[idx]
        
        # Sample conditional vectors if categorical columns exist
        if self.cond_vec:
            cond_vec, mask = self.cond_vec.sample_conditional_vector(batch_size)
            return data, cond_vec, mask
        
        return data, None, None
        
class DataTransformer:
    """Transforms data between original space and CTGAN transformed space."""
    
    def __init__(self, categorical_columns):
        self.categorical_columns = categorical_columns if categorical_columns else []
        self.categorical_dims = {}
        self.continuous_gmms = {}
        self.n_clusters = 10  # Number of modes for GMM
        
    def fit(self, data):
        """Fit the data transformer."""
        # Process categorical columns
        for column in self.categorical_columns:
            categories = pd.Categorical(data[column]).categories
            self.categorical_dims[column] = len(categories)
            
        # Process continuous columns by fitting GMMs
        continuous_columns = [c for c in data.columns if c not in self.categorical_columns]
        for column in continuous_columns:
            col_data = data[column].values.reshape(-1, 1)
            gmm = GaussianMixture(n_components=self.n_clusters)
            gmm.fit(col_data)
            self.continuous_gmms[column] = gmm
            
    def transform(self, data):
        """Transform data to CTGAN format."""
        result = []
        
        # Transform categorical columns to one-hot encoding
        for column in self.categorical_columns:
            one_hot = pd.get_dummies(data[column], prefix=column)
            result.append(one_hot.values)
            
        # Transform continuous columns with mode-specific normalization
        for column in data.columns:
            if column not in self.categorical_columns:
                col_data = data[column].values.reshape(-1, 1)
                gmm = self.continuous_gmms[column]
                
                # Get cluster assignments and probabilities
                clusters = gmm.predict(col_data)
                probs = gmm.predict_proba(col_data)
                
                # Normalize data based on Gaussian parameters
                normalized = np.zeros_like(col_data)
                for i in range(len(col_data)):
                    cluster = clusters[i]
                    mean = gmm.means_[cluster][0]
                    std = np.sqrt(gmm.covariances_[cluster][0][0])
                    normalized[i] = (col_data[i] - mean) / (4 * std)
                
                # Create encoded data: [normalized value, cluster_1_prob, ..., cluster_k_prob]
                encoded = np.zeros((len(col_data), self.n_clusters + 1))
                encoded[:, 0] = normalized.flatten()
                encoded[:, 1:] = probs
                
                result.append(encoded)
                
        # Combine all transformed columns
        if result:
            return np.concatenate(result, axis=1).astype('float32')
        return np.zeros((len(data), 0))
        
    def inverse_transform(self, transformed_data):
        """Convert transformed data back to original format."""
        # Create a DataFrame for the result
        result = pd.DataFrame()
        column_idx = 0
        
        # Inverse transform categorical columns
        for column in self.categorical_columns:
            dim = self.categorical_dims[column]
            one_hot = transformed_data[:, column_idx:column_idx + dim]
            
            # Convert one-hot back to categorical
            indices = np.argmax(one_hot, axis=1)
            # Récupérer les catégories originales
            try:
                categories = pd.Categorical(self.data[column]).categories
                result[column] = pd.Categorical.from_codes(indices, categories=categories)
            except:
                # Fallback en cas d'erreur
                result[column] = indices
            
            column_idx += dim
            
        # Inverse transform continuous columns
        for column in self.continuous_gmms:
            gmm = self.continuous_gmms[column]
            
            # Extract normalized value and cluster probabilities
            normalized = transformed_data[:, column_idx]
            probs = transformed_data[:, column_idx + 1:column_idx + 1 + self.n_clusters]
            
            # Convert back to original space
            cluster_idx = np.argmax(probs, axis=1)
            values = np.zeros(len(normalized))
            
            for i in range(len(normalized)):
                cluster = cluster_idx[i]
                mean = gmm.means_[cluster][0]
                std = np.sqrt(gmm.covariances_[cluster][0][0])
                values[i] = normalized[i] * (4 * std) + mean
                
            result[column] = values
            column_idx += self.n_clusters + 1
            
        return result
        
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim, n_categories=0, hidden_dims=[256, 256]):
        super(Generator, self).__init__()
        
        dims = [input_dim + n_categories] + hidden_dims + [output_dim]
        
        self.layers = nn.ModuleList()
        for i in range(len(dims) - 1):
            self.layers.append(nn.Linear(dims[i], dims[i+1]))
            if i < len(dims) - 2:
                self.layers.append(nn.BatchNorm1d(dims[i+1]))
                self.layers.append(nn.LeakyReLU(0.2))
                self.layers.append(nn.Dropout(0.2))
                
    def forward(self, noise, cond_vec=None):
        if cond_vec is not None:
            x = torch.cat([noise, cond_vec], dim=1)

        else:
            x = noise
            
        for layer in self.layers:
            x = layer(x)
        return x
    
class Discriminator(nn.Module):
    def __init__(self, input_dim, n_categories=0, hidden_dims=[256, 128]):
        super(Discriminator, self).__init__()
        
        self.input_dim = input_dim
        self.pac = 1  # Default no pac
        
        # Placeholder for main layers - will be initialized in set_pac
        self.main_layers = None
        self.output_layer = None
        self.sigmoid = nn.Sigmoid()
        
        # Store parameters for layer initialization
        self.hidden_dims = hidden_dims
        self.n_categories = n_categories
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        # Initialize layers with default pac=1
        self._init_layers()
            
    def _init_layers(self):
        """Initialize network layers based on current pac value"""
        pac_input_dim = self.input_dim * self.pac
        
        self.main_layers = nn.ModuleList()
        for i in range(len(self.hidden_dims)):
            if i == 0:
                self.main_layers.append(nn.Linear(pac_input_dim, self.hidden_dims[i]))
            else:
                self.main_layers.append(nn.Linear(self.hidden_dims[i-1], self.hidden_dims[i]))
            self.main_layers.append(nn.LeakyReLU(0.2))
            self.main_layers.append(nn.Dropout(0.2))
        self.main_layers.to(self.device)
        self.output_layer = nn.Linear(self.hidden_dims[-1], 1, device= self.device)
        
        # Conditional embedding layers
        self.cond_layers = None
        if self.n_categories > 0:
            
            self.cond_layers = nn.Sequential(
                nn.Linear(self.n_categories, pac_input_dim),
                nn.ReLU()
            ).to(self.device)
    
    def set_pac(self, pac):
        """Update the model to handle pac-sized inputs"""
        self.pac = pac
        self._init_layers()
        
    def forward(self, x, cond_vec=None):
        if cond_vec is not None and self.cond_layers is not None:
            cond = self.cond_layers(cond_vec)
            x = x + cond
            
        for layer in self.main_layers:
            x = layer(x)
            
        x = self.output_layer(x)
        return self.sigmoid(x)

class PacGan(nn.Module):
    """PacGAN discriminator for improved GAN training stability."""
    def __init__(self, discriminator, pac=10):
        super(PacGan, self).__init__()
        self.discriminator = discriminator
        self.pac = pac
        # Tell the discriminator about pac to handle dimensions
        if hasattr(self.discriminator, 'set_pac'):
            self.discriminator.set_pac(pac)
        
    def forward(self, x, cond_vec=None):
        batch_size = x.size(0)
        if batch_size % self.pac != 0:
            # Padding to make divisible by pac
            pad_size = self.pac - (batch_size % self.pac)
            indices = np.random.choice(batch_size, pad_size)
            x = torch.cat([x, x[indices]], dim=0)
            if cond_vec is not None:
                cond_vec = torch.cat([cond_vec, cond_vec[indices]], dim=0)
                
        # Reshape x for PacGAN structure
        new_batch_size = x.size(0) // self.pac
        x_reshaped = x.view(new_batch_size, self.pac * x.size(1))
        
        # For conditional vectors, we need to have one per batch
        if cond_vec is not None:
            # Take one conditional vector per pac group
            cond_vec_reshaped = cond_vec.view(new_batch_size, self.pac, cond_vec.size(1))
            cond_vec_flat = cond_vec_reshaped[:, 0, :]  # Just take the first one
            return self.discriminator(x_reshaped, cond_vec_flat)
        else:
            return self.discriminator(x_reshaped, None)

class CTGAN:
    def __init__(self, categorical_columns=None, noise_dim=100, batch_size=500, 
                 generator_lr=2e-4, discriminator_lr=2e-4, pac=10):
        self.categorical_columns = categorical_columns if categorical_columns else []
        self.noise_dim = noise_dim
        self.batch_size = batch_size
        self.generator_lr = generator_lr
        self.discriminator_lr = discriminator_lr
        self.pac = pac
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        self.transformer = None
        self.dataset = None
        self.train = None
        self.val = None
        self.generator = None
        self.discriminator = None
        self.generator_optimizer = None
        self.discriminator_optimizer = None
    def fit(self, data, val_size= 0.2, epochs=300):
        """Fit CTGAN to the data with train and validation datasets and learning rate scheduling"""
        # Create datasets
        self.dataset = CTGANDataset(data, categorical_columns=self.categorical_columns)
        categorical_dims = self.dataset.get_categorical_dims()
        #########################################################################################################################################
        train, val = self.dataset.train_val_split(val_ratio= 0.2)
        
        self.train = TransformedCTGANDataset(train, categorical_columns=self.categorical_columns, categorical_dims=categorical_dims)
        self.val = TransformedCTGANDataset(val, categorical_columns=self.categorical_columns, categorical_dims=categorical_dims)
        
        # Use the same transformer for both datasets
        self.transformer = self.dataset.transformer
        
        # Calculate dimensions
        data_dim = self.dataset.transformed_data.shape[1]
        n_categories = 0
        if self.train.cond_vec:
            n_categories = self.dataset.cond_vec.n_categories
        
        # Initialize models
        self.generator = Generator(
            input_dim=self.noise_dim, 
            output_dim=data_dim,
            n_categories=n_categories
        ).to(self.device)
        
        discriminator = Discriminator(
            input_dim=data_dim,
            n_categories=n_categories
        ).to(self.device)
        self.discriminator = PacGan(discriminator, pac=self.pac)
        
        # Setup optimizers
        self.generator_optimizer = optim.Adam(
            self.generator.parameters(), 
            lr=self.generator_lr, 
            betas=(0.5, 0.9)
        )
        
        self.discriminator_optimizer = optim.Adam(
            self.discriminator.parameters(), 
            lr=self.discriminator_lr, 
            betas=(0.5, 0.9)
        )
        
        # Setup learning rate schedulers with proper optimizer references
        generator_scheduler = ReduceLROnPlateau(
            self.generator_optimizer, 
            'min', 
            patience=10, 
            factor=0.2
        )
        
        discriminator_scheduler = ReduceLROnPlateau(
            self.discriminator_optimizer, 
            'min', 
            patience=10, 
            factor=0.2
        )
        
        # Loss function
        criterion = nn.BCELoss()
        
        # Ensure batch size is a multiple of pac for PacGAN
        batch_size = (self.batch_size // self.pac) * self.pac
        if batch_size == 0:
            batch_size = self.pac
            
        run = wandb.init(project="Data Augmentation - CTGAN", name="ctgan", config={
        "epochs": epochs,
        "batch_size": self.batch_size,
        "generator_lr": self.generator_lr,
        "discriminator_lr": self.discriminator_lr,
        "pac": self.pac,
        "noise_dim": self.noise_dim,
        "categorical_columns": self.categorical_columns,
        "data_dim": data_dim,
        "n_categories": n_categories
    })
        # Training loop
        for epoch in range(epochs):
            self.generator.train()
            self.discriminator.train()
            
            g_losses = []
            d_losses = []
            
            for _ in range(max(1, len(self.train) // batch_size)):
                # Get real data and conditional vectors from training set
                real_data, cond_vec, mask = self.train.sample(batch_size)
                real_data = torch.from_numpy(real_data).to(self.device)
                
                if cond_vec is not None:
                    
                    #cond_vec = torch.from_numpy(cond_vec).to(self.device)
                    cond_vec = cond_vec.to(self.device)
                    #mask = torch.from_numpy(mask).to(self.device)
                    mask = mask.to(self.device)
                
                # Labels for real and fake data
                real_labels = torch.ones(batch_size // self.pac, 1).to(self.device)
                fake_labels = torch.zeros(batch_size // self.pac, 1).to(self.device)
                
                # Train Discriminator
                self.discriminator_optimizer.zero_grad()
                
                # Real data loss
                outputs = self.discriminator(real_data, cond_vec)
                d_real_loss = criterion(outputs, real_labels)
                
                # Generate fake data
                noise = torch.randn(batch_size, self.noise_dim).to(self.device)
                fake_data = self.generator(noise, cond_vec)
                
                outputs = self.discriminator(fake_data.detach(), cond_vec)
                d_fake_loss = criterion(outputs, fake_labels)
                
                d_loss = d_real_loss + d_fake_loss
                d_loss.backward()
                self.discriminator_optimizer.step()
                
                # Train Generator
                self.generator_optimizer.zero_grad()
                
                outputs = self.discriminator(fake_data, cond_vec)
                g_loss = criterion(outputs, real_labels)
                
                g_loss.backward()
                self.generator_optimizer.step()
                
                g_losses.append(g_loss.item())
                d_losses.append(d_loss.item())
            
            # Validation phase
            self.generator.eval()
            self.discriminator.eval()
            
            g_val_losses = []
            d_val_losses = []
            d_val_real_losses = []
            d_val_fake_losses = []
            
            with torch.no_grad():
                for _ in range(max(1, len(self.val) // batch_size)):
                    # Get real data and conditional vectors from validation set
                    real_val_data, val_cond_vec, val_mask = self.val.sample(batch_size)
                    real_val_data = torch.from_numpy(real_val_data).to(self.device)
                    
                    if val_cond_vec is not None:
                        #val_cond_vec = torch.from_numpy(val_cond_vec).to(self.device)
                        #val_mask = torch.from_numpy(val_mask).to(self.device)
                        val_cond_vec = val_cond_vec.to(self.device)
                        val_mask = val_mask.to(self.device)
                        
                    
                    # Labels for real and fake validation data
                    real_labels = torch.ones(batch_size // self.pac, 1).to(self.device)
                    fake_labels = torch.zeros(batch_size // self.pac, 1).to(self.device)
                    
                    # Validation - Discriminator
                    val_outputs = self.discriminator(real_val_data, val_cond_vec)
                    d_val_real_loss = criterion(val_outputs, real_labels)
                    d_val_real_losses.append(d_val_real_loss.item())
                    
                    # Generate fake validation data
                    val_noise = torch.randn(batch_size, self.noise_dim).to(self.device)
                    val_fake_data = self.generator(val_noise, val_cond_vec)
                    
                    val_outputs = self.discriminator(val_fake_data, val_cond_vec)
                    d_val_fake_loss = criterion(val_outputs, fake_labels)
                    d_val_fake_losses.append(d_val_fake_loss.item())
                    
                    d_val_loss = d_val_real_loss + d_val_fake_loss
                    
                    # Validation - Generator
                    g_val_loss = criterion(val_outputs, real_labels)
                    
                    g_val_losses.append(g_val_loss.item())
                    d_val_losses.append(d_val_loss.item())
            
            # Calculate average losses
            avg_g_loss = np.mean(g_losses)
            avg_d_loss = np.mean(d_losses)
            avg_g_val_loss = np.mean(g_val_losses)
            avg_d_val_loss = np.mean(d_val_losses)
            avg_d_val_real_loss = np.mean(d_val_real_losses)
            avg_d_val_fake_loss = np.mean(d_val_fake_losses)
            
            # Update learning rates based on validation losses
            generator_scheduler.step(avg_g_val_loss)
            discriminator_scheduler.step(avg_d_val_loss)
            
            # Get current learning rates
            g_lr = self.generator_optimizer.param_groups[0]['lr']
            d_lr = self.discriminator_optimizer.param_groups[0]['lr']
            wandb.log({
            "epoch": epoch,
            "g_loss": avg_g_loss,
            "d_loss": avg_d_loss,
            "g_val_loss": avg_g_val_loss,
            "d_val_loss": avg_d_val_loss,
            "d_val_real_loss": avg_d_val_real_loss,
            "d_val_fake_loss": avg_d_val_fake_loss,
            "generator_lr": g_lr,
            "discriminator_lr": d_lr
        })
            if (epoch + 1) % 10 == 0 or epoch == 0:
                print(f"Epoch [{epoch+1}/{epochs}], "
                      f"Train - G Loss: {avg_g_loss:.4f}, D Loss: {avg_d_loss:.4f}, "
                      f"Val - G Loss: {avg_g_val_loss:.4f}, D Loss: {avg_d_val_loss:.4f}, "
                      f"LR - Generator: {g_lr:.6f}, Discriminator: {d_lr:.6f}")
        wandb.finish()
        save = input("Save? 0 is not save, other is save")
        is_saved = False
        if save:
            file_name = input("Tap model's name:" )
            while not is_saved:
                if os.path.exists(f"models/{file_name}.pth"):
                    print("Model's name existes!!!!")
                    file_name = input("Tap model's name:" )
                else:
                    self.save(f"models/{file_name}.pth")
                    is_saved = True
                    print("Done!!!!")
            return 1
    def generate(self, n_samples, conditions= None):
        """Generate synthetic samples with optional conditioning."""
        if self.generator is None:
            raise RuntimeError("Model not trained. Call fit() first.")
        
        self.generator.eval()
        
        steps = n_samples // self.batch_size + 1
        data = []
        
        for i in range(steps):
            n_batch = min(self.batch_size, n_samples - i * self.batch_size)
            if n_batch <= 0:
                break
                
            # Generate noise
            noise = torch.randn(n_batch, self.noise_dim).to(self.device)
            
            # Generate conditional vector if necessary
            cond_vec = None
            if self.dataset.cond_vec and conditions:
                cond_vec = self.dataset.cond_vec.generate_conditional_vector(conditions, n_batch)
                cond_vec = cond_vec.to(self.device)
                
            # Generate data
            with torch.no_grad():
                
                fake = self.generator(noise, cond_vec)
            data.append(fake.cpu().numpy())
            
        data = np.concatenate(data, axis=0)
        
        # Convert to the original data format
        synthetic_data = self.transformer.inverse_transform(data[:n_samples])
        return synthetic_data
    
    def save(self, path):
        """Save the model."""
        if self.generator is None or self.discriminator is None:
            raise RuntimeError("Model not trained. Call fit() first.")
        
        """state = {
            'generator': self.generator.state_dict(),
            'discriminator': self.discriminator.state_dict(),
            'noise_dim': self.noise_dim,
            'categorical_columns': self.categorical_columns,
            'transformer': self.transformer
        }
        
        torch.save(state, path)"""
        checkpoint = {
            'generator': self.generator.state_dict(),
            'discriminator': self.discriminator.state_dict(),
            'generator_optimizer': self.generator_optimizer.state_dict(),
            'discriminator_optimizer': self.discriminator_optimizer.state_dict(),
            'categorical_columns': self.categorical_columns,
            'noise_dim': self.noise_dim,
            'transformer': self.transformer  
        }
        torch.save(checkpoint, path)
    
    def load(self, path):
        """Load the model."""
        state = torch.load(path, map_location=self.device)
        self.generator_optimizer = state['generator_optimizer']
        self.discriminator_optimizer = state['discriminator_optimizer']
        self.noise_dim = state['noise_dim']
        self.categorical_columns = state['categorical_columns']
        self.transformer = state['transformer']
        
        # Recreate the dataset and models
        n_categories = 0
        data_dim = 0
        
        if hasattr(self.transformer, 'categorical_dims'):
            n_categories = sum(self.transformer.categorical_dims.values())
            if hasattr(self.transformer, 'continuous_gmms'):
                continuous_dims = sum([gmm.n_components + 1 for gmm in self.transformer.continuous_gmms.values()])
                data_dim = n_categories + continuous_dims
        
        self.generator = Generator(
            input_dim=self.noise_dim,
            output_dim=data_dim,
            n_categories=n_categories
        ).to(self.device)
        
        discriminator = Discriminator(
            input_dim=data_dim,
            n_categories=n_categories
        ).to(self.device)
        
        self.discriminator = PacGan(discriminator, pac=self.pac)
        
        self.generator.load_state_dict(state['generator'])
        self.discriminator.load_state_dict(state['discriminator'])
        
        self.generator.eval()
        self.discriminator.eval()


# Test


In [52]:
# Example usage
from sklearn.datasets import fetch_california_housing
def example_usage():
    # Sample tabular data

    
    # Load data
    data = fetch_california_housing(as_frame=True).frame
    
    # For this example, let's convert HouseAge to categorical by binning
    data['HouseAge_Cat'] = pd.cut(data['HouseAge'], bins=5, labels=False)
    categorical_columns = ['HouseAge_Cat']
    # Initialize CTGAN
    ctgan = CTGAN(categorical_columns=categorical_columns, pac=5, generator_lr=0.005, discriminator_lr=0.002, batch_size= 500)  # Ajusté le pac à 5 pour éviter des problèmes de dimensionnalité
    
    # Fit model
    print("Training CTGAN model...")
    
    ctgan.fit(data , epochs=50)
    
    # Generate synthetic data (unconditional)
    #print("Generating synthetic data...")
    #synthetic_data = ctgan.generate(n_samples=1000)
    
    # Generate synthetic data with conditions
    # Example condition: HouseAge_Cat = 2"""
    conditioned_data = ctgan.generate(
        n_samples=7,
        conditions={'HouseAge_Cat': 2}
    )
    
    # Compare statistics
    print("\nReal data statistics:")
    print(data.describe())
    
    #print("\nSynthetic data statistics:")
    #print(synthetic_data.describe())
    
    print("\nConditioned data (HouseAge_Cat = 2) statistics:")
    print(conditioned_data.describe())


if __name__ == "__main__":
    example_usage()

Training CTGAN model...


Epoch [1/50], Train - G Loss: 2.1219, D Loss: 1.2687, Val - G Loss: 1.6256, D Loss: 0.8865, LR - Generator: 0.005000, Discriminator: 0.002000
Epoch [10/50], Train - G Loss: 1.0850, D Loss: 1.2874, Val - G Loss: 0.8279, D Loss: 1.3237, LR - Generator: 0.005000, Discriminator: 0.002000
Epoch [20/50], Train - G Loss: 0.8656, D Loss: 1.2908, Val - G Loss: 0.6986, D Loss: 1.3241, LR - Generator: 0.001000, Discriminator: 0.000400
Epoch [30/50], Train - G Loss: 0.7193, D Loss: 1.4109, Val - G Loss: 0.6391, D Loss: 1.4465, LR - Generator: 0.000200, Discriminator: 0.000080
Epoch [40/50], Train - G Loss: 0.6762, D Loss: 1.4366, Val - G Loss: 0.6359, D Loss: 1.4285, LR - Generator: 0.000040, Discriminator: 0.000016
Epoch [50/50], Train - G Loss: 0.6883, D Loss: 1.4159, Val - G Loss: 0.6445, D Loss: 1.4157, LR - Generator: 0.000008, Discriminator: 0.000003


0,1
d_loss,▄▁▅█▇▆▅▅▅▄▃▃▃▆▅▅▄▄▅▄█▆▅▇▇▆▆▆▇██▇▇▇▇▇▇▇▇▇
d_val_fake_loss,▁▁█▆▄▄▃▅▄▅▅▃▅▄▃▅▅▄▄▅▅▄▅▅▅▅▄▅▅▅▅▅▅▅▅▅▅▅▅▅
d_val_loss,▂▁█▆▅▅▄▄▅▄▃▅▄▄▄▄▄▄▄▆▄▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅
d_val_real_loss,▅▁▇▄▇█▆▆▆▂▂▄▅▄▂▄▂▂▅▄▄▅▅▆▅▅▅▅▅▆▅▅▅▅▅▅▅▅▅▅
discriminator_lr,█████████▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
g_loss,▅█▃▂▁▁▂▂▂▂▃▂▂▁▂▁▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
g_val_loss,▇█▁▂▂▃▅▂▃▃▃▂▃▄▃▃▃▃▃▂▃▂▂▂▂▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂
generator_lr,██████████▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
d_loss,1.41585
d_val_fake_loss,0.75154
d_val_loss,1.41568
d_val_real_loss,0.66414
discriminator_lr,0.0
epoch,49.0
g_loss,0.68826
g_val_loss,0.64454
generator_lr,1e-05


Save? 0 is not save, other is save 1
Tap model's name: ctgan0


Done!!!!

Real data statistics:
             MedInc      HouseAge      AveRooms     AveBedrms    Population  \
count  20640.000000  20640.000000  20640.000000  20640.000000  20640.000000   
mean       3.870671     28.639486      5.429000      1.096675   1425.476744   
std        1.899822     12.585558      2.474173      0.473911   1132.462122   
min        0.499900      1.000000      0.846154      0.333333      3.000000   
25%        2.563400     18.000000      4.440716      1.006079    787.000000   
50%        3.534800     29.000000      5.229129      1.048780   1166.000000   
75%        4.743250     37.000000      6.052381      1.099526   1725.000000   
max       15.000100     52.000000    141.909091     34.066667  35682.000000   

           AveOccup      Latitude     Longitude   MedHouseVal  HouseAge_Cat  
count  20640.000000  20640.000000  20640.000000  20640.000000  20640.000000  
mean       3.070655     35.631861   -119.569704      2.068558      2.196560  
std       10.386050   

In [45]:
data = fetch_california_housing(as_frame=True).frame
data.shape

(20640, 9)