In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
from sklearn.mixture import GaussianMixture
import warnings
warnings.filterwarnings("ignore")



In [3]:
import pandas as pd

data = pd.DataFrame({
    'fruits': ['pomme', 'banane', 'pomme', 'orange', 'banane']
})

# Catégoriser la colonne 'fruits' avec des catégories prédéfinies
data['fruits_cat'] = pd.Categorical(data['fruits'], categories=['pomme', 'banane', 'orange', 'fraise'])
pd.get_dummies(data)


Unnamed: 0,fruits_banane,fruits_orange,fruits_pomme,fruits_cat_pomme,fruits_cat_banane,fruits_cat_orange,fruits_cat_fraise
0,False,False,True,True,False,False,False
1,True,False,False,False,True,False,False
2,False,False,True,True,False,False,False
3,False,True,False,False,False,True,False
4,True,False,False,False,True,False,False


In [154]:
class CondVec:
    def __init__(self, data, categorical_columns, categorical_dims):
        self.categorical_columns = categorical_columns
        self.categorical_dims = categorical_dims
        self.n_categories = sum(categorical_dims.values())
        self.n_features = len(categorical_columns)
        self.data = data
        
    def sample_conditional_vector(self, batch_size):
        """Sample conditional vectors for training."""
        if self.n_features == 0:
            return None, None
        
        vec = np.zeros((batch_size, self.n_categories), dtype='float32')
        mask = np.zeros((batch_size, self.n_features), dtype='float32')
        
        for i in range(batch_size):
            # Choose a random discrete column
            feature_idx = np.random.choice(range(self.n_features))
            feature = self.categorical_columns[feature_idx]
            
            # Choose a random category from that column
            feature_dim = self.categorical_dims[feature]
            category_idx = np.random.choice(range(feature_dim))
            
            # Set mask and vec values
            mask[i, feature_idx] = 1
            vec[i, sum(list(self.categorical_dims.values())[:feature_idx]) + category_idx] = 1
            
        return torch.from_numpy(vec), torch.from_numpy(mask)
    
    def generate_conditional_vector(self, conditions, batch_size):
        """Generate conditional vector based on conditions."""
        if self.n_features == 0:
            return None
            
        vec = np.zeros((batch_size, self.n_categories), dtype='float32')
        for feature, category in conditions.items():
            if feature in self.categorical_columns:
                feature_idx = self.categorical_columns.index(feature)
                category_idx = int(category)  # Assuming category is an index
                
                vec[:, sum(list(self.categorical_dims.values())[:feature_idx]) + category_idx] = 1
        
        return torch.from_numpy(vec)
    
class CTGANDataset(Dataset):
    def __init__(self, data, categorical_columns=None):
        self.data = data
        self.categorical_columns = categorical_columns if categorical_columns else []
        self.continuous_columns = [col for col in data.columns if col not in self.categorical_columns]
        
        # Create encoders for categorical columns and fit GMMs for continuous columns
        self.cond_vec = None
        self.transformer = DataTransformer(self.categorical_columns)
        self.transformer.fit(data)
        self.transformed_data = self.transformer.transform(data)
        
        if len(self.categorical_columns) > 0:
            self.cond_vec = CondVec(
                data, 
                categorical_columns=self.categorical_columns,
                categorical_dims=self.transformer.categorical_dims
            )
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.transformed_data[idx]
    
    def sample(self, batch_size):
        """Sample data and conditional vectors for training."""
        # Sample data
        idx = np.random.choice(range(len(self)), batch_size)
        data = self.transformed_data[idx]
        
        # Sample conditional vectors if categorical columns exist
        if self.cond_vec:
            cond_vec, mask = self.cond_vec.sample_conditional_vector(batch_size)
            return data, cond_vec, mask
        
        return data, None, None
        
class DataTransformer:
    """Transforms data between original space and CTGAN transformed space."""
    
    def __init__(self, categorical_columns):
        self.categorical_columns = categorical_columns if categorical_columns else []
        self.categorical_dims = {}
        self.continuous_gmms = {}
        self.n_clusters = 10  # Number of modes for GMM
        
    def fit(self, data):
        """Fit the data transformer."""
        # Process categorical columns
        for column in self.categorical_columns:
            categories = pd.Categorical(data[column]).categories
            self.categorical_dims[column] = len(categories)
            
        # Process continuous columns by fitting GMMs
        continuous_columns = [c for c in data.columns if c not in self.categorical_columns]
        for column in continuous_columns:
            col_data = data[column].values.reshape(-1, 1)
            gmm = GaussianMixture(n_components=self.n_clusters)
            gmm.fit(col_data)
            self.continuous_gmms[column] = gmm
            
    def transform(self, data):
        """Transform data to CTGAN format."""
        result = []
        
        # Transform categorical columns to one-hot encoding
        for column in self.categorical_columns:
            one_hot = pd.get_dummies(data[column], prefix=column)
            result.append(one_hot.values)
            
        # Transform continuous columns with mode-specific normalization
        for column in data.columns:
            if column not in self.categorical_columns:
                col_data = data[column].values.reshape(-1, 1)
                gmm = self.continuous_gmms[column]
                
                # Get cluster assignments and probabilities
                clusters = gmm.predict(col_data)
                probs = gmm.predict_proba(col_data)
                
                # Normalize data based on Gaussian parameters
                normalized = np.zeros_like(col_data)
                for i in range(len(col_data)):
                    cluster = clusters[i]
                    mean = gmm.means_[cluster][0]
                    std = np.sqrt(gmm.covariances_[cluster][0][0])
                    normalized[i] = (col_data[i] - mean) / (4 * std)
                
                # Create encoded data: [normalized value, cluster_1_prob, ..., cluster_k_prob]
                encoded = np.zeros((len(col_data), self.n_clusters + 1))
                encoded[:, 0] = normalized.flatten()
                encoded[:, 1:] = probs
                
                result.append(encoded)
                
        # Combine all transformed columns
        if result:
            return np.concatenate(result, axis=1).astype('float32')
        return np.zeros((len(data), 0))
        
    def inverse_transform(self, transformed_data):
        """Convert transformed data back to original format."""
        # Create a DataFrame for the result
        result = pd.DataFrame()
        column_idx = 0
        
        # Inverse transform categorical columns
        for column in self.categorical_columns:
            dim = self.categorical_dims[column]
            one_hot = transformed_data[:, column_idx:column_idx + dim]
            
            # Convert one-hot back to categorical
            indices = np.argmax(one_hot, axis=1)
            # Récupérer les catégories originales
            try:
                categories = pd.Categorical(self.data[column]).categories
                result[column] = pd.Categorical.from_codes(indices, categories=categories)
            except:
                # Fallback en cas d'erreur
                result[column] = indices
            
            column_idx += dim
            
        # Inverse transform continuous columns
        for column in self.continuous_gmms:
            gmm = self.continuous_gmms[column]
            
            # Extract normalized value and cluster probabilities
            normalized = transformed_data[:, column_idx]
            probs = transformed_data[:, column_idx + 1:column_idx + 1 + self.n_clusters]
            
            # Convert back to original space
            cluster_idx = np.argmax(probs, axis=1)
            values = np.zeros(len(normalized))
            
            for i in range(len(normalized)):
                cluster = cluster_idx[i]
                mean = gmm.means_[cluster][0]
                std = np.sqrt(gmm.covariances_[cluster][0][0])
                values[i] = normalized[i] * (4 * std) + mean
                
            result[column] = values
            column_idx += self.n_clusters + 1
            
        return result
        
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim, n_categories=0, hidden_dims=[256, 256]):
        super(Generator, self).__init__()
        
        dims = [input_dim + n_categories] + hidden_dims + [output_dim]
        
        self.layers = nn.ModuleList()
        for i in range(len(dims) - 1):
            self.layers.append(nn.Linear(dims[i], dims[i+1]))
            if i < len(dims) - 2:
                self.layers.append(nn.BatchNorm1d(dims[i+1]))
                self.layers.append(nn.ReLU())
        
    def forward(self, noise, cond_vec=None):
        if cond_vec is not None:
            x = torch.cat([noise, cond_vec], dim=1)

        else:
            x = noise
            
        for layer in self.layers:
            x = layer(x)
        return x
    
class Discriminator(nn.Module):
    def __init__(self, input_dim, n_categories=0, hidden_dims=[256, 128]):
        super(Discriminator, self).__init__()
        
        self.input_dim = input_dim
        self.pac = 1  # Default no pac
        
        # Placeholder for main layers - will be initialized in set_pac
        self.main_layers = None
        self.output_layer = None
        self.sigmoid = nn.Sigmoid()
        
        # Store parameters for layer initialization
        self.hidden_dims = hidden_dims
        self.n_categories = n_categories
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        # Initialize layers with default pac=1
        self._init_layers()
            
    def _init_layers(self):
        """Initialize network layers based on current pac value"""
        pac_input_dim = self.input_dim * self.pac
        
        self.main_layers = nn.ModuleList()
        for i in range(len(self.hidden_dims)):
            if i == 0:
                self.main_layers.append(nn.Linear(pac_input_dim, self.hidden_dims[i]))
            else:
                self.main_layers.append(nn.Linear(self.hidden_dims[i-1], self.hidden_dims[i]))
            self.main_layers.append(nn.LeakyReLU(0.2))
        self.main_layers.to(self.device)
        self.output_layer = nn.Linear(self.hidden_dims[-1], 1, device= self.device)
        
        # Conditional embedding layers
        self.cond_layers = None
        if self.n_categories > 0:
            
            self.cond_layers = nn.Sequential(
                nn.Linear(self.n_categories, pac_input_dim),
                nn.ReLU()
            ).to(self.device)
    
    def set_pac(self, pac):
        """Update the model to handle pac-sized inputs"""
        self.pac = pac
        self._init_layers()
        
    def forward(self, x, cond_vec=None):
        if cond_vec is not None and self.cond_layers is not None:
            cond = self.cond_layers(cond_vec)
            x = x + cond
            
        for layer in self.main_layers:
            x = layer(x)
            
        x = self.output_layer(x)
        return self.sigmoid(x)

class PacGan(nn.Module):
    """PacGAN discriminator for improved GAN training stability."""
    def __init__(self, discriminator, pac=10):
        super(PacGan, self).__init__()
        self.discriminator = discriminator
        self.pac = pac
        # Tell the discriminator about pac to handle dimensions
        if hasattr(self.discriminator, 'set_pac'):
            self.discriminator.set_pac(pac)
        
    def forward(self, x, cond_vec=None):
        batch_size = x.size(0)
        if batch_size % self.pac != 0:
            # Padding to make divisible by pac
            pad_size = self.pac - (batch_size % self.pac)
            indices = np.random.choice(batch_size, pad_size)
            x = torch.cat([x, x[indices]], dim=0)
            if cond_vec is not None:
                cond_vec = torch.cat([cond_vec, cond_vec[indices]], dim=0)
                
        # Reshape x for PacGAN structure
        new_batch_size = x.size(0) // self.pac
        x_reshaped = x.view(new_batch_size, self.pac * x.size(1))
        
        # For conditional vectors, we need to have one per batch
        if cond_vec is not None:
            # Take one conditional vector per pac group
            cond_vec_reshaped = cond_vec.view(new_batch_size, self.pac, cond_vec.size(1))
            cond_vec_flat = cond_vec_reshaped[:, 0, :]  # Just take the first one
            return self.discriminator(x_reshaped, cond_vec_flat)
        else:
            return self.discriminator(x_reshaped, None)

class CTGAN:
    def __init__(self, categorical_columns=None, noise_dim=100, batch_size=500, 
                 generator_lr=2e-4, discriminator_lr=2e-4, pac=10):
        self.categorical_columns = categorical_columns if categorical_columns else []
        self.noise_dim = noise_dim
        self.batch_size = batch_size
        self.generator_lr = generator_lr
        self.discriminator_lr = discriminator_lr
        self.pac = pac
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        self.transformer = None
        self.dataset = None
        self.generator = None
        self.discriminator = None
        
    def fit(self, data, epochs=300):
        """Fit CTGAN to the data"""
        # Create dataset
        self.dataset = CTGANDataset(data, categorical_columns=self.categorical_columns)
        self.transformer = self.dataset.transformer
        
        # Calculate dimensions
        data_dim = self.dataset.transformed_data.shape[1]
        n_categories = 0
        if self.dataset.cond_vec:
            n_categories = self.dataset.cond_vec.n_categories
        
        # Initialize models
        self.generator = Generator(
            input_dim=self.noise_dim, 
            output_dim=data_dim,
            n_categories=n_categories
        ).to(self.device)
        
        discriminator = Discriminator(
            input_dim=data_dim,
            n_categories=n_categories
        ).to(self.device)
        self.discriminator = PacGan(discriminator, pac=self.pac)
        
        # Setup optimizers
        generator_optimizer = optim.Adam(
            self.generator.parameters(), 
            lr=self.generator_lr, 
            betas=(0.5, 0.9)
        )
        
        discriminator_optimizer = optim.Adam(
            self.discriminator.parameters(), 
            lr=self.discriminator_lr, 
            betas=(0.5, 0.9)
        )
        
        # Loss function
        criterion = nn.BCELoss()
        
        # Ensure batch size is a multiple of pac for PacGAN
        batch_size = (self.batch_size // self.pac) * self.pac
        if batch_size == 0:
            batch_size = self.pac
        
        # Training loop
        for epoch in range(epochs):
            g_losses = []
            d_losses = []
            
            for _ in range(max(1, len(self.dataset) // batch_size)):
                # Get real data and conditional vectors
                real_data, cond_vec, mask = self.dataset.sample(batch_size)
                real_data = torch.from_numpy(real_data).to(self.device)
                
                if cond_vec is not None:
                    cond_vec = cond_vec.to(self.device)
                    mask = mask.to(self.device)
                
                # Labels for real and fake data
                real_labels = torch.ones(batch_size // self.pac, 1).to(self.device)
                fake_labels = torch.zeros(batch_size // self.pac, 1).to(self.device)
                
                # Train Discriminator
                discriminator_optimizer.zero_grad()
                
                # Real data loss
                outputs = self.discriminator(real_data, cond_vec)
                d_real_loss = criterion(outputs, real_labels)
                
                # Generate fake data
                noise = torch.randn(batch_size, self.noise_dim).to(self.device)
                fake_data = self.generator(noise, cond_vec)
                
                outputs = self.discriminator(fake_data.detach(), cond_vec)
                d_fake_loss = criterion(outputs, fake_labels)
                
                d_loss = d_real_loss + d_fake_loss
                d_loss.backward()
                discriminator_optimizer.step()
                
                # Train Generator
                generator_optimizer.zero_grad()
                
                outputs = self.discriminator(fake_data, cond_vec)
                g_loss = criterion(outputs, real_labels)
                
                g_loss.backward()
                generator_optimizer.step()
                
                g_losses.append(g_loss.item())
                d_losses.append(d_loss.item())
            
            if (epoch + 1) % 50 == 0:
                print(f"Epoch [{epoch+1}/{epochs}], G Loss: {np.mean(g_losses):.4f}, D Loss: {np.mean(d_losses):.4f}, Learning rate")
    
    def generate(self, n_samples, conditions= None):
        """Generate synthetic samples with optional conditioning."""
        if self.generator is None:
            raise RuntimeError("Model not trained. Call fit() first.")
        
        self.generator.eval()
        
        steps = n_samples // self.batch_size + 1
        data = []
        
        for i in range(steps):
            n_batch = min(self.batch_size, n_samples - i * self.batch_size)
            if n_batch <= 0:
                break
                
            # Generate noise
            noise = torch.randn(n_batch, self.noise_dim).to(self.device)
            
            # Generate conditional vector if necessary
            cond_vec = None
            if self.dataset.cond_vec and conditions:
                print(n_batch)
                cond_vec = self.dataset.cond_vec.generate_conditional_vector(conditions, n_batch)
                cond_vec = cond_vec.to(self.device)
                
            # Generate data
            with torch.no_grad():
                
                fake = self.generator(noise, cond_vec)
            data.append(fake.cpu().numpy())
            
        data = np.concatenate(data, axis=0)
        
        # Convert to the original data format
        synthetic_data = self.transformer.inverse_transform(data[:n_samples])
        print(synthetic_data.shape)
        return synthetic_data
    
    def save(self, path):
        """Save the model."""
        if self.generator is None or self.discriminator is None:
            raise RuntimeError("Model not trained. Call fit() first.")
        
        state = {
            'generator': self.generator.state_dict(),
            'discriminator': self.discriminator.state_dict(),
            'noise_dim': self.noise_dim,
            'categorical_columns': self.categorical_columns,
            'transformer': self.transformer
        }
        
        torch.save(state, path)
    
    def load(self, path):
        """Load the model."""
        state = torch.load(path, map_location=self.device)
        
        self.noise_dim = state['noise_dim']
        self.categorical_columns = state['categorical_columns']
        self.transformer = state['transformer']
        
        # Recreate the dataset and models
        n_categories = 0
        data_dim = 0
        
        if hasattr(self.transformer, 'categorical_dims'):
            n_categories = sum(self.transformer.categorical_dims.values())
            if hasattr(self.transformer, 'continuous_gmms'):
                continuous_dims = sum([gmm.n_components + 1 for gmm in self.transformer.continuous_gmms.values()])
                data_dim = n_categories + continuous_dims
        
        self.generator = Generator(
            input_dim=self.noise_dim,
            output_dim=data_dim,
            n_categories=n_categories
        ).to(self.device)
        
        discriminator = Discriminator(
            input_dim=data_dim,
            n_categories=n_categories
        ).to(self.device)
        
        self.discriminator = PacGan(discriminator, pac=self.pac)
        
        self.generator.load_state_dict(state['generator'])
        self.discriminator.load_state_dict(state['discriminator'])
        
        self.generator.eval()
        self.discriminator.eval()


# Hide

In [156]:
# Example usage
from sklearn.datasets import fetch_california_housing
def example_usage():
    # Sample tabular data

    
    # Load data
    data = fetch_california_housing(as_frame=True).frame
    
    # For this example, let's convert HouseAge to categorical by binning
    data['HouseAge_Cat'] = pd.cut(data['HouseAge'], bins=5, labels=False)
    categorical_columns = ['HouseAge_Cat']
    # Initialize CTGAN
    ctgan = CTGAN(categorical_columns=categorical_columns, pac=5)  # Ajusté le pac à 5 pour éviter des problèmes de dimensionnalité
    
    # Fit model
    print("Training CTGAN model...")
    ctgan.fit(data, epochs=1)
    
    # Generate synthetic data (unconditional)
    #print("Generating synthetic data...")
    #synthetic_data = ctgan.generate(n_samples=1000)
    
    # Generate synthetic data with conditions
    # Example condition: HouseAge_Cat = 2"""
    conditioned_data = ctgan.generate(
        n_samples=100,
        conditions={'HouseAge_Cat': 2}
    )
    
    # Compare statistics
    print("\nReal data statistics:")
    print(data.describe())
    
    #print("\nSynthetic data statistics:")
    #print(synthetic_data.describe())
    
    print("\nConditioned data (HouseAge_Cat = 2) statistics:")
    print(conditioned_data.describe())
    
    # Save model
    ctgan.save("ctgan_model.pt")
    
    # Load model
    new_ctgan = CTGAN()
    new_ctgan.load("ctgan_model.pt")
    
    return synthetic_data

if __name__ == "__main__":
    example_usage()

Training CTGAN model...
100
(100, 10)

Real data statistics:
             MedInc      HouseAge      AveRooms     AveBedrms    Population  \
count  20640.000000  20640.000000  20640.000000  20640.000000  20640.000000   
mean       3.870671     28.639486      5.429000      1.096675   1425.476744   
std        1.899822     12.585558      2.474173      0.473911   1132.462122   
min        0.499900      1.000000      0.846154      0.333333      3.000000   
25%        2.563400     18.000000      4.440716      1.006079    787.000000   
50%        3.534800     29.000000      5.229129      1.048780   1166.000000   
75%        4.743250     37.000000      6.052381      1.099526   1725.000000   
max       15.000100     52.000000    141.909091     34.066667  35682.000000   

           AveOccup      Latitude     Longitude   MedHouseVal  HouseAge_Cat  
count  20640.000000  20640.000000  20640.000000  20640.000000  20640.000000  
mean       3.070655     35.631861   -119.569704      2.068558      2.19

UnpicklingError: Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL __main__.DataTransformer was not an allowed global by default. Please use `torch.serialization.add_safe_globals([DataTransformer])` or the `torch.serialization.safe_globals([DataTransformer])` context manager to allowlist this global if you trust this class/function.

Check the documentation of torch.load to learn more about types accepted by default with weights_only https://pytorch.org/docs/stable/generated/torch.load.html.

In [45]:
data = fetch_california_housing(as_frame=True).frame
data.shape

(20640, 9)

# Test

In [90]:
# Example usage
from sklearn.datasets import fetch_california_housing
def example_usage():
    # Sample tabular data

    
    # Load data
    data = pd.DataFrame({'a': [1,2,3,4,5,6,7,8,9,10], 'b': [1,-1,-1,1,1,1,1,1,1,-1 ]})
    data['b'] = pd.Categorical(data['b'])
    pd.Categorical(data['b']).categories
    categorical_columns = ['b']
    # Initialize CTGAN
    ctgan = CTGAN(categorical_columns=categorical_columns, pac=5, noise_dim=1, batch_size=1)  # Ajusté le pac à 5 pour éviter des problèmes de dimensionnalité
    
    # Fit model
    print("Training CTGAN model...")
    ctgan.fit(data, epochs=1)
    
    # Generate synthetic data (unconditional)
    print("Generating synthetic data...")
    synthetic_data = ctgan.generate(n_samples=1000)
    
    # Generate synthetic data with conditions
    # Example condition: HouseAge_Cat = 2
    conditioned_data = ctgan.generate( n_samples=1000, conditions = {'b': 1} )
    
    # Compare statistics
    print("\nReal data statistics:")
    print(data.describe())
    
    print("\nSynthetic data statistics:")
    print(synthetic_data.describe())
    
    print("\nConditioned data (HouseAge_Cat = 2) statistics:")
    print(conditioned_data.describe())
    
    # Save model
    ctgan.save("ctgan_model.pt")
    
    # Load model
    new_ctgan = CTGAN()
    new_ctgan.load("ctgan_model.pt")
    
    return synthetic_data

if __name__ == "__main__":
    example_usage()

Training CTGAN model...
errr
['b']
{'b': 2}
errr
torch.Size([5, 1])
torch.Size([5, 2])
torch.Size([5, 1])
torch.Size([5, 2])
Generating synthetic data...


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x1 and 3x256)

In [69]:
df = CTGANDataset(data, categorical_columns)

In [15]:
a = pd.DataFrame({'a': [1,2,3,4,5,6,7,8,9,10], 'b': ['z','zz','zz','z','zzz','z','zzz','zzz','z','zzz' ]})
a['b'] = pd.Categorical(a['b'])
pd.Categorical(a['b']).categories

Index(['z', 'zz', 'zzz'], dtype='object')

In [17]:
aa = CTGANDataset(a, categorical_columns= ['b'])
aa.sample(1)

(array([[1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       dtype=float32),
 tensor([[0., 1., 0.]]),
 tensor([[1.]]))

In [79]:
df.transformed_data

array([[0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 1.1629847e-11,
        9.9996722e-01, 0.0000000e+00],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, ..., 3.2581773e-02,
        6.3743941e-02, 1.0747959e-42],
       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 6.6141941e-02,
        4.1500479e-02, 3.8707367e-40],
       ...,
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, ..., 8.7430551e-26,
        2.6141222e-21, 5.6873501e-04],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, ..., 1.5613448e-27,
        3.4088894e-22, 6.9994559e-05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, ..., 1.9312942e-26,
        1.2193060e-21, 2.6615727e-04]], dtype=float32)

In [None]:
data_dim = self.dataset.transformed_data.shape[1]
        n_categories = 0
        if self.dataset.cond_vec:
            n_categories = self.dataset.cond_vec.n_categories

In [125]:
dt = DataTransformer(['b'])

In [127]:
dt.transform(a)

KeyError: 'a'