In [40]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from torch.nn import functional as F
from torch.utils.data import Dataset, random_split, DataLoader
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
import time 
from datetime import datetime, timedelta


In [44]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
data = pd.read_csv('data/Physical experiments.csv',sep= ';')
num = data.select_dtypes(np.number)
cat= data.select_dtypes(exclude=np.number)
data.head()

GPU Name: NVIDIA GeForce RTX 3060 Laptop GPU
GPU Memory: 6.44 GB


Unnamed: 0,X Axis,Y Axis,Z Axis,speed,std speed,flashes,height,flash time,number of flashes,time,direction
0,0,0,0,7732100079,2127411752,836535186,893909107,375,12.0,1940939338,V
1,0,1,0,893297619,2414391967,997566682,981108211,4178899083,327.0,1999695986,U
2,0,2,0,14209136,2514055993,992084814,964873731,4241145833,576.0,2002562567,U
3,0,3,0,-7024316029,2348523655,989346321,951449147,4436641221,655.0,1995115159,U
4,0,4,0,-1405511477,2516917632,987477067,930151443,4610802139,935.0,1996782501,U


## CTGAN

In [16]:
class TabularDataset(Dataset):
    def __init__(self, data):
        self.data = torch.tensor(data,dtype=torch.float32)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]
        
class Conditioner(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim)  
        )
    
    def forward(self, x):
        return torch.sigmoid(self.net(x)) 

class CTGANGenerator(nn.Module):
    def __init__(self, latent_dim, output_dim, hidden_dim=256):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(latent_dim + output_dim, hidden_dim), 
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim),
            nn.Tanh()  
        )
        self.conditioner = Conditioner(output_dim, hidden_dim)
        
    def forward(self, z, c):
        condition = self.conditioner(c)
        inputs = torch.cat([z, condition], dim=1) 
        x = self.net(inputs)
        return x

class CTGANDiscriminator(nn.Module):
    def __init__(self, input_dim, hidden_dim=256):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim * 2, hidden_dim), 
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),  
            nn.Linear(hidden_dim, hidden_dim),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )
        
    def forward(self, x, c):
        inputs = torch.cat([x, c], dim=1)  
        return self.net(inputs)

class CTGAN:
    def __init__(self, input_dim, latent_dim=100, device='cuda' if torch.cuda.is_available() else 'cpu'):
        self.latent_dim = latent_dim
        self.device = device
        self.generator = CTGANGenerator(latent_dim, input_dim).to(device)
        self.discriminator = CTGANDiscriminator(input_dim).to(device)
        self.g_optimizer = torch.optim.Adam(self.generator.parameters(), lr=2e-4, betas=(0.5, 0.999))
        self.d_optimizer = torch.optim.Adam(self.discriminator.parameters(), lr=2e-4, betas=(0.5, 0.999))
        
    def train_step(self, real_data, conditions):
        batch_size = real_data.shape[0]
        real_data = real_data.to(self.device)
        conditions = conditions.to(self.device)
        

        self.d_optimizer.zero_grad()
        
        z = torch.randn(batch_size, self.latent_dim, device=self.device)
        fake_data = self.generator(z, conditions)
        
        real_pred = self.discriminator(real_data, conditions)
        fake_pred = self.discriminator(fake_data.detach(), conditions)
        
        d_loss_real = -torch.mean(torch.log(real_pred + 1e-8))
        d_loss_fake = -torch.mean(torch.log(1 - fake_pred + 1e-8))
        d_loss = d_loss_real + d_loss_fake
        
        d_loss.backward()
        self.d_optimizer.step()
        

        self.g_optimizer.zero_grad()
        
        z = torch.randn(batch_size, self.latent_dim, device=self.device)
        fake_data = self.generator(z, conditions)
        fake_pred = self.discriminator(fake_data, conditions)
        
        g_loss = -torch.mean(torch.log(fake_pred + 1e-8))
        
        g_loss.backward()
        self.g_optimizer.step()
        
        return d_loss.item(), g_loss.item()

    def train(self, data, conditions, epochs=200, batch_size=500):
        data = torch.FloatTensor(data)
        conditions = torch.FloatTensor(conditions)
        
        for epoch in range(epochs):
            d_losses = []
            g_losses = []
            
            idx = torch.randperm(len(data))
            data = data[idx]
            conditions = conditions[idx]
            
            for i in range(0, len(data), batch_size):
                batch_data = data[i:i+batch_size]
                batch_conditions = conditions[i:i+batch_size]
                d_loss, g_loss = self.train_step(batch_data, batch_conditions)
                d_losses.append(d_loss)
                g_losses.append(g_loss)
            
            if epoch % 10 == 0:
                print(f'Epoch {epoch}: D_loss={np.mean(d_losses):.4f}, G_loss={np.mean(g_losses):.4f}')
    
    def generate(self, num_samples, conditions):
        self.generator.eval()
        conditions = torch.FloatTensor(conditions).to(self.device)
        with torch.no_grad():
            z = torch.randn(num_samples, self.latent_dim, device=self.device)
            samples = self.generator(z, conditions)
        return samples.cpu().numpy()

## Entrainement et test

In [36]:
def train_ctgan(data, conditions=None, test_size=0.2, epochs=200, batch_size=500):
    train_data, test_data = train_test_split(data, test_size=test_size, random_state=42)
    
    if conditions is None:
        conditions = data  
    train_conditions, test_conditions = train_test_split(conditions, test_size=test_size, random_state=42)
    
    # Initialize CTGAN
    input_dim = data.shape[1]
    ctgan = CTGAN(input_dim)
    
    # Training history
    history = {
        'd_losses': [],
        'g_losses': []
    }
    
    # Train the model
    for epoch in range(epochs):
        d_losses = []
        g_losses = []
        
        # Shuffle data
        idx = np.random.permutation(len(train_data))
        epoch_data = train_data[idx]
        epoch_conditions = train_conditions[idx]
        
        for i in range(0, len(epoch_data), batch_size):
            batch_data = torch.FloatTensor(epoch_data[i:i+batch_size])
            batch_conditions = torch.FloatTensor(epoch_conditions[i:i+batch_size])
            
            d_loss, g_loss = ctgan.train_step(batch_data, batch_conditions)
            d_losses.append(d_loss)
            g_losses.append(g_loss)
        
        # Record losses
        history['d_losses'].append(np.mean(d_losses))
        history['g_losses'].append(np.mean(g_losses))
        
        if epoch % 10 == 0:
            print(f'Epoch {epoch}: D_loss={np.mean(d_losses):.4f}, G_loss={np.mean(g_losses):.4f}')
    
    return ctgan, history, test_data, test_conditions

def evaluate_generated_data(real_data, generated_data, continuous_columns, categorical_columns=None):

    results = {}
    
    ks_tests = {}
    for idx, col in enumerate(continuous_columns):
        statistic, pvalue = ks_2samp(real_data[:, idx], generated_data[:, idx])
        ks_tests[col] = {'statistic': statistic, 'pvalue': pvalue}
    
    results['ks_tests'] = ks_tests
    
    # Calculate means and stds
    real_means = np.mean(real_data, axis=0)
    real_stds = np.std(real_data, axis=0)
    gen_means = np.mean(generated_data, axis=0)
    gen_stds = np.std(generated_data, axis=0)
    
    results['statistics'] = {
        'real_means': real_means,
        'real_stds': real_stds,
        'gen_means': gen_means,
        'gen_stds': gen_stds
    }
    
    return results

def plot_training_history(history):
    """
    Plot training history
    """
    plt.figure(figsize=(10, 5))
    plt.plot(history['d_losses'], label='Discriminator Loss')
    plt.plot(history['g_losses'], label='Generator Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training History')
    plt.legend()
    plt.show()

def plot_distributions(real_data, generated_data, continuous_columns, num_cols=3):
    """
    Plot distributions of real and generated data
    """
    num_features = len(continuous_columns)
    num_rows = (num_features + num_cols - 1) // num_cols
    
    plt.figure(figsize=(15, 5 * num_rows))
    
    for idx, col in enumerate(continuous_columns):
        plt.subplot(num_rows, num_cols, idx + 1)
        sns.kdeplot(real_data[:, idx], label='Real', color='blue')
        sns.kdeplot(generated_data[:, idx], label='Generated', color='red')
        plt.title(f'Distribution of {col}')
        plt.legend()
    
    plt.tight_layout()
    plt.show()

In [50]:
np.random.seed(42)
    

n_samples = 1000
n_features = 5


continuous_columns = num.columns
    

ctgan_model, history, test_data, test_conditions = train_ctgan(
        num, 
        conditions=None,
        epochs=200,
        batch_size=100
    )
    
    # Generate synthetic data
n_synthetic = len(test_data)
synthetic_data = ctgan_model.generate(n_synthetic, test_conditions)
    
    # Evaluate results
evaluation = evaluate_generated_data(test_data, synthetic_data, continuous_columns)
    
    # Plot results
plot_training_history(history)
plot_distributions(test_data, synthetic_data, continuous_columns)
    
    # Print evaluation metrics
print("\nKolmogorov-Smirnov test results:")
for col, results in evaluation['ks_tests'].items():
    print(f"{col}:")
    print(f"  Statistic: {results['statistic']:.4f}")
    print(f"  P-value: {results['pvalue']:.4f}")

KeyError: "None of [Index([ 78,  16,  65, 114,  76,  19, 122,  24,  66, 152,\n       ...\n        87,  74, 121, 178,  20,  71, 106,  14,  92, 102],\n      dtype='int32', length=179)] are in the [columns]"