In [None]:
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from multiprocessing import Pool, cpu_count
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from joblib import Parallel, delayed
import time

In [None]:
start_time = time.time()
start_time

In [None]:
data = pd.read_csv('PoblacionSintetica.csv',
                   sep=',',  
                   on_bad_lines='warn',
                   index_col=0)
data.head()

In [None]:
divisor = int(data.shape[0]/4)
divisor

In [None]:
data = data.sample(frac=1, random_state=42)

In [None]:
# Preprocesamiento de datos
label_encoders = {}
scalers = {}
data_cleaned = data.copy()

# Codificar variables categóricas
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data_cleaned[column] = label_encoders[column].fit_transform(data[column])

# Normalizar valores numéricos
for column in data.select_dtypes(include=['int64', 'float64']).columns:
    scalers[column] = MinMaxScaler()
    data_cleaned[column] = scalers[column].fit_transform(data[[column]])

In [None]:
# Fijar semillas para reproducibilidad
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)  # Puedes cambiar '42' a cualquier otro número de semilla

# Función de inicialización de pesos
def weights_init(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
        if m.bias is not None:
            torch.nn.init.constant_(m.bias, 0)

# Generador
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, output_dim),
            nn.Sigmoid()
        )
        self.apply(weights_init)
    
    def forward(self, z):
        return self.model(z)

# Discriminador
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )
        self.apply(weights_init)
    
    def forward(self, x):
        return self.model(x)

# Inicialización de la CTGAN
class CTGAN:
    def __init__(self, data, latent_dim=100, epochs=100, batch_size=64, lr=0.0002):
        self.data = data
        self.latent_dim = latent_dim
        self.epochs = epochs
        self.batch_size = batch_size
        self.lr = lr

        # Dimensión del dato de entrada
        self.data_dim = data.shape[1]

        # Instancia de las redes
        self.generator = Generator(latent_dim, self.data_dim)
        self.discriminator = Discriminator(self.data_dim)

        # Configuración del optimizador
        self.opt_gen = optim.Adam(self.generator.parameters(), lr=self.lr)
        self.opt_disc = optim.Adam(self.discriminator.parameters(), lr=self.lr)

        # Pérdida binaria cruzada
        self.criterion = nn.BCELoss()

    def train(self):
        # Convertir los datos en tensores y cargar en DataLoader
        tensor_data = torch.tensor(self.data, dtype=torch.float32)
        dataloader = DataLoader(TensorDataset(tensor_data), batch_size=self.batch_size, shuffle=True)

        for epoch in range(self.epochs):
            for batch in dataloader:
                real_data = batch[0]

                # Crear etiquetas
                real_labels = torch.ones(real_data.size(0), 1)
                fake_labels = torch.zeros(real_data.size(0), 1)

                # Generar ruido y datos falsos
                z = torch.randn(real_data.size(0), self.latent_dim)
                fake_data = self.generator(z)

                # Entrenamiento del discriminador
                self.opt_disc.zero_grad()
                real_loss = self.criterion(self.discriminator(real_data), real_labels)
                fake_loss = self.criterion(self.discriminator(fake_data.detach()), fake_labels)
                disc_loss = real_loss + fake_loss
                disc_loss.backward()
                self.opt_disc.step()

                # Entrenamiento del generador
                self.opt_gen.zero_grad()
                gen_loss = self.criterion(self.discriminator(fake_data), real_labels)
                gen_loss.backward()
                self.opt_gen.step()

            print(f"Epoch {epoch+1}/{self.epochs}, Disc Loss: {disc_loss.item()}, Gen Loss: {gen_loss.item()}")

    def sample(self, n_samples):
        z = torch.randn(n_samples, self.latent_dim)
        generated_data = self.generator(z)
        return generated_data.detach().numpy()

# Función para entrenar un CTGAN en paralelo
def train_ctgan(data_chunk, latent_dim=100, epochs=100, batch_size=64, lr=0.0002):
    ctgan = CTGAN(data_chunk, latent_dim=latent_dim, epochs=epochs, batch_size=batch_size, lr=lr)
    ctgan.train()
    return ctgan


In [None]:
# Dividir los datos en chunks para paralelización
data_chunks = np.array_split(data_cleaned.to_numpy(), 4)  # Dividir en 4 partes

# Entrenar modelos CTGAN en paralelo
models = Parallel(n_jobs=4)(delayed(train_ctgan)(chunk, epochs=20) for chunk in data_chunks)

In [None]:
synthetic_data_parts = []
for model in models:
    synthetic_part = model.sample(divisor)  # Generar 1/4 de los datos con cada modelo
    synthetic_data_parts.append(synthetic_part)

synthetic_data = np.concatenate(synthetic_data_parts, axis=0)

In [None]:
# Convierte los datos sintéticos generados de vuelta al formato original
def convert_synthetic_data(synthetic_data, original_columns, label_encoders, scalers):
    # Convertir array de NumPy a DataFrame
    synthetic_df = pd.DataFrame(synthetic_data, columns=original_columns)

    # Desnormalizar las columnas numéricas
    for column, scaler in scalers.items():
        synthetic_df[column] = scaler.inverse_transform(synthetic_df[[column]])

    # Decodificar las columnas categóricas
    for column, encoder in label_encoders.items():
        synthetic_df[column] = encoder.inverse_transform(synthetic_df[column].round().astype(int))

    return synthetic_df

# Uso de la función para convertir el formato de los datos sintéticos
original_columns = data.columns  # Nombres originales de las columnas del dataset
synthetic_df = convert_synthetic_data(synthetic_data, original_columns, label_encoders, scalers)

In [None]:
# Mostrar los primeros registros de los datos sintéticos en el formato original
print(synthetic_df.shape)
synthetic_df.head()

In [None]:
synthetic_df['Género'].value_counts()

In [None]:
#subir_csv_S3(df, ruta_s3, nombre_archivo)
synthetic_df.to_csv('PoblacionSinteticaGenerada2.csv',
                   sep=',',
                   decimal='.')