In [None]:
import warnings
warnings.filterwarnings('ignore')
! pip install torch
! pip install tensorflow

In [None]:
import warnings
import torch
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from multiprocessing import Pool, cpu_count
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from joblib import Parallel, delayed
import tensorflow as tf
import time
from scipy.stats import entropy, chi2_contingency
import os

In [4]:
warnings.filterwarnings('ignore')
data = pd.read_csv('PoblacionSintetica.csv',
                   sep=',',  
                   on_bad_lines='warn',
                   index_col=0)

In [5]:
cantidad_muestras = len(data)

In [None]:
# Reemplazar los valores que cumplen la condición por ceros
data.loc[(data['Salario'] == 'No especificado') | (data['Salario'] == 'No Aplica'), 'Salario'] = 0
data.head()

In [7]:
data['Salario'] = data['Salario'].astype(float)
data['Salario'] = data['Salario'].round(0).astype(int)

In [8]:
data = data.fillna('No Aplica')

# Reducción

In [None]:
data_sampled = data.sample(frac=1, random_state=42, replace=False) #El muestreo al 100% es para desorganizar la data de forma aleatoria
data_sampled.shape

In [None]:
data_sampled.head()

In [None]:
# Definición de tamaño de lote
batch = 64
batch

In [None]:
data_sampled.columns

# Embedding

In [104]:
# Definir embedding_dims para cada columna
embedding_dims = {
                'Región': 6, 'Canton': 9, # Cantón 81
                'Género': 1, 'Rango edad': 6, 'Estado Laboral': 3,
                'Zona': 2, 'Sector': 5, 'Rama': 15, 'Nivel calificacion': 5, #Rama 15
                'Sector Institucional': 3, 'Posición': 7, 'Formalidad': 3
                }

In [105]:
# Crear un diccionario para almacenar los label encoders y embeddings
label_encoders = {}
embedding_layers = {}
embedded_dfs = []
df = pd.DataFrame()

In [None]:
# Iterar sobre cada columna categórica y aplicar embedding
warnings.filterwarnings('ignore')
for col, dim in embedding_dims.items():
    label_encoders[col] = LabelEncoder()
    df[f'{col}_idx'] = label_encoders[col].fit_transform(data_sampled[col])

    num_categories = df[f'{col}_idx'].nunique()
    embedding_layers[col] = tf.keras.layers.Embedding(input_dim=num_categories, output_dim=dim, input_length=1)

    input_data = tf.convert_to_tensor(df[f'{col}_idx'].values)
    embeddings = embedding_layers[col](input_data)

    embedded_df = pd.DataFrame(embeddings.numpy(), columns=[f'{col}_embed_{i+1}' for i in range(dim)])
    embedded_dfs.append(embedded_df)

# Concatenar todos los embeddings
final_embedded_df = pd.concat(embedded_dfs, axis=1)
final_embedded_df.head()

In [107]:
# Resetea los índices de ambos DataFrames para asegurarte de que no haya duplicados
final_embedded_df = final_embedded_df.reset_index(drop=True)
data_sampled = data_sampled.reset_index(drop=True)

In [108]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
data_sampled['Salario'] = scaler.fit_transform(data_sampled[['Salario']])

In [109]:
final_embedded_df['Salario'] = data_sampled['Salario']

# TGAN

In [110]:
# Fijar semillas para reproducibilidad
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)  # Puedes cambiar '42' a cualquier otro número de semilla

# Función de inicialización de pesos
def weights_init(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
        if m.bias is not None:
            torch.nn.init.constant_(m.bias, 0)

# Generador
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ELU(),
            nn.Dropout(0.5),
            nn.Linear(512, 1024),
            nn.ELU(),
            nn.Dropout(0.5),
            nn.Linear(1024, 2048),
            nn.ELU(),
            nn.Dropout(0.5),
            nn.Linear(2048, output_dim)
        )
        self.apply(weights_init)
    
    def forward(self, z):
        return self.model(z)

# Crítico
class Critic(nn.Module):
    def __init__(self, input_dim):
        super(Critic, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 2048),
            nn.ELU(),
            nn.Dropout(0.5),
            nn.Linear(2048, 1024),
            nn.ELU(),
            nn.Dropout(0.5),
            nn.Linear(1024, 512),
            nn.ELU(),
            nn.Dropout(0.5),
            nn.Linear(512, 1)
        )
        self.apply(weights_init)
    
    def forward(self, x):
        return self.model(x)

# Inicialización de la WGAN-GP
class WGAN_GP:
    def __init__(self, data, latent_dim=100, epochs=100, batch_size=64, lr=0.0001, std_thresholds=None, lambda_gp=5, patience=10):
        self.data = data
        self.latent_dim = latent_dim
        self.epochs = epochs
        self.batch_size = batch_size
        self.lr = lr
        self.lambda_gp = lambda_gp
        self.patience = patience

        # Dimensión del dato de entrada
        self.data_dim = data.shape[1]

        # Instancia de las redes
        self.generator = Generator(latent_dim, self.data_dim)
        self.critic = Critic(self.data_dim)

        # Configuración del optimizador
        self.opt_gen = optim.Adam(self.generator.parameters(), lr=self.lr, betas=(0.5, 0.9))
        self.opt_critic = optim.Adam(self.critic.parameters(), lr=self.lr, betas=(0.5, 0.9))

        # Ponderación de la pérdida por dimensión
        self.dimension_weights = torch.ones(self.data_dim)

        # Umbrales de variabilidad personalizados por dimensión
        if std_thresholds is None:
            self.std_thresholds = torch.full((self.data_dim,), 0.01)  # Valor por defecto para todas las dimensiones
        else:
            self.std_thresholds = torch.tensor(std_thresholds)

        # Variables para almacenamiento de las mejores métricas y early stopping
        self.best_gen_loss = float('inf')
        self.best_epoch = 0
        self.early_stop_counter = 0

        # Almacenamiento de pérdidas para graficar
        self.critic_losses = []
        self.gen_losses = []
        self.embedding_metrics = []

    def gradient_penalty(self, real_data, fake_data):
        batch_size = real_data.size(0)
        epsilon = torch.rand(batch_size, 1, device=real_data.device, requires_grad=True)
        epsilon = epsilon.expand_as(real_data)
        
        interpolated = epsilon * real_data + (1 - epsilon) * fake_data
        interpolated = interpolated.requires_grad_(True)
        
        critic_interpolated = self.critic(interpolated)
        
        gradients = torch.autograd.grad(
            outputs=critic_interpolated,
            inputs=interpolated,
            grad_outputs=torch.ones_like(critic_interpolated),
            create_graph=True,
            retain_graph=True  # Retiene el grafo para futuras operaciones
        )[0]
        
        gradients = gradients.view(batch_size, -1)
        gradient_norm = gradients.norm(2, dim=1)
        penalty = self.lambda_gp * ((gradient_norm - 1) ** 2).mean()
        return penalty

    def embedding_metric(self, real_data, fake_data):
        # Métrica de comparación directa entre los embeddings reales y generados
        return torch.mean(torch.norm(real_data - fake_data, dim=1)).item()  # Distancia media L2

    def distribution_metric(self, real_data, fake_data):
        # Pequeño valor para evitar logaritmos de 0 o divisiones por 0
        epsilon = 1e-10

        # Calcular histogramas normalizados para real_data y fake_data
        real_dist = torch.histc(real_data, bins=100, min=0, max=1) + epsilon
        fake_dist = torch.histc(fake_data, bins=100, min=0, max=1) + epsilon

        # Normalizar las distribuciones
        real_dist /= real_dist.sum()
        fake_dist /= fake_dist.sum()

        # Media de las distribuciones
        mean_dist = 0.5 * (real_dist + fake_dist)

        # Calcular KL divergence para cada distribución respecto a la media
        kl_real = torch.sum(real_dist * (real_dist / mean_dist).log())
        kl_fake = torch.sum(fake_dist * (fake_dist / mean_dist).log())

        # Calcular Jensen-Shannon Divergence
        js_divergence = 0.5 * (kl_real + kl_fake)

        return js_divergence.item()

    def train(self):
        # Convertir los datos en tensores y cargar en DataLoader
        tensor_data = torch.tensor(self.data, dtype=torch.float32)
        dataloader = DataLoader(TensorDataset(tensor_data), batch_size=self.batch_size, shuffle=True)

        for epoch in range(self.epochs):
            for batch in dataloader:
                real_data = batch[0]

                # Generar ruido y datos falsos con mayor varianza
                z = torch.randn(real_data.size(0), self.latent_dim) * 1.0
                fake_data = self.generator(z)

                # Entrenamiento del crítico (con más frecuencia)
                for _ in range(5):
                    self.opt_critic.zero_grad()
                    
                    critic_real = self.critic(real_data)
                    critic_fake = self.critic(fake_data.detach())
                    
                    gp = self.gradient_penalty(real_data, fake_data)
                    
                    critic_loss = -torch.mean(critic_real) + torch.mean(critic_fake) + gp
                    critic_loss.backward(retain_graph=True)
                    self.opt_critic.step()

                # Entrenamiento del generador cada 5 pasos del crítico
                self.opt_gen.zero_grad()
                
                fake_data = self.generator(z)
                critic_fake = self.critic(fake_data)
                
                gen_loss = -torch.mean(critic_fake)
                gen_loss.backward()
                self.opt_gen.step()

                # Calcular la métrica de embeddings
                embedding_metric = self.embedding_metric(real_data, fake_data)
                # Calcular la métrica de distribución
                distribution_metric = self.distribution_metric(real_data, fake_data)

                # Almacenar las métricas
                self.embedding_metrics.append(embedding_metric)

            # Guardar las pérdidas para graficar
            self.critic_losses.append(critic_loss.item())
            self.gen_losses.append(gen_loss.item())

            print(f"Epoch {epoch+1}/{self.epochs}, Critic Loss: {critic_loss.item():.4f}, Gen Loss: {gen_loss.item():.4f}, Embedding Metric: {embedding_metric:.4f}, Distribution Metric: {distribution_metric:.4f}")

            # Checkpoint y early stopping
            if not torch.isinf(gen_loss) and not torch.isnan(gen_loss) and gen_loss.item() < self.best_gen_loss:
                if distribution_metric < 0.5:  # Ajusta el umbral si es necesario
                    self.best_gen_loss = gen_loss.item()
                    self.best_epoch = epoch
                    self.early_stop_counter = 0
                    torch.save(self.generator.state_dict(), f'best_generator_{id(self)}.pth')
                    torch.save(self.critic.state_dict(), f'best_critic_{id(self)}.pth')
                    print(f"Model saved at epoch {epoch+1}. Gen Loss: {gen_loss.item():.4f}, Distribution Metric: {distribution_metric:.4f}")
            else:
                self.early_stop_counter += 1

            if self.early_stop_counter >= self.patience:
                print(f"Early stopping at epoch {epoch+1}. Best epoch was {self.best_epoch+1} with Gen Loss: {self.best_gen_loss:.4f}")
                break

    def sample(self, n_samples):
        z = torch.randn(n_samples, self.latent_dim) * 5  # Aumentar la varianza del ruido para mejorar la diversidad
        generated_data = self.generator(z)
        return generated_data.detach().numpy()

# Función para entrenar un WGAN-GP con métricas adicionales
def train_wgan_gp(data_chunk, latent_dim=500, epochs=100, batch_size=64, lr=0.0001, std_thresholds=None, lambda_gp=10, patience=10):
    wgan_gp = WGAN_GP(data_chunk, latent_dim=latent_dim, epochs=epochs, batch_size=batch_size, lr=lr, std_thresholds=std_thresholds, lambda_gp=lambda_gp, patience=patience)
    wgan_gp.train()

    # Cargar el mejor modelo guardado si existe
    gen_path = f'best_generator_{id(wgan_gp)}.pth'
    critic_path = f'best_critic_{id(wgan_gp)}.pth'

    if os.path.exists(gen_path):
        wgan_gp.generator.load_state_dict(torch.load(gen_path))
    else:
        print(f"Warning: No se encontró el archivo '{gen_path}'")

    if os.path.exists(critic_path):
        wgan_gp.critic.load_state_dict(torch.load(critic_path))
    else:
        print(f"Warning: No se encontró el archivo '{critic_path}'")

    # Graficar las pérdidas y métricas
    # Crear una figura y tres subplots para las métricas
    fig, axs = plt.subplots(3, 1, figsize=(10, 15))  # 3 filas, 1 columna

    # Gráfica 1: Pérdida del Critic
    axs[0].plot(wgan_gp.critic_losses, label="Critic Loss", color='blue')
    axs[0].set_title('Critic Loss')
    axs[0].set_xlabel("Epochs")
    axs[0].set_ylabel("Value")
    axs[0].legend()

    # Gráfica 2: Pérdida del Generador
    axs[1].plot(wgan_gp.gen_losses, label="Gen Loss", color='green')
    axs[1].set_title('Generator Loss')
    axs[1].set_xlabel("Epochs")
    axs[1].set_ylabel("Value")
    axs[1].legend()

    # Gráfica 3: Métrica de Embeddings
    axs[2].plot(wgan_gp.embedding_metrics, label="Embedding Metric", color='red')
    axs[2].set_title('Embedding Metric')
    axs[2].set_xlabel("Epochs")
    axs[2].set_ylabel("Value")
    axs[2].legend()

    # Ajustar el espaciado entre las subplots para mejor visualización
    plt.tight_layout()
    plt.show()

    return wgan_gp

# Función para entrenar múltiples WGAN-GP y combinar sus resultados
def ensemble_wgan_gp(data_chunk, num_models=3, latent_dim=1000, epochs=100, batch_size=batch, lr=0.00005, std_thresholds=None, lambda_gp=10, patience=10):
    models = []
    for i in range(num_models):
        set_seed(42 + i)  # Cambiar la semilla para cada modelo
        print(f"Training model {i+1}/{num_models}")
        model = train_wgan_gp(data_chunk, latent_dim, epochs, batch_size, lr, std_thresholds, lambda_gp, patience)
        models.append(model)
    return models

# Función para combinar las salidas de múltiples generadores
def sample_from_ensemble(models, n_samples):
    samples = []
    for model in models:
        samples.append(model.sample(n_samples))
    # Combinar las muestras de todos los modelos (por ejemplo, concatenando o tomando un promedio ponderado)
    combined_samples = np.mean(samples, axis=0)  # Simple promedio para combinar muestras
    return combined_samples

# Train

In [None]:
start_time = time.time()
start_time

In [112]:
# Convertir el DataFrame a numpy array para entrenamiento
train_tensor = final_embedded_df.to_numpy()

In [None]:
ensemble_models = ensemble_wgan_gp(train_tensor, num_models=2)  # Entrenar 3 modelos

In [None]:
end_time = time.time()
elapsed_time = (end_time - start_time)/60
print(f"Tiempo de ejecución total: {elapsed_time} minutos")

In [None]:
synthetic_data = sample_from_ensemble(ensemble_models, n_samples=cantidad_muestras)  # Obtener muestras combinadas de los modelos
# Convertir los datos generados a un DataFrame
synthetic_embedded_df = pd.DataFrame(synthetic_data, columns=final_embedded_df.columns)
synthetic_embedded_df.head()

In [126]:
synthetic_salaries = synthetic_embedded_df['Salario'] # Si synthetic_salaries es una Serie de pandas, conviértela a DataFrame
# Revertir el escalado para volver a la escala original
synthetic_salaries_df = synthetic_salaries.to_frame()
salarios_revertidos = scaler.inverse_transform(synthetic_salaries_df)
synthetic_embedded_df = synthetic_embedded_df.iloc[:,:-1]

In [127]:
# Reconstruir las categorías originales desde las embeddings
from sklearn.metrics.pairwise import euclidean_distances
reconstructed_data = {}

for col, dim in embedding_dims.items():
    original_embeddings = embedding_layers[col](np.arange(df[f'{col}_idx'].nunique())).numpy()
    synthetic_col_embeds = synthetic_embedded_df[[f'{col}_embed_{i+1}' for i in range(dim)]].to_numpy()

    distances = euclidean_distances(synthetic_col_embeds, original_embeddings)
    closest_indices = np.argmin(distances, axis=1)

    reconstructed_data[col] = label_encoders[col].inverse_transform(closest_indices)
    
# Convertir los datos reconstruidos a un DataFrame
reconstructed_df = pd.DataFrame(reconstructed_data)

In [128]:
reconstructed_df['Salario'] = salarios_revertidos
reconstructed_df['Salario'] = reconstructed_df['Salario'].astype(float).round(0)

In [None]:
reconstructed_df.head()

In [None]:
reconstructed_df.iloc[:,2].value_counts(normalize=True).round(2)

In [None]:
data.iloc[:,3].value_counts(normalize=True).round(2)

In [None]:
reconstructed_df.shape

In [132]:
#subir_csv_S3(df, ruta_s3, nombre_archivo)
reconstructed_df.to_csv('PoblacionSinteticaGenerada.csv',
                   sep=',',
                   decimal='.')