In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from dataclasses import dataclass
from datetime import datetime
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from torch.utils.tensorboard import SummaryWriter

torch.manual_seed(42)
np.random.seed(42)

In [2]:
# Set device and hyperparameters
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
hidden_dim = 512
latent_dim = 10  # Adjust based on desired complexity
num_epochs = 10
learning_rate = 1e-3
weight_decay = 1e-5

In [3]:
def load_adult_data(filepath):
    """
    Loads the Adult Census dataset from a CSV file.

    Args:
        filepath (str): Path to the CSV file.

    Returns:
        pd.DataFrame: Loaded dataset.
    """
    adult_data = pd.read_csv(
        filepath,
        na_values='?',
    )
    # Rename 'class' column to 'income'
    adult_data.rename(columns={'class': 'income'}, inplace=True)
    return adult_data


In [4]:
def split_data(adult_data):
    """
    Splits the data into training and test sets.

    Args:
        adult_data (pd.DataFrame): The raw dataset.

    Returns:
        pd.DataFrame, pd.DataFrame, pd.Series, pd.Series: Training and test features and labels.
    """
    # Drop rows with missing values
    adult_data.dropna(inplace=True)

    # Encode 'income' column using LabelEncoder
    label_encoder_income = LabelEncoder()
    adult_data['income_encoded'] = label_encoder_income.fit_transform(adult_data['income'])
    num_classes = len(label_encoder_income.classes_)

    # Features and labels
    X = adult_data.drop(columns=['income', 'income_encoded'])  # Drop 'income' from features
    y = adult_data['income_encoded']  # Use encoded 'income' as labels

    # Split into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        stratify=y,
        random_state=42
    )

    return X_train, X_test, y_train, y_test, num_classes, label_encoder_income


In [5]:
def create_preprocessing_pipeline(X_train):
    """
    Creates preprocessing pipelines for numerical and categorical data.

    Args:
        X_train (pd.DataFrame): Training features.

    Returns:
        ColumnTransformer: Preprocessing pipeline.
    """
    # Identify numerical and categorical columns
    numerical_columns = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_columns = X_train.select_dtypes(include=['object']).columns.tolist()

    # Remove 'fnlwgt' if desired (it's a weighting factor that may not be useful)
    if 'fnlwgt' in numerical_columns:
        numerical_columns.remove('fnlwgt')

    # Define numerical and categorical pipelines
    numerical_pipeline = Pipeline(steps=[
        ('scaler', MinMaxScaler())
    ])

    categorical_pipeline = Pipeline(steps=[
        ('onehot', OneHotEncoder())
    ])

    # Combine pipelines using ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_pipeline, numerical_columns),
            ('cat', categorical_pipeline, categorical_columns)
        ]
    )

    return preprocessor, numerical_columns, categorical_columns

In [6]:
class AdultDataset(Dataset):
    def __init__(self, features, income_labels):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.income_labels = torch.tensor(income_labels.values, dtype=torch.long)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.income_labels[idx]

In [7]:
@dataclass
class VAEOutput:
    """
    Dataclass for VAE output.
    """
    z_sample: torch.Tensor
    x_recon: torch.Tensor
    loss: torch.Tensor
    loss_recon: torch.Tensor
    loss_kl: torch.Tensor

class VAE(nn.Module):
    """
    Variational Autoencoder (VAE) class.
    """

    def __init__(self, input_dim, hidden_dim, latent_dim, num_classes):
        super(VAE, self).__init__()
        self.num_classes = num_classes

        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim + num_classes, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(hidden_dim // 2, hidden_dim // 4),
            nn.ReLU(),
            nn.Linear(hidden_dim // 4, 2 * latent_dim),  # Mean and log variance
        )

        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim + num_classes, hidden_dim // 4),
            nn.ReLU(),
            nn.Linear(hidden_dim // 4, hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(hidden_dim // 2, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim),
            nn.Sigmoid(),
        )

    def encode(self, x, y, eps: float = 1e-8):
        y_onehot = F.one_hot(y, num_classes=self.num_classes).float()
        x = torch.cat([x, y_onehot], dim=-1)
        h = self.encoder(x)
        mu, logvar = torch.chunk(h, 2, dim=-1)
        std = torch.exp(0.5 * logvar) + eps
        return mu, std

    def reparameterize(self, mu, std):
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z, y):
        y_onehot = F.one_hot(y, num_classes=self.num_classes).float()
        z = torch.cat([z, y_onehot], dim=-1)
        return self.decoder(z)

    def forward(self, x, y, compute_loss: bool = True):
        mu, std = self.encode(x, y)
        z = self.reparameterize(mu, std)
        recon_x = self.decode(z, y)

        if not compute_loss:
            return VAEOutput(
                z_sample=z,
                x_recon=recon_x,
                loss=None,
                loss_recon=None,
                loss_kl=None,
            )

        # Reconstruction loss
        loss_recon = F.mse_loss(recon_x, x, reduction='mean')

        # KL divergence
        loss_kl = -0.5 * torch.mean(1 + torch.log(std.pow(2)) - mu.pow(2) - std.pow(2))

        loss = loss_recon + loss_kl

        return VAEOutput(
            z_sample=z,
            x_recon=recon_x,
            loss=loss,
            loss_recon=loss_recon,
            loss_kl=loss_kl,
        )


In [8]:
def train(model, dataloader, optimizer, epoch, device, writer):
    model.train()
    train_loss = 0

    for batch_idx, (data, target) in enumerate(tqdm(dataloader, desc=f"Training Epoch {epoch+1}")):
        data = data.to(device)
        target = target.to(device)

        optimizer.zero_grad()
        output = model(data, target)
        loss = output.loss
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

        # Log training loss every log_interval batches
        log_interval = 100
        if batch_idx % log_interval == 0:
            step = epoch * len(dataloader) + batch_idx
            writer.add_scalar('Loss/Train', loss.item(), step)
            writer.add_scalar('Loss/Train_Reconstruction', output.loss_recon.item(), step)
            writer.add_scalar('Loss/Train_KL', output.loss_kl.item(), step)

    average_loss = train_loss / len(dataloader)
    print(f"====> Epoch: {epoch+1} Average training loss: {average_loss:.4f}")
    writer.add_scalar('Loss/Train_Epoch', average_loss, epoch + 1)

def test(model, dataloader, epoch, device, writer):
    model.eval()
    test_loss = 0
    loss_recon_total = 0
    loss_kl_total = 0

    with torch.no_grad():
        for data, target in tqdm(dataloader, desc='Testing'):
            data = data.to(device)
            target = target.to(device)
            output = model(data, target)
            test_loss += output.loss.item()
            loss_recon_total += output.loss_recon.item()
            loss_kl_total += output.loss_kl.item()

    average_loss = test_loss / len(dataloader)
    average_loss_recon = loss_recon_total / len(dataloader)
    average_loss_kl = loss_kl_total / len(dataloader)

    print(f"====> Test set loss: {average_loss:.4f}")
    writer.add_scalar('Loss/Test', average_loss, epoch + 1)
    writer.add_scalar('Loss/Test_Reconstruction', average_loss_recon, epoch + 1)
    writer.add_scalar('Loss/Test_KL', average_loss_kl, epoch + 1)


In [9]:
def generate_samples(model, num_samples, desired_income, device, preprocessor, numerical_columns, categorical_columns):
    model.eval()
    with torch.no_grad():
        z = torch.randn(num_samples, model.encoder[-1].out_features // 2).to(device)
        y = torch.full((num_samples,), desired_income, dtype=torch.long, device=device)
        samples = model.decode(z, y)
    samples_np = samples.cpu().numpy()

    # Inverse transform the data
    # Get the number of numerical features
    num_numerical = len(numerical_columns)
    num_categorical = samples_np.shape[1] - num_numerical

    # Inverse transform numerical data
    numerical_data = samples_np[:, :num_numerical]
    numerical_data_inv = preprocessor.named_transformers_['num'].inverse_transform(numerical_data)

    # Inverse transform categorical data
    categorical_data = samples_np[:, num_numerical:]
    categorical_data_inv = preprocessor.named_transformers_['cat'].inverse_transform(categorical_data)

    # Combine numerical and categorical data
    samples_df_num = pd.DataFrame(numerical_data_inv, columns=numerical_columns)
    samples_df_cat = pd.DataFrame(categorical_data_inv, columns=categorical_columns)
    final_samples_df = pd.concat([samples_df_num, samples_df_cat], axis=1)

    return final_samples_df

def log_generated_samples(model, device, writer, epoch, preprocessor, numerical_columns, categorical_columns, label_encoder_income):
    num_samples = 5
    desired_income = 1  # '>50K'

    final_samples_df = generate_samples(
        model, num_samples, desired_income, device, preprocessor, numerical_columns, categorical_columns
    )

    # Log the generated samples as text to TensorBoard
    for i in range(num_samples):
        sample = final_samples_df.iloc[i]
        sample_text = sample.to_string()
        writer.add_text(f'Generated Samples/Epoch_{epoch+1}_Sample_{i+1}', sample_text, epoch + 1)

In [10]:
import os

# Check if 'adult.csv' exists
if not os.path.exists('adult.csv'):
    # Download the 'adult' census dataset
    adult = fetch_openml(name='adult', version=2, as_frame=True)
    df = adult.frame
    df.to_csv('adult.csv', index=False)

In [11]:
# Load the data
adult_data = load_adult_data('adult.csv')

# Split data
X_train_raw, X_test_raw, y_train, y_test, num_income_classes, label_encoder_income = split_data(adult_data)


In [12]:
# Create preprocessing pipeline
preprocessor, numerical_columns, categorical_columns = create_preprocessing_pipeline(X_train_raw)

# Fit the preprocessor on training data
preprocessor.fit(X_train_raw)

# Transform the data
X_train = preprocessor.transform(X_train_raw).todense()
X_test = preprocessor.transform(X_test_raw).todense()

In [13]:
# Create datasets
train_dataset = AdultDataset(X_train, y_train)
test_dataset = AdultDataset(X_test, y_test)

In [14]:
# Create data loaders
batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [15]:
# Initialize model and optimizer
model = VAE(input_dim=X_train.shape[1], hidden_dim=hidden_dim, latent_dim=latent_dim, num_classes=num_income_classes).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

writer = SummaryWriter(log_dir='runs/conditional_vae_adult')

# Training loop
for epoch in range(num_epochs):
    train(model, train_loader, optimizer, epoch, device, writer)
    test(model, test_loader, epoch, device, writer)
    log_generated_samples(
        model, device, writer, epoch, preprocessor, numerical_columns, categorical_columns, label_encoder_income
    )
    
writer.close()

Training Epoch 1:   0%|          | 0/283 [00:00<?, ?it/s]

====> Epoch: 1 Average training loss: 0.0487


Testing:   0%|          | 0/71 [00:00<?, ?it/s]

====> Test set loss: 0.0420


Training Epoch 2:   0%|          | 0/283 [00:00<?, ?it/s]

====> Epoch: 2 Average training loss: 0.0419


Testing:   0%|          | 0/71 [00:00<?, ?it/s]

====> Test set loss: 0.0417


Training Epoch 3:   0%|          | 0/283 [00:00<?, ?it/s]

====> Epoch: 3 Average training loss: 0.0418


Testing:   0%|          | 0/71 [00:00<?, ?it/s]

====> Test set loss: 0.0416


Training Epoch 4:   0%|          | 0/283 [00:00<?, ?it/s]

====> Epoch: 4 Average training loss: 0.0418


Testing:   0%|          | 0/71 [00:00<?, ?it/s]

====> Test set loss: 0.0417


Training Epoch 5:   0%|          | 0/283 [00:00<?, ?it/s]

====> Epoch: 5 Average training loss: 0.0418


Testing:   0%|          | 0/71 [00:00<?, ?it/s]

====> Test set loss: 0.0417


Training Epoch 6:   0%|          | 0/283 [00:00<?, ?it/s]

====> Epoch: 6 Average training loss: 0.0418


Testing:   0%|          | 0/71 [00:00<?, ?it/s]

====> Test set loss: 0.0417


Training Epoch 7:   0%|          | 0/283 [00:00<?, ?it/s]

====> Epoch: 7 Average training loss: 0.0418


Testing:   0%|          | 0/71 [00:00<?, ?it/s]

====> Test set loss: 0.0418


Training Epoch 8:   0%|          | 0/283 [00:00<?, ?it/s]

====> Epoch: 8 Average training loss: 0.0418


Testing:   0%|          | 0/71 [00:00<?, ?it/s]

====> Test set loss: 0.0416


Training Epoch 9:   0%|          | 0/283 [00:00<?, ?it/s]

====> Epoch: 9 Average training loss: 0.0418


Testing:   0%|          | 0/71 [00:00<?, ?it/s]

====> Test set loss: 0.0417


Training Epoch 10:   0%|          | 0/283 [00:00<?, ?it/s]

====> Epoch: 10 Average training loss: 0.0418


Testing:   0%|          | 0/71 [00:00<?, ?it/s]

====> Test set loss: 0.0417


In [None]:
# Generate samples conditioned on a specific income class
# For 'income', the classes are encoded as 0: '<=50K', 1: '>50K'
desired_income = 0  # Adjust based on your encoding (1 corresponds to '>50K')
num_samples = 5

final_samples_df = generate_samples(
    model, num_samples, desired_income, device, preprocessor, numerical_columns, categorical_columns
)

# Print the generated samples
print("\nGenerated Samples:")
print(final_samples_df.head())

# Decode the income label
income_class = label_encoder_income.inverse_transform([desired_income])[0]
print(f"\nSamples conditioned on income: {income_class}")



Generated Samples:
         age  education-num  capital-gain  capital-loss  hours-per-week  \
0  37.818928       9.678803    928.586304     76.391281       38.970665   
1  37.906044       9.671959    958.353760     78.512070       39.039623   
2  37.697903       9.681820    897.575623     74.269279       38.879658   
3  37.837624       9.675223    938.164673     77.087257       38.984947   
4  38.060295       9.657116   1015.300476     82.471115       39.175804   

  workclass education marital-status     occupation relationship   race   sex  \
0   Private   HS-grad  Never-married  Other-service      Husband  White  Male   
1   Private   HS-grad  Never-married  Other-service      Husband  White  Male   
2   Private   HS-grad  Never-married  Other-service      Husband  White  Male   
3   Private   HS-grad  Never-married  Other-service      Husband  White  Male   
4   Private   HS-grad  Never-married  Other-service      Husband  White  Male   

  native-country  
0  United-States  
1  U

In [None]:
# Generate samples conditioned on a specific income class
# For 'income', the classes are encoded as 0: '<=50K', 1: '>50K'
desired_income = 1  # Adjust based on your encoding (1 corresponds to '>50K')
num_samples = 5

final_samples_df = generate_samples(
    model, num_samples, desired_income, device, preprocessor, numerical_columns, categorical_columns
)

# Print the generated samples
print("\nGenerated Samples:")
print(final_samples_df.head())

# Decode the income label
income_class = label_encoder_income.inverse_transform([desired_income])[0]
print(f"\nSamples conditioned on income: {income_class}")



Generated Samples:
         age  education-num  capital-gain  capital-loss  hours-per-week  \
0  43.407150      11.547520   2860.858643    171.151993       43.669777   
1  43.009159      11.521250   2576.706299    157.152084       43.380733   
2  43.569908      11.465553   3096.794189    184.053070       43.790455   
3  43.159351      11.500388   2719.616211    164.675110       43.503033   
4  43.051792      11.539227   2583.427490    157.238907       43.412880   

  workclass  education      marital-status       occupation relationship  \
0   Private  Bachelors  Married-civ-spouse  Exec-managerial      Husband   
1   Private  Bachelors  Married-civ-spouse  Exec-managerial      Husband   
2   Private  Bachelors  Married-civ-spouse  Exec-managerial      Husband   
3   Private  Bachelors  Married-civ-spouse  Exec-managerial      Husband   
4   Private  Bachelors  Married-civ-spouse  Exec-managerial      Husband   

    race   sex native-country  
0  White  Male  United-States  
1  White