In [1]:
import torch
import torch.nn as nn
import numpy as np

# Define the Autoencoder Model
class TabularAutoencoder(nn.Module):
    def __init__(self, num_continuous, num_categorical, embedding_sizes, latent_dim):
        super(TabularAutoencoder, self).__init__()

        # Embedding layers for categorical variables
        self.embeddings = nn.ModuleList([nn.Embedding(categories, size) for categories, size in embedding_sizes])
        embed_dim = sum([size for _, size in embedding_sizes])

        # Input size for the encoder
        input_size = num_continuous + embed_dim

        # Encoder layers
        self.encoder = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, latent_dim)
        )

        # Decoder layers
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 128),
            nn.ReLU(),
            nn.Linear(128, input_size)
        )

        # Output layers for continuous and categorical variables
        self.output_continuous = nn.Linear(num_continuous, num_continuous)
        self.output_categorical = nn.ModuleList([
            nn.Linear(size, categories) for categories, size in embedding_sizes
        ])

    def forward(self, x_continuous, x_categorical=None, latent=None):
        if latent is None:
            # Process categorical variables through embeddings
            embedded = [embedding(x_categorical[:, i]) for i, embedding in enumerate(self.embeddings)]
            embedded = torch.cat(embedded, dim=1)

            # Concatenate continuous and embedded categorical variables
            x = torch.cat([x_continuous, embedded], dim=1)

            # Pass through the encoder to get latent representation
            latent = self.encoder(x)

        # Pass through the decoder to reconstruct inputs
        decoded = self.decoder(latent)

        # Separate the continuous and categorical outputs
        reconstructed_continuous = self.output_continuous(decoded[:, :x_continuous.size(1)])
        start_idx = x_continuous.size(1)
        reconstructed_categorical = []
        for i, output_layer in enumerate(self.output_categorical):
            end_idx = start_idx + self.embeddings[i].embedding_dim
            reconstructed_categorical.append(output_layer(decoded[:, start_idx:end_idx]))
            start_idx = end_idx

        return latent, reconstructed_continuous, reconstructed_categorical

# Example usage
if __name__ == "__main__":
    # Define the dataset structure
    num_continuous = 5
    num_categorical = 3
    embedding_sizes = [(10, 4), (15, 6), (8, 3)]  # (num_categories, embedding_dim)
    latent_dim = 16

    # Create random example data
    batch_size = 32
    x_continuous = torch.rand(batch_size, num_continuous)
    x_categorical = torch.randint(0, 10, (batch_size, num_categorical))

    # Initialize and run the model
    model = TabularAutoencoder(num_continuous, num_categorical, embedding_sizes, latent_dim)
    latent, _, _ = model(x_continuous, x_categorical)

IndexError: index out of range in self