In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch

# Load Dataset
# Replace 'dataset.csv' with your dataset file path
df = pd.read_csv('../data/physionet_wo_missing.csv', index_col=0)

# Convert dataset to numpy array (assuming all features are numeric)
data = df.to_numpy()

# Split Dataset
# Specify the test size (e.g., 20% for testing)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Normalize Data
# Use StandardScaler to normalize the features
scaler = StandardScaler()
train_data = scaler.fit_transform(train_data)
test_data = scaler.transform(test_data)

# Convert to Torch Tensors
# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_tensor = torch.tensor(train_data, dtype=torch.float32).to(device)
test_tensor = torch.tensor(test_data, dtype=torch.float32).to(device)

# Verify Shapes
print(f"Train Tensor Shape: {train_tensor.shape}")
print(f"Test Tensor Shape: {test_tensor.shape}")
print(f"Device Used: {device}")

Train Tensor Shape: torch.Size([1278, 39])
Test Tensor Shape: torch.Size([320, 39])
Device Used: cuda


In [2]:
import torch
import torch.nn as nn

class TabularTransformer(nn.Module):
    def __init__(self, num_features, d_model=64, num_heads=4, num_layers=2, dropout=0.1):
        super(TabularTransformer, self).__init__()
        
        # 1) Column Embedding Layer
        self.embedding = nn.Linear(1, d_model)  # Embed each feature (column) into d_model dimensions
        
        # 2) Positional Encoding (Column Embedding)
        self.column_embedding = nn.Embedding(num_features, d_model)
        
        # 3) Transformer Encoder Layers
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, 
            nhead=num_heads, 
            dim_feedforward=128, 
            dropout=dropout,
            batch_first=True  # Use batch-first format for better performance
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers)
        
        # 4) Output Layer
        self.output_layer = nn.Linear(d_model, 1)  # Predict one value per feature

    def forward(self, x, column_indices, mask=None):
        """
        x: [batch_size, num_features]
        column_indices: Tensor of column indices for embedding (shape: [num_features])
        mask: Optional mask for self-attention (shape: [batch_size, num_features, num_features])
        """
        # Reshape x to [batch_size, num_features, 1] for embedding
        x = x.unsqueeze(-1)
        
        # Column Embedding
        x_embed = self.embedding(x)
        
        # Add Positional (Column) Embedding
        column_embed = self.column_embedding(column_indices)
        x_embed += column_embed.unsqueeze(0)  # Broadcast across batch
        
        # Pass through Transformer Encoder
        x_encoded = self.transformer_encoder(x_embed, mask=mask)
        
        # Final Output Layer (Predict missing values)
        output = self.output_layer(x_encoded)
        return output.squeeze(-1)  # [batch_size, num_features]

# Model Initialization
num_features = train_tensor.shape[1]  # Number of features (columns in dataset)
model = TabularTransformer(num_features=num_features, d_model=64, num_heads=4, num_layers=2)

# Move model to GPU if available
model = model.to(device)

# Define Column Indices
column_indices = torch.arange(num_features).to(device)

# Example Forward Pass
output = model(train_tensor, column_indices)
print(f"Output shape: {output.shape}")  # Should match [batch_size, num_features]

Output shape: torch.Size([1278, 39])


  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)


In [3]:
def create_missing_mask(data, missing_fraction=0.2):
    """
    Creates a binary mask for missing values.

    Args:
        data (Tensor): Input data (shape: [batch_size, num_features]).
        missing_fraction (float): Fraction of values to mask.

    Returns:
        Tensor: Binary mask of the same shape as `data` (1 = missing, 0 = observed).
    """
    mask = torch.rand(data.shape).to(data.device) < missing_fraction
    return mask.int()

# Example usage
missing_fraction = 0.2  # Mask 20% of the data
mask = create_missing_mask(train_tensor, missing_fraction)
print(f"Mask Shape: {mask.shape}")

Mask Shape: torch.Size([1278, 39])


In [4]:
def compute_loss(predictions, ground_truth, mask):
    """
    Computes the loss only on masked positions.

    Args:
        predictions (Tensor): Model predictions (shape: [batch_size, num_features]).
        ground_truth (Tensor): Original data (shape: [batch_size, num_features]).
        mask (Tensor): Binary mask (1 = missing, 0 = observed).

    Returns:
        Tensor: Scalar loss value.
    """
    mse_loss = nn.MSELoss(reduction='none')
    loss = mse_loss(predictions, ground_truth)
    masked_loss = (loss * mask).sum() / mask.sum()  # Normalize by number of masked positions
    return masked_loss

In [5]:
# Define Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Training Loop
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    
    # Generate masks and replace masked values with 0
    mask = create_missing_mask(train_tensor, missing_fraction)
    input_with_mask = train_tensor.clone()
    input_with_mask[mask == 1] = 0  # Replace masked positions with 0
    
    # Forward Pass
    predictions = model(input_with_mask, column_indices)
    
    # Compute Loss
    loss = compute_loss(predictions, train_tensor, mask)
    
    # Backward Pass and Optimization
    loss.backward()
    optimizer.step()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")


Epoch 1/20, Loss: 1.2102
Epoch 2/20, Loss: 1.2609
Epoch 3/20, Loss: 1.1221
Epoch 4/20, Loss: 1.0523
Epoch 5/20, Loss: 1.0262
Epoch 6/20, Loss: 0.9926
Epoch 7/20, Loss: 0.9286
Epoch 8/20, Loss: 1.0039
Epoch 9/20, Loss: 1.0599
Epoch 10/20, Loss: 1.0303
Epoch 11/20, Loss: 0.9674
Epoch 12/20, Loss: 1.0186
Epoch 13/20, Loss: 1.0311
Epoch 14/20, Loss: 0.9629
Epoch 15/20, Loss: 1.1192
Epoch 16/20, Loss: 1.0817
Epoch 17/20, Loss: 1.0180
Epoch 18/20, Loss: 0.9703
Epoch 19/20, Loss: 0.9981
Epoch 20/20, Loss: 1.1161


In [6]:
def evaluate_model(model, test_data, column_indices, missing_fraction=0.2):
    model.eval()
    
    # Generate masks and replace masked values with 0
    mask = create_missing_mask(test_data, missing_fraction)
    input_with_mask = test_data.clone()
    input_with_mask[mask == 1] = 0  # Replace masked positions with 0
    
    # Forward Pass
    with torch.no_grad():
        predictions = model(input_with_mask, column_indices)
    
    # Compute Loss
    loss = compute_loss(predictions, test_data, mask)
    print(f"Test Loss: {loss.item():.4f}")
    return loss

# Evaluate the Model
evaluate_model(model, test_tensor, column_indices)


Test Loss: 1.0148


tensor(1.0148, device='cuda:0')

In [7]:
import torch

def compute_nrmse(predictions, ground_truth, mask):
    """
    Computes the Normalized Root Mean Squared Error (NRMSE) for imputed values.

    Args:
        predictions (Tensor): Model predictions (shape: [batch_size, num_features]).
        ground_truth (Tensor): Original data (shape: [batch_size, num_features]).
        mask (Tensor): Binary mask (1 = missing, 0 = observed).

    Returns:
        float: NRMSE value for the masked positions.
    """
    # Extract only masked values
    masked_predictions = predictions[mask == 1]
    masked_ground_truth = ground_truth[mask == 1]
    
    # Compute RMSE
    mse = torch.mean((masked_predictions - masked_ground_truth) ** 2)
    rmse = torch.sqrt(mse)
    
    # Compute range of ground truth values
    data_range = ground_truth.max() - ground_truth.min()
    
    # Normalize RMSE
    nrmse = rmse / data_range
    return nrmse.item()

def evaluate_model_with_nrmse(model, test_data, column_indices, missing_fraction=0.2):
    """
    Evaluates the model and computes NRMSE for masked positions.

    Args:
        model (nn.Module): Trained model.
        test_data (Tensor): Test dataset (shape: [batch_size, num_features]).
        column_indices (Tensor): Column indices for embedding (shape: [num_features]).
        missing_fraction (float): Fraction of values to mask.

    Returns:
        float: NRMSE value for the test dataset.
    """
    model.eval()
    
    # Generate masks and replace masked values with 0
    mask = create_missing_mask(test_data, missing_fraction)
    input_with_mask = test_data.clone()
    input_with_mask[mask == 1] = 0  # Replace masked positions with 0
    
    # Forward Pass
    with torch.no_grad():
        predictions = model(input_with_mask, column_indices)
    
    # Compute NRMSE
    nrmse = compute_nrmse(predictions, test_data, mask)
    print(f"NRMSE: {nrmse:.4f}")
    return nrmse

# Evaluate the Model with NRMSE
nrmse = evaluate_model_with_nrmse(model, test_tensor, column_indices)


NRMSE: 0.0271


In [13]:
from pyampute.ampute import MultivariateAmputation
import pandas as pd
import numpy as np
import torch

# Utility to create missing datasets (MCAR, MAR, MNAR)
def create_missing_dataset(data, missing_fraction=0.1, mechanism="MCAR"):
    """
    Introduces missingness (MCAR, MAR, MNAR) into the dataset and generates a mask.

    Args:
        data (numpy.ndarray or pd.DataFrame): Original dataset (numerical).
        missing_fraction (float): Fraction of missing values to introduce.
        mechanism (str): Missingness mechanism ("MCAR", "MAR", or "MNAR").

    Returns:
        Tensor: Data with missing values replaced by 0.0.
        Tensor: Binary mask where 1 = missing, 0 = observed.
    """
    if isinstance(data, np.ndarray):
        data = pd.DataFrame(data)
    
    # Define the amputation pattern
    patterns = [{
        "incomplete_vars": data.columns.tolist(),
        "weights": np.zeros(len(data.columns)),  # Default for MCAR
        "mechanism": mechanism,
        "score_to_probability_func": "sigmoid-right"
    }]
    
    # Customize for MAR or MNAR
    if mechanism == "MAR":
        # Randomly choose a subset of columns for MAR
        num_columns = len(data.columns)
        num_amputed_columns = max(1, int(num_columns * 0.5))  # Amputate 50% of columns
        amputed_columns = np.random.choice(data.columns, num_amputed_columns, replace=False)
        patterns[0]["incomplete_vars"] = amputed_columns
        patterns[0]["weights"] = np.random.uniform(-1, 1, num_columns)  # Random weights
    
    elif mechanism == "MNAR":
        patterns[0]["weights"] = np.random.uniform(0.5, 2, len(data.columns))  # Skewed weights

    # Apply missingness with pyampute
    amputer = MultivariateAmputation(prop=missing_fraction, patterns=patterns)
    amputed_data = amputer.fit_transform(data)
    
    # Create mask and fill NaNs with 0.0
    mask = pd.isna(amputed_data).astype(int).to_numpy()
    amputed_data = np.nan_to_num(amputed_data, nan=0.0)
    
    # Convert to PyTorch tensors
    amputed_data = torch.tensor(amputed_data, dtype=torch.float32).to(device)
    mask = torch.tensor(mask, dtype=torch.int32).to(device)
    
    # Ensure shapes match
    assert amputed_data.shape == mask.shape, "Data and mask shapes do not match!"
    return amputed_data, mask


In [14]:
# Step 2: Compute NRMSE
def compute_nrmse(predictions, ground_truth, mask):
    """
    Computes the Normalized Root Mean Squared Error (NRMSE) for imputed values.

    Args:
        predictions (Tensor): Model predictions (shape: [batch_size, num_features]).
        ground_truth (Tensor): Original data (shape: [batch_size, num_features]).
        mask (Tensor): Binary mask (1 = missing, 0 = observed).

    Returns:
        float: NRMSE value for the masked positions.
    """
    # Extract only masked values
    masked_predictions = predictions[mask == 1]
    masked_ground_truth = ground_truth[mask == 1]
    
    # Compute RMSE
    mse = torch.mean((masked_predictions - masked_ground_truth) ** 2)
    rmse = torch.sqrt(mse)
    
    # Compute range of ground truth values
    data_range = ground_truth.max() - ground_truth.min()
    
    # Normalize RMSE
    nrmse = rmse / data_range
    return nrmse.item()

In [15]:
# Evaluate the model with missing data
def evaluate_model_with_missingness(model, original_data, column_indices, missing_fraction=0.1, mechanism="MCAR"):
    """
    Evaluates the model using a dataset with missing data (MCAR, MAR, or MNAR).

    Args:
        model (nn.Module): Trained model.
        original_data (Tensor): Original dataset (shape: [batch_size, num_features]).
        column_indices (Tensor): Column indices for embedding (shape: [num_features]).
        missing_fraction (float): Fraction of missing values to introduce.
        mechanism (str): Missingness mechanism ("MCAR", "MAR", or "MNAR").

    Returns:
        float: NRMSE value for the dataset.
    """
    model.eval()
    
    # Generate dataset with missing data
    amputed_data, mask = create_missing_dataset(original_data.cpu().numpy(), missing_fraction, mechanism)
    
    # Forward pass with the amputed data
    with torch.no_grad():
        predictions = model(amputed_data, column_indices)
    
    # Compute NRMSE
    nrmse = compute_nrmse(predictions, original_data, mask)
    print(f"{mechanism} NRMSE at {missing_fraction * 100:.0f}% Missing: {nrmse:.4f}")
    return nrmse


In [16]:
# Evaluate for all mechanisms
def evaluate_across_mechanisms(model, original_data, column_indices, missing_fractions):
    """
    Evaluates the model for MCAR, MAR, and MNAR at different missing fractions.

    Args:
        model (nn.Module): Trained model.
        original_data (Tensor): Original dataset.
        column_indices (Tensor): Column indices for embedding.
        missing_fractions (list): List of missing fractions to evaluate.

    Returns:
        dict: Results of NRMSE for each mechanism and fraction.
    """
    mechanisms = ["MCAR", "MAR", "MNAR"]
    results = {mechanism: {} for mechanism in mechanisms}
    
    for mechanism in mechanisms:
        for fraction in missing_fractions:
            results[mechanism][fraction] = evaluate_model_with_missingness(
                model, original_data, column_indices, missing_fraction=fraction, mechanism=mechanism
            )
    
    return results

In [19]:
# Define missing fractions to test
missing_fractions = [0.1, 0.2, 0.3, 0.4, 0.5]

# Run evaluations for MCAR, MAR, and MNAR
results = evaluate_across_mechanisms(model, test_tensor, column_indices, missing_fractions)

# Print or log results
for mechanism, nrmse_values in results.items():
    print(f"\n{mechanism} Results:")
    for frac, nrmse in nrmse_values.items():
        print(f"  Missing Fraction {frac * 100:.0f}%: NRMSE = {nrmse:.4f}")




MCAR NRMSE at 10% Missing: 0.0338
MCAR NRMSE at 20% Missing: 0.0305
MCAR NRMSE at 30% Missing: 0.0262
MCAR NRMSE at 40% Missing: 0.0288
MCAR NRMSE at 50% Missing: 0.0273
MAR NRMSE at 10% Missing: 0.0299
MAR NRMSE at 20% Missing: 0.0210
MAR NRMSE at 30% Missing: 0.0338
MAR NRMSE at 40% Missing: 0.0266
MAR NRMSE at 50% Missing: 0.0268
MNAR NRMSE at 10% Missing: 0.0306




MNAR NRMSE at 20% Missing: 0.0337
MNAR NRMSE at 30% Missing: 0.0326
MNAR NRMSE at 40% Missing: 0.0303
MNAR NRMSE at 50% Missing: 0.0307

MCAR Results:
  Missing Fraction 10%: NRMSE = 0.0338
  Missing Fraction 20%: NRMSE = 0.0305
  Missing Fraction 30%: NRMSE = 0.0262
  Missing Fraction 40%: NRMSE = 0.0288
  Missing Fraction 50%: NRMSE = 0.0273

MAR Results:
  Missing Fraction 10%: NRMSE = 0.0299
  Missing Fraction 20%: NRMSE = 0.0210
  Missing Fraction 30%: NRMSE = 0.0338
  Missing Fraction 40%: NRMSE = 0.0266
  Missing Fraction 50%: NRMSE = 0.0268

MNAR Results:
  Missing Fraction 10%: NRMSE = 0.0306
  Missing Fraction 20%: NRMSE = 0.0337
  Missing Fraction 30%: NRMSE = 0.0326
  Missing Fraction 40%: NRMSE = 0.0303
  Missing Fraction 50%: NRMSE = 0.0307


In [12]:
# Path to save the model
model_path = "tabular_transformer_model.pth"

# Save the model's state dictionary (recommended)
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")

Model saved to tabular_transformer_model.pth
