In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ---------------------
# 1. LOAD YOUR DATA
# ---------------------
# Example CSV reading; adjust filename, skip the index column, etc.
df = pd.read_csv("../data/physionet_wo_missing.csv", index_col=0)  # if first column is an index
# df.head()

from sklearn.preprocessing import StandardScaler

# Create a StandardScaler instance
scaler = StandardScaler()

# Convert your DataFrame to a NumPy array (excluding the index)
data_np = df.values  # shape: (num_samples, num_features)

# Fit the scaler on the data and transform it
# After this, each column ~ N(0, 1)
data_scaled_np = scaler.fit_transform(data_np)

print("Shape of scaled data:", data_scaled_np.shape)

print("Mean per column after scaling:", np.mean(data_scaled_np, axis=0))
print("Std per column after scaling:", np.std(data_scaled_np, axis=0))

Shape of scaled data: (1598, 39)
Mean per column after scaling: [ 8.89290032e-18  4.44645016e-17 -7.11432026e-17 -5.78038521e-17
  0.00000000e+00 -4.44645016e-17  4.89109518e-17 -1.11161254e-17
  1.35616730e-16  1.47844468e-16  3.11251511e-17 -2.53447659e-16
 -4.89109518e-17  8.89290032e-18  8.00361029e-17 -1.05769933e-15
 -1.22277379e-15  4.22412765e-17  4.00180515e-17  1.88974132e-16
 -6.73637199e-16  6.26393666e-16 -4.40198566e-16 -1.29502861e-16
 -5.33574019e-17 -4.40198566e-16  4.53537916e-16 -2.77903135e-17
 -2.62340560e-16  3.37930212e-16  3.94622452e-17 -1.40063180e-16
 -1.26723830e-15  1.70076719e-16  1.37839955e-16  3.21256024e-16
 -2.93465711e-16 -5.33574019e-17  0.00000000e+00]
Std per column after scaling: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [7]:
# Convert scaled numpy array -> torch tensor
data_full = torch.tensor(data_scaled_np, dtype=torch.float32).to(device)

print("Final tensor shape:", data_full.shape)

Final tensor shape: torch.Size([1598, 39])


In [8]:
num_samples, num_features = data_full.shape
print("Data shape:", data_full.shape)  # (N, D)

Data shape: torch.Size([1598, 39])


In [9]:
class MaskedImputationDataset(torch.utils.data.Dataset):
    def __init__(self, data_tensor, mask_ratio=0.1):
        super().__init__()
        self.data = data_tensor
        self.mask_ratio = mask_ratio
        self.num_samples = data_tensor.shape[0]
        self.num_features = data_tensor.shape[1]

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        row = self.data[idx]  # shape: (num_features,)

        # Randomly choose which columns to mask
        mask = (torch.rand(self.num_features) < self.mask_ratio).float()

        # Create a copy of row with masked entries set to 0 (or any sentinel)
        masked_row = row.clone()
        masked_row[mask == 1] = 0.0   # zero-out the masked positions

        return masked_row, mask, row
        # - masked_row: the row with 10% of columns replaced by 0
        # - mask: binary (1 if masked, 0 otherwise)
        # - row: the original, unmasked row


In [10]:
class PositionalEncoding(nn.Module):
    """
    Learnable positional encoding to differentiate column 'positions'.
    For tabular data, we typically have a fixed set of column positions [0..D-1].
    """
    def __init__(self, d_model, max_len):
        super().__init__()
        # A learnable embedding for each column index
        self.col_embed = nn.Embedding(max_len, d_model)

    def forward(self, x):
        # x shape: (batch_size, num_features, d_model)
        batch_size, num_features, d_model = x.size()

        # column positions: 0..num_features-1 repeated across the batch
        positions = torch.arange(0, num_features, device=x.device).unsqueeze(0).repeat(batch_size, 1)
        # shape: (batch_size, num_features)

        # shape of col_embed(positions): (batch_size, num_features, d_model)
        pe = self.col_embed(positions)

        # Add the positional embedding
        x = x + pe
        return x

class TabTransformer(nn.Module):
    def __init__(self, num_features, d_model=64, nhead=4, num_layers=2):
        """
        num_features: number of columns (tokens)
        d_model: embedding dimension
        nhead: number of attention heads
        num_layers: how many TransformerEncoderLayers
        """
        super().__init__()
        self.num_features = num_features
        self.d_model = d_model

        # A simple linear embedding from scalar -> d_model
        self.value_embedding = nn.Linear(1, d_model)

        # Positional encoding for each of the columns
        self.pos_encoder = PositionalEncoding(d_model, max_len=num_features)

        # Transformer Encoder
        encoder_layers = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=256,
            dropout=0.1,
            activation='relu'
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)

        # Final projection: from d_model -> 1 (reconstruct each column's value)
        self.output_layer = nn.Linear(d_model, 1)

    def forward(self, x, mask=None):
        """
        x shape: (batch_size, num_features)
           We'll convert x -> (batch_size, num_features, 1)
           Then embed -> (batch_size, num_features, d_model)
        mask: binary mask indicating which columns are missing (optional)
        """
        batch_size, num_features = x.size()

        # Reshape to [batch_size, num_features, 1]
        x = x.unsqueeze(-1)  # shape: (B, D, 1)
        
        # value_embedding -> (B, D, d_model)
        x = self.value_embedding(x)

        # Add positional encoding
        x = self.pos_encoder(x)  # (B, D, d_model)

        # Transformer in PyTorch expects shape (S, B, E), so we transpose:
        x = x.transpose(0, 1)  # shape: (D, B, d_model)

        # Pass through the Transformer Encoder
        # (no source mask typically, but we could incorporate a padding_mask)
        x = self.transformer_encoder(x)  # shape: (D, B, d_model)

        # Transpose back
        x = x.transpose(0, 1)  # (B, D, d_model)

        # Project back to 1 dimension
        x = self.output_layer(x)  # (B, D, 1)

        # Squeeze final dim
        x = x.squeeze(-1)  # (B, D)
        return x


In [11]:
# Create dataset and dataloader
train_dataset = MaskedImputationDataset(data_full, mask_ratio=0.1)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)

# Instantiate the Transformer model
model = TabTransformer(num_features=num_features, d_model=64, nhead=4, num_layers=2).to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

num_epochs = 20
model.train()
for epoch in range(num_epochs):
    total_loss = 0.0
    for batch in train_loader:
        masked_row, mask, original_row = batch
        # masked_row, mask, original_row have shape (batch_size, num_features)

        # Forward
        optimizer.zero_grad()
        outputs = model(masked_row)  # shape: (batch_size, num_features)

        # Compute loss only over masked elements
        # mask == 1 => these columns are truly masked
        loss = criterion(outputs[mask == 1], original_row[mask == 1])

        # Backprop
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}] - Loss: {avg_loss:.4f}")




Epoch [1/20] - Loss: 0.9974
Epoch [2/20] - Loss: 1.0720
Epoch [3/20] - Loss: 0.9508
Epoch [4/20] - Loss: 1.0113
Epoch [5/20] - Loss: 0.8638
Epoch [6/20] - Loss: 0.9166
Epoch [7/20] - Loss: 0.9316
Epoch [8/20] - Loss: 0.9349
Epoch [9/20] - Loss: 0.9896
Epoch [10/20] - Loss: 0.8833
Epoch [11/20] - Loss: 0.8619
Epoch [12/20] - Loss: 0.8633
Epoch [13/20] - Loss: 0.8517
Epoch [14/20] - Loss: 0.6757
Epoch [15/20] - Loss: 0.8002
Epoch [16/20] - Loss: 0.6454
Epoch [17/20] - Loss: 0.7259
Epoch [18/20] - Loss: 0.5558
Epoch [19/20] - Loss: 0.6014
Epoch [20/20] - Loss: 0.5791


In [12]:
def create_test_masked_data(data_tensor, mask_ratio=0.1):
    """
    Return masked_data, mask, original_data
    where mask=1 if the entry was masked.
    """
    mask = (torch.rand_like(data_tensor) < mask_ratio).float()
    masked_data = data_tensor.clone()
    masked_data[mask == 1] = 0.0
    return masked_data, mask, data_tensor

# Suppose we have a test set 'data_test' (or we can split from data_full)
# For demonstration, let's just use the same data again as "test".
data_test = data_full.clone()

# Create test masked data
masked_test, test_mask, original_test = create_test_masked_data(data_test, mask_ratio=0.1)

# Evaluate the model
model.eval()
with torch.no_grad():
    # Forward pass
    predictions = model(masked_test)

# predictions shape: (num_samples, num_features)

# We only care about the entries that were masked
masked_indices = (test_mask == 1)

# Extract the predicted and actual values for masked entries
pred_vals = predictions[masked_indices]
true_vals = original_test[masked_indices]

# Compute NRMSE
mse = torch.mean((pred_vals - true_vals) ** 2)
rmse = torch.sqrt(mse)

# We can compute the normalizing factor from min/max of the *true_vals*
val_min = torch.min(true_vals)
val_max = torch.max(true_vals)
norm_factor = val_max - val_min

nrmse = (rmse / norm_factor).item()

print(f"Test RMSE: {rmse.item():.4f}")
print(f"Test NRMSE: {nrmse:.4f}")


Test RMSE: 0.7900
Test NRMSE: 0.0382
