In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch

df = pd.read_csv('../data/finalEDA/data/dataset_1_39_features.csv', index_col=0)
data = df.to_numpy()

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

scaler = StandardScaler()
train_data = scaler.fit_transform(train_data)
test_data = scaler.transform(test_data)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_tensor = torch.tensor(train_data, dtype=torch.float32).to(device)
test_tensor = torch.tensor(test_data, dtype=torch.float32).to(device)

# Verify Shapes
print(f"Train Tensor Shape: {train_tensor.shape}")
print(f"Test Tensor Shape: {test_tensor.shape}")
print(f"Device Used: {device}")

Train Tensor Shape: torch.Size([1606, 38])
Test Tensor Shape: torch.Size([402, 38])
Device Used: cuda


In [3]:
import torch
import torch.nn as nn

class TransformerModel(nn.Module):
    def __init__(self, num_features, d_model=64, num_heads=4, num_layers=2, dropout=0.1):
        super(TransformerModel, self).__init__()

        self.embedding = nn.Linear(1, d_model)  # Embed each feature (column) into d_model dimensions

        self.column_embedding = nn.Embedding(num_features, d_model)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, 
            nhead=num_heads, 
            dim_feedforward=128, 
            dropout=dropout,
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers)

        self.output_layer = nn.Linear(d_model, 1)  # Predict one value per feature

    def forward(self, x, column_indices, mask=None):

        x = x.unsqueeze(-1)

        x_embed = self.embedding(x)

        column_embed = self.column_embedding(column_indices)
        x_embed += column_embed.unsqueeze(0)

        x_encoded = self.transformer_encoder(x_embed, mask=mask)

        output = self.output_layer(x_encoded)
        return output.squeeze(-1)

num_features = train_tensor.shape[1]
model = TransformerModel(num_features=num_features, d_model=64, num_heads=4, num_layers=2)

model = model.to(device)

column_indices = torch.arange(num_features).to(device)

output = model(train_tensor, column_indices)
print(f"Output shape: {output.shape}")

Output shape: torch.Size([1606, 38])


In [4]:
def create_missing_mask(data, missing_fraction=0.2):
    mask = torch.rand(data.shape).to(data.device) < missing_fraction
    return mask.int()

missing_fraction = 0.2
mask = create_missing_mask(train_tensor, missing_fraction)
print(f"Mask Shape: {mask.shape}")

Mask Shape: torch.Size([1606, 38])


In [5]:
def compute_loss(predictions, ground_truth, mask):
    mse_loss = nn.MSELoss(reduction='none')
    loss = mse_loss(predictions, ground_truth)
    masked_loss = (loss * mask).sum() / mask.sum()  # Normalize by number of masked positions
    return masked_loss

In [6]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    mask = create_missing_mask(train_tensor, missing_fraction)
    input_with_mask = train_tensor.clone()
    input_with_mask[mask == 1] = 0

    predictions = model(input_with_mask, column_indices)

    loss = compute_loss(predictions, train_tensor, mask)

    loss.backward()
    optimizer.step()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")


Epoch 1/20, Loss: nan
Epoch 2/20, Loss: nan
Epoch 3/20, Loss: nan
Epoch 4/20, Loss: nan
Epoch 5/20, Loss: nan
Epoch 6/20, Loss: nan
Epoch 7/20, Loss: nan
Epoch 8/20, Loss: nan
Epoch 9/20, Loss: nan
Epoch 10/20, Loss: nan
Epoch 11/20, Loss: nan
Epoch 12/20, Loss: nan
Epoch 13/20, Loss: nan
Epoch 14/20, Loss: nan
Epoch 15/20, Loss: nan
Epoch 16/20, Loss: nan
Epoch 17/20, Loss: nan
Epoch 18/20, Loss: nan
Epoch 19/20, Loss: nan
Epoch 20/20, Loss: nan


In [7]:
def evaluate_model(model, test_data, column_indices, missing_fraction=0.2):
    model.eval()

    mask = create_missing_mask(test_data, missing_fraction)
    input_with_mask = test_data.clone()
    input_with_mask[mask == 1] = 0

    with torch.no_grad():
        predictions = model(input_with_mask, column_indices)

    loss = compute_loss(predictions, test_data, mask)
    print(f"Test Loss: {loss.item():.4f}")
    return loss

evaluate_model(model, test_tensor, column_indices)


Test Loss: nan


tensor(nan, device='cuda:0')

In [8]:
import torch

def compute_nrmse(predictions, ground_truth, mask):

    masked_predictions = predictions[mask == 1]
    masked_ground_truth = ground_truth[mask == 1]

    mse = torch.mean((masked_predictions - masked_ground_truth) ** 2)
    rmse = torch.sqrt(mse)

    data_range = ground_truth.max() - ground_truth.min()

    nrmse = rmse / data_range
    return nrmse.item()

def evaluate_model_with_nrmse(model, test_data, column_indices, missing_fraction=0.2):

    model.eval()

    mask = create_missing_mask(test_data, missing_fraction)
    input_with_mask = test_data.clone()
    input_with_mask[mask == 1] = 0

    with torch.no_grad():
        predictions = model(input_with_mask, column_indices)

    nrmse = compute_nrmse(predictions, test_data, mask)
    print(f"NRMSE: {nrmse:.4f}")
    return nrmse

nrmse = evaluate_model_with_nrmse(model, test_tensor, column_indices)


NRMSE: nan


In [9]:
from pyampute.ampute import MultivariateAmputation
import pandas as pd
import numpy as np
import torch

def create_missing_dataset(data, missing_fraction=0.1, mechanism="MCAR"):

    if isinstance(data, np.ndarray):
        data = pd.DataFrame(data)

    patterns = [{
        "incomplete_vars": data.columns.tolist(),
        "weights": np.zeros(len(data.columns)),  # Default for MCAR
        "mechanism": mechanism,
        "score_to_probability_func": "sigmoid-right"
    }]

    if mechanism == "MAR":
        num_columns = len(data.columns)
        num_amputed_columns = max(1, int(num_columns * 0.5))
        amputed_columns = np.random.choice(data.columns, num_amputed_columns, replace=False)
        patterns[0]["incomplete_vars"] = amputed_columns
        patterns[0]["weights"] = np.random.uniform(-1, 1, num_columns) 
    
    elif mechanism == "MNAR":
        patterns[0]["weights"] = np.random.uniform(0.5, 2, len(data.columns))

    amputer = MultivariateAmputation(prop=missing_fraction, patterns=patterns)
    amputed_data = amputer.fit_transform(data)

    mask = pd.isna(amputed_data).astype(int).to_numpy()
    amputed_data = np.nan_to_num(amputed_data, nan=0.0)

    amputed_data = torch.tensor(amputed_data, dtype=torch.float32).to(device)
    mask = torch.tensor(mask, dtype=torch.int32).to(device)

    assert amputed_data.shape == mask.shape, "Data and mask shapes dont match"
    return amputed_data, mask


In [10]:
def compute_nrmse(predictions, ground_truth, mask):

    masked_predictions = predictions[mask == 1]
    masked_ground_truth = ground_truth[mask == 1]

    mse = torch.mean((masked_predictions - masked_ground_truth) ** 2)
    rmse = torch.sqrt(mse)

    data_range = ground_truth.max() - ground_truth.min()

    nrmse = rmse / data_range
    return nrmse.item()

In [11]:
def evaluate_model_with_missingness(model, original_data, column_indices, missing_fraction=0.1, mechanism="MCAR"):
    model.eval()

    amputed_data, mask = create_missing_dataset(original_data.cpu().numpy(), missing_fraction, mechanism)

    with torch.no_grad():
        predictions = model(amputed_data, column_indices)

    nrmse = compute_nrmse(predictions, original_data, mask)
    print(f"{mechanism} NRMSE at {missing_fraction * 100:.0f}% Missing: {nrmse:.4f}")
    return nrmse


In [12]:
def evaluate_across_mechanisms(model, original_data, column_indices, missing_fractions):
    mechanisms = ["MCAR", "MAR", "MNAR"]
    results = {mechanism: {} for mechanism in mechanisms}
    
    for mechanism in mechanisms:
        for fraction in missing_fractions:
            results[mechanism][fraction] = evaluate_model_with_missingness(
                model, original_data, column_indices, missing_fraction=fraction, mechanism=mechanism
            )
    
    return results

In [13]:
missing_fractions = [0.1, 0.2, 0.3, 0.4, 0.5]

results = evaluate_across_mechanisms(model, test_tensor, column_indices, missing_fractions)

for mechanism, nrmse_values in results.items():
    print(f"\n{mechanism} Results:")
    for frac, nrmse in nrmse_values.items():
        print(f"  Missing Fraction {frac * 100:.0f}%: NRMSE = {nrmse:.4f}")




MCAR NRMSE at 10% Missing: nan
MCAR NRMSE at 20% Missing: nan
MCAR NRMSE at 30% Missing: nan
MCAR NRMSE at 40% Missing: nan
MCAR NRMSE at 50% Missing: nan


AssertionError: Features involved in amputation must be complete, but contains NaNs.

In [None]:
model_path = "tabular_transformer_model.pth"

torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")

Model saved to tabular_transformer_model.pth


In [None]:
new_df = pd.read_csv('synthetic_physionet_data.csv', index_col=0)
new_data = new_df.to_numpy()

new_test_data = scaler.transform(new_data)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

new_test_tensor = torch.tensor(new_test_data, dtype=torch.float32).to(device)


new_results = evaluate_across_mechanisms(model, new_test_tensor, column_indices, missing_fractions)

for mechanism, nrmse_values in new_results.items():
    print(f"\n{mechanism} Results:")
    for frac, nrmse in nrmse_values.items():
        print(f"  Missing Fraction {frac * 100:.0f}%: NRMSE = {nrmse:.4f}")

  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)


MCAR NRMSE at 10% Missing: 0.0508
MCAR NRMSE at 20% Missing: 0.0505
MCAR NRMSE at 30% Missing: 0.0482
MCAR NRMSE at 40% Missing: 0.0477
MCAR NRMSE at 50% Missing: 0.0478
MAR NRMSE at 10% Missing: 0.0431
MAR NRMSE at 20% Missing: 0.0514
MAR NRMSE at 30% Missing: 0.0496
MAR NRMSE at 40% Missing: 0.0414
MAR NRMSE at 50% Missing: 0.0590
MNAR NRMSE at 10% Missing: 0.0603
MNAR NRMSE at 20% Missing: 0.0521
MNAR NRMSE at 30% Missing: 0.0480
MNAR NRMSE at 40% Missing: 0.0466


  data_group = stats.zscore(data_group)


MNAR NRMSE at 50% Missing: 0.0470

MCAR Results:
  Missing Fraction 10%: NRMSE = 0.0508
  Missing Fraction 20%: NRMSE = 0.0505
  Missing Fraction 30%: NRMSE = 0.0482
  Missing Fraction 40%: NRMSE = 0.0477
  Missing Fraction 50%: NRMSE = 0.0478

MAR Results:
  Missing Fraction 10%: NRMSE = 0.0431
  Missing Fraction 20%: NRMSE = 0.0514
  Missing Fraction 30%: NRMSE = 0.0496
  Missing Fraction 40%: NRMSE = 0.0414
  Missing Fraction 50%: NRMSE = 0.0590

MNAR Results:
  Missing Fraction 10%: NRMSE = 0.0603
  Missing Fraction 20%: NRMSE = 0.0521
  Missing Fraction 30%: NRMSE = 0.0480
  Missing Fraction 40%: NRMSE = 0.0466
  Missing Fraction 50%: NRMSE = 0.0470
