In [16]:
import os
import logging
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from pathlib import Path
import math

#### Setup logging

In [17]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

## Data Loading + Preprocessing

#### Loading the data

In [18]:
def load_data(file_path, index_col=0):
    """
    Load data from CSV file and handle potential errors.
    
    Args:
        file_path (str): Path to the CSV file
        index_col (int, optional): Column to use as index. Defaults to 0.
        
    Returns:
        numpy.ndarray: Loaded data as numpy array
    """
    try:
        file_path = Path(file_path)
        if not file_path.exists():
            raise FileNotFoundError(f"File not found: {file_path}")
            
        logger.info(f"Loading data from {file_path}")
        df = pd.read_csv(file_path, index_col=index_col)
        
        # Basic data validation
        if df.empty:
            raise ValueError("The loaded DataFrame is empty")
        
        # Check for remaining missing values
        missing_values = df.isnull().sum().sum()
        if missing_values > 0:
            logger.warning(f"Dataset contains {missing_values} missing values")
            
        # Log data shape information
        logger.info(f"Loaded data shape: {df.shape}")
        
        # Convert to numpy array
        data = df.to_numpy()
        return data
    
    except Exception as e:
        logger.error(f"Error loading data: {str(e)}")
        raise

#### Preparing the data

In [19]:
def prepare_data_for_training(data, test_size=0.2, random_state=42, device=None):
    """
    Split data into train and test sets, scale, and convert to tensors.
    
    Args:
        data (numpy.ndarray): Input data
        test_size (float, optional): Proportion of test set. Defaults to 0.2.
        random_state (int, optional): Random seed for reproducibility. Defaults to 42.
        device (torch.device, optional): Device to use. Defaults to None.
        
    Returns:
        tuple: (train_tensor, test_tensor, scaler) - PyTorch tensors and scaler
    """
    try:
        # Determine device to use
        if device is None:
            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        logger.info(f"Using device: {device}")
        
        # Split data into train and test sets
        train_data, test_data = train_test_split(
            data, 
            test_size=test_size, 
            random_state=random_state
        )
        logger.info(f"Train set shape: {train_data.shape}, Test set shape: {test_data.shape}")
        
        # Scale data
        scaler = StandardScaler()
        train_data = scaler.fit_transform(train_data)
        test_data = scaler.transform(test_data)
        
        # Check for NaNs or infinities after scaling
        if np.isnan(train_data).any() or np.isinf(train_data).any():
            logger.warning("Train data contains NaN or infinite values after scaling")
            
        if np.isnan(test_data).any() or np.isinf(test_data).any():
            logger.warning("Test data contains NaN or infinite values after scaling")
        
        # Convert to PyTorch tensors
        train_tensor = torch.tensor(train_data, dtype=torch.float32).to(device)
        test_tensor = torch.tensor(test_data, dtype=torch.float32).to(device)
        
        logger.info(f"Train tensor shape: {train_tensor.shape}")
        logger.info(f"Test tensor shape: {test_tensor.shape}")
        
        return train_tensor, test_tensor, scaler
        
    except Exception as e:
        logger.error(f"Error preparing data: {str(e)}")
        raise


In [20]:
# File path with more robust handling
file_path = os.path.join('..', 'data', 'physionet_wo_missing.csv')

# Load data
data = load_data(file_path)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Prepare data
train_tensor, test_tensor, scaler = prepare_data_for_training(
    data,
    device=device
)

# Verify shapes
print(f"Train Tensor Shape: {train_tensor.shape}")
print(f"Test Tensor Shape: {test_tensor.shape}")
print(f"Device Used: {device}")

2025-03-29 16:18:42,605 - __main__ - INFO - Loading data from ..\data\physionet_wo_missing.csv
2025-03-29 16:18:42,613 - __main__ - INFO - Loaded data shape: (1598, 39)
2025-03-29 16:18:42,614 - __main__ - INFO - Using device: cuda
2025-03-29 16:18:42,615 - __main__ - INFO - Train set shape: (1278, 39), Test set shape: (320, 39)
2025-03-29 16:18:42,618 - __main__ - INFO - Train tensor shape: torch.Size([1278, 39])
2025-03-29 16:18:42,618 - __main__ - INFO - Test tensor shape: torch.Size([320, 39])


Train Tensor Shape: torch.Size([1278, 39])
Test Tensor Shape: torch.Size([320, 39])
Device Used: cuda


## Model Configuration

In [21]:
import torch.nn as nn

#### Positional Encoder

In [22]:
class PositionalEncoding(nn.Module):
    """
    Positional encoding to provide position information to the transformer model.
    This helps the model understand the relative positions of features.
    """
    def __init__(self, d_model, max_len=1000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        # x: [batch_size, seq_len, d_model]
        return x + self.pe[:, :x.size(1), :]

#### Feature Wise attention

In [23]:
class FeatureWiseAttention(nn.Module):
    """
    Feature-wise attention mechanism to help the model focus on important relationships
    between different features during imputation.
    """
    def __init__(self, d_model):
        super(FeatureWiseAttention, self).__init__()
        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        self.scale = math.sqrt(d_model)
        
    def forward(self, x):
        # x: [batch_size, seq_len, d_model]
        q = self.query(x)
        k = self.key(x)
        v = self.value(x)
        
        # Compute attention scores
        scores = torch.matmul(q, k.transpose(-2, -1)) / self.scale
        attn = torch.softmax(scores, dim=-1)
        
        # Apply attention to values
        output = torch.matmul(attn, v)
        return output, attn

#### Transformer model

In [24]:
class EnhancedTransformerModel(nn.Module):
    def __init__(self, num_features, d_model=128, num_heads=8, num_layers=3, dropout=0.2, 
                 use_layer_norm=True, feedforward_dim=512, activation='gelu'):
        super(EnhancedTransformerModel, self).__init__()
        
        self.d_model = d_model
        self.num_features = num_features
        
        # Enhanced feature embedding with layer normalization
        self.feature_embedding = nn.Sequential(
            nn.Linear(1, d_model),
            nn.LayerNorm(d_model) if use_layer_norm else nn.Identity(),
            nn.Dropout(dropout)
        )
        
        # Improved column embedding with additional context
        self.column_embedding = nn.Embedding(num_features, d_model)
        
        # Add positional encoding to provide position information
        self.positional_encoding = PositionalEncoding(d_model)
        
        # Select activation function
        if activation == 'gelu':
            activation_fn = nn.GELU()
        elif activation == 'relu':
            activation_fn = nn.ReLU()
        else:
            activation_fn = nn.GELU()  # Default to GELU
        
        # Enhanced transformer encoder layers
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, 
            nhead=num_heads, 
            dim_feedforward=feedforward_dim,
            dropout=dropout,
            activation=activation_fn,
            batch_first=True,
            norm_first=True  # Apply normalization before attention & feedforward
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers)
        
        # Feature-wise attention to focus on important relationships
        self.feature_attention = FeatureWiseAttention(d_model)
        
        # Multi-stage output with residual connection
        self.output_projection = nn.Sequential(
            nn.Linear(d_model, d_model // 2),
            activation_fn,
            nn.Dropout(dropout),
            nn.Linear(d_model // 2, 1)
        )
        
        # Optional direct connection for residual learning
        self.direct_connection = nn.Linear(1, 1)
        
        # Initialize weights properly
        self._initialize_weights()
    
    def _initialize_weights(self):
        """
        Initialize weights using Xavier uniform for better gradient flow
        """
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
    
    def forward(self, x, column_indices, mask=None):
        batch_size = x.size(0)
        
        # Expand to feature dimension if needed
        x_input = x.unsqueeze(-1)  # [batch_size, num_features, 1]
        
        # Feature embedding
        x_embed = self.feature_embedding(x_input)  # [batch_size, num_features, d_model]
        
        # Add column embeddings - provides context about which feature is being processed
        col_embed = self.column_embedding(column_indices).unsqueeze(0)  # [1, num_features, d_model]
        col_embed = col_embed.expand(batch_size, -1, -1)  # [batch_size, num_features, d_model]
        x_embed = x_embed + col_embed
        
        # Add positional encoding
        x_embed = self.positional_encoding(x_embed)
        
        # Apply transformer encoder
        if mask is not None:
            x_encoded = self.transformer_encoder(x_embed, src_key_padding_mask=mask)
        else:
            x_encoded = self.transformer_encoder(x_embed)
        
        # Apply feature-wise attention
        x_attended, attention_weights = self.feature_attention(x_encoded)
        
        # Generate output with residual connection
        direct_out = self.direct_connection(x_input) if hasattr(self, 'direct_connection') else 0
        output = self.output_projection(x_attended) + direct_out
        
        return output.squeeze(-1), attention_weights
    
    def impute(self, x, mask=None):
        """
        Specialized method for imputation that handles missing value masks
        
        Args:
            x: Input tensor with missing values (NaNs or specified value)
            mask: Boolean mask where True indicates missing values
            
        Returns:
            Imputed tensor
        """
        # Create column indices
        column_indices = torch.arange(self.num_features, device=x.device)
        
        # Forward pass with mask
        imputed_values, _ = self.forward(x, column_indices, mask)
        
        # If mask is provided, only replace masked values
        if mask is not None:
            # Where mask is True, use imputed values, otherwise keep original
            result = torch.where(mask, imputed_values, x)
            return result
        
        return imputed_values

#### Model Instantiation

In [25]:
def create_enhanced_transformer(num_features, device, d_model=128, num_heads=8, num_layers=3):
    model = EnhancedTransformerModel(
        num_features=num_features,
        d_model=d_model,
        num_heads=num_heads,
        num_layers=num_layers,
        dropout=0.2,
        use_layer_norm=True,
        feedforward_dim=512,
        activation='gelu'
    )
    model = model.to(device)
    return model

#### Model usage

In [27]:
num_features = train_tensor.shape[1]
model = create_enhanced_transformer(num_features, device)

In [28]:
column_indices = torch.arange(num_features).to(device)

In [29]:
output, attention_weights = model(train_tensor, column_indices)
print(f"Output shape: {output.shape}")
print(f"Attention weights shape: {attention_weights.shape}")

Output shape: torch.Size([1278, 39])
Attention weights shape: torch.Size([1278, 39, 39])
