In [32]:
# dependencies
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Data

In [42]:
import pandas as pd
# from sklearn.preprocessing import StandardScaler

# Load Data
train_data_path = 'data/train_data.csv'
pseudo_train_data_path = 'data/combined_labeled_data.csv'
test_data_path = 'data/test_data.csv'

train_data = pd.read_csv(train_data_path)
pseudo_train_data = pd.read_csv(pseudo_train_data_path)
test_data = pd.read_csv(test_data_path)

# Drop unnecessary columns
train_data = train_data.drop(columns=['DssTime','Event'])
test_data = test_data.drop(columns=['DssTime','Event'])

# # Scale Data
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)


In [66]:
# Create the dataset class
class CancerDataset(Dataset):
    def __init__(self, data, features, label_column):
        self.data = data
        self.features = features
        self.label_column = label_column
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        inputs = torch.tensor(self.data[self.features].iloc[idx].values, dtype=torch.float32)
        label = torch.tensor(self.data[self.label_column].iloc[idx], dtype=torch.float32)
        return inputs, label

# Define feature columns
# all_columns = ['ESR1', 'PGR', 'ERBB2', 'MKI67', 'PLAU', 'ELAVL1', 'EGFR', 'BTRC', 'FBXO6', 'SHMT2', 'KRAS', 
#                'SRPK2', 'YWHAQ', 'PDHA1', 'EWSR1', 'ZDHHC17', 'ENO1', 'DBN1', 'PLK1', 'GSK3B', 'Age', 'Size', 
#                'Menopausal State', 'Radio Therapy', 'Chemotherapy', 'Hormone Therapy', 'Neoplasm Histologic Grade',
#                'Cellularity', 'Surgery-breast conserving', 'Surgery-mastectomy']
train_columns = train_data.columns
label_column = "Label"

# Create datasets and dataloaders
train_dataset = CancerDataset(train_data, train_columns, label_column)
test_dataset = CancerDataset(test_data, train_columns, label_column)  # Use same columns as during training

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Model

In [67]:
# Define the unified Transformer binary classifier
class UnifiedTransformerBinaryClassifier(nn.Module):
    def __init__(self, input_dim, transformer_dim=64, num_heads=4, num_layers=2, dropout=0.1):
        super(UnifiedTransformerBinaryClassifier, self).__init__()
        
        # Define the transformer model
        self.embedding = nn.Linear(input_dim, transformer_dim)  # Linear layer to project input to transformer dim
        
        # Transformer Encoder Layer
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=transformer_dim, nhead=num_heads, dropout=dropout),
            num_layers=num_layers
        )
        
        # Final classification layer
        self.fc = nn.Linear(transformer_dim, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        # Apply the initial embedding to get input in the transformer dimension
        x = self.embedding(x)
        
        # Add a batch dimension (needed for the transformer)
        x = x.unsqueeze(1)  # Shape: [batch_size, 1, input_dim]
        
        # Pass through the transformer encoder
        x = self.transformer_encoder(x)
        
        # Get the output from the transformer (we'll use the last output for classification)
        x = x[:, -1, :]  # Shape: [batch_size, transformer_dim]
        
        # Classification layer
        x = self.fc(x)
        
        # Sigmoid activation for binary classification
        x = self.sigmoid(x)
        
        return x

# Training

In [68]:
# Define training loop
def train_model(model, train_loader, epochs, optimizer, loss_fn, device):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for i, (inputs, labels) in enumerate(train_loader):
            # Send data to the device (GPU/CPU)
            inputs, labels = inputs.to(device), labels.to(device)
            
            # Zero the gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(inputs).squeeze(1)
            
            # Compute loss
            loss = loss_fn(outputs, labels)
            
            # Backward pass and optimize
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(train_loader)}")

In [69]:
# Define the model, optimizer, and loss function
# torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyper parameters
transformer_dim=64
num_heads=4
num_layers=2
dropout=0.1

input_dim = len(train_columns)  # Number of input features
model = UnifiedTransformerBinaryClassifier(input_dim, transformer_dim, num_heads, num_layers, dropout).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.BCELoss()

# Train the model
epochs = 30
train_model(model, train_loader, epochs, optimizer, loss_fn, device)



Epoch 1/30, Loss: 0.8123042345046997
Epoch 2/30, Loss: 0.7097771326700847
Epoch 3/30, Loss: 0.6917547782262167
Epoch 4/30, Loss: 0.6845372398694356
Epoch 5/30, Loss: 0.6677679459253947
Epoch 6/30, Loss: 0.6779388586680094
Epoch 7/30, Loss: 0.6515095988909404
Epoch 8/30, Loss: 0.6496336698532105
Epoch 9/30, Loss: 0.610536766052246
Epoch 10/30, Loss: 0.5673736055692037
Epoch 11/30, Loss: 0.5049219210942586
Epoch 12/30, Loss: 0.4391743838787079
Epoch 13/30, Loss: 0.43664355874061583
Epoch 14/30, Loss: 0.3910779962937037
Epoch 15/30, Loss: 0.3021206135551135
Epoch 16/30, Loss: 0.20717207367221516
Epoch 17/30, Loss: 0.09863994481662909
Epoch 18/30, Loss: 0.17177195213735102
Epoch 19/30, Loss: 0.12560502427319686
Epoch 20/30, Loss: 0.30115052685141563
Epoch 21/30, Loss: 0.13317374462882678
Epoch 22/30, Loss: 0.08182375188916921
Epoch 23/30, Loss: 0.13373701988408962
Epoch 24/30, Loss: 0.05785323406259219
Epoch 25/30, Loss: 0.0222954036667943
Epoch 26/30, Loss: 0.04253771541019281
Epoch 27/30

# Evaluation

In [70]:
def evaluate_model(model, test_loader, device):
    model.eval()  # Set the model to evaluation mode
    all_preds = []
    all_labels = []
    
    with torch.no_grad():  # No need to compute gradients during evaluation
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            # Forward pass
            outputs = model(inputs).squeeze(1)  # Shape: [batch_size]
            preds = (outputs > 0.5).float()  # Binary classification (threshold 0.5)
            
            # Collect all predictions and true labels
            all_preds.extend(preds.cpu().numpy())  # Move to CPU and convert to numpy
            all_labels.extend(labels.cpu().numpy())
    
    # Convert lists to numpy arrays
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)
    
    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    auc = roc_auc_score(all_labels, all_preds)

    # Print the results
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1-Score: {f1:.4f}')
    if auc is not None:
        print(f'AUC: {auc:.4f}')
    else:
        print('AUC: Not available (single class prediction)')
    
    return accuracy, precision, recall, f1, auc

In [71]:
# Evaluate the model
evaluate_model(model, test_loader, device)

Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-Score: 1.0000
AUC: 1.0000


(1.0, 1.0, 1.0, 1.0, 1.0)