In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score


In [2]:
data_labels = pd.read_csv("E://table//PRS/all_labels.csv")
data_prs = pd.read_csv("E://table//PRS/all_prs.csv")


In [None]:
# Data Preparation
class PRSDataset(Dataset):
    def __init__(self, prs_data, labels):
        self.prs_data = prs_data
        self.labels = labels

    def __len__(self):
        return len(self.prs_data)

    def __getitem__(self, idx):
        x = torch.tensor(self.prs_data.iloc[idx].values, dtype=torch.float32)
        y = torch.tensor(self.labels.iloc[idx].values, dtype=torch.float32)
        return x, y

In [None]:
# Preprocessing
# Normalizing the PRS data
scaler = StandardScaler()
data_prs.iloc[:, 1:] = scaler.fit_transform(data_prs.iloc[:, 1:])  # Normalize PRS features (excluding ID column)

# Merging labels and PRS datasets by ID
merged_data = data_prs.merge(data_labels, left_on='FID', right_on='eid')
prs_data = merged_data.iloc[:, 1:81]  # Extract PRS columns
disease_labels = merged_data.iloc[:, 82:]  # Extract labels columns

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(prs_data, disease_labels, test_size=0.2, random_state=42)

# Create DataLoader
dataset_train = PRSDataset(X_train, y_train)
dataset_test = PRSDataset(X_test, y_test)

train_loader = DataLoader(dataset_train, batch_size=64, shuffle=True)
test_loader = DataLoader(dataset_test, batch_size=64, shuffle=False)


In [None]:
# Define Attention Mechanism
class Attention(nn.Module):
    def __init__(self, input_dim):
        super(Attention, self).__init__()
        self.attention_weights = nn.Linear(input_dim, 1)

    def forward(self, x):
        scores = self.attention_weights(x)
        weights = torch.softmax(scores, dim=1)
        context = torch.sum(weights * x, dim=1)
        return context, weights



In [None]:
# Define Multi-Task Model with Attention Mechanism
class MultiTaskAttentionModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_tasks):
        super(MultiTaskAttentionModel, self).__init__()
        self.shared_encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
        )
        self.attention = Attention(hidden_dim)
        self.task_heads = nn.ModuleList([nn.Linear(hidden_dim, 1) for _ in range(num_tasks)])

    def forward(self, x):
        shared_features = self.shared_encoder(x)
        attended_features, attention_weights = self.attention(shared_features)
        task_outputs = [torch.sigmoid(head(attended_features)) for head in self.task_heads]
        return task_outputs, attention_weights


In [None]:
# Initialize model, loss, and optimizer
input_dim = prs_data.shape[1]
hidden_dim = 128
num_tasks = y_train.shape[1]

model = MultiTaskAttentionModel(input_dim, hidden_dim, num_tasks)
criterion = nn.BCELoss()  # Binary Cross Entropy Loss for multi-label classification
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [None]:
# Training Loop
def train_model(model, train_loader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs, _ = model(inputs)
            loss = sum([criterion(outputs[i].squeeze(), labels[:, i]) for i in range(num_tasks)]) / num_tasks
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / len(train_loader):.4f}")



In [None]:
# Evaluation Function
def evaluate_model(model, test_loader):
    model.eval()
    all_labels = []
    all_preds = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs, _ = model(inputs)
            predictions = torch.cat(outputs, dim=1)
            all_preds.append(predictions)
            all_labels.append(labels)
    all_preds = torch.cat(all_preds).numpy()
    all_labels = torch.cat(all_labels).numpy()
    auc_scores = [roc_auc_score(all_labels[:, i], all_preds[:, i]) for i in range(num_tasks)]
    average_auc = np.mean(auc_scores)
    print(f"Average AUC-ROC: {average_auc:.4f}")
    return auc_scores


In [None]:
# Train and Evaluate
epochs = 20
train_model(model, train_loader, criterion, optimizer, num_epochs=epochs)
evaluate_model(model, test_loader)

# Summary and Analysis
# 1. After training, use SHAP or other interpretability tools to understand feature importance.
# 2. Visualize the attention weights to see which features were most attended for different disease tasks.

