In [1]:
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

class MLGANN_Dataset(Dataset):
    def __init__(self, drug_ids, target_ids, labels, adjacency_matrix, feature_matrix):
        self.drug_ids = np.clip(drug_ids, 0, feature_matrix.shape[0] - 1)  # Fix out-of-bounds IDs
        self.target_ids = np.clip(target_ids, 0, feature_matrix.shape[0] - 1)
        self.labels = labels
        self.adjacency_matrix = adjacency_matrix
        self.feature_matrix = feature_matrix

    def __len__(self):
        return len(self.drug_ids)

    def __getitem__(self, idx):
        drug_id = self.drug_ids[idx]
        target_id = self.target_ids[idx]
        label = self.labels[idx]

        # Extract drug and target features
        drug_feature = self.feature_matrix[drug_id]
        target_feature = self.feature_matrix[target_id]

        return {
            "drug_id": drug_id,
            "target_id": target_id,
            "drug_feature": drug_feature,
            "target_feature": target_feature,
            "label": label,
            "adjacency_matrix": self.adjacency_matrix  # Full adjacency matrix for GNN
        }

def generate_negative_samples(AY, num_neg_samples):
    """
    Generates negative samples by selecting (drug, target) pairs where AY == 0.
    """
    nd, nt = AY.shape
    neg_samples = []
    
    while len(neg_samples) < num_neg_samples:
        d = np.random.randint(0, nd)  # Random drug index
        t = np.random.randint(0, nt)  # Random target index
        if AY[d, t] == 0:  # Ensure it's not a positive sample
            neg_samples.append((d, t, 0))  # Label 0 for negative sample

    return neg_samples

def load_data(AM, AY, feature_matrix, batch_size=32):
    """
    Prepares dataset for MLGANN with negative sampling.
    - AD: Drug-drug similarity matrices
    - AT: Target-target similarity matrices
    - AY: Drug-target interaction matrix (binary)
    - feature_matrix: Initial node feature matrix (drugs & targets)
    """
    # Extract positive samples (confirmed interactions)
    pos_samples = list(zip(*np.where(AY == 1)))  # [(drug_id, target_id)]
    pos_samples = [(d, t, 1) for d, t in pos_samples]  # Label = 1

    # Generate negative samples (same number as positive samples)
    neg_samples = generate_negative_samples(AY, num_neg_samples=len(pos_samples))

    # Combine positive and negative samples
    all_samples = pos_samples + neg_samples
    np.random.shuffle(all_samples)  # Shuffle for randomness

    # Convert to numpy arrays
    drug_ids, target_ids, labels = zip(*all_samples)
    drug_ids, target_ids, labels = np.array(drug_ids), np.array(target_ids), np.array(labels)

    # Train-test split (2/3 training, 1/3 testing)
    drug_train, drug_test, target_train, target_test, label_train, label_test = train_test_split(
        drug_ids, target_ids, labels, test_size=1/3, random_state=42
    )

    # Create PyTorch datasets
    train_dataset = MLGANN_Dataset(drug_train, target_train, label_train, AM, feature_matrix)
    test_dataset = MLGANN_Dataset(drug_test, target_test, label_test, AM, feature_matrix)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, test_loader


In [2]:
import pandas as pd

target_labels = pd.read_csv("Datasets/raw/target_labels.csv")
targets = pd.read_csv("Datasets/raw/protein_sequences.csv")['pdb_id'].tolist()

AY = target_labels.filter(items = targets).to_numpy()
# AY

AM = pd.read_csv("Datasets/processed/AM.csv", index_col=0).to_numpy()
AM

array([[1.        , 0.08139535, 0.04615385, ..., 0.        , 0.        ,
        0.        ],
       [0.08139535, 1.        , 0.06349206, ..., 0.        , 0.        ,
        0.        ],
       [0.04615385, 0.06349206, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [3]:
nd, nt = 304, 405
feature_dim = 128
feature_matrix = np.random.rand(1722, feature_dim)  # Shape: (nd + nt, feature_dim)

print("Feature matrix shape:", feature_matrix.shape)

Feature matrix shape: (1722, 128)


In [4]:
train_loader, test_loader = load_data(AM, AY, feature_matrix, batch_size=128)

In [5]:
for batch in train_loader:
    print(batch)
    break

{'drug_id': tensor([ 39, 237, 145, 114,  94, 131, 297, 244, 274, 281, 115, 224, 282, 195,
        283,  20, 253, 221, 294, 154,  17,  11, 166, 122,  51,  80, 187,  43,
        204, 201, 281, 233, 257,  57, 284, 136, 243, 109,  82, 281, 210,  47,
          4, 282, 146,  77,  29, 209, 179, 256, 100, 175, 252, 215, 167, 278,
        178,  52, 108, 200, 132,  20, 209, 243,  72, 104, 153, 253, 121,   4,
        279, 282, 227,  14, 100,  19, 165,  30,  42, 215, 303, 221, 134,  41,
        116,  59, 165, 283, 132, 147,  40,  49, 154, 204, 222, 245, 190, 250,
        153, 282,  59, 187,   6, 204, 103, 269, 228,   7, 293, 259,  41, 241,
         82,  77, 287,  60, 216, 138, 186,   2,   8, 131, 115, 298,  80,  12,
        115, 170]), 'target_id': tensor([ 17,   2, 345,  58,  86,  61,  86, 126, 123,  10,   4, 383,  54, 120,
        196, 295,  24, 377, 122,  79,   5,   4, 177, 262,  25,  74,  72, 342,
        389,  87, 332, 308, 316, 140, 356, 143,  99, 389,  35,  26, 365, 102,
        139, 350,  

In [6]:
import torch
from torch_geometric.data import Data

def create_pyg_graph(adjacency_matrix, feature_matrix):
    edge_index = torch.nonzero(torch.tensor(adjacency_matrix)).t().contiguous()
    x = torch.tensor(feature_matrix, dtype=torch.float)

    # Ensure edges are within bounds
    max_index = x.shape[0] - 1
    edge_index = edge_index[:, (edge_index[0] <= max_index) & (edge_index[1] <= max_index)]

    return Data(x=x, edge_index=edge_index)



In [7]:
from tqdm import tqdm  # Status bar for training

def train_model(model, train_loader, graph, adjacency_matrix, optimizer, criterion, num_epochs=100, device="cpu"):
    model.to(device)
    graph = graph.to(device)
    adjacency_matrix = adjacency_matrix.to(device)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=True)

        for batch in progress_bar:
            drug_ids = batch["drug_id"].to(device)
            target_ids = batch["target_id"].to(device)
            labels = batch["label"].float().to(device)

            optimizer.zero_grad()
            outputs = model(graph, drug_ids, target_ids, adjacency_matrix)
            loss = criterion(outputs, labels)
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()

            total_loss += loss.item()
            progress_bar.set_postfix(loss=total_loss / len(train_loader))

        print(f"Epoch [{epoch+1}/{num_epochs}] - Loss: {total_loss:.4f}")


In [8]:
from src.models.mlgann import MLGANN
from torch import nn

graph = create_pyg_graph(AM, feature_matrix)

# Define model, optimizer, loss function
input_dim = feature_matrix.shape[1]  # Feature size
hidden_dim = 256
output_dim = 1  # Binary classification
num_layers = 2  # Number of GCN layers

# Initialize MLGANN model
model = MLGANN(input_dim, hidden_dim, output_dim, num_layers, num_heads=4, dropout=0.2)

# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=0.01, weight_decay=1e-4)
criterion = nn.BCELoss()  # Binary cross-entropy loss

print("Feature Matrix Shape:", feature_matrix.shape)
# print("Max Drug ID:", max(drug_ids), "Max Target ID:", max(target_ids))
print("Graph Nodes:", graph.num_nodes)
# Train model
train_model(model, train_loader, graph, torch.tensor(AM, dtype=torch.float), optimizer, criterion, num_epochs=20)


Feature Matrix Shape: (1722, 128)
Graph Nodes: 1722


Epoch 1/20: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 11/11 [00:10<00:00,  1.07it/s, loss=32.9] 


Epoch [1/20] - Loss: 361.9457


Epoch 2/20: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 11/11 [00:09<00:00,  1.16it/s, loss=46.2]


Epoch [2/20] - Loss: 507.9688


Epoch 3/20: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 11/11 [00:09<00:00,  1.17it/s, loss=48.4]


Epoch [3/20] - Loss: 532.6562


Epoch 4/20: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 11/11 [00:09<00:00,  1.16it/s, loss=44.9]


Epoch [4/20] - Loss: 493.9293


Epoch 5/20: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 11/11 [00:09<00:00,  1.16it/s, loss=47.5]


Epoch [5/20] - Loss: 522.6562


Epoch 6/20: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 11/11 [00:09<00:00,  1.15it/s, loss=47.4]


Epoch [6/20] - Loss: 521.0938


Epoch 7/20: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 11/11 [00:09<00:00,  1.12it/s, loss=49.8]


Epoch [7/20] - Loss: 547.5000


Epoch 8/20: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 11/11 [00:10<00:00,  1.09it/s, loss=48.3]


Epoch [8/20] - Loss: 531.4048


Epoch 9/20: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 11/11 [00:10<00:00,  1.01it/s, loss=53.1]


Epoch [9/20] - Loss: 584.4793


Epoch 10/20: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 11/11 [00:12<00:00,  1.12s/it, loss=52.9]


Epoch [10/20] - Loss: 581.9721


Epoch 11/20: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 11/11 [00:12<00:00,  1.11s/it, loss=51.4]


Epoch [11/20] - Loss: 565.8745


Epoch 12/20: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 11/11 [00:14<00:00,  1.29s/it, loss=52.3]


Epoch [12/20] - Loss: 575.7812


Epoch 13/20: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 11/11 [00:14<00:00,  1.31s/it, loss=52.8]


Epoch [13/20] - Loss: 581.2703


Epoch 14/20: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 11/11 [00:13<00:00,  1.24s/it, loss=52.5]


Epoch [14/20] - Loss: 577.3438


Epoch 15/20: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 11/11 [00:14<00:00,  1.31s/it, loss=52]  


Epoch [15/20] - Loss: 572.0312


Epoch 16/20: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 11/11 [00:13<00:00,  1.22s/it, loss=52]  


Epoch [16/20] - Loss: 572.0312


Epoch 17/20: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 11/11 [00:12<00:00,  1.16s/it, loss=51.9]


Epoch [17/20] - Loss: 570.4688


Epoch 18/20: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 11/11 [00:13<00:00,  1.25s/it, loss=52.8]


Epoch [18/20] - Loss: 580.4688


Epoch 19/20: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 11/11 [00:12<00:00,  1.10s/it, loss=53.9]


Epoch [19/20] - Loss: 592.8125


Epoch 20/20: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 11/11 [00:12<00:00,  1.10s/it, loss=54.7]

Epoch [20/20] - Loss: 601.2500





In [12]:
def evaluate_model(model, test_loader, graph, adjacency_matrix, device="cpu"):
    model.eval()  # Set model to evaluation mode
    graph = graph.to(device)
    adjacency_matrix = adjacency_matrix.to(device)

    all_labels = []
    all_predictions = []
    all_scores = []

    progress_bar = tqdm(test_loader, desc="Evaluating", leave=True)  # Add progress bar

    with torch.no_grad():  # Disable gradient tracking
        for batch in progress_bar:
            drug_ids = batch["drug_id"].to(device)
            target_ids = batch["target_id"].to(device)
            labels = batch["label"].to(device)

            # **Forward pass with adjacency matrix**
            outputs = model(graph, drug_ids, target_ids, adjacency_matrix).squeeze()
            scores = outputs.cpu().numpy()
            predictions = (outputs > 0.5).long().cpu().numpy()  # Convert to binary (0/1)
            labels = labels.cpu().numpy()

            all_labels.extend(labels)
            all_predictions.extend(predictions)
            all_scores.extend(scores)

            # **Update progress bar with evaluation step**
            progress_bar.set_postfix(current_batch_accuracy=(predictions == labels).mean())

    # **Compute Metrics**
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

    accuracy = accuracy_score(all_labels, all_predictions)
    precision = precision_score(all_labels, all_predictions, zero_division=1)
    recall = recall_score(all_labels, all_predictions, zero_division=1)
    f1 = f1_score(all_labels, all_predictions, zero_division=1)
    auc = roc_auc_score(all_labels, all_scores)

    # **Print Final Evaluation Metrics**
    print("\nðŸš€ Model Evaluation Results:")
    print(f"âœ… Accuracy:  {accuracy:.4f}")
    print(f"âœ… Precision: {precision:.4f}")
    print(f"âœ… Recall:    {recall:.4f}")
    print(f"âœ… F1-score:  {f1:.4f}")
    print(f"âœ… AUC-ROC:   {auc:.4f}")

    return accuracy, precision, recall, f1, auc


In [13]:
evaluate_model(model, test_loader, graph, torch.tensor(AM, dtype=torch.float))

Evaluating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 6/6 [00:03<00:00,  1.70it/s, current_batch_accuracy=0.667]


ðŸš€ Model Evaluation Results:
âœ… Accuracy:  0.4737
âœ… Precision: 0.4321
âœ… Recall:    0.1064
âœ… F1-score:  0.1707
âœ… AUC-ROC:   0.4806





(0.47368421052631576,
 0.43209876543209874,
 0.10638297872340426,
 0.17073170731707316,
 0.48063628431438354)

In [11]:
print("Min Score:", min(all_scores))
print("Max Score:", max(all_scores))


NameError: name 'all_scores' is not defined