Load Dataset

In [None]:
# --------------------------
# Import necessary libraries
# --------------------------
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import pickle
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    classification_report
)
from google.colab import drive
from google.colab import files

# --------------------------
# Mount Google Drive
# --------------------------
drive.mount('/content/drive')

# --------------------------
# Load dataset
# --------------------------
data_path = '/content/drive/MyDrive/Partially_Oversampled_Data.csv'
data = pd.read_csv(data_path)


Mounted at /content/drive


  data = pd.read_csv(data_path)


Feature Engineering

In [None]:

# --------------------------
# Feature Selection
# --------------------------
# Choose features to use: enzyme, substructure, pathway, targets
#selected_features = ['enzymes', 'substructure', 'pathways', 'targets']  # Change as needed
selected_features = ['substructure', 'pathways', 'targets']

# Automatically collect matching columns
selected_columns = []
for prefix in selected_features:
    selected_columns += [col for col in data.columns if col.startswith(prefix + '_similarity_')]

# --------------------------
# Data preprocessing
# --------------------------
X = data[selected_columns]
y = data['interaction']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# --------------------------
# Move data to tensors
# --------------------------
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# --------------------------
# Create DataLoader
# --------------------------
batch_size = 512
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# --------------------------
# Prepare device
# --------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"✅ Using device: {device}")

# --------------------------
# Define DNN model
# --------------------------
class DNNModel(nn.Module):
    def __init__(self, input_size, num_classes):
        super(DNNModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(64, num_classes)
        )
    def forward(self, x):
        return self.model(x)

input_size = X_train_tensor.shape[1]
num_classes = len(y.unique())
model = DNNModel(input_size, num_classes).to(device)

# --------------------------
# Define Loss, Optimizer, Scheduler
# --------------------------
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0025)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=10, factor=0.5, verbose=True)

# --------------------------
# Training Loop with Early Stopping
# --------------------------
epochs = 300
early_stopping_threshold = 15
best_val_loss = float('inf')
early_stopping_counter = 0

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            val_loss += criterion(outputs, y_batch).item()

    val_loss /= len(test_loader)
    scheduler.step(val_loss)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        early_stopping_counter = 0
    else:
        early_stopping_counter += 1

    if early_stopping_counter >= early_stopping_threshold:
        print(f"🛑 Early stopping at epoch {epoch+1}")
        break

    if (epoch + 1) % 10 == 0:
        avg_train_loss = total_loss / len(train_loader)
        print(f"Epoch [{epoch + 1}/{epochs}] - Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}")

# --------------------------
# Evaluation
# --------------------------
model.eval()
all_predictions = []
all_targets = []
all_probs = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        probs = torch.softmax(outputs, dim=1)
        _, predictions = torch.max(probs, 1)

        all_probs.extend(probs.cpu().numpy())
        all_predictions.extend(predictions.cpu().numpy())
        all_targets.extend(y_batch.cpu().numpy())

accuracy = accuracy_score(all_targets, all_predictions)
f1 = f1_score(all_targets, all_predictions, average='weighted', zero_division=0)
precision = precision_score(all_targets, all_predictions, average='weighted', zero_division=0)
recall = recall_score(all_targets, all_predictions, average='weighted', zero_division=0)

print("\n Evaluation Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

# --------------------------
#  Save evaluation results
# --------------------------
method_name = "DNN"
results_dict = {
    "Method": method_name,
    "Accuracy": accuracy,
    "F1": f1,
    "Precision": precision,
    "Recall": recall,
    'y_true': np.array(all_targets),
    'y_pred_prob': np.array(all_probs)
}

with open('classification_results.pkl', 'wb') as f:
    pickle.dump(results_dict, f)

print(f"✅ Results exported to 'classification_results.pkl'")

# --------------------------
#  Download the file
# --------------------------
files.download('classification_results.pkl')


✅ Using device: cuda




Epoch [10/300] - Train Loss: 0.6389, Val Loss: 0.6272
Epoch [20/300] - Train Loss: 0.5131, Val Loss: 0.5543
Epoch [30/300] - Train Loss: 0.4487, Val Loss: 0.5362
Epoch [40/300] - Train Loss: 0.4065, Val Loss: 0.5275
Epoch [50/300] - Train Loss: 0.3819, Val Loss: 0.5488
Epoch [60/300] - Train Loss: 0.3499, Val Loss: 0.5421
Epoch [70/300] - Train Loss: 0.2171, Val Loss: 0.5105
🛑 Early stopping at epoch 79

✅ Evaluation Results:
Accuracy: 0.8810
F1 Score: 0.8791
Precision: 0.8792
Recall: 0.8810
✅ Results exported to 'classification_results.pkl'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# --------------------------
#  Upload multiple saved pickle files
# --------------------------
from google.colab import files
import pickle
import pandas as pd

uploaded = files.upload()  # Upload multiple .pkl files

# --------------------------
#  Extract evaluation results
# --------------------------
results_list = []

for filename in uploaded.keys():
    with open(filename, 'rb') as f:
        result = pickle.load(f)

    feature_set = filename.replace('.pkl', '')  # Assume filename like 'S+P+E+T.pkl' → 'S+P+E+T'

    results_list.append({
        "Set of Features": feature_set,
        "ACC": round(result['Accuracy'], 4),
        "F1": round(result['F1'], 4),
        "Precision": round(result['Precision'], 4),   # 🔥 Optional: you can add AUC if you calculate it
        "Recall": round(result['Recall'], 4)   # 🔥 Optional: you can add AUPR if you calculate it
    })

# --------------------------
#  Create DataFrame
# --------------------------
results_df = pd.DataFrame(results_list)

# --------------------------
#  Display Table
# --------------------------
from IPython.display import display
display(results_df)

# Optionally, save to CSV
results_df.to_csv('evaluation_summary.csv', index=False)
files.download('evaluation_summary.csv')


Saving E.pkl to E.pkl
Saving E+P+S.pkl to E+P+S.pkl
Saving E+P+T.pkl to E+P+T.pkl
Saving E+T.pkl to E+T.pkl
Saving E+T+S.pkl to E+T+S.pkl
Saving P.pkl to P.pkl
Saving P+E.pkl to P+E.pkl
Saving P+S.pkl to P+S.pkl
Saving P+S+T.pkl to P+S+T.pkl
Saving P+T.pkl to P+T.pkl
Saving S.pkl to S.pkl
Saving S+E.pkl to S+E.pkl
Saving S+P+E+T.pkl to S+P+E+T.pkl
Saving S+T.pkl to S+T.pkl
Saving T.pkl to T.pkl


Unnamed: 0,Set of Features,ACC,F1,Precision,Recall
0,E,0.5671,0.5487,0.5611,0.5671
1,E+P+S,0.8856,0.884,0.8843,0.8856
2,E+P+T,0.8749,0.8727,0.8727,0.8749
3,E+T,0.868,0.8657,0.8659,0.868
4,E+T+S,0.8843,0.8826,0.8834,0.8843
5,P,0.5421,0.515,0.5265,0.5421
6,P+E,0.8228,0.8195,0.8196,0.8228
7,P+S,0.8671,0.8651,0.8651,0.8671
8,P+S+T,0.881,0.8791,0.8792,0.881
9,P+T,0.817,0.8125,0.8118,0.817


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# --------------------------
#  Import necessary libraries
# --------------------------
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import pickle
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    classification_report
)
from google.colab import drive
from google.colab import files

# --------------------------
#  Mount Google Drive
# --------------------------
drive.mount('/content/drive')

# --------------------------
#  Load dataset
# --------------------------
data_path = '/content/drive/MyDrive/Partially_Oversampled_Data.csv'  # <- Update if needed
data = pd.read_csv(data_path)

# --------------------------
#  Data preprocessing
# --------------------------
X = data.drop(columns=['interaction', 'Index', 'drug1', 'drug2'])
y = data['interaction']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# --------------------------
#  Move data to tensors
# --------------------------
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# --------------------------
#  Create DataLoader
# --------------------------
batch_size = 512
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# --------------------------
#  Prepare device
# --------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"✅ Using device: {device}")

# --------------------------
#  Define DNN model
# --------------------------
class DNNModel(nn.Module):
    def __init__(self, input_size, num_classes):
        super(DNNModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(64, num_classes)
        )
    def forward(self, x):
        return self.model(x)

input_size = X_train_tensor.shape[1]
num_classes = len(y.unique())
model = DNNModel(input_size, num_classes).to(device)

# --------------------------
#  Define Loss, Optimizer, Scheduler
# --------------------------
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0025)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=10, factor=0.5, verbose=True)

# --------------------------
#  Training Loop with Early Stopping
# --------------------------
epochs = 300
early_stopping_threshold = 15
best_val_loss = float('inf')
early_stopping_counter = 0

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            val_loss += criterion(outputs, y_batch).item()

    val_loss /= len(test_loader)
    scheduler.step(val_loss)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        early_stopping_counter = 0
    else:
        early_stopping_counter += 1

    if early_stopping_counter >= early_stopping_threshold:
        print(f"🛑 Early stopping at epoch {epoch+1}")
        break

    if (epoch + 1) % 10 == 0:
        avg_train_loss = total_loss / len(train_loader)
        print(f"Epoch [{epoch + 1}/{epochs}] - Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}")

# --------------------------
#  Evaluation
# --------------------------
model.eval()
all_predictions = []
all_targets = []
all_probs = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        probs = torch.softmax(outputs, dim=1)  
        _, predictions = torch.max(probs, 1)

        all_probs.extend(probs.cpu().numpy())
        all_predictions.extend(predictions.cpu().numpy())
        all_targets.extend(y_batch.cpu().numpy())

accuracy = accuracy_score(all_targets, all_predictions)
f1 = f1_score(all_targets, all_predictions, average='weighted', zero_division=0)
precision = precision_score(all_targets, all_predictions, average='weighted', zero_division=0)
recall = recall_score(all_targets, all_predictions, average='weighted', zero_division=0)

print("\n Evaluation Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

# --------------------------
#  Save evaluation results
# --------------------------
method_name = "DNN"
results_dict = {
    "Method": method_name,
    "Accuracy": accuracy,
    "F1": f1,
    "Precision": precision,
    "Recall": recall,
    'y_true': np.array(all_targets),
    'y_pred_prob': np.array(all_probs)
}

with open('classification_results.pkl', 'wb') as f:
    pickle.dump(results_dict, f)

print(f" Results exported to 'classification_results.pkl'")

# --------------------------
#  Download the file
# --------------------------
files.download('classification_results.pkl')


Mounted at /content/drive


  data = pd.read_csv(data_path)


✅ Using device: cuda




Epoch [10/300] - Train Loss: 0.6425, Val Loss: 0.6100
Epoch [20/300] - Train Loss: 0.5273, Val Loss: 0.5701
Epoch [30/300] - Train Loss: 0.4573, Val Loss: 0.5525
Epoch [40/300] - Train Loss: 0.4204, Val Loss: 0.5441
Epoch [50/300] - Train Loss: 0.3956, Val Loss: 0.5436
Epoch [60/300] - Train Loss: 0.2389, Val Loss: 0.4958
Epoch [70/300] - Train Loss: 0.1807, Val Loss: 0.4928
🛑 Early stopping at epoch 72

✅ Evaluation Results:
Accuracy: 0.8843
F1 Score: 0.8822
Precision: 0.8825
Recall: 0.8843
✅ Results exported to 'classification_results.pkl'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>