In [None]:
!pip install torch torchvision torchaudio
!pip install datasets numpy pandas scikit-learn matplotlib

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

In [None]:
from datasets import load_dataset
dataset = load_dataset("BALM/BALM-benchmark", "BindingDB_filtered")
data = dataset["train"]
print("Sample:", data[0])

In [None]:
from datasets import load_dataset
dataset = load_dataset("BALM/BALM-benchmark", "BindingDB_filtered")
data = dataset["train"]
print("Sample:", data[0])

In [None]:
chars = list("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789=#()%+-[]@")
char2idx = {c:i+1 for i,c in enumerate(chars)}

def encode(seq, max_len):
    encoded = [char2idx.get(c, 0) for c in seq[:max_len]]
    return encoded + [0]*(max_len - len(encoded))

In [None]:
from torch.utils.data import Dataset, DataLoader

class BALMDataset(Dataset):
    def __init__(self, data, drug_max=100, target_max=1000):
        self.data = data
        self.drug_max = drug_max
        self.target_max = target_max

    def __getitem__(self, idx):
        d = self.data[idx]
        drug = torch.tensor(encode(d["Drug"], self.drug_max))
        target = torch.tensor(encode(d["Target"], self.target_max))
        y = torch.tensor(d["Y"], dtype=torch.float)
        return drug, target, y

    def __len__(self):
        return len(self.data)

In [None]:
from sklearn.model_selection import train_test_split

all_data = list(data)

train_data, test_data = train_test_split(all_data, test_size=0.2, random_state=42)

train_dataset = BALMDataset(train_data)
test_dataset = BALMDataset(test_data)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

print("Train size:", len(train_dataset))
print("Test size:", len(test_dataset))


In [None]:
import torch.nn as nn

class RNNModel(nn.Module):
    def __init__(self, vocab_size=100, embed_dim=128, hidden_dim=128):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim*2, 1)

    def forward(self, drug, target):
        d = self.embed(drug)
        t = self.embed(target)

        _, (d_h, _) = self.lstm(d)
        _, (t_h, _) = self.lstm(t)

        x = torch.cat([d_h[-1], t_h[-1]], dim=1)
        return self.fc(x).squeeze()

model = RNNModel().to(device)


In [None]:
import torch
import torch.nn as nn
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data loading and preprocessing
dataset = load_dataset("BALM/BALM-benchmark", "BindingDB_filtered")
data = dataset["train"]

chars = list("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789=#()%+-[]@")
char2idx = {c:i+1 for i,c in enumerate(chars)}

def encode(seq, max_len):
    encoded = [char2idx.get(c, 0) for c in seq[:max_len]]
    return encoded + [0]*(max_len - len(encoded))

class BALMDataset(Dataset):
    def __init__(self, data, drug_max=100, target_max=1000):
        self.data = data
        self.drug_max = drug_max
        self.target_max = target_max

    def __getitem__(self, idx):
        d = self.data[idx]
        drug = torch.tensor(encode(d["Drug"], self.drug_max))
        target = torch.tensor(encode(d["Target"], self.target_max))
        y = torch.tensor(d["Y"], dtype=torch.float)
        return drug, target, y

    def __len__(self):
        return len(self.data)

all_data = list(data)
train_data, test_data = train_test_split(all_data, test_size=0.2, random_state=42)

train_dataset = BALMDataset(train_data)
test_dataset = BALMDataset(test_data)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# RNNModel class definition
class RNNModel(nn.Module):
    def __init__(self, vocab_size=100, embed_dim=128, hidden_dim=128):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim*2, 1)

    def forward(self, drug, target):
        d = self.embed(drug)
        t = self.embed(target)

        _, (d_h, _) = self.lstm(d)
        _, (t_h, _) = self.lstm(t)

        x = torch.cat([d_h[-1], t_h[-1]], dim=1)
        return self.fc(x).squeeze()

# Model instantiation
model = RNNModel().to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for drug, target, y in train_loader:
        drug, target, y = drug.to(device), target.to(device), y.to(device)
        optimizer.zero_grad()
        pred = model(drug, target)
        loss = loss_fn(pred, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")

In [None]:
model.eval()
preds, targets = [], []

with torch.no_grad():
    for drug, target, y in test_loader:
        drug, target, y = drug.to(device), target.to(device), y.to(device)
        pred = model(drug, target)
        preds.extend(pred.cpu().numpy())
        targets.extend(y.cpu().numpy())


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

mse = mean_squared_error(targets, preds)
mae = mean_absolute_error(targets, preds)
print(f"Test MSE: {mse:.4f}, Test MAE: {mae:.4f}")


In [None]:
import matplotlib.pyplot as plt

plt.scatter(targets, preds, alpha=0.5)
plt.xlabel("True Y")
plt.ylabel("Predicted Y")
plt.title("RNN Predictions vs True Values")
plt.show()


In [None]:
print("True Y vs Predicted Y for first 10 samples:")
for i in range(10):
    print(f"Sample {i+1}: True = {targets[i]:.4f}, Predicted = {preds[i]:.4f}")
