In [1]:
# Import libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the BBBP dataset from [1](https://zhuanlan.zhihu.com/p/47802053)
df = pd.read_csv('BBBP.csv')

# Extract the SMILES strings and the labels
smiles = df['smiles']
labels = df['p_np']

# Define the SMILES alphabet
alphabet = '#%()+-.0123456789=@ABCDEFGHIKLMNOPRSTVXYZ[\\]abcdefgilmnorsu'

# Create a dictionary to map each character to an index
char_to_idx = {c: i for i, c in enumerate(alphabet)}

# Define the maximum length of SMILES strings
max_len = 100

# Define a function to encode a SMILES string into a one-hot vector
def encode_smiles(smiles):
  # Pad the string with spaces if it is shorter than max_len
  smiles = smiles.ljust(max_len)
  # Truncate the string if it is longer than max_len
  smiles = smiles[:max_len]
  # Create an empty vector of shape (max_len, len(alphabet))
  vector = np.zeros((max_len, len(alphabet)))
  # Loop over each character and set the corresponding element to 1
  for i, c in enumerate(smiles):
    # If the character is in the alphabet, use its index
    if c in char_to_idx:
      vector[i, char_to_idx[c]] = 1
    # Otherwise, use the index of the last element (unknown character)
    else:
      vector[i, -1] = 1
  # Return the vector as a torch tensor
  return torch.tensor(vector, dtype=torch.float32)

# Encode all the SMILES strings in the dataset
X = torch.stack([encode_smiles(s) for s in smiles])
# Convert the labels to a torch tensor
y = torch.tensor(labels, dtype=torch.float32)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the BiLSTM model
class BiLSTM(nn.Module):
  def __init__(self, input_size, hidden_size, output_size):
    super(BiLSTM, self).__init__()
    # Define the BiLSTM layer
    self.bilstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True)
    # Define the output layer
    self.fc = nn.Linear(2 * hidden_size, output_size)
    # Define the sigmoid activation
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    # Pass the input through the BiLSTM layer
    x, _ = self.bilstm(x)
    # Take the last output of the BiLSTM
    x = x[:, -1, :]
    # Pass the output through the output layer and activation
    x = self.fc(x)
    x = self.sigmoid(x)
    # Return the output
    return x

# Define the hyperparameters
input_size = len(alphabet)
hidden_size = 128
output_size = 1
batch_size = 32
num_epochs = 10
learning_rate = 0.01

# Create an instance of the model
model = BiLSTM(input_size, hidden_size, output_size)

# Define the loss function and the optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
for epoch in range(num_epochs):
  # Shuffle the data
  perm = torch.randperm(X_train.size(0))
  X_train = X_train[perm]
  y_train = y_train[perm]
  # Loop over batches
  for i in range(0, X_train.size(0), batch_size):
    # Get the current batch
    X_batch = X_train[i:i+batch_size]
    y_batch = y_train[i:i+batch_size]
    # Zero the gradients
    optimizer.zero_grad()
    # Forward pass
    y_pred = model(X_batch)
    # Compute the loss
    loss = criterion(y_pred, y_batch.unsqueeze(1))
    # Backward pass
    loss.backward()
    # Update the parameters
    optimizer.step()
  # Print the loss at the end of the epoch
  print(f'Epoch {epoch+1}, Loss: {loss.item():.4f}')

# Evaluate the model on the test set
with torch.no_grad():
  # Get the predictions
  y_pred = model(X_test)
  # Round them to 0 or 1
  y_pred = torch.round(y_pred)
  # Compute the accuracy
  acc = accuracy_score(y_test, y_pred)
  # Print the accuracy
  print(f'Accuracy: {acc:.4f}')


Epoch 1, Loss: 0.4550
Epoch 2, Loss: 0.2617
Epoch 3, Loss: 0.5893
Epoch 4, Loss: 0.1998
Epoch 5, Loss: 0.3860
Epoch 6, Loss: 0.5639
Epoch 7, Loss: 0.3518
Epoch 8, Loss: 0.8351
Epoch 9, Loss: 0.6198
Epoch 10, Loss: 0.2152
Accuracy: 0.7707
