In [3]:
# Import libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the BBBP dataset from 1
df = pd.read_csv('BBBP.csv')

# Extract the SMILES strings and the labels
smiles = df['smiles']
labels = df['p_np']

# Define the SMILES alphabet
alphabet = '#%()+-.0123456789=@ABCDEFGHIKLMNOPRSTVXYZ[\\]abcdefgilmnorsu'

# Create a dictionary to map each character to an index
char_to_idx = {c: i for i, c in enumerate(alphabet)}

# Define the maximum length of SMILES strings
max_len = 100

# Define a function to encode a SMILES string into a one-hot vector
def encode_smiles(smiles):
  # Pad the string with spaces if it is shorter than max_len
  smiles = smiles.ljust(max_len)
  # Truncate the string if it is longer than max_len
  smiles = smiles[:max_len]
  # Create an empty vector of shape (max_len, len(alphabet))
  vector = np.zeros((max_len, len(alphabet)))
  # Loop over each character and set the corresponding element to 1
  for i, c in enumerate(smiles):
    # If the character is in the alphabet, use its index
    if c in char_to_idx:
      vector[i, char_to_idx[c]] = 1
    # Otherwise, use the index of the last element (unknown character)
    else:
      vector[i, -1] = 1
  # Return the vector as a torch tensor
  return torch.tensor(vector, dtype=torch.float32)

# Encode all the SMILES strings in the dataset
X = torch.stack([encode_smiles(s) for s in smiles])
# Convert the labels to a torch tensor
y = torch.tensor(labels, dtype=torch.float32)

# Define the LSTM model
class LSTM(nn.Module):
  def __init__(self, input_size, hidden_size, output_size):
    super(LSTM, self).__init__()
    # Define the LSTM layer
    self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
    # Define the output layer
    self.fc = nn.Linear(hidden_size, output_size)
    # Define the sigmoid activation
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    # Pass the input through the LSTM layer
    x, _ = self.lstm(x)
    # Take the last output of the LSTM
    x = x[:, -1, :]
    # Pass the output through the output layer and activation
    x = self.fc(x)
    x = self.sigmoid(x)
    # Return the output
    return x

# Define the hyperparameters
input_size = len(alphabet)
hidden_size = 128
output_size = 1
batch_size = 32
num_epochs = 10
learning_rate = 0.01

# Create an instance of the model
model = LSTM(input_size, hidden_size, output_size)

# Define the loss function and the optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Define the number of folds for cross validation
k_fold = 5

# Define a list to store the accuracy scores for each fold
acc_scores = []

# Loop over the folds
for k in range(k_fold):
  # Print the current fold number
  print(f'Fold {k+1}')

  # Split the data into train and validation sets
  # Use a fraction of 1/k_fold for the validation set
  val_size = int(len(X) / k_fold)
  val_indices = list(range(k * val_size, (k + 1) * val_size))
  train_indices = list(set(range(len(X))) - set(val_indices))
  X_train = X[train_indices]
  y_train = y[train_indices]
  X_val = X[val_indices]
  y_val = y[val_indices]

  # Create data loaders for the train and validation sets
  train_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
  val_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(X_val, y_val), batch_size=batch_size, shuffle=True)

  # Train the model on the train set
  for epoch in range(num_epochs):
    # Shuffle the data
    perm = torch.randperm(X_train.size(0))
    X_train = X_train[perm]
    y_train = y_train[perm]
    # Loop over batches
    for i in range(0, X_train.size(0), batch_size):
      # Get the current batch
      X_batch = X_train[i:i+batch_size]
      y_batch = y_train[i:i+batch_size]
      # Zero the gradients
      optimizer.zero_grad()
      # Forward pass
      y_pred = model(X_batch)
      # Compute the loss
      loss = criterion(y_pred, y_batch.unsqueeze(1))
      # Backward pass
      loss.backward()
      # Update the parameters
      optimizer.step()
    # Print the loss at the end of the epoch
    print(f'Epoch {epoch+1}, Loss: {loss.item():.4f}')

  # Evaluate the model on the validation set
  with torch.no_grad():
    # Get the predictions
    y_pred = model(X_val)
    # Round them to 0 or 1
    y_pred = torch.round(y_pred)
    # Compute the accuracy
    acc = accuracy_score(y_val, y_pred)
    # Print the accuracy
    print(f'Accuracy: {acc:.4f}')
    # Append the accuracy to the list
    acc_scores.append(acc)

# Print the average accuracy over the folds
print(f'Average accuracy: {np.mean(acc_scores):.4f}')


Fold 1
Epoch 1, Loss: 0.6361
Epoch 2, Loss: 0.4033
Epoch 3, Loss: 0.1474
Epoch 4, Loss: 0.7559
Epoch 5, Loss: 0.5962
Epoch 6, Loss: 0.9445
Epoch 7, Loss: 0.6073
Epoch 8, Loss: 0.5812
Epoch 9, Loss: 0.4046
Epoch 10, Loss: 0.1479
Accuracy: 0.5683
Fold 2
Epoch 1, Loss: 0.5065
Epoch 2, Loss: 0.2474
Epoch 3, Loss: 0.3931
Epoch 4, Loss: 0.8445
Epoch 5, Loss: 0.3127
Epoch 6, Loss: 0.2936
Epoch 7, Loss: 0.2157
Epoch 8, Loss: 0.8074
Epoch 9, Loss: 0.1785
Epoch 10, Loss: 0.1716
Accuracy: 0.6293
Fold 3
Epoch 1, Loss: 0.0946
Epoch 2, Loss: 0.2602
Epoch 3, Loss: 0.3240
Epoch 4, Loss: 0.4023
Epoch 5, Loss: 0.0534
Epoch 6, Loss: 0.1556
Epoch 7, Loss: 0.0659
Epoch 8, Loss: 0.7786
Epoch 9, Loss: 0.5834
Epoch 10, Loss: 0.2615
Accuracy: 0.9366
Fold 4
Epoch 1, Loss: 0.2146
Epoch 2, Loss: 0.0205
Epoch 3, Loss: 0.0429
Epoch 4, Loss: 0.2677
Epoch 5, Loss: 0.9725
Epoch 6, Loss: 0.0370
Epoch 7, Loss: 0.1618
Epoch 8, Loss: 0.1327
Epoch 9, Loss: 0.1768
Epoch 10, Loss: 0.0400
Accuracy: 0.9927
Fold 5
Epoch 1, Loss