In [1]:
# Import libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score

# Load the BBBP dataset from [1]
df = pd.read_csv('BBBP.csv')

# Extract the SMILES strings and the labels
smiles = df['smiles']
labels = df['p_np']

# Define the SMILES alphabet
alphabet = '#%()+-.0123456789=@ABCDEFGHIKLMNOPRSTVXYZ[\\]abcdefgilmnorsu'

# Create a dictionary to map each character to an index
char_to_idx = {c: i for i, c in enumerate(alphabet)}

# Define the maximum length of SMILES strings
max_len = 100

# Define a function to encode a SMILES string into a one-hot vector
def encode_smiles(smiles):
  # Pad the string with spaces if it is shorter than max_len
  smiles = smiles.ljust(max_len)
  # Truncate the string if it is longer than max_len
  smiles = smiles[:max_len]
  # Create an empty vector of shape (max_len, len(alphabet))
  vector = np.zeros((max_len, len(alphabet)))
  # Loop over each character and set the corresponding element to 1
  for i, c in enumerate(smiles):
    # If the character is in the alphabet, use its index
    if c in char_to_idx:
      vector[i, char_to_idx[c]] = 1
    # Otherwise, use the index of the last element (unknown character)
    else:
      vector[i, -1] = 1
  # Return the vector as a torch tensor
  return torch.tensor(vector, dtype=torch.float32)

# Encode all the SMILES strings in the dataset
X = torch.stack([encode_smiles(s) for s in smiles])
# Convert the labels to a torch tensor
y = torch.tensor(labels, dtype=torch.float32)

# Define the FC model
class FC(nn.Module):
  def __init__(self, input_size, hidden_size, output_size):
    super(FC, self).__init__()
    # Define the layers
    self.fc1 = nn.Linear(input_size, hidden_size)
    self.fc2 = nn.Linear(hidden_size, output_size)
    self.relu = nn.ReLU()
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    # Flatten the input
    x = x.view(-1, input_size)
    # Apply the first layer and activation
    x = self.fc1(x)
    x = self.relu(x)
    # Apply the second layer and activation
    x = self.fc2(x)
    x = self.sigmoid(x)
    # Return the output
    return x

# Define the hyperparameters
input_size = max_len * len(alphabet)
hidden_size = 128
output_size = 1
batch_size = 32
num_epochs = 10
learning_rate = 0.01

# Create an instance of the model
model = FC(input_size, hidden_size, output_size)

# Define the loss function and the optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Define the number of folds
k = 5

# Create a KFold object
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Initialize a list to store the validation accuracies
val_accs = []

# Loop over the folds
for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
  # Get the train and validation data
  X_train = X[train_idx]
  y_train = y[train_idx]
  X_val = X[val_idx]
  y_val = y[val_idx]
  # Train the model
  for epoch in range(num_epochs):
    # Shuffle the data
    perm = torch.randperm(X_train.size(0))
    X_train = X_train[perm]
    y_train = y_train[perm]
    # Loop over batches
    for i in range(0, X_train.size(0), batch_size):
      # Get the current batch
      X_batch = X_train[i:i+batch_size]
      y_batch = y_train[i:i+batch_size]
      # Zero the gradients
      optimizer.zero_grad()
      # Forward pass
      y_pred = model(X_batch)
      # Compute the loss
      loss = criterion(y_pred, y_batch.unsqueeze(1))
      # Backward pass
      loss.backward()
      # Update the parameters
      optimizer.step()
    # Print the loss at the end of the epoch
    print(f'Fold {fold+1}, Epoch {epoch+1}, Loss: {loss.item():.4f}')
  # Evaluate the model on the validation set
  with torch.no_grad():
    # Get the predictions
    y_pred = model(X_val)
    # Round them to 0 or 1
    y_pred = torch.round(y_pred)
    # Compute the accuracy
    acc = accuracy_score(y_val, y_pred)
    # Print the accuracy
    print(f'Fold {fold+1}, Accuracy: {acc:.4f}')
    # Append the accuracy to the list
    val_accs.append(acc)

# Compute the mean and standard deviation of the validation accuracies
mean_acc = np.mean(val_accs)
std_acc = np.std(val_accs)

# Print the summary
print(f'Mean validation accuracy: {mean_acc:.4f}')
print(f'Standard deviation of validation accuracy: {std_acc:.4f}')


Fold 1, Epoch 1, Loss: 0.0767
Fold 1, Epoch 2, Loss: 0.3149
Fold 1, Epoch 3, Loss: 0.0372
Fold 1, Epoch 4, Loss: 0.1532
Fold 1, Epoch 5, Loss: 0.0250
Fold 1, Epoch 6, Loss: 0.0936
Fold 1, Epoch 7, Loss: 0.0250
Fold 1, Epoch 8, Loss: 0.0008
Fold 1, Epoch 9, Loss: 0.0260
Fold 1, Epoch 10, Loss: 0.0002
Fold 1, Accuracy: 0.8610
Fold 2, Epoch 1, Loss: 0.0583
Fold 2, Epoch 2, Loss: 0.8482
Fold 2, Epoch 3, Loss: 0.0098
Fold 2, Epoch 4, Loss: 0.0018
Fold 2, Epoch 5, Loss: 0.0064
Fold 2, Epoch 6, Loss: 0.0001
Fold 2, Epoch 7, Loss: 0.0000
Fold 2, Epoch 8, Loss: 0.0000
Fold 2, Epoch 9, Loss: 0.0002
Fold 2, Epoch 10, Loss: 0.0003
Fold 2, Accuracy: 0.9756
Fold 3, Epoch 1, Loss: 0.0002
Fold 3, Epoch 2, Loss: 0.0100
Fold 3, Epoch 3, Loss: 0.0005
Fold 3, Epoch 4, Loss: 0.0191
Fold 3, Epoch 5, Loss: 0.0001
Fold 3, Epoch 6, Loss: 0.0000
Fold 3, Epoch 7, Loss: 0.0000
Fold 3, Epoch 8, Loss: 0.0003
Fold 3, Epoch 9, Loss: 0.0048
Fold 3, Epoch 10, Loss: 0.0026
Fold 3, Accuracy: 0.9951
Fold 4, Epoch 1, Loss: