In [1]:
pip install jupyterlab jupyterlab-optuna rdkit pysmiles prettytable pybel optuna

Collecting jupyterlab-optuna
  Downloading jupyterlab_optuna-0.2.2-py3-none-any.whl.metadata (4.1 kB)
Collecting rdkit
  Downloading rdkit-2024.9.6-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Collecting pysmiles
  Downloading pysmiles-2.0.0-py2.py3-none-any.whl.metadata (14 kB)
Collecting prettytable
  Downloading prettytable-3.15.1-py3-none-any.whl.metadata (33 kB)
Collecting pybel
  Downloading pybel-0.15.5-py3-none-any.whl.metadata (19 kB)
Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting bottle>=0.13.0 (from jupyterlab-optuna)
  Downloading bottle-0.13.2-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting pbr (from pysmiles)
  Downloading pbr-6.1.1-py2.py3-none-any.whl.metadata (3.4 kB)
Collecting click-plugins (from pybel)
  Downloading click_plugins-1.1.1-py2.py3-none-any.whl.metadata (6.4 kB)
Collecting bel-resources>=0.0.3 (from pybel)
  Downloading bel_resources-0.0.3-py3-none-any.whl.metadata (2.5 kB)
Collecting more-itertools

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
from prettytable import PrettyTable
import multiprocessing
from scipy import stats
import os, sys, gc
import time
import optuna
import optuna.visualization as vis

In [3]:
start = time.time()

file_dir = os.path.dirname("../")
sys.path.append(file_dir)

from anima.smiles import SMILES

sml = SMILES()

cores = int(multiprocessing.cpu_count())
print(f"Cores: {cores}, GPUs: {torch.cuda.device_count()}")

# Checking CUDA
torch.cuda.empty_cache()
use_cuda = True
print(f"Cuda available: {torch.cuda.is_available()}")
device = torch.device(
    "cuda" if (use_cuda and torch.cuda.is_available()) else "cpu"
)

Cores: 24, GPUs: 1
Cuda available: True


In [4]:
database = pd.read_csv("../anima-master/databases/OMEAD_41801.csv")

print(f"\nDatabase shape: {database.shape}\n")

train_db = database
train_db["homo_lumo_gap"] = train_db.lumo - train_db.homo
smiles = np.array(train_db.smiles)
#train_db.to_csv("changed_test200k.csv")
print(f"\nNew size of train database: {train_db.shape}\n")


Database shape: (41801, 20)


New size of train database: (41801, 21)



In [5]:
# Define SMILES vocabulary

print("\nDefining SMILES vocabulary\n")
vocab = sml.smilesVOC(smiles, n_jobs=cores)

vocab_size = len(vocab)

print(f"\nSize of vocabulary: {vocab_size}\n")


Defining SMILES vocabulary



[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=24)]: Done 152 tasks      | elapsed:    1.2s
[Parallel(n_jobs=24)]: Done 402 tasks      | elapsed:    1.5s
[Parallel(n_jobs=24)]: Done 752 tasks      | elapsed:    1.8s
[Parallel(n_jobs=24)]: Done 1202 tasks      | elapsed:    2.3s
[Parallel(n_jobs=24)]: Done 1752 tasks      | elapsed:    2.8s
[Parallel(n_jobs=24)]: Done 2402 tasks      | elapsed:    3.9s
[Parallel(n_jobs=24)]: Done 3152 tasks      | elapsed:    6.6s
[Parallel(n_jobs=24)]: Done 4002 tasks      | elapsed:    9.4s
[Parallel(n_jobs=24)]: Done 4952 tasks      | elapsed:   12.7s
[Parallel(n_jobs=24)]: Done 6002 tasks      | elapsed:   16.4s
[Parallel(n_jobs=24)]: Done 7152 tasks      | elapsed:   20.4s
[Parallel(n_jobs=24)]: Done 8402 tasks      | elapsed:   24.6s
[Parallel(n_jobs=24)]: Done 9752 tasks      | elapsed:   29.6s
[Parallel(n_jobs=24)]: Done 11202 tasks 


Size of vocabulary: 33



In [6]:
# Preparing SMILES into SEQUENCES or 1h tensors
# for embeddings
all_sequences = []

l = 1
for i in smiles:
    print(l, end="\r")
    all_sequences.append(torch.tensor(sml.smilesToSequence(i, vocab)))
    l += 1

# for embeddings
packing = torch.nn.utils.rnn.pack_sequence(
    all_sequences,
    enforce_sorted = False
)

packing_padding = torch.nn.utils.rnn.pad_packed_sequence(
    packing,
    batch_first = True
)

# Check dimensions
# for embedding
# BATCH x SEQUENCE x INFO
print(f"\nCheck packing shape: {packing_padding[0][:,:,:].size()}\n")

temp = packing_padding[0][:,:,0]
random_state = 1
test_size = 0.12

# inputs
x_train, x_test = train_test_split(
    temp.numpy(),
    test_size=test_size,
    random_state=1
)
del temp

gap = np.array(train_db.homo_lumo_gap)
# targets / outputs
y_train, y_test = train_test_split(
    gap,
    test_size=test_size,
    random_state=1
)

max_length = x_train.shape[-1]

# max length of tensor sequences
print(f"\nMax length of tensor sequences: {x_train.shape[-1]}")

print(f"\nTest size: {x_test.shape}\n")
print(f"\nTrain size: {x_train.shape}\n")

41801
Check packing shape: torch.Size([41801, 224, 1])


Max length of tensor sequences: 224

Test size: (5017, 224)


Train size: (36784, 224)



In [7]:
# Define TOOLS
def model_evaluation(model, x_test, y_test, device):
    batch_size=64

    test_data = TensorDataset(
        torch.tensor(x_test),
        torch.tensor(y_test)
    )

    test_loader = DataLoader(
        test_data,
        shuffle= False,
        batch_size= batch_size,
        drop_last = False
    )

    running_mae = []
    running_mse = []
    mae = torch.nn.L1Loss().to(device)
    mse = torch.nn.SmoothL1Loss().to(device)

    model.eval().to(device)

    with torch.no_grad():
        for batch_idx, data in enumerate(test_loader):
            inputs, targets = data
            inputs, targets = inputs.to(device), targets.to(device)
            targets = targets.double()

            output = model(inputs)

            loss_mae = mae(output, targets)
            loss_mse = mse(output, targets)

            running_mae.append(loss_mae.item())
            running_mse.append(loss_mse.item())
            
    model.train()
    return np.mean(running_mae), np.mean(running_mse)


def model_predictions(model, x, device, batch_size = 64):
    # predictions

    model.eval().to(device)

    pred_data = TensorDataset(torch.tensor(x))
    pred_loader = DataLoader(
        pred_data,
        shuffle=False,
        batch_size=batch_size,
        drop_last=False,
    )

    batches = len(x) / batch_size

    with torch.no_grad():
        for batch_idx, data in enumerate(pred_loader):
            print(f"Batch: {batch_idx + 1:010.2f} of {batches:010.2f}", end="\r")
            inputs = data[0]
            inputs = inputs.to(device)

            output = model(inputs)
            if batch_idx == 0:
                temp = output.cpu().detach().numpy()
            else:
                temp = np.append(temp, output.cpu().detach().numpy())
            del inputs, output

    return np.reshape(temp, -1)

In [8]:
class SMILESTransformerRegressor(nn.Module):
    def __init__(self, vocab_size, embed_dim, n_heads, n_layers, hidden_dim, max_len, output_dim, dropout):
        super(SMILESTransformerRegressor, self).__init__()

        self.embeddings = nn.Embedding(vocab_size + 1, embed_dim, padding_idx=0)

        # Positional encoding
        self.positional_encoding = nn.Parameter(torch.randn(1, max_len, embed_dim))


        # Transformer encoding
        encoder_layer = nn.TransformerEncoderLayer(
                d_model=embed_dim,
                nhead=n_heads,
                dim_feedforward=hidden_dim,
                dropout=dropout,
                batch_first=True
                )

        self.transformer_encoder = nn.TransformerEncoder(
                encoder_layer,
                num_layers=n_layers)

        # Regression head
        self.fc = nn.Sequential(
                nn.Linear(embed_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, output_dim)
                )

    def forward(self, x):
         """
         x: Tensor of shape (batch_size, seq_len)
         """
         batch_size, seq_len = x.shape

         # Embed and add positional encoding
         x_embed = self.embeddings(x) # (batch size, seq_len, embed_dim)

         # If sequence is shorter than max_len, crop pos encoding
         pos_enc = self.positional_encoding[:, :seq_len, :].to(x.device) # (1, seq_len, embed_dim)
         x_embed = x_embed + pos_enc

         # Transformer expects input shape (batch_size, seq_len, embed_dim)
         x_transformed = self.transformer_encoder(x_embed)

         # Pooling: mean over sequence dimension
         x_pooled = x_transformed.mean(dim=1) # (batch_size, embed_dim)

         out = self.fc(x_pooled) # (batch_size, 1)
         return out.squeeze(1) # (batch_size, )

In [9]:
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# Define the objective function for Optuna
def objective(trial):
    try:
        # Suggest hyperparameters
        embed_dim = trial.suggest_categorical("embed_dim", [64, 128, 256, 512])
        n_heads = trial.suggest_categorical("n_heads", [2, 4, 8])
        n_layers = trial.suggest_int("n_layers", 1, 6)
        hidden_dim = trial.suggest_int("hidden_dim", 128, 1024, log=True)
        weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-2, log=True)
        batch_size = trial.suggest_categorical("batch_size", [32, 64, 128, 256])
        learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True)
        dropout = trial.suggest_float('dropout', 0.0, 0.5)

    
        # Optional: Check compatibility of embed_dim and n_heads
        if embed_dim % n_heads != 0:
            raise optuna.exceptions.TrialPruned()  # Invalid combo
    
        # Model Initialization
        model = SMILESTransformerRegressor(
            vocab_size,
            embed_dim=embed_dim,
            n_heads=n_heads,
            n_layers=n_layers,
            hidden_dim=hidden_dim,
            max_len=max_length,
            output_dim=1,
            dropout=dropout
        ).float().to(device)
            
    
        # Loss and Optimizer
        criterion = nn.SmoothL1Loss().double().to(device)
        optimizer = torch.optim.Adam(
            model.parameters(),
            lr=learning_rate,
            weight_decay=weight_decay)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=10)
                                    
    
        # Data Loaders
        train_data = TensorDataset(torch.tensor(x_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32))
        train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size, drop_last=False, pin_memory=True)

        # Data loaders for validation
        val_data = TensorDataset(torch.tensor(x_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32))
        val_loader = DataLoader(val_data, shuffle=False, batch_size=batch_size, drop_last=False, pin_memory=True)
    
        # Training Loop (Reduced Epochs for Speed)
        best_val_loss = float('inf')
        patience_counter = 0
        epochs = 500  # Reduce for faster optimization
        #accumulation_steps = 2
        #model.train()
    
        for epoch in range(epochs):
            model.train()
            train_losses = []

            for x_batch, y_batch in train_loader:
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)
                optimizer.zero_grad()
                output = model(x_batch)
                loss = criterion(output, y_batch)
                loss.backward()
                optimizer.step()
                train_losses.append(loss.item())
            

            # Evaluate on validation set
            model.eval()
            val_losses = []

            with torch.no_grad():
                for x_val, y_val in val_loader:
                    x_val, y_val = x_val.to(device), y_val.to(device)
                    output = model(x_val)
                    val_loss = criterion(output, y_val)
                    val_losses.append(val_loss.item())

            avg_train_loss = sum(train_losses) / len(train_losses)
            avg_val_loss = sum(val_losses) / len(val_losses)
            scheduler.step(avg_val_loss)

            # Prune if it's bad
            trial.report(avg_val_loss, epoch)
            if trial.should_prune():
                raise optuna.exceptions.TrialPruned()

            print(f"Trial {trial.number} | Epoch {epoch} | Val Loss: {avg_val_loss:.5f} | Train Loss: {avg_train_loss:.5f}")


            # Early stopping manually (optional)
            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= 30:  # patience limit
                    break

            # Track overfitting (optional)
            overfit_gap = avg_val_loss - avg_train_loss
            trial.set_user_attr("overfit_gap", overfit_gap)

        return best_val_loss  # this is what Optuna will minimize

    except RuntimeError as e:
        if 'out of memory' in str(e):
            print("CUDA OOM. Skipping this trial.")
            torch.cuda.empty_cache()
            gc.collect()
            return float('inf')
        else:
            raise
    finally:
        torch.cuda.empty_cache()
        gc.collect()

In [None]:
# Create Optuna study and optimize the objective function
study = optuna.create_study(direction='minimize', study_name='RNN Hyperparameter Optimization')
study.optimize(objective, n_trials=50)

# Output the best hyperparameters
print('Best hyperparameters found:')
print(study.best_params)

[I 2025-03-24 11:24:36,748] A new study created in memory with name: RNN Hyperparameter Optimization
[W 2025-03-24 11:31:05,955] Trial 0 failed with parameters: {'embed_dim': 128, 'n_heads': 8, 'n_layers': 2, 'hidden_dim': 880, 'weight_decay': 0.0015423589604909055, 'batch_size': 256, 'learning_rate': 0.0012962594688829706, 'dropout': 0.09246618965108322} because of the following error: The value nan is not acceptable.
[W 2025-03-24 11:31:05,955] Trial 0 failed with value nan.
[W 2025-03-24 11:46:07,397] Trial 1 failed with parameters: {'embed_dim': 64, 'n_heads': 8, 'n_layers': 5, 'hidden_dim': 867, 'weight_decay': 2.6980243151556872e-06, 'batch_size': 128, 'learning_rate': 0.00946910464098346, 'dropout': 0.4400440177505704} because of the following error: The value nan is not acceptable.
[W 2025-03-24 11:46:07,398] Trial 1 failed with value nan.
[I 2025-03-24 11:54:20,068] Trial 2 finished with value: 0.029659311577345783 and parameters: {'embed_dim': 128, 'n_heads': 2, 'n_layers': 4

In [None]:
import optuna.visualization as vis

# Visualize parameter importance and optimization history
vis.plot_param_importances(study).show()
vis.plot_optimization_history(study).show()

In [None]:
import json
with open("best_param.json", 'w') as f:
    json.dump(study.best_params, f, indent=4)
    f.close()

In [None]:
print("Script done!")