In [1]:
from transformers import DistilBertTokenizer, DistilBertForMaskedLM
from transformers import DataCollatorForLanguageModeling

# Load pre-trained model tokenizer (vocabulary)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Load pre-trained PyTorch model
model = DistilBertForMaskedLM.from_pretrained('distilbert-base-uncased').cuda()

# Data collator used for dynamic padding
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,  # True if you're doing masked language modeling
    mlm_probability=0.15  # Probability of masking a token
)


In [2]:
print(type(model))

<class 'transformers.models.distilbert.modeling_distilbert.DistilBertForMaskedLM'>


In [3]:
print(issubclass(type(model), DistilBertForMaskedLM))

True


# Pre-Processing

# Normalization: 
Converts SMILES to a consistent, canonical format using RDKit, which helps in removing duplicates and maintaining consistency.
# Tokenization: 
Splits the SMILES strings into individual characters and converts these into numerical tokens. This step is crucial for the model to learn from the sequence data.
# Padding: 
Ensures that all input sequences to the model are of the same length by padding shorter sequences with zeros.

In [4]:
from rdkit import Chem
from transformers import DistilBertTokenizer
import numpy as np

# Function to read SMILES from a file
def read_smiles(file_path):
    with open(file_path, 'r') as file:
        smiles = file.read().strip().split('\n')
    return smiles

# Normalize SMILES
def normalize_smiles(smiles_list):
    normalized = [Chem.MolToSmiles(Chem.MolFromSmiles(smile), canonical=True) for smile in smiles_list if Chem.MolFromSmiles(smile) is not None]
    return normalized

# Tokenize SMILES
def tokenize_smiles(smiles_list):
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    tokenized = [tokenizer.encode(smile, add_special_tokens=True) for smile in smiles_list]
    return tokenized, tokenizer

# Load and process the data
file_path = 'smiles_train.txt'
smiles = read_smiles(file_path)
normalized_smiles = normalize_smiles(smiles)
tokenized_smiles, tokenizer = tokenize_smiles(normalized_smiles)
max_length = max(len(s) for s in tokenized_smiles)
padded_smiles = [s + [0] * (max_length - len(s)) for s in tokenized_smiles]  # Padding manually

# Insights on the data
print(f"Total number of molecules: {len(smiles)}")
print(f"Number of valid molecules after normalization: {len(normalized_smiles)}")
print(f"Max length of SMILES: {max_length}")
print(f"Vocabulary size: {len(tokenizer)}")  # Vocabulary size is determined by the tokenizer itself

# Example output
print("Example of normalized SMILES:", normalized_smiles[0])
print("Example of tokenized and padded SMILES:", padded_smiles[0])


Total number of molecules: 1036643
Number of valid molecules after normalization: 1036643
Max length of SMILES: 96
Vocabulary size: 30522
Example of normalized SMILES: COc1ccc(N2CCN(C(=O)c3cc4ccccc4[nH]3)CC2)cc1
Example of tokenized and padded SMILES: [101, 2522, 2278, 2487, 9468, 2278, 1006, 1050, 2475, 9468, 2078, 1006, 1039, 1006, 1027, 1051, 1007, 1039, 2509, 9468, 2549, 9468, 9468, 2278, 2549, 1031, 18699, 1033, 1017, 1007, 10507, 2475, 1007, 10507, 2487, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [None]:
import torch

# Check if GPU is available
if torch.cuda.is_available():
    # PyTorch will use GPU
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(0))  # This will print the name of the GPU
else:
    # PyTorch will use CPU
    device = torch.device("cpu")
    print("Using CPU")

In [None]:
print(type(model))

In [None]:
print(issubclass(type(model), DistilBertForMaskedLM))


In [5]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

class SmilesDataset(Dataset):
    def __init__(self, input_ids, attention_mask):
        self.input_ids = input_ids
        self.attention_mask = attention_mask

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx]
        }

# Convert lists to tensors and create a dataset
input_ids = torch.tensor(padded_smiles, dtype=torch.long)

attention_mask = torch.tensor([[1 if token > 0 else 0 for token in seq] for seq in padded_smiles], dtype=torch.long)

# Split the dataset into training and evaluation sets
train_inputs, eval_inputs, train_masks, eval_masks = train_test_split(input_ids, attention_mask, test_size=0.2, random_state=42)

# Create training dataset
train_dataset = SmilesDataset(train_inputs, train_masks)

# Create evaluation dataset
eval_dataset = SmilesDataset(eval_inputs, eval_masks)

# Training arguments
training_args = TrainingArguments(
    output_dir='./model_output',
    num_train_epochs=1,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True,   # Load the best model at the end of training
    evaluation_strategy="epoch",   # Evaluate at each specified number of steps
    save_strategy="epoch",         # Save strategy to match evaluation
    save_steps=5000,                # Number of steps to save the model
    report_to="all"                # Report to all available platforms
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator
)

# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.2665,0.240668


There were missing keys in the checkpoint model loaded: ['vocab_projector.weight'].


TrainOutput(global_step=12959, training_loss=0.38036279680552476, metrics={'train_runtime': 17914.2631, 'train_samples_per_second': 46.294, 'train_steps_per_second': 0.723, 'total_flos': 2.061277923873715e+16, 'train_loss': 0.38036279680552476, 'epoch': 1.0})

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def generate_smiles(model, start_sequence, max_length):
    model.eval()
    with torch.no_grad():
        input_ids = tokenizer.encode(start_sequence, return_tensors='pt').to(device)
        
        generated_ids = input_ids
        for _ in range(max_length):
            masked_input_ids = generated_ids.clone()
            masked_input_ids[:, -1] = tokenizer.mask_token_id
            
            outputs = model(masked_input_ids)
            next_token_logits = outputs.logits[:, -1, :]
            
            next_token_id = torch.argmax(next_token_logits, dim=-1, keepdim=True)
            
            generated_ids = torch.cat([generated_ids, next_token_id], dim=-1)
            
            if next_token_id.item() == tokenizer.sep_token_id:
                break
            
        generated_smiles = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        return generated_smiles

num_sequences = 10
generated_smiles_list = [generate_smiles(model, "Molecule Start Sequence ", max_length=100) for _ in range(num_sequences)]

with open('generated_molecules.txt', 'w') as f:
    for smile in generated_smiles_list:
        f.write(smile + '\n')

print("Generated SMILES have been saved to 'generated_molecules.txt'.")


Generated SMILES have been saved to 'generated_molecules.txt'.
