In [None]:
from tokenizers import Tokenizer, models, pre_tokenizers, trainers
from transformers import BertConfig, BertForMaskedLM, TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.Split(
    pattern=r"(\[.*?\]]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\||\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])",
    behavior="isolated"
)

# Train the tokenizer on the MOSES dataset
trainer = trainers.BpeTrainer(special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])
tokenizer.train(files=["moses_smiles.txt"], trainer=trainer)

# Save the tokenizer
tokenizer.save("smiles_tokenizer.json")

# Define model configuration
config = BertConfig(
    vocab_size=tokenizer.get_vocab_size(),
    hidden_size=256,  # Embedding dimension
    num_hidden_layers=4,
    num_attention_heads=8,
    max_position_embeddings=128,  # Adjust based on max SMILES length
)
model = BertForMaskedLM(config)


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15  # Mask 15% of tokens
)

In [None]:
training_args = TrainingArguments(
    output_dir="./smiles_bert",
    overwrite_output_dir=True,
    num_train_epochs=50,
    per_device_train_batch_size=64,
    save_steps=1000,
    logging_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,  # Preprocessed dataset
)

trainer.train()

In [None]:
# Get embeddings for a token (e.g., "C")
token_id = tokenizer.convert_tokens_to_ids("C")
embedding = model.bert.embeddings.word_embeddings.weight[token_id]

# Get contextual embeddings for a SMILES string
inputs = tokenizer("CCO", return_tensors="pt", padding=True, truncation=True)
outputs = model(**inputs, output_hidden_states=True)
last_hidden_states = outputs.hidden_states[-1]  # Shape: [batch_size, seq_len, hidden_size]