In [12]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, RobertaTokenizerFast, RobertaForSequenceClassification, pipeline
import selfies as sf
import numpy as np

In [13]:
generation_tokenizer = T5Tokenizer.from_pretrained("QizhiPei/biot5-base-text2mol", model_max_length=512)
base_generation_model = 'models/biot5-base-text2mol-finetuned'
augmented_generation_model = 'models/biot5-base-text2mol-augmented'
classifier = pipeline("text-classification", model = "models/chemberta")

generation_size = 1000

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
for model_path in [base_generation_model, augmented_generation_model]:
    generation_model = T5ForConditionalGeneration.from_pretrained(model_path, use_safetensors=True)
    task_definition = 'Definition: You are given a molecule description in English. Your job is to generate the molecule SELFIES that fits the description.\n\n'
    text_input = 'The molecule is a COVID-19 drug candidate.'
    task_input = f'Now complete the following example -\nInput: {text_input}\nOutput: '

    model_input = task_definition + task_input
    input_ids = generation_tokenizer(model_input, return_tensors="pt").input_ids

    generation_config = generation_model.generation_config
    generation_config.max_length = 512
    generation_config.num_beams = 1

    mol_selfies = []
    mol_smiles = []
    dupes = 0

    for i in range(generation_size):  
        outputs = generation_model.generate(input_ids, generation_config=generation_config)
        output_selfies = generation_tokenizer.decode(outputs[0], skip_special_tokens=True).replace(' ', '')
        mol_selfies.append(output_selfies)        
        output_smiles = sf.decoder(output_selfies)
        mol_smiles.append(output_smiles)

    mol_activity = [1 if result["label"] == 'LABEL_1' else 0 for result in classifier(mol_smiles)]
    predicted_activity_rate = np.average(mol_activity)
    print(model_path, predicted_activity_rate)

models/biot5-base-text2mol-finetuned 1.0
models/biot5-base-text2mol-augmented 1.0
