In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="2"

In [2]:
!which python
!pwd

/home/zinoviev/miniconda3/envs/airi-summer-p16/bin/python
/home/zinoviev/2024-08-23_Project-16


In [3]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import re
from rdkit.Chem import MolFromSmiles
import string
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')
atoms_tokens = ['Ag','Al','As','Au','B','Ba','Bi','Br','C','Ca',
              'Cd','Cl','Co','Cr','Cs','Cu','F','Fe','Ga','Gd',
              'Ge','H','Hg','I','In','K','Li','M','Mg','Mn',
              'Mo','N','Na','O','P','Pt','Ru','S','Sb','Sc',
              'Se','Si','Sn','V','W','Z','Zn','c','e','n','o','p','s']
atoms_tokens = sorted(atoms_tokens, key=lambda s: len(s), reverse=True)
SMI_REGEX_PATTERN = r"(\[|\]|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9]|" + \
                                                                  '|'.join(atoms_tokens) + ")"
regex = re.compile(SMI_REGEX_PATTERN)
def clean_output_sequence(output_sequence):
    return output_sequence.replace('</s>', '').replace('<sm_', '').replace(' sm_', '').replace('>', '').strip()

def add_special_symbols(text):
  output = []
  for word in text.split():
      tokens = [token for token in regex.findall(word)]
      if len(tokens) > 4 and (word == ''.join(tokens)) and MolFromSmiles(word):
          output.append(''.join(['<sm_'+t+'>' for t in tokens]))
      else:
          output.append(word)
  return ' '.join(output)

In [4]:
import pandas 

df_train = pandas.read_csv("_data/train_split_fluor.csv")
df_train[~df_train["Absorption max (nm)"].isna()][["Chromophore", "Solvent", "Absorption max (nm)"]].to_csv("train_absorption.csv", index=False)

df_test = pandas.read_csv("_data/test_split_fluor.csv")
df_test[~df_test["Absorption max (nm)"].isna()][["Chromophore", "Solvent", "Absorption max (nm)"]].to_csv("test_absorption.csv", index=False)

In [5]:
from datasets import load_dataset
dataset = load_dataset("csv", data_files={"train":"train_absorption.csv", "test":"test_absorption.csv"})

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [6]:
from transformers import T5ForSequenceClassification, T5Config

config = T5Config.from_pretrained('insilicomedicine/nach0_base')
config.num_labels=1
model = T5ForSequenceClassification.from_pretrained('insilicomedicine/nach0_base',
 config=config,
 ignore_mismatched_sizes=True)
tokenizer = AutoTokenizer.from_pretrained('insilicomedicine/nach0_base')

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at insilicomedicine/nach0_base and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def preprocess(d):
    prompt = add_special_symbols(f"{d["Chromophore"]}, {d["Solvent"]}")
    inputs = tokenizer(prompt,padding="longest", max_length=512, truncation=True, return_tensors="pt")
    inputs["input_ids"] = inputs["input_ids"][0]
    inputs["attention_mask"] = inputs["attention_mask"][0]
    inputs["label"] = d["Absorption max (nm)"]
    return inputs

dataset_map = dataset.map(preprocess, batched=False)



Map:   0%|          | 0/15649 [00:00<?, ? examples/s]

Map:   0%|          | 0/1646 [00:00<?, ? examples/s]

In [8]:
from transformers import DataCollatorForTokenClassification, DataCollatorForLanguageModeling, DataCollatorWithPadding, Trainer, TrainingArguments


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
run_name = "2024-08-28_run-005_model-nach0_pred-abs-max_learning-rate-1e-4_epochs-10_batch-size-08_NB-V02"

training_args = TrainingArguments(
    output_dir=run_name,
    # run_name=run_name,
    learning_rate=5e-5,
    # learning_rate=3e-4,
    per_device_train_batch_size=8,
    # per_device_train_batch_size=32,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="tensorboard",
    resume_from_checkpoint=False,
    lr_scheduler_type="cosine",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_map["train"],
    eval_dataset=dataset_map["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    # compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,108442.384,73679.757812
2,37259.856,22399.857422
3,12803.386,10412.444336
4,8590.442,6585.351074
5,6028.5585,4738.245605
6,4590.3885,3956.91626
7,3766.2395,2971.237549
8,3043.654,2604.481201
9,2925.155,2507.635742
10,2894.338,2483.632812


There were missing keys in the checkpoint model loaded: ['transformer.encoder.embed_tokens.weight', 'transformer.decoder.embed_tokens.weight'].


TrainOutput(global_step=19570, training_loss=21892.981677950946, metrics={'train_runtime': 3461.1938, 'train_samples_per_second': 45.213, 'train_steps_per_second': 5.654, 'total_flos': 2.267919237221946e+16, 'train_loss': 21892.981677950946, 'epoch': 10.0})