In [1]:
%env CUDA_VISIBLE_DEVICES=1,2

env: CUDA_VISIBLE_DEVICES=1,2


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
from datasets import Dataset

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from tqdm.notebook import trange, tqdm

import selfies as sf

In [None]:
tqdm.pandas()

In [None]:
tokenizer = T5Tokenizer.from_pretrained("QizhiPei/biot5-base-mol2text", model_max_length=512)
model = T5ForConditionalGeneration.from_pretrained('QizhiPei/biot5-base-mol2text')

# Проверка исходной модели на работоспособность

In [None]:
task_definition = 'Definition: You are given a molecule SELFIES. Your job is to generate the molecule description in English that fits the molecule SELFIES.\n\n'
selfies_input = '[C][C][Branch1][C][O][C][C][=Branch1][C][=O][C][=Branch1][C][=O][O-1]'
task_input = f'Now complete the following example -\nInput: <bom>{selfies_input}<eom>\nOutput: '

In [None]:
model_input = task_definition + task_input
input_ids = tokenizer(model_input, return_tensors="pt").input_ids

In [None]:
generation_config = model.generation_config
generation_config.max_length = 512
generation_config.num_beams = 1

outputs = model.generate(input_ids, generation_config=generation_config)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

# Обучение

## подготовка датасета

In [None]:
df = pd.read_csv('train_split_fluor.csv')

In [None]:
df = df[["Chromophore", "Solvent", "Absorption max (nm)"]] #Absorption max (nm) 	Emission max (nm)
df = df.dropna()
df = df[df["Solvent"]!="gas"]

In [None]:
def gen_task(chromophore, solvent):
    task_definition = (
        'Definition: You are given chromophore molecule SELFIES and solvent chromophore SELFIES.'
        'Your job is to generate the wavelength of max chromophore molecule absorption fits the molecules SELFIES.\n\n')
    try:
        task_input = (
            f'{task_definition}'
            'Now complete the following example-\n'
            'Input: '
            f'Chromophore: <bom>{sf.encoder(chromophore)}<eom> Solvent: <bom>{sf.encoder(solvent)}<eom>\nOutput: ')
    except:
        task_input = None
        print(chromophore, solvent)
    return task_input

In [None]:
df["input"] = df.progress_apply(lambda x: gen_task(x.Chromophore, x.Solvent), axis=1)

In [None]:
df = df.dropna()

In [None]:
df["output"] = df["Absorption max (nm)"].astype(int).astype(str)

In [None]:
df[["input", "output"]].head(3)

In [None]:
df.to_csv("absorption_p1.csv")

## Обучение

In [None]:
mp = "./_MODELS_AND_RESULTS/justtext-absorption-all-10epoch"

In [None]:
df=pd.read_csv("absorption_p1.csv")
df["output"] = df.output.astype(str)

In [None]:
print(df.iloc[0].input)
print(df.iloc[0].output)

In [None]:
train, test = train_test_split(df[["input", "output"]])

train_dataset = Dataset.from_pandas(train)
eval_dataset = Dataset.from_pandas(test)

In [None]:
def tokenize_function(example):
    inputs = tokenizer(example["input"], padding="max_length", truncation=True, max_length=512)
    targets = tokenizer(example["output"], padding="max_length", truncation=True, max_length=512)
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

In [None]:
training_args = TrainingArguments(
    output_dir=f"{mp}/results",          # output directory
    save_strategy="epoch",
    evaluation_strategy="epoch",     # evaluate each epoch
    learning_rate=1.33e-4/2,
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    num_train_epochs=10,              # number of training epochs
    weight_decay=0.01,               # strength of weight decay
    logging_dir=f"{mp}/logs",            # directory for storing logs
    logging_steps=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset
)

trainer.train()

In [None]:
model.save_pretrained(f"{mp}/model")
tokenizer.save_pretrained(f"{mp}/model")

In [1]:
import subprocess

In [2]:
subprocess.run(["kill","-9","1031650"])

CompletedProcess(args=['kill', '-9', '1031650'], returncode=0)

## Проверка своей модели

In [None]:
import torch

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device);

In [None]:
i = 0
input_text = df["input"].iloc[i]
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

# Generate output text
output_ids = model.generate(input_ids)
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print(output_text, df["output"].iloc[i])

### проверка всех

In [None]:
pred = []
real = []
for i in trange(1000, 1100):
    input_text = df["input"].iloc[i]
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
    # Generate output text
    output_ids = model.generate(input_ids)
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    pred+=[int(output_text),]
    real+=[int(df["output"].iloc[i]),]

In [None]:
import numpy as np

In [None]:
pred = np.array(pred)
real = np.array(real)

In [None]:
from sklearn.metrics import mean_squared_error as mse

In [None]:
mse(pred, real)

### другой метод

In [None]:
i = 1000
model_input = df["input"].iloc[i]
input_ids = tokenizer(model_input, return_tensors="pt").input_ids

In [None]:
generation_config = model.generation_config
generation_config.max_length = 512
generation_config.num_beams = 1

outputs = model.generate(input_ids, generation_config=generation_config)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

# Получение эмбедингов

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
text = [f'<bom>{selfies_input}<eom>']

In [None]:
m = SentenceTransformer("QizhiPei/biot5-base-mol2text")

In [None]:
embeddings = m.encode(text)
df = pd.DataFrame(embeddings)
df