## Load Data

In [19]:
!pip install cloud-tpu-client



In [None]:
!pip install timm transformers pytorch-lightning Levenshtein rdkit wandb

In [75]:
!pip install -U torch_xla>=1.13

## Main

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import timm
import torch
from torch import nn
from PIL import Image
from torch.utils.data import DataLoader, Dataset
import torch.backends.cudnn as cudnn
import wandb
import pytorch_lightning as pl
from torch.nn.utils.rnn import pad_sequence,pack_padded_sequence
from transformers import (AutoProcessor,
                          AutoTokenizer,
                          VisionEncoderDecoderModel,
                          RobertaTokenizerFast,
                          TrOCRForCausalLM,
                          AutoModel,
                          TrOCRConfig,
                          ViTModel,
                          ViTConfig,
                          ViTImageProcessor
                         )
from sklearn.model_selection import train_test_split
from rdkit import RDLogger,Chem
from rdkit.Chem import AllChem,DataStructs,MolFromSmiles,Draw
import torch.nn.functional as F
import os
from rdkit.DataStructs import TanimotoSimilarity
from Levenshtein import distance as levenshtein_distance
pl.seed_everything(56)

INFO:lightning_fabric.utilities.seed:Seed set to 56


56

In [2]:
RDLogger.DisableLog('rdApp.*')
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
class CFG:
    wandb=False
    encoder="google/vit-base-patch16-384"
    decoder="entropy/roberta_zinc_480m"
    train_path = './validation-00000-of-00001-53255e68092e122d.parquet'#'./all_ChEMBLSmiles.csv'
    train_folder = './train/'
    betas=(0.9, 0.999)
    img_size = 512
    max_pred_len = 128
    val_split_size = 1e-6
    scheduler = None
    emb_dim = 512
    attention_dim = 512
    freq_threshold = 2
    decoder_dim = 512
    img_size=512
    dropout = 0.4
    eps=1e-6
    num_workers = 2
    batch_size = 64
    encoder_lr = 1e-4
    decoder_lr = 2e-4
    weight_decay = 0.01
    fine_tune_encoder = False
    max_epoches=6
    seed=56

## Data

In [4]:
class PLDataset(Dataset):
    def __init__(self, df, tokenizer,processor):
        super().__init__()
        self.df = df
        self.tokenizer = tokenizer
        self.processor = processor
        self.fn = lambda x : 255 if x == 255 else 0

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        label = self.df.iloc[idx]['smiles']
        image = self.processor(self._gen_smiles(label),return_tensors='pt').pixel_values
        label_enc = self.tokenizer.encode_plus(label, padding='max_length',max_length=128, truncation=True, return_tensors='pt')
        return {'image':image.squeeze(0),
                'input_ids':label_enc.input_ids.squeeze(0),
                'attention_mask':label_enc.attention_mask.squeeze(0)}

    def _gen_smiles(self,smiles):
        try:
            mol = MolFromSmiles(smiles)
            img = Draw.MolToImage(mol,size=(384,384))
            img = img.convert('L').convert('RGB')
            return img
        except:
            return Image.open('/content/Без названия (3).png')


In [5]:
class PLDataModule(pl.LightningDataModule):
    def __init__(self,tokenizer,processor):
        super().__init__()
        self.cfg = CFG()
        self.is_setup = False
        self.tokenizer = tokenizer
        self.processor = processor

    def prepare_data(self):
        self.train_df = pd.read_parquet(CFG.train_path)

    def setup(self, stage: str):
        #self.train_df, self.val_df = train_test_split(self.train_data, test_size=self.cfg.val_split_size,random_state=self.cfg.seed)
        #self.train_df = self.train_df.reset_index(drop=True)
        #self.val_df = self.val_df.reset_index(drop=True)
        self.train_dataset = PLDataset(self.train_df,self.tokenizer,self.processor)
        #self.val_dataset = PLDataset(self.val_df,self.tokenizer,self.processor)
        self.is_setup = True

    def train_dataloader(self):
        return DataLoader(self.train_dataset,
                          batch_size=self.cfg.batch_size,
                          num_workers=self.cfg.num_workers,
                          pin_memory=True,
                          shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset,
                          batch_size=self.cfg.batch_size,
                          num_workers=self.cfg.num_workers,
                          pin_memory=True,
                          shuffle=False)

In [6]:
def char_accuracy(y_p,y):
    y_p,y = list(y_p),list(y)
    ln = min(len(y_p),len(y))
    score = 0
    for i in range(ln):
        if y_p[i] == y[i]:
            score += 1
    return score / max(len(y_p),len(y))

def correct_part(y_p):
    if Chem.MolFromSmiles(y_p) is None:
        return 0
    else:
        return 1

def tanimoto(y_p,y):
    try:
        mol1 = Chem.MolFromSmiles(y_p)
        mol2 = Chem.MolFromSmiles(y)

        vec_1 = AllChem.RDKFingerprint(mol1)
        vec_2 = AllChem.RDKFingerprint(mol2)
        return DataStructs.TanimotoSimilarity(vec_1,vec_2)
    except:
        return 0

In [7]:
class AverageMeter:
    def __init__(self):
        self.reset()

    def reset(self):
        self.preds = []
        self.targets = []

    def update(self,preds,targets):
        self.preds += preds
        self.targets += targets

    def calc_metrics(self):
        f = dict()

        f['char_acc'] = np.mean([char_accuracy(x,y) for x,y in zip(self.preds,self.targets)])
        f['corrent_part'] = np.mean([correct_part(x) for x in self.preds])
        f['tanimoto'] = np.mean([tanimoto(x,y) for x,y in zip(self.preds,self.targets)])

        return f

In [8]:
class PLModule(pl.LightningModule):
    def __init__(self,model,tokenizer):
        super().__init__()
        self.cfg = CFG()
        self.avg_meter = AverageMeter()
        self.model = model
        self.tokenizer = tokenizer

    def forward(self,image,input_ids=None,attention_mask=None):
        return self.model(pixel_values=image,labels=input_ids,decoder_attention_mask=attention_mask)

    def training_step(self, batch, _):
        loss = self(**batch).loss
        self.log_dict({'train_loss':loss.item()})
        return loss

    def validation_stepPAS(self, batch, _):
        labels = batch['input_ids'].detach().cpu().numpy()
        labels = [self.tokenizer.decode(ids, skip_special_tokens=True) for ids in labels]

        logits = self.model.generate(
            batch['image'],
            num_beams=4,
            max_length=128
        )

        logits = logits.detach().cpu().numpy()
        logits = [self.tokenizer.decode(ids, skip_special_tokens=True) for ids in logits]
        self.avg_meter.update(logits,labels)

    def predict_step(self,batch,_):

        logits = self.model.generate(
            batch['image'],
            num_beams=1,
            max_length=128
        )

        logits = logits.detach().cpu().numpy()
        logits = [self.tokenizer.decode(ids, skip_special_tokens=True) for ids in logits]
        return logits

    def on_validation_epoch_end(self):
        f = self.avg_meter.calc_metrics()
        self.log_dict(f)
        print(f)
        self.avg_meter.reset()

    def configure_optimizers(self):
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in self.model.encoder.named_parameters()],
                "lr":self.cfg.encoder_lr
            },
            {
                "params": [p for n, p in self.model.decoder.named_parameters()],
                "lr": self.cfg.decoder_lr
            },
        ]
        return torch.optim.AdamW(optimizer_grouped_parameters,
                                 betas=self.cfg.betas,
                                 weight_decay=self.cfg.weight_decay,
                                 eps=self.cfg.eps)

In [9]:
tokenizer = RobertaTokenizerFast.from_pretrained(CFG.decoder)
processor = AutoProcessor.from_pretrained(CFG.encoder)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [10]:
encoder = ViTModel(ViTConfig(hidden_size=384,
                             hidden_act='gelu',
                             image_size=384,
                             num_attention_heads=6,
                             num_hidden_layers=12,
                             num_channels=3,
                             intermediate_size=384 * 4,
                             patch_size=16))

In [11]:
decoder = TrOCRForCausalLM(TrOCRConfig(vocab_size=len(tokenizer),
                                       d_model=256,
                                       decoder_attention_heads=8,
                                       decoder_ffn_dim=1024,
                                       decoder_layers=6,
                                       activation_function='gelu',
                                       max_position_embeddings=384,
                                       dropout=0.2))

In [12]:
model = VisionEncoderDecoderModel(encoder=encoder,decoder=decoder)
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size

In [13]:
dm = PLDataModule(tokenizer,processor)
dm.prepare_data()
dm.setup(0)

In [14]:
model_pl = PLModule(model,tokenizer)

In [15]:
wandb.login(key="673ae6e9b51cc896110db5327738b993795fffad")
os.environ['WANDB_API_KEY'] = "673ae6e9b51cc896110db5327738b993795fffad"
wandb.init(project='MOLECULA',name='TrOCR_tiny')

[34m[1mwandb[0m: Currently logged in as: [33mandrey20007[0m ([33mandrey2007[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [16]:
lr_monitor = pl.callbacks.LearningRateMonitor(logging_interval='epoch')
checkpoint_cb = pl.callbacks.ModelCheckpoint(
    dirpath='./outputs/',
    filename='base_model{epoch:02d}',
    monitor='tanimoto',
    mode='max',
    save_last=True
)

trainer = pl.Trainer(
    accelerator="tpu",
    precision=32,
    callbacks = [lr_monitor,checkpoint_cb],
    logger = pl.loggers.WandbLogger(),
    min_epochs=1,
    devices='auto',
    check_val_every_n_epoch=1,
    max_epochs=CFG.max_epoches
)



MisconfigurationException: `XLAAccelerator` can not run on your system since the accelerator is not available. The following accelerator(s) is available and can be passed into `accelerator` argument of `Trainer`: ['cpu'].