In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install wandb
!pip install transformers==4.0.0
!pip install catalyst==20.12

In [None]:
!wandb login

In [None]:
!git clone https://github.com/lehgtrung/egfr-att

In [1]:
from pathlib import Path
import json
from transformers import AutoTokenizer, RobertaForSequenceClassification, RobertaConfig
import pandas as pd
from dataclasses import dataclass
import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from catalyst import dl
from catalyst.utils import set_global_seed


ORIGINAL_PAPER_PATH = Path("egfr-att")
import sys
sys.path.append(ORIGINAL_PAPER_PATH.as_posix())


from egfr.dataset import EGFRDataset, train_cross_validation_split


DEVICE = torch.device('cuda')


SEED = 21
set_global_seed(SEED)


DATA_PATH = ORIGINAL_PAPER_PATH / "egfr/data/egfr_10_full_ft_pd_lines.json"

In [None]:
EXPERIMENT_NAME = 'transformer-no-descriptor'


@dataclass
class Config:

    tokenizer_path: str = "seyonec/PubChem10M_SMILES_BPE_450k"

    hidden_size: int = 768
    num_hidden_layers: int = 2
    num_attention_heads: int = 12
    intermediate_size: int = 3072
    hidden_dropout_prob: float = 0.1
    attention_probs_dropout_prob: float = 0.1

    batch_size: int = 16
    accumulation_steps: int = 8
  
    num_epochs: int = 100
    patience: int = 10

    scheduler: str = 'OneCycleLR'
    max_lr: float = 0.0005
    warmup_prop: float = 0.2

    logdir: str = f'drive/MyDrive/logdir_{EXPERIMENT_NAME}'


config = Config()

In [None]:
class SequenceEGFRDataset(EGFRDataset):

    def __init__(self, data, tokenizer):
        super().__init__(data, infer=True)
        self.tokenizer = tokenizer
        self.pad_token_id = tokenizer.pad_token_id
        self.encode_smiles()

        self.mord_ft = torch.FloatTensor(self.mord_ft)
        self.non_mord_ft = torch.FloatTensor(self.non_mord_ft)
        self.label = torch.LongTensor(self.label)

    def encode_smiles(self):
        self.smiles = [
            torch.LongTensor(self.tokenizer.encode(s))
            for s in self.smiles
        ]

    def collate_fn(self, batch):
        smiles, mord_ft, non_mord_ft, labels = zip(*batch)
        smiles = pad_sequence(
            smiles, batch_first=True, padding_value=self.pad_token_id
        )
        mord_ft = torch.stack(mord_ft)
        non_mord_ft = torch.stack(non_mord_ft)
        labels = torch.stack(labels)
        return smiles, mord_ft, non_mord_ft, labels

    def make_loader(self, *args, **kwargs):
        return DataLoader(self, *args, collate_fn=self.collate_fn, **kwargs)


In [None]:
def init_scheduler(
    optimizer: torch.optim.Optimizer,
    num_steps_per_epoch: int,
    config: Config
):

    if config.scheduler == 'OneCycleLR':
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer,
            max_lr=config.max_lr,
            epochs=config.num_epochs,
            steps_per_epoch=num_steps_per_epoch,
            pct_start=config.warmup_prop
        )
        return scheduler, 'batch'

    return None, None


In [None]:
class EgfrNoDescriptorRunner(dl.Runner):

    def _handle_batch(self, batch):
        smiles, _, _, labels = batch
        out = self.model(input_ids=smiles)
        self.batch_metrics['loss'] = \
          torch.nn.functional.binary_cross_entropy_with_logits(out.logits, labels.float().unsqueeze(-1))
        self.input = {'targets': labels}
        self.output = {'logits': out.logits}


In [None]:
def experiment(train, valid, config, experiment_name, fold_idx):

    tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_path)
    PAD_TOKEN_ID = tokenizer.pad_token_id

    model_config = RobertaConfig(
        vocab_size=tokenizer.vocab_size,
        hidden_size=config.hidden_size,
        num_hidden_layers=config.num_hidden_layers,
        num_attention_heads=config.num_attention_heads,
        intermediate_size=config.intermediate_size,
        hidden_dropout_prob=config.hidden_dropout_prob,
        attention_probs_dropout_prob=config.attention_probs_dropout_prob,
        pad_token_id=PAD_TOKEN_ID,
        num_labels=1
    )
    model = RobertaForSequenceClassification(config=model_config)

    train_dataset = SequenceEGFRDataset(train, tokenizer)
    valid_dataset = SequenceEGFRDataset(valid, tokenizer)

    loaders = {
        'train': train_dataset.make_loader(batch_size=config.batch_size, shuffle=True),
        'valid': valid_dataset.make_loader(batch_size=config.batch_size)
    }

    optimizer = torch.optim.Adam(model.parameters())

    callbacks = [
        dl.OptimizerCallback(accumulation_steps=config.accumulation_steps),
        dl.EarlyStoppingCallback(patience=config.patience),
        dl.WandbLogger(
            project='egfr-project',
            entity='dimaorekhov',
            group=f"{EXPERIMENT_NAME}_CV",
            name=f"{EXPERIMENT_NAME}_fold_{fold_idx}",
            config=config.__dict__
        ),
        dl.AUCCallback(activation='Sigmoid')
    ]

    scheduler, mode = init_scheduler(optimizer, len(loaders['train']), config)
    if scheduler is not None:
        callbacks.append(dl.SchedulerCallback(mode=mode))

    Path(config.logdir).mkdir(exist_ok=True)

    runner = EgfrNoDescriptorRunner(device=DEVICE)
    runner.train(
        model=model,
        loaders=loaders,
        optimizer=optimizer,
        scheduler=scheduler,        
        num_epochs=config.num_epochs,
        verbose=True,
        logdir=config.logdir,
        callbacks=callbacks,
        check=True
    )

    model.to(torch.device("cpu"))


In [None]:
for i, (train, valid) in enumerate(train_cross_validation_split(DATA_PATH.as_posix())):
    experiment(train, valid, config, EXPERIMENT_NAME, i)