In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install wandb
!pip install transformers==4.0.0
!pip install catalyst==20.11

Collecting wandb
[?25l  Downloading https://files.pythonhosted.org/packages/ca/5e/9df94df3bfee51b92b54a5e6fa277d6e1fcdf1f27b1872214b98f55ec0f7/wandb-0.10.12-py2.py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 12.5MB/s 
[?25hCollecting sentry-sdk>=0.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/b1/5c/018bf9a5c24343a664deaea70e61f33f53bb1bd3caf193110f827bfd07e2/sentry_sdk-0.19.5-py2.py3-none-any.whl (128kB)
[K     |████████████████████████████████| 133kB 57.9MB/s 
Collecting watchdog>=0.8.3
[?25l  Downloading https://files.pythonhosted.org/packages/e6/76/39d123d37908a772b6a281d85fbb4384d9db7e13d19d10ad409006bd2962/watchdog-1.0.1.tar.gz (97kB)
[K     |████████████████████████████████| 102kB 14.7MB/s 
Collecting shortuuid>=0.5.0
  Downloading https://files.pythonhosted.org/packages/25/a6/2ecc1daa6a304e7f1b216f0896b26156b78e7c38e1211e9b798b4716c53d/shortuuid-1.0.1-py3-none-any.whl
Collecting subprocess32>=3.5.3
[?25l  Downloading https://fil

In [3]:
!wandb login

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [4]:
!git clone https://github.com/lehgtrung/egfr-att

Cloning into 'egfr-att'...
remote: Enumerating objects: 5, done.[K
remote: Counting objects: 100% (5/5), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 421 (delta 0), reused 0 (delta 0), pack-reused 416[K
Receiving objects: 100% (421/421), 18.81 MiB | 18.57 MiB/s, done.
Resolving deltas: 100% (210/210), done.


In [4]:
from pathlib import Path
import json
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
from dataclasses import dataclass
import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from catalyst import dl
from catalyst.utils import set_global_seed


ORIGINAL_PAPER_PATH = Path("egfr-att")
import sys
sys.path.append(ORIGINAL_PAPER_PATH.as_posix())


from egfr.dataset import EGFRDataset, train_cross_validation_split


DEVICE = torch.device('cuda')


SEED = 21
set_global_seed(SEED)


DATA_PATH = ORIGINAL_PAPER_PATH / "egfr/data/egfr_10_full_ft_pd_lines.json"

In [2]:
EXPERIMENT_NAME = 'chemberta-no-descriptor'


@dataclass
class Config:

    pretrained_path: str = "seyonec/PubChem10M_SMILES_BPE_450k"
    finetune_embeddings: bool = False
    n_layers_to_finetune: int = 2

    batch_size: int = 16
    accumulation_steps: int = 8
  
    num_epochs: int = 100
    patience: int = 10

    scheduler: str = 'OneCycleLR'
    max_lr: float = 0.0005
    warmup_prop: float = 0.2

    logdir: str = f'drive/MyDrive/logdir_{EXPERIMENT_NAME}'


config = Config()

In [None]:
def get_tokenizer_info(tokenizer):
    for key, value in tokenizer.special_tokens_map.items():
        print(f"{key}:", value, getattr(tokenizer, f"{key}_id"))


def freeze_module(module: torch.nn.Module):
    for p in module.parameters():
        p.requires_grad = False


def freeze_pretrained(model: 'RobertaForSequenceClassification', config: Config):
    if not config.finetune_embeddings:
        freeze_module(model.roberta.embeddings)

    n_layers = len(model.roberta.encoder.layer)
    layer_idx_to_stop = n_layers - config.n_layers_to_finetune
    for i, layer in enumerate(model.roberta.encoder.layer):
        if i == layer_idx_to_stop:
            break
        freeze_module(layer)
        

class SequenceEGFRDataset(EGFRDataset):

    def __init__(self, data, tokenizer):
        super().__init__(data, infer=True)
        self.tokenizer = tokenizer
        self.pad_token_id = tokenizer.pad_token_id
        self.encode_smiles()

        self.mord_ft = torch.FloatTensor(self.mord_ft)
        self.non_mord_ft = torch.FloatTensor(self.non_mord_ft)
        self.label = torch.LongTensor(self.label)

    def encode_smiles(self):
        self.smiles = [
            torch.LongTensor(self.tokenizer.encode(s))
            for s in self.smiles
        ]

    def collate_fn(self, batch):
        smiles, mord_ft, non_mord_ft, labels = zip(*batch)
        smiles = pad_sequence(
            smiles, batch_first=True, padding_value=self.pad_token_id
        )
        mord_ft = torch.stack(mord_ft)
        non_mord_ft = torch.stack(non_mord_ft)
        labels = torch.stack(labels)
        return smiles, mord_ft, non_mord_ft, labels

    def make_loader(self, *args, **kwargs):
        return DataLoader(self, *args, collate_fn=self.collate_fn, **kwargs)


In [None]:
def init_scheduler(
    optimizer: torch.optim.Optimizer,
    num_steps_per_epoch: int,
    config: Config
):

    if config.scheduler == 'OneCycleLR':
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer,
            max_lr=config.max_lr,
            epochs=config.num_epochs,
            steps_per_epoch=num_steps_per_epoch,
            pct_start=config.warmup_prop
        )
        return scheduler, 'batch'

    return None, None


class EgfrNoDescriptorRunner(dl.Runner):

    def _handle_batch(self, batch):
        smiles, _, _, labels = batch
        out = self.model(input_ids=smiles, labels=labels)
        self.batch_metrics['loss'] = out.loss
        self.input = {'targets': labels}
        self.output = {'logits': out.logits}


In [None]:
def experiment(train, valid, config, experiment_name, fold_idx):
    #  initialize model:
    model = AutoModelForSequenceClassification.from_pretrained(config.pretrained_path)
    tokenizer = AutoTokenizer.from_pretrained(config.pretrained_path)
    PAD_TOKEN_ID = tokenizer.pad_token_id
    freeze_pretrained(model, config)
    
    #  initialize data:
    train_dataset = SequenceEGFRDataset(train, tokenizer)
    valid_dataset = SequenceEGFRDataset(valid, tokenizer)
    
    
    loaders = {
        'train': train_dataset.make_loader(batch_size=config.batch_size, shuffle=True),
        'valid': valid_dataset.make_loader(batch_size=config.batch_size)
    }
    
    optimizer = torch.optim.Adam(model.parameters())

    callbacks = [
        dl.OptimizerCallback(accumulation_steps=config.accumulation_steps),
        dl.EarlyStoppingCallback(patience=config.patience),
        dl.WandbLogger(
            project='egfr-project',
            entity='dimaorekhov',
            group=f"{EXPERIMENT_NAME}_CV",
            name=f"{EXPERIMENT_NAME}_fold_{fold_idx}",
            config=config.__dict__
        ),
        dl.AUCCallback()
    ]

    scheduler, mode = init_scheduler(optimizer, len(loaders['train']), config)
    if scheduler is not None:
        callbacks.append(dl.SchedulerCallback(mode=mode))

    Path(config.logdir).mkdir(exist_ok=True)
    
    runner = EgfrNoDescriptorRunner(device=DEVICE)
    runner.train(
        model=model,
        loaders=loaders,
        optimizer=optimizer,
        scheduler=scheduler,        
        num_epochs=config.num_epochs,
        verbose=True,
        logdir=config.logdir,
        callbacks=callbacks
    )

    # Clear cuda memory:
    model.to(torch.device("cpu"))

In [None]:
for i, (train, valid) in enumerate(train_cross_validation_split(DATA_PATH.as_posix())):
    experiment(train, valid, config, EXPERIMENT_NAME, i)