In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install wandb
!pip install transformers==4.0.0
!pip install catalyst==20.11

Collecting wandb
[?25l  Downloading https://files.pythonhosted.org/packages/ca/5e/9df94df3bfee51b92b54a5e6fa277d6e1fcdf1f27b1872214b98f55ec0f7/wandb-0.10.12-py2.py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 12.5MB/s 
[?25hCollecting sentry-sdk>=0.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/b1/5c/018bf9a5c24343a664deaea70e61f33f53bb1bd3caf193110f827bfd07e2/sentry_sdk-0.19.5-py2.py3-none-any.whl (128kB)
[K     |████████████████████████████████| 133kB 57.9MB/s 
Collecting watchdog>=0.8.3
[?25l  Downloading https://files.pythonhosted.org/packages/e6/76/39d123d37908a772b6a281d85fbb4384d9db7e13d19d10ad409006bd2962/watchdog-1.0.1.tar.gz (97kB)
[K     |████████████████████████████████| 102kB 14.7MB/s 
Collecting shortuuid>=0.5.0
  Downloading https://files.pythonhosted.org/packages/25/a6/2ecc1daa6a304e7f1b216f0896b26156b78e7c38e1211e9b798b4716c53d/shortuuid-1.0.1-py3-none-any.whl
Collecting subprocess32>=3.5.3
[?25l  Downloading https://fil

In [3]:
!wandb login

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [4]:
!git clone https://github.com/lehgtrung/egfr-att

Cloning into 'egfr-att'...
remote: Enumerating objects: 5, done.[K
remote: Counting objects: 100% (5/5), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 421 (delta 0), reused 0 (delta 0), pack-reused 416[K
Receiving objects: 100% (421/421), 18.81 MiB | 18.57 MiB/s, done.
Resolving deltas: 100% (210/210), done.


In [5]:
from pathlib import Path
import json
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
from dataclasses import dataclass
import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from catalyst import dl
from catalyst.utils import set_global_seed


ORIGINAL_PAPER_PATH = Path("egfr-att")
import sys
sys.path.append(ORIGINAL_PAPER_PATH.as_posix())


from egfr.dataset import EGFRDataset


DEVICE = torch.device('cuda')


SEED = 21
set_global_seed(SEED)


DATA_PATH = ORIGINAL_PAPER_PATH / "egfr/data/egfr_10_full_ft_pd_lines.json"

In [6]:
EXPERIMENT_NAME = 'chemberta-no-descriptor'


@dataclass
class Config:

    pretrained_path: str = "seyonec/PubChem10M_SMILES_BPE_450k"
    finetune_embeddings: bool = False
    n_layers_to_finetune: int = 2

    batch_size: int = 16
    accumulation_steps: int = 8
  
    num_epochs: int = 100
    patience: int = 10

    scheduler: str = 'OneCycleLR'
    max_lr: float = 0.0005
    warmup_prop: float = 0.2

    logdir: str = f'drive/MyDrive/logdir_{EXPERIMENT_NAME}'


config = Config()

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(config.pretrained_path)
tokenizer = AutoTokenizer.from_pretrained(config.pretrained_path)


def get_tokenizer_info(tokenizer):
    for key, value in tokenizer.special_tokens_map.items():
        print(f"{key}:", value, getattr(tokenizer, f"{key}_id"))

get_tokenizer_info(tokenizer)


PAD_TOKEN_ID = tokenizer.pad_token_id

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=515.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=336422980.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at seyonec/PubChem10M_SMILES_BPE_450k were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at seyonec/PubChem10M_SMILES_BPE_45

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=164540.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=101307.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=772.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=62.0, style=ProgressStyle(description_w…


bos_token: <s> 0
eos_token: </s> 2
unk_token: <unk> 3
sep_token: </s> 2
pad_token: <pad> 1
cls_token: <s> 0
mask_token: <mask> 4


In [8]:
def freeze_module(module: torch.nn.Module):
    for p in module.parameters():
        p.requires_grad = False


def freeze_pretrained(model: 'RobertaForSequenceClassification', config: Config):
    if not config.finetune_embeddings:
        freeze_module(model.roberta.embeddings)

    n_layers = len(model.roberta.encoder.layer)
    layer_idx_to_stop = n_layers - config.n_layers_to_finetune
    for i, layer in enumerate(model.roberta.encoder.layer):
        if i == layer_idx_to_stop:
            break
        freeze_module(layer)


freeze_pretrained(model, config)

In [9]:
class SequenceEGFRDataset(EGFRDataset):

    def __init__(self, data, tokenizer):
        super().__init__(data, infer=True)
        self.tokenizer = tokenizer
        self.pad_token_id = tokenizer.pad_token_id
        self.encode_smiles()

        self.mord_ft = torch.FloatTensor(self.mord_ft)
        self.non_mord_ft = torch.FloatTensor(self.non_mord_ft)
        self.label = torch.LongTensor(self.label)

    def encode_smiles(self):
        self.smiles = [
            torch.LongTensor(self.tokenizer.encode(s))
            for s in self.smiles
        ]

    def collate_fn(self, batch):
        smiles, mord_ft, non_mord_ft, labels = zip(*batch)
        smiles = pad_sequence(
            smiles, batch_first=True, padding_value=self.pad_token_id
        )
        mord_ft = torch.stack(mord_ft)
        non_mord_ft = torch.stack(non_mord_ft)
        labels = torch.stack(labels)
        return smiles, mord_ft, non_mord_ft, labels

    def make_loader(self, *args, **kwargs):
        return DataLoader(self, *args, collate_fn=self.collate_fn, **kwargs)


In [10]:
train, valid = train_test_split(
    pd.read_json(DATA_PATH, lines=True), test_size=0.2, random_state=42 #  42 hard code is from original paper code 
)


train_dataset = SequenceEGFRDataset(train, tokenizer)
valid_dataset = SequenceEGFRDataset(valid, tokenizer)

In [11]:
print('Max train smiles length:', max(len(s) for s in train_dataset.smiles))
print('Max valid smiles length:', max(len(s) for s in valid_dataset.smiles))

Max train smiles length: 100
Max valid smiles length: 93


In [12]:
loaders = {
    'train': train_dataset.make_loader(batch_size=config.batch_size, shuffle=True),
    'valid': valid_dataset.make_loader(batch_size=config.batch_size)
}

In [13]:
def init_scheduler(
    optimizer: torch.optim.Optimizer,
    num_steps_per_epoch: int,
    config: Config
):

    if config.scheduler == 'OneCycleLR':
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer,
            max_lr=config.max_lr,
            epochs=config.num_epochs,
            steps_per_epoch=num_steps_per_epoch,
            pct_start=config.warmup_prop
        )
        return scheduler, 'batch'

    return None, None


In [14]:
optimizer = torch.optim.Adam(model.parameters())

callbacks = [
    dl.OptimizerCallback(accumulation_steps=config.accumulation_steps),
    dl.EarlyStoppingCallback(patience=config.patience),
    dl.WandbLogger(
        project='egfr-project',
        entity='dimaorekhov',
        group='chemberta-no-descriptor',
        name=EXPERIMENT_NAME,
        config=config.__dict__
    ),
    dl.AUCCallback()
]

scheduler, mode = init_scheduler(optimizer, len(loaders['train']), config)
if scheduler is not None:
    callbacks.append(dl.SchedulerCallback(mode=mode))

In [15]:
class EgfrNoDescriptorRunner(dl.Runner):

    def _handle_batch(self, batch):
        smiles, _, _, labels = batch
        out = self.model(input_ids=smiles, labels=labels)
        self.batch_metrics['loss'] = out.loss
        self.input = {'targets': labels}
        self.output = {'logits': out.logits}


In [16]:
# be careful not to override log dir
Path(config.logdir).mkdir(exist_ok=True)

In [17]:
runner = EgfrNoDescriptorRunner(device=DEVICE)
runner.train(
    model=model,
    loaders=loaders,
    optimizer=optimizer,
    scheduler=scheduler,        
    num_epochs=config.num_epochs,
    verbose=True,
    logdir=config.logdir,
    callbacks=callbacks
)

[34m[1mwandb[0m: Currently logged in as: [33mdimaorekhov[0m (use `wandb login --relogin` to force relogin)


1/100 * Epoch (train):   1% 2/175 [00:00<00:38,  4.48it/s, loss=0.588, lr=2.000e-05, momentum=0.950]


Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`.  Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate


To get the last learning rate computed by the scheduler, please use `get_last_lr()`.



1/100 * Epoch (train): 100% 175/175 [00:09<00:00, 18.75it/s, loss=0.246, lr=2.296e-05, momentum=0.949]
1/100 * Epoch (valid): 100% 44/44 [00:01<00:00, 32.46it/s, loss=0.296]
[2020-12-11 21:42:34,635] 
1/100 * Epoch 1 (_base): lr=2.296e-05 | momentum=0.9494
1/100 * Epoch 1 (train): auc/class_00=0.5750 | auc/class_01=0.5516 | auc/mean=0.5633 | loss=0.4340 | lr=2.099e-05 | momentum=0.9498
1/100 * Epoch 1 (valid): auc/class_00=0.7100 | auc/class_01=0.7127 | auc/mean=0.7113 | loss=0.3737
2/100 * Epoch (train): 100% 175/175 [00:09<00:00, 17.72it/s, loss=0.320, lr=3.175e-05, momentum=0.948]
2/100 * Epoch (valid): 100% 44/44 [00:01<00:00, 29.86it/s, loss=0.215]
[2020-12-11 21:43:17,165] 
2/100 * Epoch 2 (_base): lr=3.175e-05 | momentum=0.9476
2/100 * Epoch 2 (train): auc/class_00=0.7816 | auc/class_01=0.7635 | auc/mean=0.7726 | loss=0.3513 | lr=2.689e-05 | momentum=0.9486
2/100 * Epoch 2 (valid): auc/class_00=0.8351 | auc/class_01=0.8287 | auc/mean=0.8319 | loss=0.3180
3/100 * Epoch (train): 1

VBox(children=(Label(value=' 0.01MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
auc/class_00/train,0.9814
auc/class_01/train,0.97945
auc/mean/train,0.98043
loss/train,0.12535
lr/train,0.00039
momentum/train,0.87387
auc/class_00/valid,0.93625
auc/class_01/valid,0.93569
auc/mean/valid,0.93597
loss/valid,0.25785


0,1
auc/class_00/train,▁▅▆▇▇█████████
auc/class_01/train,▁▄▆▇▇█████████
auc/mean/train,▁▄▆▇▇█████████
loss/train,█▆▅▄▃▂▂▂▂▁▁▁▁▁
lr/train,▁▁▁▂▂▃▃▄▅▅▆▇▇█
momentum/train,███▇▇▆▆▅▄▄▃▂▂▁
auc/class_00/valid,▁▅▇███████████
auc/class_01/valid,▁▅▇███████████
auc/mean/valid,▁▅▇███████████
loss/valid,█▅▃▁▁▂▂▂▃▃▆▄▂▃


Top best models:
drive/MyDrive/logdir_chemberta-no-descriptor/checkpoints/train.4.pth	0.2249
