# Requirements

In [None]:
#!g1.1
%pip install --upgrade pip
%pip install transformers
%pip install pyyaml==5.4.1
%pip install gdown
%pip install wandb
%pip install sentencepiece

In [8]:
#!g1.1
!wget https://gist.githubusercontent.com/ArseniyBolotin/7623835da1631b00fb150bcd5b0d909f/raw/wandb_writer.py -O wandb_writer.py

--2022-05-17 17:11:29--  https://gist.githubusercontent.com/ArseniyBolotin/7623835da1631b00fb150bcd5b0d909f/raw/wandb_writer.py
Resolving gist.githubusercontent.com (gist.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to gist.githubusercontent.com (gist.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2609 (2.5K) [text/plain]
Saving to: ‘wandb_writer.py’


2022-05-17 17:11:29 (23.2 MB/s) - ‘wandb_writer.py’ saved [2609/2609]



# Config

In [2]:
#!g1.1
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
#!g1.1
MODEL_TYPE = "microsoft/deberta-v3-large"

In [4]:
#!g1.1
from transformers import logging
logging.set_verbosity_error()

In [5]:
#!g1.1
PAD = 0
UNK = 1
BOS = 2
EOS = 3

N_CLASSES = 54
tgt_vocab_size = N_CLASSES + 4

In [6]:
#!g1.1
BATCH_SIZE = 16

# Download AAPD

In [7]:
#!g1.1
import gdown

url = 'https://drive.google.com/drive/folders/1qw05BnA1O-XDgJ50OgNGFSlTa9Kls00j?usp=sharing'
gdown.download_folder(url, quiet=True)

['/content/AAPD/label_test',
 '/content/AAPD/label_train',
 '/content/AAPD/label_val',
 '/content/AAPD/test.tsv',
 '/content/AAPD/text_test',
 '/content/AAPD/text_train',
 '/content/AAPD/text_val',
 '/content/AAPD/train.tsv',
 '/content/AAPD/validation.tsv']

In [None]:
#!g1.1
!mkdir AAPD
!mv *.tsv AAPD
!mv text_* AAPD
!mv label_* AAPD


# Datasets and Dataloaders

In [9]:
#!g1.1
def apply_to_dict_values(dict, f):
    for key, value in dict.items():
        dict[key] = f(value)

In [10]:
#!g1.1
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel

class AAPDDataset(Dataset):
    """AAPD dataset."""

    def __init__(self, path):
        self.path = path
        self.data = pd.read_csv(self.path, sep='\t', header=None)
        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_TYPE)

    def __len__(self):
        return self.data.shape[0]

    @staticmethod
    def target_to_tensor(target):
        return torch.tensor([float(label) for label in target])

    @staticmethod
    def target_to_tensor_with_specials(target):
        return torch.tensor([BOS] + [float(index) + 4 for index, label in enumerate(target) if label == '1'] + [EOS])

    def __getitem__(self, idx):
        data = self.tokenizer(self.data.iloc[idx, 1], return_tensors="pt", max_length=512, padding="max_length", truncation=True) # max_len=512 !DocBERT
        apply_to_dict_values(data, lambda x: x.flatten())
        return data, AAPDDataset.target_to_tensor(self.data.iloc[idx, 0])

In [11]:
#!g1.1
train_dataset = AAPDDataset('./AAPD/train.tsv')
val_dataset = AAPDDataset('./AAPD/validation.tsv')
test_dataset = AAPDDataset('./AAPD/test.tsv')

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/580 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"


In [14]:
#!g1.1
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Models

In [29]:
#!g1.1
import torch
import torch.nn as nn


class DocDeberta(nn.Module):
    def __init__(self, n_classes):
        super(DocDeberta, self).__init__()
        self.deberta = AutoModel.from_pretrained(MODEL_TYPE)
        self.hidden_size = 1024
        self.n_classes = n_classes
        self.dense = nn.Linear(self.hidden_size, self.n_classes)

    def forward(self, inputs):
        output = self.deberta(**inputs)
        return self.dense(output.last_hidden_state[:, 0, :])

In [39]:
model = DocDeberta(N_CLASSES).to(device)

In [31]:
#!g1.1
import torch.optim as optim

LEARNING_RATE = 2e-5 # !DocBERT

In [40]:
#!g1.1
optimizer = optim.Adam(params=model.parameters(), lr=2e-5, betas=(0.9, 0.99))

In [19]:
#!g1.1
from wandb_writer import WandbWriter
wb_writer = None
# WandbWriter("BERT+SGM experiment")

In [21]:
#!g1.1
from sklearn import metrics

def get_metrics(y, y_pre):
        hamming_loss = metrics.hamming_loss(y, y_pre)
        macro_f1 = metrics.f1_score(y, y_pre, average="macro")
        macro_precision = metrics.precision_score(y, y_pre, average="macro")
        macro_recall = metrics.recall_score(y, y_pre, average="macro")
        micro_f1 = metrics.f1_score(y, y_pre, average="micro")
        micro_precision = metrics.precision_score(y, y_pre, average="micro")
        micro_recall = metrics.recall_score(y, y_pre, average="micro")
        
        return {
            "hamming_loss": hamming_loss,
            "macro_f1": macro_f1,
            "macro_precision": macro_precision,
            "macro_recall": macro_recall,
            "micro_f1": micro_f1,
            "micro_precision": micro_precision,
            "micro_recall": micro_recall
        } 

In [22]:
#!g1.1
def eval_model(model, dataloader, wb_writer, suffix):
    model.eval()

    targets = []
    predictions = []
    with torch.no_grad():
        for src, tgt in dataloader:
            apply_to_dict_values(src, lambda x: x.to(device))
            tgt = tgt.to(device)
            logits = model(src)
            prediction = (torch.sigmoid(logits) > 0.5).type(torch.DoubleTensor)
            targets.extend(tgt.tolist())
            predictions.extend(prediction.tolist())
    
    results = get_metrics(targets, predictions)

    if wb_writer:
        for k, v in results.items():
            name = k
            if suffix:
                name += suffix
            wb_writer.add_scalar(name, v)
        wb_writer.next_step()
        wb_writer.add_scalar("Step", wb_writer.step)
    
    return results

In [33]:
#!g1.1
eval_model(model, val_dataloader, None, "")

  _warn_prf(average, modifier, msg_start, len(result))


{'hamming_loss': 0.5411481481481482,
 'macro_f1': 0.042777163321996006,
 'macro_precision': 0.033038174949101204,
 'macro_recall': 0.543097869778542,
 'micro_f1': 0.0795060795060795,
 'micro_precision': 0.04300415729571321,
 'micro_recall': 0.5258333333333334}

In [34]:
#!g1.1
from tqdm import tqdm

def train_epoch(model, optimizer,  dataloader, val_dataloader, val_freq, wb_writer=None):
    model.train()
    index = 0
    criterion = torch.nn.BCEWithLogitsLoss()
    for src, tgt in tqdm(dataloader, leave=False):
        index += 1
        apply_to_dict_values(src, lambda x: x.to(device))
        tgt = tgt.to(device)
        optimizer.zero_grad()
        logits = model(src)
        loss = criterion(logits, tgt)
        loss.backward()
        optimizer.step()
        if wb_writer:
            wb_writer.add_scalar("Batch train loss", loss.item())
            wb_writer.next_step()
            wb_writer.add_scalar("Step", wb_writer.step)
        if index % val_freq == 0:
            print(index, eval_model(model, val_dataloader, wb_writer, '_validation'))
            model.train()

In [41]:
train_epoch(model, optimizer,  train_dataloader, val_dataloader, 100)



RuntimeError: ignored

In [None]:
#!g1.1
EPOCHS = 10
for epoch in range(1, EPOCHS + 1):
    train_epoch(model, optimizer, train_dataloader, val_dataloader, 100, wb_writer)
    log = eval_model(model, val_dataloader, wb_writer, '_epoch_validation')
    print(log)
    log = eval_model(model, test_dataloader, wb_writer, '_epoch_test')
    print(log)
    torch.save(model, 'docbert_base_' + str(epoch) + '.pt')