In [1]:
class CFG:
  train_df='./train/train_df.csv'
  test_df='./test/test_df.csv'
  target_cols=['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']
  classes=5
  model='bert-base-cased'
  embedd_dim=768
  criterion = 'mse' # ['crossentropy', 'mse', 'l1', 'focal']
  main_metric = 'acc@1'
  model_file = './models/ed_best_bert_base.pt'
  # just use it
  apex=True
  gradient_checkpointing=True
  num_cycles=0.5
  num_warmup_steps=0
  epochs=10
  encoder_lr=1e-5
  decoder_lr=1e-5
  min_lr=1e-6
  eps=1e-6
  betas=(0.9, 0.999)
  batch_size=4
  max_len=512
  weight_decay=0.01
  # gradient_accumulation_steps=1
  max_grad_norm=1000
  seed=42
  scheduler='cosine' # ['linear', 'cosine']
  batch_scheduler=False
  #
  colab=False

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
if CFG.colab:
  from google.collab import drive
  drive.mount('/content/drive')
  import os
  os.chdir('/content/drive/MyDrive/lab/bert_finetune')

# Load libraries and data

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

import tokenizers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

from tqdm import tqdm
import gc

In [5]:
def set_seed(seed):
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available(): # для GPU отдельный seed
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
set_seed(CFG.seed)
# есть стохастические операции на GPU
# сделаем их детерминированными для воспроизводимости
torch.backends.cudnn.determinstic = True
torch.backends.cudnn.benchmark = False

In [6]:
CFG.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.model, use_fast=False)

In [7]:
train_df = pd.read_csv(CFG.train_df, index_col='index')
test_df = pd.read_csv(CFG.test_df, index_col='index')

In [8]:
train_df.head()

Unnamed: 0_level_0,text,0,1,2,3,4,5
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Mortar assault leaves at least 18 dead,22,2,60,0,64,0
2,Goal delight for Sheva,0,0,0,93,0,38
3,Nigeria hostage feared dead is freed,18,0,52,66,20,65
4,Bombers kill shoppers,66,39,94,0,86,0
5,"Vegetables, not fruit, slow brain decline",0,0,25,26,2,46


In [9]:
df = pd.concat((train_df, test_df))
df.columns = ['text', 'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise']
df = df.drop('Disgust', axis=1)

In [10]:
df[CFG.target_cols] = df[CFG.target_cols].apply(lambda it: it/it.sum() if it.sum() else 0, axis=1)
df.head()

Unnamed: 0_level_0,text,Anger,Fear,Joy,Sadness,Surprise
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Mortar assault leaves at least 18 dead,0.150685,0.410959,0.0,0.438356,0.0
2,Goal delight for Sheva,0.0,0.0,0.709924,0.0,0.290076
3,Nigeria hostage feared dead is freed,0.081448,0.235294,0.298643,0.090498,0.294118
4,Bombers kill shoppers,0.268293,0.382114,0.0,0.349593,0.0
5,"Vegetables, not fruit, slow brain decline",0.0,0.252525,0.262626,0.020202,0.464646


In [11]:
tmp, valid_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, test_df = train_test_split(tmp, test_size=2/8, random_state=42)

# Dataset

In [12]:
class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = (df['text']).values
        self.labels = df[cfg.target_cols].values

    def __len__(self):
        return len(self.texts)

    def get_text(self, idx):
        # tokenization
        inputs = self.cfg.tokenizer.encode_plus(
            self.texts[idx], 
            return_tensors=None, 
            add_special_tokens=True, 
            max_length=CFG.max_len,
            pad_to_max_length=True,
            # padding='longest',
            truncation=True
        )
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
        return inputs
    
    def get_labels(self, idx):
        if CFG.criterion != 'crossentropy' and CFG.criterion != 'focal':
           return torch.tensor(self.labels[idx], dtype=torch.float)
        return torch.tensor(self.labels[idx]).type(torch.LongTensor)

    def __getitem__(self, idx):
        inputs = self.get_text(idx)
        label = self.get_labels(idx)
        return inputs, label

def collate(inputs):
		# reduce sequence length
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

# Model

In [13]:
class TransformerModel(nn.Module):

    def __init__(self):
        super(TransformerModel, self).__init__()

        self.model = AutoModel.from_pretrained(CFG.model)
        if CFG.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        self.clf =  nn.Linear(CFG.embedd_dim, CFG.classes)
        if not CFG.criterion == 'crossentropy':
            self.sm = nn.Softmax(dim=-1)
        torch.nn.init.xavier_uniform_(self.clf.weight)

    def forward(self, input_id, mask):
        # sequence has [CLF] token in the beginning
        # bert() returns first vector as pooling of sentence
        _, x = self.model(input_ids= input_id, attention_mask=mask, return_dict=False)
        out = self.clf(x)
        if not CFG.criterion == 'crossentropy':
            return self.sm(out)
        return out

# Train loop and metrics and other

In [14]:
from scipy.stats import wasserstein_distance, pearsonr
from sklearn.metrics import f1_score, accuracy_score

In [15]:
def calculate_metrics(y, pred):
    APd = torch.mean(torch.tensor([pearsonr(pred[i], y[i])[0] for i in range(pred.size()[0])]))
    APe = torch.mean(torch.tensor([pearsonr(pred[:, i], y[:, i])[0] for i in range(CFG.classes)]))
    RMSED = torch.mean(torch.tensor([np.sqrt(nn.functional.mse_loss(pred[i:i+1, :], y[i:i+1, :])) for i in range(pred.size()[0])]))
    WD = torch.mean(torch.tensor([wasserstein_distance(pred[i], y[i]) for i in range(pred.size()[0])]))

    y = torch.argmax(y, dim=1)
    pred = torch.argmax(pred, dim=1)
    acc_1 = accuracy_score(y, pred)


    return {'acc@1': acc_1, 'APd': APd, 'APe': APe, 'RMSED': RMSED, 'WD': WD}

In [16]:
def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
         'lr': encoder_lr, 'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
         'lr': encoder_lr, 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if "model" not in n],
         'lr': decoder_lr, 'weight_decay': 0.0}
    ]
    return optimizer_parameters

def get_scheduler(cfg, optimizer, num_train_steps):
    if cfg.scheduler == 'linear':
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
        )
    elif cfg.scheduler == 'cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
        )
    return scheduler

In [17]:
class FocalLoss(nn.Module):
    def __init__(self, gamma=0, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, input, target):
        target = target.view(-1,1)

        logpt = F.log_softmax(input)
        logpt = logpt.gather(1,target)
        logpt = logpt.view(-1)
        pt = torch.exp(logpt)

        loss = -1 * (1-pt)**self.gamma * logpt
        if self.reduction == 'mean': 
          return loss.mean()
        elif self.reduction == 'sum':
          return loss.sum()
        else:
          raise NotImplementedError(f'Not implemented reduction: {self.reduction}')

In [18]:
train_ds = TrainDataset(CFG, train_df)
valid_ds = TrainDataset(CFG, valid_df)

In [19]:
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=CFG.batch_size, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_ds, batch_size=CFG.batch_size, shuffle=False)

In [20]:
model = TransformerModel()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
if CFG.criterion == 'crossentropy':
  criterion = nn.CrossEntropyLoss()
elif CFG.criterion == 'mse':
  criterion = nn.MSELoss()
elif CFG.criterion == 'l1':
  criterion = nn.SmoothL1Loss()
elif CFG.criterion == 'focal':
  criterion = FocalLoss(5)
else:
  raise NotImplementedError('Change loss')

In [22]:
# optimizer_parameters = get_optimizer_params(model,
#                                            encoder_lr=CFG.encoder_lr, 
#                                            decoder_lr=CFG.decoder_lr,
#                                            weight_decay=CFG.weight_decay)
# optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
optimizer = AdamW(model.parameters(), lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)

In [23]:
num_train_steps = int(len(train_df) / CFG.batch_size * CFG.epochs)
scheduler = get_scheduler(CFG, optimizer, num_train_steps)

In [24]:
model.to(CFG.device)
criterion.to(CFG.device)

MSELoss()

In [25]:
def train_loop(model, optimizer, criterion, train_loader, valid_loader, epochs, scheduler):
  best_score = 0
  # multiplies gradient so it won't vanish (torch use float16)
  scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
  for e in range(epochs):
    # train
    model.train()
    train_loss = []
    for inputs, labels in tqdm(train_loader):
      inputs = collate(inputs)
      # move inputs to device
      mask = inputs['attention_mask'].to(CFG.device)
      input_id = inputs['input_ids'].squeeze(1).to(CFG.device)
      labels = labels.to(CFG.device)
      # forward
      with torch.cuda.amp.autocast(enabled=CFG.apex):
        y_preds = model(input_id, mask)
        loss = criterion(y_preds, labels)
      # calculate loss
      train_loss.append(loss.detach().cpu().item())

      optimizer.zero_grad()
      # loss.backward()
      scaler.scale(loss).backward()
      # gradient clipping
      grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
      scaler.step(optimizer)
      scaler.update()
      # optimizer.step()
      if CFG.batch_scheduler:
          scheduler.step()
    train_loss = np.mean(train_loss)
    # valid
    model.eval()
    with torch.no_grad():
      valid_loss = []
      preds = []
      y = []
      for inputs, labels in valid_loader:
        inputs = collate(inputs)
        # move inputs to device
        mask = inputs['attention_mask'].to(CFG.device)
        input_id = inputs['input_ids'].squeeze(1).to(CFG.device)
        labels = labels.to(CFG.device)
        # forward
        with torch.cuda.amp.autocast(enabled=CFG.apex):
          y_preds = model(input_id, mask)
          loss = criterion(y_preds, labels)
        preds.append(y_preds.detach().cpu())
        y.append(labels.detach().cpu())
        # calculate loss
        valid_loss.append(loss.detach().cpu().item())
    valid_loss = np.mean(valid_loss)
    preds = torch.concat(preds, dim=0)
    y = torch.concat(y, dim=0)
    metrics = calculate_metrics(y, preds)
    # save best model
    if best_score < metrics[CFG.main_metric]:
      torch.save(model.state_dict(), CFG.model_file)
      best_score = metrics[CFG.main_metric]
    print('best_score =', best_score)
    print(f'EPOCH {e + 1}:, train_loss = {train_loss: .5f}, valid_loss = {valid_loss: .5f}', *[f'{name} = {value: .5f}' for name, value in metrics.items()])
    

In [26]:
torch.cuda.empty_cache()
train_loop(model, optimizer, criterion, train_loader, valid_loader, CFG.epochs, scheduler)

100%|██████████| 188/188 [00:18<00:00, 10.25it/s]


best_score = 0.396
EPOCH 1:, train_loss =  0.05340, valid_loss =  0.04196 acc@1 =  0.39600 APd =  0.34826 APe =  0.43224 RMSED =  0.19380 WD =  0.13394


100%|██████████| 188/188 [00:17<00:00, 11.02it/s]


best_score = 0.528
EPOCH 2:, train_loss =  0.03393, valid_loss =  0.03114 acc@1 =  0.52800 APd =  0.55859 APe =  0.55954 RMSED =  0.15773 WD =  0.09698


100%|██████████| 188/188 [00:14<00:00, 12.96it/s]


best_score = 0.532
EPOCH 3:, train_loss =  0.02384, valid_loss =  0.03270 acc@1 =  0.53200 APd =  0.53384 APe =  0.56815 RMSED =  0.16080 WD =  0.09613


100%|██████████| 188/188 [00:13<00:00, 14.13it/s]


best_score = 0.536
EPOCH 4:, train_loss =  0.01747, valid_loss =  0.02917 acc@1 =  0.53600 APd =  0.59517 APe =  0.60443 RMSED =  0.14939 WD =  0.08581


100%|██████████| 188/188 [00:13<00:00, 14.13it/s]


best_score = 0.592
EPOCH 5:, train_loss =  0.01401, valid_loss =  0.02909 acc@1 =  0.59200 APd =  0.61341 APe =  0.61605 RMSED =  0.14657 WD =  0.08193


100%|██████████| 188/188 [00:13<00:00, 14.11it/s]


best_score = 0.604
EPOCH 6:, train_loss =  0.01050, valid_loss =  0.02721 acc@1 =  0.60400 APd =  0.62335 APe =  0.63999 RMSED =  0.14092 WD =  0.07817


100%|██████████| 188/188 [00:13<00:00, 14.12it/s]


best_score = 0.604
EPOCH 7:, train_loss =  0.00937, valid_loss =  0.02808 acc@1 =  0.57200 APd =  0.60711 APe =  0.63569 RMSED =  0.14465 WD =  0.08044


100%|██████████| 188/188 [00:13<00:00, 14.11it/s]


best_score = 0.604
EPOCH 8:, train_loss =  0.00771, valid_loss =  0.02967 acc@1 =  0.58000 APd =  0.59982 APe =  0.62493 RMSED =  0.14807 WD =  0.07970


100%|██████████| 188/188 [00:13<00:00, 14.13it/s]


best_score = 0.604
EPOCH 9:, train_loss =  0.00697, valid_loss =  0.02907 acc@1 =  0.56800 APd =  0.60556 APe =  0.62528 RMSED =  0.14675 WD =  0.07865


100%|██████████| 188/188 [00:13<00:00, 14.12it/s]


best_score = 0.604
EPOCH 10:, train_loss =  0.00610, valid_loss =  0.02745 acc@1 =  0.58800 APd =  0.62383 APe =  0.64091 RMSED =  0.14166 WD =  0.07837


# Test

In [27]:
def test(model, test_loader):
    model.eval()
    with torch.no_grad():
      test_loss = []
      preds = []
      y = []
      for step, (inputs, labels) in enumerate(test_loader):
        inputs = collate(inputs)
        # move inputs to device
        mask = inputs['attention_mask'].to(CFG.device)
        input_id = inputs['input_ids'].squeeze(1).to(CFG.device)
        labels = labels.to(CFG.device)
        # forward
        with torch.cuda.amp.autocast(enabled=CFG.apex):
          y_preds = model(input_id, mask)
          loss = criterion(y_preds, labels)
        preds.append(y_preds.detach().cpu())
        y.append(labels.detach().cpu())
        # calculate loss
        test_loss.append(loss.detach().cpu().item())
    test_loss = np.mean(test_loss)
    preds = torch.concat(preds, dim=0)
    y = torch.concat(y, dim=0)
    metrics = calculate_metrics(y, preds)

    print(f'Test metrics: test_loss={test_loss: 0.5f}', *[f'{name} = {value: 0.5f}' for name, value in metrics.items()])

In [28]:
test_ds = TrainDataset(CFG, test_df)
test_loader = torch.utils.data.DataLoader(test_ds, batch_size=CFG.batch_size, shuffle=False)

In [29]:
CFG.batch_size

4

In [30]:
test(model, valid_loader)

Test metrics: test_loss= 0.02745 acc@1 =  0.58800 APd =  0.62383 APe =  0.64091 RMSED =  0.14166 WD =  0.07837


In [31]:
test(model, test_loader)

Test metrics: test_loss= 0.02760 acc@1 =  0.63600 APd =  0.66730 APe =  0.66488 RMSED =  0.13617 WD =  0.07353
