In [1]:
class CFG:
  dataset='./dataset/polarization_ds.csv'
  train_df='./dataset/media_split/train.csv'
  valid_df='./dataset/media_split/valid.csv'
  test_df='./dataset/media_split/test.csv'
  target_cols=['left', 'center', 'right'] # 'bias_text'
  classes=3
  model='microsoft/deberta-v3-base'
  embedd_dim=768
  criterion = 'mse' # ['crossentropy', 'mse', 'l1', 'focal']
  main_metric = 'f1_macro'
  model_file = './models/best_deberta_base.pt'
  # just use it
  apex=True
  gradient_checkpointing=True
  num_cycles=0.5
  num_warmup_steps=0
  epochs=5
  encoder_lr=2e-5
  decoder_lr=2e-5
  min_lr=1e-6
  eps=1e-6
  betas=(0.9, 0.999)
  batch_size=32
  max_len=512
  weight_decay=0.01
  # gradient_accumulation_steps=1
  max_grad_norm=1
  seed=42
  scheduler='cosine' # ['linear', 'cosine']
  batch_scheduler=True
  #
  colab=False

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
if CFG.colab:
  from google.collab import drive
  drive.mount('/content/drive')
  import os
  os.chdir('/content/drive/MyDrive/lab/bert_finetune')

# Load libraries and data

In [4]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

import tokenizers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

from tqdm import tqdm
import gc

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def set_seed(seed):
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available(): # для GPU отдельный seed
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
set_seed(CFG.seed)
# есть стохастические операции на GPU
# сделаем их детерминированными для воспроизводимости
torch.backends.cudnn.determinstic = True
torch.backends.cudnn.benchmark = False

In [6]:
CFG.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.model, use_fast=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
df = pd.read_csv(CFG.dataset, index_col='id')
train_df = pd.read_csv(CFG.train_df)
valid_df = pd.read_csv(CFG.valid_df)

In [8]:
df.head()

Unnamed: 0_level_0,topic,bias_text,title,content
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
wzYhj24VaSbhhr8T,sexual_misconduct,right,Two more people back parts of Tara Reade’s cla...,Two more people corroborated some of Tara Read...
pcgmZXUEXb0xWWB1,marijuana_legalization,right,Colorado lawmakers set limits on pot edibles p...,DENVER — The Mile High City was jammed with po...
oYxUHzLvAtQhGoXk,foreign_policy,right,"Trump, world leaders, mark 100-year WWI annive...",President Donald Trump and dozens of world lea...
EHAhlgvwQ8U2ZvGB,race_and_racism,right,It’s Not Just MSNBC Making Flip Assumptions Ab...,"Last night , the official Twitter feed of MSNB..."
JD0fSIsxc4p1DuDe,china,center,The Global Stories Of 2019 That You Probably M...,"Sure , everybody thinks it 's great when a sto..."


In [9]:
target = (df['bias_text'] == 'center') + (df['bias_text'] == 'right') * 2
df['bias_text'] = target

In [10]:
df['left'] = (df['bias_text'] == 0).astype(float)
df['center'] = (df['bias_text'] == 1).astype(float)
df['right'] = (df['bias_text'] == 2).astype(float)

In [11]:
train_df = df.loc[train_df['ID']]
valid_df = df.loc[valid_df['ID']]

In [12]:
train_df.head()

Unnamed: 0_level_0,topic,bias_text,title,content,left,center,right
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
zl7kc7EmAyIdUMIo,immigration,2,"Shutdown Theater, Again",President Trump and Senate Minority Leader Chu...,0.0,0.0,1.0
xpbjYTJYPdlw6HmJ,culture,1,Can the developing world endure the coronavirus?,“ The 360 ” shows you diverse perspectives on ...,0.0,1.0,0.0
k4SGI3GXarnz5dJl,elections,0,Sanders’ California supporters can’t quite say...,LOS ANGELES — Actress Rosario Dawson took the ...,1.0,0.0,0.0
0jIpietfnrPRGHKQ,white_house,1,Trump says he doesn't know if Rudy Giuliani is...,President Donald Trump said on Friday that he ...,0.0,1.0,0.0
zMlSt7dyJvanHqJq,politics,0,Trump's historic moment arrives,Washington ( CNN ) Donald Trump became the 45t...,1.0,0.0,0.0


# Dataset

In [13]:
class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = (df['title'] + ' ' + df['content']).values
        self.labels = df[cfg.target_cols].values

    def __len__(self):
        return len(self.texts)

    def get_text(self, idx):
        # tokenization
        inputs = self.cfg.tokenizer.encode_plus(
            self.texts[idx], 
            return_tensors=None, 
            add_special_tokens=True, 
            max_length=CFG.max_len,
            pad_to_max_length=True,
            # padding='longest',
            truncation=True
        )
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
        return inputs
    
    def get_labels(self, idx):
        if CFG.criterion != 'crossentropy' and CFG.criterion != 'focal':
           return torch.tensor(self.labels[idx], dtype=torch.float)
        return torch.tensor(self.labels[idx]).type(torch.LongTensor)

    def __getitem__(self, idx):
        inputs = self.get_text(idx)
        label = self.get_labels(idx)
        return inputs, label

def collate(inputs):
		# reduce sequence length
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

# Model

In [14]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

In [15]:
class TransformerModel(nn.Module):

    def __init__(self):
        super(TransformerModel, self).__init__()

        self.model = AutoModel.from_pretrained(CFG.model)
        if CFG.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        self.pool = MeanPooling()
        self.clf =  nn.Linear(CFG.embedd_dim, CFG.classes)
        if not CFG.criterion == 'crossentropy':
            self.sm = nn.Softmax(dim=-1)
        torch.nn.init.xavier_uniform_(self.clf.weight)

    def forward(self, input_id, mask):
        # sequence has [CLF] token in the beginning
        # bert() returns first vector as pooling of sentence
        x = self.model(input_ids= input_id, attention_mask=mask)[0]
        out = self.pool(x, mask)
        out = self.clf(out)
        if not CFG.criterion == 'crossentropy':
            return self.sm(out)
        return out

# Train loop, metrics and other

In [16]:
from sklearn.metrics import f1_score, accuracy_score

In [17]:
def calculate_metrics(y, pred):
  y = torch.argmax(y, dim=1)
  pred = torch.argmax(pred, dim=1)
  f1 = f1_score(y, pred, average='macro')
  acc = accuracy_score(y, pred)
  return {'f1_macro': f1, 'acc': acc}

In [18]:
def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
         'lr': encoder_lr, 'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
         'lr': encoder_lr, 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if "model" not in n],
         'lr': decoder_lr, 'weight_decay': 0.0}
    ]
    return optimizer_parameters

def get_scheduler(cfg, optimizer, num_train_steps):
    if cfg.scheduler == 'linear':
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
        )
    elif cfg.scheduler == 'cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
        )
    return scheduler

In [19]:
class FocalLoss(nn.Module):
    def __init__(self, gamma=0, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, input, target):
        target = target.view(-1,1)

        logpt = F.log_softmax(input)
        logpt = logpt.gather(1,target)
        logpt = logpt.view(-1)
        pt = torch.exp(logpt)

        loss = -1 * (1-pt)**self.gamma * logpt
        if self.reduction == 'mean': 
          return loss.mean()
        elif self.reduction == 'sum':
          return loss.sum()
        else:
          raise NotImplementedError(f'Not implemented reduction: {self.reduction}')

In [20]:
train_ds = TrainDataset(CFG, train_df)
valid_ds = TrainDataset(CFG, valid_df)

In [21]:
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=CFG.batch_size, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_ds, batch_size=CFG.batch_size, shuffle=False)

In [22]:
model = TransformerModel()

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.classifier.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [23]:
if CFG.criterion == 'crossentropy':
  criterion = nn.CrossEntropyLoss()
elif CFG.criterion == 'mse':
  criterion = nn.MSELoss()
elif CFG.criterion == 'l1':
  criterion = nn.SmoothL1Loss()
elif CFG.criterion == 'focal':
  criterion = FocalLoss(5)
else:
  raise NotImplementedError('Change loss')

In [24]:
optimizer_parameters = get_optimizer_params(model,
                                           encoder_lr=CFG.encoder_lr, 
                                           decoder_lr=CFG.decoder_lr,
                                           weight_decay=CFG.weight_decay)
optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)

In [25]:
num_train_steps = int(len(train_df) / CFG.batch_size * CFG.epochs)
scheduler = get_scheduler(CFG, optimizer, num_train_steps)

In [26]:
model.to(CFG.device)
criterion.to(CFG.device)

MSELoss()

In [27]:
best_score = 0

In [28]:
def train_loop(model, optimizer, criterion, train_loader, valid_loader, epochs, scheduler):
  best_score = 0
  # multiplies gradient so it won't vanish (torch use float16)
  scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
  for e in range(epochs):
    # train
    model.train()
    train_loss = []
    for inputs, labels in tqdm(train_loader):
      inputs = collate(inputs)
      # move inputs to device
      mask = inputs['attention_mask'].to(CFG.device)
      input_id = inputs['input_ids'].squeeze(1).to(CFG.device)
      labels = labels.to(CFG.device)
      # forward
      with torch.cuda.amp.autocast(enabled=CFG.apex):
        y_preds = model(input_id, mask)
        loss = criterion(y_preds, labels)
      # calculate loss
      train_loss.append(loss.detach().cpu().item())

      optimizer.zero_grad()
      # loss.backward()
      scaler.scale(loss).backward()
      # gradient clipping
      grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
      scaler.step(optimizer)
      scaler.update()
      # optimizer.step()
      if CFG.batch_scheduler:
          scheduler.step()
    train_loss = np.mean(train_loss)
    # valid
    model.eval()
    with torch.no_grad():
      valid_loss = []
      preds = []
      y = []
      for inputs, labels in valid_loader:
        inputs = collate(inputs)
        # move inputs to device
        mask = inputs['attention_mask'].to(CFG.device)
        input_id = inputs['input_ids'].squeeze(1).to(CFG.device)
        labels = labels.to(CFG.device)
        # forward
        with torch.cuda.amp.autocast(enabled=CFG.apex):
          y_preds = model(input_id, mask)
          loss = criterion(y_preds, labels)
        preds.append(y_preds.detach().cpu())
        y.append(labels.detach().cpu())
        # calculate loss
        valid_loss.append(loss.detach().cpu().item())
    valid_loss = np.mean(valid_loss)
    preds = torch.concat(preds, dim=0)
    y = torch.concat(y, dim=0)
    metrics = calculate_metrics(y, preds)
    # save best model
    if best_score < metrics[CFG.main_metric]:
      torch.save(model.state_dict(), CFG.model_file)
      best_score = metrics[CFG.main_metric]
    print('best_score =', best_score)
    print(f'EPOCH {e + 1}:, train_loss = {train_loss}, valid_loss = {valid_loss}', *[f'{name} = {value}' for name, value in metrics.items()])
    

In [29]:
torch.cuda.empty_cache()

In [30]:
train_loop(model, optimizer, criterion, train_loader, valid_loader, CFG.epochs, scheduler)

100%|██████████| 416/416 [14:02<00:00,  2.03s/it]


best_score = 0.275029626415907
EPOCH 1:, train_loss = 0.24768156379174727, valid_loss = 0.2216248935138857 f1_macro = 0.275029626415907 acc = 0.4367572156196944


100%|██████████| 416/416 [14:04<00:00,  2.03s/it]


best_score = 0.3058787095987227
EPOCH 2:, train_loss = 0.21367977294497764, valid_loss = 0.21490701992769498 f1_macro = 0.3058787095987227 acc = 0.4859932088285229


100%|██████████| 416/416 [14:03<00:00,  2.03s/it]


best_score = 0.3458828365790134
EPOCH 3:, train_loss = 0.20428220025048807, valid_loss = 0.20512174714255976 f1_macro = 0.3458828365790134 acc = 0.5411714770797963


100%|██████████| 416/416 [14:06<00:00,  2.03s/it]


best_score = 0.36821932841059374
EPOCH 4:, train_loss = 0.19305806026722377, valid_loss = 0.20510840697868452 f1_macro = 0.36821932841059374 acc = 0.5246179966044142


100%|██████████| 416/416 [14:05<00:00,  2.03s/it]


best_score = 0.36821932841059374
EPOCH 5:, train_loss = 0.18241622069707283, valid_loss = 0.20925698046748703 f1_macro = 0.3464206238941396 acc = 0.47283531409168084


100%|██████████| 416/416 [14:04<00:00,  2.03s/it]


best_score = 0.36821932841059374
EPOCH 6:, train_loss = 0.1770691338998194, valid_loss = 0.20700339852152644 f1_macro = 0.3488355454069236 acc = 0.4847198641765705


100%|██████████| 416/416 [14:04<00:00,  2.03s/it]


best_score = 0.36821932841059374
EPOCH 7:, train_loss = 0.17467433522240475, valid_loss = 0.20937290586329796 f1_macro = 0.3361847371919315 acc = 0.4592529711375212


100%|██████████| 416/416 [14:05<00:00,  2.03s/it]


best_score = 0.36821932841059374
EPOCH 8:, train_loss = 0.17358122286028588, valid_loss = 0.20937477897953344 f1_macro = 0.3363956204376075 acc = 0.4588285229202037


100%|██████████| 416/416 [14:05<00:00,  2.03s/it]


best_score = 0.36821932841059374
EPOCH 9:, train_loss = 0.17331649187522438, valid_loss = 0.2100283108853005 f1_macro = 0.3345209280511991 acc = 0.45543293718166383


100%|██████████| 416/416 [14:05<00:00,  2.03s/it]


best_score = 0.36821932841059374
EPOCH 10:, train_loss = 0.1729615069925785, valid_loss = 0.20996148441288923 f1_macro = 0.33512637210455276 acc = 0.45585738539898135


# Test

In [31]:
def test(model, test_loader):
    model.eval()
    with torch.no_grad():
      test_loss = []
      preds = []
      y = []
      for step, (inputs, labels) in enumerate(test_loader):
        inputs = collate(inputs)
        # move inputs to device
        mask = inputs['attention_mask'].to(CFG.device)
        input_id = inputs['input_ids'].squeeze(1).to(CFG.device)
        labels = labels.to(CFG.device)
        # forward
        with torch.cuda.amp.autocast(enabled=CFG.apex):
          y_preds = model(input_id, mask)
          loss = criterion(y_preds, labels)
        preds.append(y_preds.detach().cpu())
        y.append(labels.detach().cpu())
        # calculate loss
        test_loss.append(loss.detach().cpu().item())
    test_loss = np.mean(test_loss)
    preds = torch.concat(preds, dim=0)
    y = torch.concat(y, dim=0)
    metrics = calculate_metrics(y, preds)

    print(f'Test metrics: test_loss={test_loss}', *[f'{name} = {value}' for name, value in metrics.items()])

In [32]:
model.load_state_dict(torch.load(CFG.model_file))

<All keys matched successfully>

In [33]:
test_df = pd.read_csv(CFG.test_df)
test_df = df.loc[test_df['ID']]
test_ds = TrainDataset(CFG, test_df)
test_loader = torch.utils.data.DataLoader(test_ds, batch_size=CFG.batch_size, shuffle=False)

In [34]:
test(model, valid_loader)



Test metrics: test_loss=0.20510840697868452 f1_macro = 0.36821932841059374 acc = 0.5246179966044142


In [35]:
test(model, test_loader)



Test metrics: test_loss=0.20383579177515848 f1_macro = 0.3883497747222216 acc = 0.4946153846153846
