In [1]:
class CFG:
  dataset='./dataset/polarization_ds.csv'
  train_df='./dataset/media_split/train.csv'
  valid_df='./dataset/media_split/valid.csv'
  test_df='./dataset/media_split/test.csv'
  target_cols='bias_text' # 'bias_text', ['left', 'center', 'right'] 
  classes=3
  model='bert-base-cased'
  embedd_dim=768
  criterion = 'crossentropy' # ['crossentropy', 'mse', 'l1', 'focal']
  main_metric = 'f1_macro'
  model_file = './models/best_bert_large_ce.pt'
  # just use it
  apex=True
  gradient_checkpointing=True
  num_cycles=0.5
  num_warmup_steps=0
  epochs=5
  encoder_lr=2e-5
  decoder_lr=2e-5
  min_lr=1e-6
  eps=1e-6
  betas=(0.9, 0.999)
  batch_size=32
  max_len=512
  weight_decay=0.01
  # gradient_accumulation_steps=1
  max_grad_norm=1
  seed=42
  scheduler='cosine' # ['linear', 'cosine']
  batch_scheduler=True
  #
  collab=False

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
if CFG.collab:
  from google.colab import drive
  drive.mount('/content/drive')
  import os
  os.chdir('/content/drive/MyDrive/lab/bert_finetune')

# Load libraries and data

In [4]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

from tqdm import tqdm
import gc

In [5]:
def set_seed(seed):
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available(): # для GPU отдельный seed
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
set_seed(CFG.seed)
# есть стохастические операции на GPU
# сделаем их детерминированными для воспроизводимости
torch.backends.cudnn.determinstic = True
torch.backends.cudnn.benchmark = False

In [6]:
CFG.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.model)

In [7]:
df = pd.read_csv(CFG.dataset, index_col='id')
train_df = pd.read_csv(CFG.train_df)
valid_df = pd.read_csv(CFG.valid_df)

In [8]:
df.head()

Unnamed: 0_level_0,topic,bias_text,title,content
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
wzYhj24VaSbhhr8T,sexual_misconduct,right,Two more people back parts of Tara Reade’s cla...,Two more people corroborated some of Tara Read...
pcgmZXUEXb0xWWB1,marijuana_legalization,right,Colorado lawmakers set limits on pot edibles p...,DENVER — The Mile High City was jammed with po...
oYxUHzLvAtQhGoXk,foreign_policy,right,"Trump, world leaders, mark 100-year WWI annive...",President Donald Trump and dozens of world lea...
EHAhlgvwQ8U2ZvGB,race_and_racism,right,It’s Not Just MSNBC Making Flip Assumptions Ab...,"Last night , the official Twitter feed of MSNB..."
JD0fSIsxc4p1DuDe,china,center,The Global Stories Of 2019 That You Probably M...,"Sure , everybody thinks it 's great when a sto..."


In [9]:
target = (df['bias_text'] == 'center') + (df['bias_text'] == 'right') * 2
df['bias_text'] = target

In [10]:
df['left'] = (df['bias_text'] == 0).astype(float)
df['center'] = (df['bias_text'] == 1).astype(float)
df['right'] = (df['bias_text'] == 2).astype(float)

In [11]:
train_df = df.loc[train_df['ID']]
valid_df = df.loc[valid_df['ID']]

In [12]:
train_df.head()

Unnamed: 0_level_0,topic,bias_text,title,content,left,center,right
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
zl7kc7EmAyIdUMIo,immigration,2,"Shutdown Theater, Again",President Trump and Senate Minority Leader Chu...,0.0,0.0,1.0
xpbjYTJYPdlw6HmJ,culture,1,Can the developing world endure the coronavirus?,“ The 360 ” shows you diverse perspectives on ...,0.0,1.0,0.0
k4SGI3GXarnz5dJl,elections,0,Sanders’ California supporters can’t quite say...,LOS ANGELES — Actress Rosario Dawson took the ...,1.0,0.0,0.0
0jIpietfnrPRGHKQ,white_house,1,Trump says he doesn't know if Rudy Giuliani is...,President Donald Trump said on Friday that he ...,0.0,1.0,0.0
zMlSt7dyJvanHqJq,politics,0,Trump's historic moment arrives,Washington ( CNN ) Donald Trump became the 45t...,1.0,0.0,0.0


# Dataset

In [13]:
class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = (df['title'] + ' ' + df['content']).values
        self.labels = df[cfg.target_cols].values

    def __len__(self):
        return len(self.texts)

    def get_text(self, idx):
        # tokenization
        inputs = self.cfg.tokenizer.encode_plus(
            self.texts[idx], 
            return_tensors=None, 
            add_special_tokens=True, 
            max_length=CFG.max_len,
            pad_to_max_length=True,
            truncation=True
        )
        
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
        
        return inputs
    
    def get_labels(self, idx):
        #if CFG.criterion != 'crossentropy':
        #    return torch.tensor(self.labels[idx], dtype=torch.float)
        return torch.tensor(self.labels[idx]).type(torch.LongTensor)

    def __getitem__(self, idx):
        inputs = self.get_text(idx)
        label = self.get_labels(idx)
        return inputs, label

def collate(inputs):
		# reduce sequence length
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

# Model

In [14]:
class TransformerModel(nn.Module):

    def __init__(self):
        super(TransformerModel, self).__init__()

        self.model = AutoModel.from_pretrained(CFG.model)
        if CFG.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        self.clf =  nn.Linear(CFG.embedd_dim, CFG.classes)
        if CFG.criterion != 'crossentropy' and CFG.criterion != 'focal':
            self.sm = nn.Softmax(dim=-1)
        torch.nn.init.xavier_uniform_(self.clf.weight)

    def forward(self, input_id, mask):
        # sequence has [CLF] token in the beginning
        # bert() returns first vector as pooling of sentence
        _, x = self.model(input_ids= input_id, attention_mask=mask, return_dict=False)
        out = self.clf(x)
        if CFG.criterion != 'crossentropy'  and CFG.criterion != 'focal':
            return self.sm(out)
        return out

# Train loop, metrics and other

In [15]:
from sklearn.metrics import f1_score, accuracy_score

In [16]:
def calculate_metrics(y, pred):
  if CFG.criterion != 'crossentropy' and CFG.criterion != 'focal':
    y = torch.argmax(y, dim=1)
  pred = torch.argmax(pred, dim=1)
  f1 = f1_score(y, pred, average='macro')
  acc = accuracy_score(y, pred)
  return {'f1_macro': f1, 'acc': acc}

In [17]:
def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
         'lr': encoder_lr, 'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
         'lr': encoder_lr, 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if "model" not in n],
         'lr': decoder_lr, 'weight_decay': 0.0}
    ]
    return optimizer_parameters

def get_scheduler(cfg, optimizer, num_train_steps):
    if cfg.scheduler == 'linear':
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
        )
    elif cfg.scheduler == 'cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
        )
    return scheduler

In [18]:
class FocalLoss(nn.Module):
    def __init__(self, gamma=1, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, input, target):
        target = target.view(-1,1)

        logpt = F.log_softmax(input)
        logpt = logpt.gather(1,target)
        logpt = logpt.view(-1)
        pt = torch.exp(logpt)

        loss = -1 * (1-pt)**self.gamma * logpt
        if self.reduction == 'mean': 
          return loss.mean()
        elif self.reduction == 'sum':
          return loss.sum()
        else:
          raise NotImplementedError(f'Not implemented reduction: {self.reduction}')

In [19]:
train_ds = TrainDataset(CFG, train_df)
valid_ds = TrainDataset(CFG, valid_df)

In [20]:
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=CFG.batch_size, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_ds, batch_size=CFG.batch_size, shuffle=False)

In [21]:
model = TransformerModel()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
if CFG.criterion == 'crossentropy':
  criterion = nn.CrossEntropyLoss()
elif CFG.criterion == 'mse':
  criterion = nn.MSELoss()
elif CFG.criterion == 'l1':
  criterion = nn.SmoothL1Loss()
elif CFG.criterion == 'focal':
  criterion = FocalLoss(5)
else:
  raise NotImplementedError('Change loss')

In [23]:
#optimizer_parameters = get_optimizer_params(model,
#                                            encoder_lr=CFG.encoder_lr, 
#                                            decoder_lr=CFG.decoder_lr,
#                                            weight_decay=CFG.weight_decay)
#optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)

In [24]:
optimizer = AdamW(model.parameters(), lr=CFG.encoder_lr, 
                  #eps=CFG.eps, 
                  betas=CFG.betas)

In [25]:
num_train_steps = int(len(train_df) / CFG.batch_size * CFG.epochs)
scheduler = get_scheduler(CFG, optimizer, num_train_steps)

In [26]:
model.to(CFG.device)
criterion.to(CFG.device)

CrossEntropyLoss()

In [1]:
def train_loop(model, optimizer, criterion, train_loader, valid_loader, epochs, scheduler):
  best_score = 0
  # multiplies gradient so it won't vanish (torch use float16)
  # scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
  for e in range(epochs):
    # train
    model.train()
    train_loss = []
    for inputs, labels in tqdm(train_loader):
      inputs = collate(inputs)
      # move inputs to device
      mask = inputs['attention_mask'].to(CFG.device)
      input_id = inputs['input_ids'].squeeze(1).to(CFG.device)
      labels = labels.to(CFG.device)
      # forward
      # with torch.cuda.amp.autocast(enabled=CFG.apex):
        y_preds = model(input_id, mask)
        loss = criterion(y_preds, labels)
      # calculate loss
      train_loss.append(loss.detach().cpu().item())

      optimizer.zero_grad()
      loss.backward()
      # scaler.scale(loss).backward()
      # gradient clipping
      grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
      # scaler.step(optimizer)
      # scaler.update()
      optimizer.step()
      if CFG.batch_scheduler:
          scheduler.step()
    train_loss = np.mean(train_loss)
    # valid
    model.eval()
    with torch.no_grad():
      valid_loss = []
      preds = []
      y = []
      for inputs, labels in valid_loader:
        inputs = collate(inputs)
        # move inputs to device
        mask = inputs['attention_mask'].to(CFG.device)
        input_id = inputs['input_ids'].squeeze(1).to(CFG.device)
        labels = labels.to(CFG.device)
        # forward
        # with torch.cuda.amp.autocast(enabled=CFG.apex):
        y_preds = model(input_id, mask)
        loss = criterion(y_preds, labels)
        preds.append(y_preds.detach().cpu())
        y.append(labels.detach().cpu())
        # calculate loss
        valid_loss.append(loss.detach().cpu().item())
    valid_loss = np.mean(valid_loss)
    preds = torch.concat(preds, dim=0)
    y = torch.concat(y, dim=0)
    metrics = calculate_metrics(y, preds)
    # save best model
    if best_score < metrics[CFG.main_metric]:
      torch.save(model.state_dict(), CFG.model_file)
      best_score = metrics[CFG.main_metric]
    print('best_score =', best_score)
    print(f'EPOCH {e + 1}:, train_loss = {train_loss}, valid_loss = {valid_loss}', *[f'{name} = {value}' for name, value in metrics.items()])
    

IndentationError: unexpected indent (2448289404.py, line 17)

In [28]:
train_loop(model, optimizer, criterion, train_loader, valid_loader, CFG.epochs, scheduler)

100%|██████████| 831/831 [08:08<00:00,  1.70it/s]


best_score = 0.22713647787894967
EPOCH 1:, train_loss = 0.7943838798182106, valid_loss = 2.3569706678390503 f1_macro = 0.22713647787894967 acc = 0.25212224108658743


100%|██████████| 831/831 [08:12<00:00,  1.69it/s]


best_score = 0.22713647787894967
EPOCH 2:, train_loss = 0.4399778932225403, valid_loss = 2.8060171153094315 f1_macro = 0.1899071630152068 acc = 0.20500848896434634


100%|██████████| 831/831 [08:11<00:00,  1.69it/s]


best_score = 0.22713647787894967
EPOCH 3:, train_loss = 0.33930398156209685, valid_loss = 2.772665658512631 f1_macro = 0.20061660551908314 acc = 0.23132427843803055


100%|██████████| 831/831 [08:11<00:00,  1.69it/s]


best_score = 0.22713647787894967
EPOCH 4:, train_loss = 0.28871909121026773, valid_loss = 2.745092105221104 f1_macro = 0.21199500714232125 acc = 0.2597623089983022


100%|██████████| 831/831 [08:12<00:00,  1.69it/s]


best_score = 0.22713647787894967
EPOCH 5:, train_loss = 0.2523892844268418, valid_loss = 2.8727190172350086 f1_macro = 0.2070872676739145 acc = 0.24405772495755518


# Test

In [29]:
def test(model, valid_loader):
    model.eval()
    with torch.no_grad():
      valid_loss = []
      preds = []
      y = []
      for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        # move inputs to device
        mask = inputs['attention_mask'].to(CFG.device)
        input_id = inputs['input_ids'].squeeze(1).to(CFG.device)
        labels = labels.to(CFG.device)
        # forward
        with torch.cuda.amp.autocast(enabled=CFG.apex):
          y_preds = model(input_id, mask)
          loss = criterion(y_preds, labels)
        preds.append(y_preds.detach().cpu())
        y.append(labels.detach().cpu())
        # calculate loss
        valid_loss.append(loss.detach().cpu().item())
    valid_loss = np.mean(valid_loss)
    preds = torch.concat(preds, dim=0)
    y = torch.concat(y, dim=0)
    metrics = calculate_metrics(y, preds)

    print(f'Test metrics: valid_loss={valid_loss}', *[f'{name} = {value}' for name, value in metrics.items()])

In [30]:
model.load_state_dict(torch.load(CFG.model_file))

<All keys matched successfully>

In [31]:
test_df = pd.read_csv(CFG.test_df)
test_df = df.loc[test_df['ID']]
test_ds = TrainDataset(CFG, test_df)
test_loader = torch.utils.data.DataLoader(test_ds, batch_size=CFG.batch_size, shuffle=False)

In [32]:
CFG.batch_size

32

In [33]:
test(model, valid_loader)

Test metrics: valid_loss=2.3569706678390503 f1_macro = 0.22713647787894967 acc = 0.25212224108658743


In [34]:
test(model, test_loader)

Test metrics: valid_loss=1.4832368914673968 f1_macro = 0.38279101560086454 acc = 0.39076923076923076
