In [1]:
class CFG:
  dataset='./dataset/polarization_ds.csv'
  train_df='./dataset/media_split/train.csv'
  valid_df='./dataset/media_split/valid.csv'
  test_df='./dataset/media_split/test.csv'
  target_cols=['left', 'center', 'right'] # 'bias_text'
  classes=3
  model='bert-large-cased'
  embedd_dim=1024
  criterion = 'mse' # ['crossentropy', 'mse', 'l1', 'focal']
  main_metric = 'f1_macro'
  model_file = './models/bert_large_triplet.pt'
  # just use it
  apex=True
  gradient_checkpointing=True
  num_cycles=0.5
  num_warmup_steps=0
  epochs=5
  encoder_lr=2e-5
  decoder_lr=2e-5
  min_lr=1e-6
  eps=1e-6
  betas=(0.9, 0.999)
  batch_size=8
  max_len=512
  weight_decay=0.01
  # gradient_accumulation_steps=1
  max_grad_norm=1
  seed=0
  scheduler='cosine' # ['linear', 'cosine']
  batch_scheduler=True
  #
  colab=False

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
if CFG.colab:
  from google.colab import drive
  drive.mount('/content/drive')
  import os
  os.chdir('/content/drive/MyDrive/lab/bert_finetune')

# Load libraries and data

In [4]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

from tqdm import tqdm
import gc

In [5]:
def set_seed(seed):
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available(): # для GPU отдельный seed
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
set_seed(CFG.seed)
# есть стохастические операции на GPU
# сделаем их детерминированными для воспроизводимости
torch.backends.cudnn.determinstic = True
torch.backends.cudnn.benchmark = False

In [6]:
CFG.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.model)

In [7]:
df = pd.read_csv(CFG.dataset, index_col='id')
train_df = pd.read_csv(CFG.train_df)
valid_df = pd.read_csv(CFG.valid_df)

In [8]:
df.head(3)

Unnamed: 0_level_0,topic,source,bias,url,title,date,authors,content,content_original,source_url,bias_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
wzYhj24VaSbhhr8T,sexual_misconduct,New York Post,2.0,https://nypost.com/2020/05/03/two-more-people-...,Two more people back parts of Tara Reade’s cla...,2020-05-03,,Two more people corroborated some of Tara Read...,Two more people corroborated some of Tara Read...,www.nypost.com,right
pcgmZXUEXb0xWWB1,marijuana_legalization,Washington Times,2.0,http://www.washingtontimes.com/news/2014/apr/2...,Colorado lawmakers set limits on pot edibles p...,2014-04-21,Valerie Richardson,DENVER — The Mile High City was jammed with po...,DENVER — The Mile High City was jammed with po...,www.washingtontimes.com,right
oYxUHzLvAtQhGoXk,foreign_policy,TheBlaze.com,2.0,https://www.theblaze.com/news/2018/11/11/trump...,"Trump, world leaders, mark 100-year WWI annive...",2018-11-11,Teri Webster,President Donald Trump and dozens of world lea...,President Donald Trump and dozens of world lea...,www.theblaze.com,right


In [9]:
target = (df['bias_text'] == 'center') + (df['bias_text'] == 'right') * 2
df['bias_text'] = target

In [10]:
df['left'] = (df['bias_text'] == 0).astype(float)
df['center'] = (df['bias_text'] == 1).astype(float)
df['right'] = (df['bias_text'] == 2).astype(float)

In [11]:
train_df = df.loc[train_df['ID']]
valid_df = df.loc[valid_df['ID']]

In [12]:
train_df.head()

Unnamed: 0_level_0,topic,source,bias,url,title,date,authors,content,content_original,source_url,bias_text,left,center,right
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
zl7kc7EmAyIdUMIo,immigration,National Review,2.0,https://www.nationalreview.com/2018/12/governm...,"Shutdown Theater, Again",2018-12-12,"Kevin D. Williamson, Kyle Smith, Andrew C. Mcc...",President Trump and Senate Minority Leader Chu...,President Trump and Senate Minority Leader Chu...,www.nationalreview.com,2,0.0,0.0,1.0
xpbjYTJYPdlw6HmJ,culture,Yahoo! The 360,1.0,https://news.yahoo.com/can-the-developing-worl...,Can the developing world endure the coronavirus?,2020-06-30,Mike Bebernes,“ The 360 ” shows you diverse perspectives on ...,“The 360” shows you diverse perspectives on th...,www.news.yahoo.com,1,0.0,1.0,0.0
k4SGI3GXarnz5dJl,elections,Politico,0.0,http://www.politico.com/story/2016/07/bernie-s...,Sanders’ California supporters can’t quite say...,2016-07-02,"Daniel Strauss, Henry C. Jackson, Nick Gass",LOS ANGELES — Actress Rosario Dawson took the ...,LOS ANGELES — Actress Rosario Dawson took the ...,www.politico.com,0,1.0,0.0,0.0
0jIpietfnrPRGHKQ,white_house,Business Insider,1.0,https://www.businessinsider.com/trump-distance...,Trump says he doesn't know if Rudy Giuliani is...,2019-10-11,Sonam Sheth,President Donald Trump said on Friday that he ...,President Donald Trump said on Friday that he ...,www.businessinsider.com,1,0.0,1.0,0.0
zMlSt7dyJvanHqJq,politics,CNN (Web News),0.0,http://www.cnn.com/2017/01/20/politics/donald-...,Trump's historic moment arrives,2017-01-20,Stephen Collinson,Washington ( CNN ) Donald Trump became the 45t...,Washington (CNN) Donald Trump became the 45th ...,www.cnn.com,0,1.0,0.0,0.0


In [13]:
biases = train_df[['bias', 'source_url']].groupby(by = 'source_url').agg(['min', 'max'])['bias']

In [14]:
sources_for_triplet = biases[biases['min'] != biases['max']].index
sources_for_triplet

Index(['www.aljazeera.com', 'www.blogs.wsj.com', 'www.bostonglobe.com',
       'www.cbsnews.com', 'www.chicago.suntimes.com', 'www.cnn.com',
       'www.csmonitor.com', 'www.dailymail.co.uk', 'www.foxnews.com',
       'www.marketwatch.com', 'www.nationalreview.com', 'www.news.yahoo.com',
       'www.npr.org', 'www.nypost.com', 'www.politico.com',
       'www.scientificamerican.com', 'www.theatlantic.com',
       'www.thedailybeast.com', 'www.thehill.com', 'www.theweek.com',
       'www.time.com', 'www.townhall.com', 'www.usatoday.com', 'www.vox.com',
       'www.washingtontimes.com', 'www.wsj.com'],
      dtype='object', name='source_url')

In [15]:
train_df.shape

(26590, 14)

In [16]:
triplet_df = train_df[train_df['source_url'].isin(sources_for_triplet)]
triplet_df.shape

(24294, 14)

# Dataset

In [17]:
class TripletDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = (df['title'] + ' ' + df['content']).values
        self.labels = df['bias_text'].values
        self.media = df['source_url'].values

    def __len__(self):
        return len(self.texts)

    def get_anchor(self, idx):
        # tokenization
        inputs = self.cfg.tokenizer.encode_plus(
            self.texts[idx], 
            return_tensors=None, 
            add_special_tokens=True, 
            max_length=CFG.max_len,
            pad_to_max_length=True,
            truncation=True
        )
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
        labels = torch.tensor(self.labels[idx], dtype=torch.float)
        return inputs, labels

    def get_pos(self, idx):
        media = self.media[idx]
        labels = self.labels[idx]
        idxs = np.arange(len(self.texts))[(self.labels == labels) & (self.media != media)]
        new_idx = np.random.choice(idxs)
        return self.get_anchor(new_idx)
    
    def get_neg(self, idx):
        media = self.media[idx]
        labels = self.labels[idx]
        idxs = np.arange(len(self.texts))[(self.labels != labels) & (self.media == media)]
        new_idx = np.random.choice(idxs)
        return self.get_anchor(new_idx)
        
    def __getitem__(self, idx):
        inputs_a, label_a = self.get_anchor(idx)
        inputs_p, label_p = self.get_pos(idx)
        inputs_n, label_n = self.get_neg(idx)
        return inputs_a, inputs_p, inputs_n

def collate(inputs):
		# reduce sequence length
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

# Model

In [18]:
class TransformerModel(nn.Module):

    def __init__(self):
        super(TransformerModel, self).__init__()

        self.model = AutoModel.from_pretrained(CFG.model)
        if CFG.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        self.clf =  nn.Linear(CFG.embedd_dim, CFG.classes)
        if not CFG.criterion == 'crossentropy':
            self.sm = nn.Softmax(dim=-1)
        torch.nn.init.xavier_uniform_(self.clf.weight)

    def forward(self, input_id, mask):
        # sequence has [CLF] token in the beginning
        # bert() returns first vector as pooling of sentence
        _, x = self.model(input_ids= input_id, attention_mask=mask, return_dict=False)
        # out = self.clf(x)
        # if not CFG.criterion == 'crossentropy':
        #     return self.sm(out)
        return x

# Train loop, metrics and other

In [19]:
from sklearn.metrics import f1_score, accuracy_score

In [20]:
def calculate_metrics(y, pred):
  y = torch.argmax(y, dim=1)
  pred = torch.argmax(pred, dim=1)
  f1 = f1_score(y, pred, average='macro')
  acc = accuracy_score(y, pred)
  return {'f1_macro': f1, 'acc': acc}

In [21]:
def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
         'lr': encoder_lr, 'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
         'lr': encoder_lr, 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if "model" not in n],
         'lr': decoder_lr, 'weight_decay': 0.0}
    ]
    return optimizer_parameters

def get_scheduler(cfg, optimizer, num_train_steps):
    if cfg.scheduler == 'linear':
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
        )
    elif cfg.scheduler == 'cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
        )
    return scheduler

In [22]:
class FocalLoss(nn.Module):
    def __init__(self, gamma=0, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, input, target):
        target = target.view(-1,1)

        logpt = F.log_softmax(input)
        logpt = logpt.gather(1,target)
        logpt = logpt.view(-1)
        pt = torch.exp(logpt)

        loss = -1 * (1-pt)**self.gamma * logpt
        if self.reduction == 'mean': 
          return loss.mean()
        elif self.reduction == 'sum':
          return loss.sum()
        else:
          raise NotImplementedError(f'Not implemented reduction: {self.reduction}')

In [23]:
class TripletLoss(nn.Module):
    def __init__(self, eps=1e-6):
        super(TripletLoss, self).__init__()
        self.mse = nn.MSELoss(reduction='none')
        self.relu = nn.ReLU()
        self.eps = eps

    def forward(self, pred_a, pred_p, pred_n):
        ap = torch.sqrt(self.mse(pred_a, pred_p).sum(dim=-1))
        an = torch.sqrt(self.mse(pred_a, pred_n).sum(dim=-1))
        return self.relu(ap - an + self.eps).mean()

In [24]:
trip_ds = TripletDataset(CFG, triplet_df)

In [25]:
train_loader = torch.utils.data.DataLoader(trip_ds, batch_size=CFG.batch_size, shuffle=True)

In [26]:
model = TransformerModel()

Some weights of the model checkpoint at bert-large-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [27]:
criterion = TripletLoss()

In [28]:
#optimizer_parameters = get_optimizer_params(model,
#                                            encoder_lr=CFG.encoder_lr, 
#                                            decoder_lr=CFG.decoder_lr,
#                                            weight_decay=CFG.weight_decay)
#optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)

In [29]:
optimizer = AdamW(model.parameters(), lr=CFG.encoder_lr, weight_decay=1e-4, eps=CFG.eps, betas=CFG.betas)

In [30]:
num_train_steps = int(len(train_df) / CFG.batch_size * CFG.epochs)
scheduler = get_scheduler(CFG, optimizer, num_train_steps)

In [31]:
model.to(CFG.device)
criterion.to(CFG.device)

TripletLoss(
  (mse): MSELoss()
  (relu): ReLU()
)

In [32]:
def train_loop(model, optimizer, criterion, train_loader, epochs, scheduler):
  best_score = 0
  # multiplies gradient so it won't vanish (torch use float16)
  scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
  for e in range(epochs):
    # train
    model.train()
    train_loss = []
    for inputs_a, inputs_p, inputs_n in tqdm(train_loader):
      inputs_a = collate(inputs_a)
      inputs_p = collate(inputs_p)
      inputs_n = collate(inputs_n)
      # move inputs to device
      mask_a = inputs_a['attention_mask'].to(CFG.device)
      input_id_a = inputs_a['input_ids'].squeeze(1).to(CFG.device)
      mask_p = inputs_p['attention_mask'].to(CFG.device)
      input_id_p = inputs_p['input_ids'].squeeze(1).to(CFG.device)
      mask_n = inputs_n['attention_mask'].to(CFG.device)
      input_id_n = inputs_n['input_ids'].squeeze(1).to(CFG.device)
      # forward
      with torch.cuda.amp.autocast(enabled=CFG.apex):
          pred_a = model(input_id_a, mask_a)
          pred_p = model(input_id_p, mask_p)
          pred_n = model(input_id_n, mask_n)
          loss = criterion(pred_a, pred_p, pred_n)
      # calculate loss
      train_loss.append(loss.detach().cpu().item())

      optimizer.zero_grad()
      scaler.scale(loss).backward()
      # gradient clipping
      grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
      scaler.step(optimizer)
      scaler.update()
      if CFG.batch_scheduler:
          scheduler.step()
    train_loss = np.mean(train_loss)
    print(f'EPOCH {e + 1}:, train_loss = {train_loss}')
    

In [33]:
train_loop(model, optimizer, criterion, train_loader, CFG.epochs, scheduler)

100%|██████████| 3037/3037 [1:01:56<00:00,  1.22s/it]


EPOCH 1:, train_loss = 0.013815938194993918


100%|██████████| 3037/3037 [1:01:53<00:00,  1.22s/it]


EPOCH 2:, train_loss = 7.812964524129023e-05


100%|██████████| 3037/3037 [1:01:52<00:00,  1.22s/it]


EPOCH 3:, train_loss = 4.8298887953777385e-05


100%|██████████| 3037/3037 [1:01:51<00:00,  1.22s/it]


EPOCH 4:, train_loss = 4.744101919715035e-05


100%|██████████| 3037/3037 [1:01:51<00:00,  1.22s/it]

EPOCH 5:, train_loss = 4.417610556736843e-05





In [35]:
torch.save(model.state_dict(), CFG.model_file)

# Test

In [29]:
def test(model, test_loader):
    model.eval()
    with torch.no_grad():
      test_loss = []
      preds = []
      y = []
      for step, (inputs, labels) in enumerate(test_loader):
        inputs = collate(inputs)
        # move inputs to device
        mask = inputs['attention_mask'].to(CFG.device)
        input_id = inputs['input_ids'].squeeze(1).to(CFG.device)
        labels = labels.to(CFG.device)
        # forward
        # with torch.cuda.amp.autocast(enabled=CFG.apex):
        y_preds = model(input_id, mask)
        loss = criterion(y_preds, labels)
        preds.append(y_preds.detach().cpu())
        y.append(labels.detach().cpu())
        # calculate loss
        test_loss.append(loss.detach().cpu().item())
    test_loss = np.mean(test_loss)
    preds = torch.concat(preds, dim=0)
    y = torch.concat(y, dim=0)
    metrics = calculate_metrics(y, preds)

    print(f'Test metrics: test_loss={test_loss}', *[f'{name} = {value}' for name, value in metrics.items()])

In [30]:
model.load_state_dict(torch.load(CFG.model_file))

<All keys matched successfully>

In [31]:
test_df = pd.read_csv(CFG.test_df)
test_df = df.loc[test_df['ID']]
test_ds = TrainDataset(CFG, test_df)
test_loader = torch.utils.data.DataLoader(test_ds, batch_size=CFG.batch_size, shuffle=False)

In [32]:
test(model, valid_loader)

Test metrics: test_loss=0.4329216697731534 f1_macro = 0.2260958910521593 acc = 0.2593378607809847


In [33]:
test(model, test_loader)

Test metrics: test_loss=0.2522725470182372 f1_macro = 0.4679760415932736 acc = 0.5246153846153846
