In [1]:
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.cuda.amp as amp
from torch.utils.tensorboard import SummaryWriter

from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup

In [2]:
# Device: GPU
device=torch.device("cuda:0")

# Hyperparams
max_sent_len=256
batch_size=16
accum_steps=1
lr=5e-5
epochs=5

In [3]:
# Pre-Trained Tokenizer
tokenizer=AutoTokenizer.from_pretrained("roberta-base")
# Pre-Trained LM
pretrained=AutoModel.from_pretrained("roberta-base").to(device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
class STSBenchmark(Dataset):
    def __init__(self, path):
        self.data=[]
        self.label=[]
        
        # Read Dataset
        with open(path, "r") as f:
            data=f.read().split("\n")
            f.close()
        # Remove Empty Data
        data.remove("")
        
        for _data in data:
            label, sent0, sent1=_data.split("\t")[4:7]
            
            # Encode Sentence
            enc0=tokenizer.encode(sent0, truncation=True, max_length=max_sent_len)
            enc1=tokenizer.encode(sent1, truncation=True, max_length=max_sent_len)
            
            # Append Data
            self.data.append(enc0[:-1]+[tokenizer.sep_token_id]+enc1[1:])
            self.label.append(float(label))
            
        print(len(self.data), "data")
            
    def __getitem__(self, idx):
        return self.data[idx], self.label[idx]
    
    def __len__(self):
        return len(self.data)

In [5]:
def collate_fn(batch):
    max_seq_len=0
    for data, _ in batch:
        if len(data)>max_seq_len: max_seq_len=len(data)
            
    batch_data=[]
    batch_label=[]
    for data, label in batch:
        data.extend([tokenizer.pad_token_id]*(max_seq_len-len(data)))
        batch_data.append(data)
        
        batch_label.append(label)
        
    return torch.tensor(batch_data), torch.tensor(batch_label)

In [6]:
# STS Benchmark Train Set
dataset_train=STSBenchmark(path="../dataset/stsbenchmark/sts-train.csv")
dataloader_train=DataLoader(dataset_train, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

5749 data


In [7]:
class CrossEncoder(nn.Module):
    def __init__(self, pretrained):
        super().__init__()
        
        # Pre-Trained LM
        self.pretrained=pretrained
        # Pooling Layer: MLP
        self.pooler=nn.Linear(pretrained.config.hidden_size, 1)
        
    def forward(self, x):
        x=self.pretrained(x)
        cls=x.last_hidden_state[:,0,:]
        return self.pooler(cls)

In [8]:
# Model: Cross-Encoder
model=CrossEncoder(pretrained=pretrained).to(device)
model.train()

# Loss: MSE
mse_loss=nn.MSELoss()

# Optimizer, Scheduler
optimizer=AdamW(model.parameters(), lr=lr)
scheduler=get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=int(epochs*len(dataset_train)/(accum_steps*batch_size))
)

# Mixed Precision: GradScaler
scaler=amp.GradScaler()

# Tensorboard
writer=SummaryWriter()

step_global=0
for epoch in range(epochs):
    _loss=0
    optimizer.zero_grad()
    
    for step, (data, label) in enumerate(dataloader_train):
        # Load Data, Label
        data=data.to(device)
        label=label.to(device)
        
        # Forward
        with amp.autocast():
            pred=model(data)
            loss=mse_loss(pred, label.unsqueeze(-1))/accum_steps
        # Backward
        scaler.scale(loss).backward()
        _loss+=loss.item()
        
        # Step
        if (step+1)%accum_steps==0:
            step_global+=1
            
            # Tensorboard
            writer.add_scalar(
                f'loss_train/Cross-Encoder_batch{int(accum_steps*batch_size)}_lr{lr}_epochs{epochs}',
                _loss,
                step_global
            )
            _loss=0
            
            # Optimizer, Scheduler
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            optimizer.zero_grad()
            
    # Save Model
    model.to(torch.device('cpu'))
    torch.save(
        model,
        f'../model/Cross-Encoder_batch{int(accum_steps*batch_size)}_lr{lr}_epoch{epoch+1}of{epochs}'
    )
    model.to(device)



### Evaluate

In [1]:
import torch
import torch.nn as nn

from transformers import AutoTokenizer

import numpy as np
from scipy import spatial, stats

In [2]:
class CrossEncoder(nn.Module):
    def __init__(self, pretrained):
        super().__init__()
        
        # Pre-Trained LM
        self.pretrained=pretrained
        # Pooling Layer: MLP
        self.pooler=nn.Linear(pretrained.config.hidden_size, 1)
        
    def forward(self, x):
        x=self.pretrained(x)
        cls=x.last_hidden_state[:,0,:]
        return self.pooler(cls)

In [3]:
# Device: GPU
device=torch.device("cuda:3")

# Pre-Trained Tokenizer
tokenizer=AutoTokenizer.from_pretrained("roberta-base")

# Load Trained Model: Cross-Encoder
model=torch.load("../model/Cross-Encoder_batch16_lr5e-05_epoch5of5").to(device)
model.eval()
print("Loaded!")

Loaded!


In [4]:
# STS Benchmark Test Set
with open('../dataset/stsbenchmark/sts-test.csv', 'r') as f:
    stsb_test=f.read()
    f.close()
    
preds=[]
labels=[]
for data in stsb_test.split("\n")[:-1]:
    label, sent0, sent1=data.split("\t")[4:7]
    labels.append(float(label))
    
    # Encode Sentence
    enc0=tokenizer.encode(sent0)
    enc1=tokenizer.encode(sent1)
    
    # Forward
    input_=torch.tensor([enc0[:-1]+[tokenizer.sep_token_id]+enc1[1:]])
    pred=model(input_.to(device))
    
    preds.append(pred[0].item())

In [5]:
np.corrcoef(preds, labels)

array([[1.        , 0.89041308],
       [0.89041308, 1.        ]])

In [6]:
stats.spearmanr(preds, labels)

SpearmanrResult(correlation=0.8837970016428108, pvalue=0.0)