In [1]:
import math
import torch
import pandas as pd
from torch import nn
from torch.utils.data import DataLoader
from sentence_transformers.readers import InputExample
from sentence_transformers import SentenceTransformer, models
from sentence_transformers.evaluation import BinaryClassificationEvaluator

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import numpy as np
import torch.nn.functional as F
from sentence_transformers import losses, util

## Loading Data

In [3]:
# Loading data
train_df = pd.read_csv('data/train_df.csv')
val_df = pd.read_csv('data/val_df.csv')
train_df.score = (train_df.score >= 0.5).astype(int)
val_df.score = (val_df.score >= 0.5).astype(int)
train_df.shape, val_df.shape

((27383, 4), (9090, 4))

## Preparing Data

In [4]:
def prepare_data(train_df, val_df, include_context=False):
    train_samples = []
    val_samples = []
    
    for i, row in train_df.iterrows():
        texts = [row['anchor'], row['target']] 
        if include_context:
            texts += [row['context'],]
        inp_example = InputExample(texts=texts, label=row['score'])
        train_samples.append(inp_example)

    for i, row in val_df.iterrows():
        texts = [row['anchor'], row['target']]
        if include_context:
            texts += [row['context'],]
        inp_example = InputExample(texts=texts, label=row['score'])
        val_samples.append(inp_example)            
    return train_samples, val_samples

# Bi Encoder

In [5]:
model_name = 'distilbert-base-uncased'
bs = 32
num_epochs = 30

In [6]:
def get_model(pool=True):
    word_embedding_model = models.Transformer(model_name, do_lower_case=True)
    if not pool:
        # word_embedding_model.auto_model.config.output_hidden_states = True
        model = SentenceTransformer(modules=[word_embedding_model,])
        return model
        
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                   pooling_mode_mean_tokens=True,
                                   pooling_mode_cls_token=False,
                                   pooling_mode_max_tokens=False)
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    return model    

In [47]:
from sentence_transformers.evaluation import SentenceEvaluator
from sentence_transformers.util import batch_to_device

class LabelAccuracyEvaluator(SentenceEvaluator):
    def __init__(self, dataloader: DataLoader, model = None):
        self.dataloader = dataloader
        self.model = model

    def __call__(self) -> float:
        self.model.eval()
        total = 0
        correct = 0

        self.dataloader.collate_fn = self.model.model.smart_batching_collate
        device = self.model.model.device
        for step, batch in enumerate(self.dataloader):
            features, label_ids = batch
            for idx in range(len(features)):
                features[idx] = batch_to_device(features[idx], device)
            label_ids = label_ids.to(device)
            with torch.no_grad():
                _, prediction = self.model(features, labels=None)

            total += prediction.size(0)
            correct += torch.argmax(prediction, dim=1).eq(label_ids).sum().item()
        accuracy = correct/total

        print(f"Accuracy: {accuracy:.4f} ({correct}/{total})\n")
        return accuracy

### Without Context

In [7]:
# preparing data
train_samples, val_samples = prepare_data(train_df, val_df, include_context=False)
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=bs)
val_dataloader = DataLoader(val_samples, shuffle=False, batch_size=bs)

warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)

In [8]:
class ClassifierHead(nn.Module):
    def __init__(self, model, emb_dim: int, num_labels: int, loss_fct=nn.BCEWithLogitsLoss()):
        super().__init__()
        self.model = model
        self.num_labels = num_labels
        self.classifier = nn.Linear(2*emb_dim, num_labels)
        self.loss_fct = loss_fct

    def forward(self, sentence_features, labels):
        reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]
        rep_a, rep_b = reps
        features = torch.cat([rep_a, rep_b], 1)
        output = self.classifier(features)

        if labels is not None:
            loss = self.loss_fct(output.view(-1), labels.float())
            return loss
        else:
            return reps, output

In [9]:
model = get_model()
train_loss = ClassifierHead(model=model, num_labels=1, emb_dim=model.get_sentence_embedding_dimension())

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
%%time

model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          show_progress_bar=False)

CPU times: user 22min 11s, sys: 7min 19s, total: 29min 30s
Wall time: 19min 4s


In [11]:
test_evaluator = LabelAccuracyEvaluator(val_dataloader, train_loss)
test_evaluator()

0.5784196241218242

### With Context

In [12]:
train_samples, val_samples = prepare_data(train_df, val_df, include_context=True)
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=bs)
val_dataloader = DataLoader(val_samples, shuffle=False, batch_size=bs)

warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)

### 1. Concatenation

In [13]:
class ClassifierHeadConcatContext(nn.Module):
    def __init__(self, model, emb_dim: int, num_labels: int, loss_fct=nn.BCEWithLogitsLoss()):
        super().__init__()
        self.model = model
        self.num_labels = num_labels
        self.classifier = nn.Linear(3*emb_dim, num_labels)
        self.loss_fct = loss_fct

    def forward(self, sentence_features, labels):
        reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]
        rep_a, rep_b, rep_c = reps
        features = torch.cat([rep_a, rep_b, rep_c], 1)
        output = self.classifier(features)

        if labels is not None:
            loss = self.loss_fct(output.view(-1), labels.float())
            return loss
        else:
            return reps, output

In [14]:
model = get_model()
train_loss = ClassifierHeadConcatContext(model=model, num_labels=1, emb_dim=model.get_sentence_embedding_dimension())

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
%%time

model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          show_progress_bar=False)

CPU times: user 38min 56s, sys: 12min 14s, total: 51min 11s
Wall time: 35min 30s


In [16]:
test_evaluator = LabelAccuracyEvaluator(val_dataloader, train_loss)
test_evaluator()

0.5693061451101133

### 2. Element-wise Addition

In [17]:
class ClassifierHeadAddContext(nn.Module):
    def __init__(self, model, emb_dim: int, num_labels: int, loss_fct=nn.BCEWithLogitsLoss()):
        super().__init__()
        self.model = model
        self.num_labels = num_labels
        self.classifier = nn.Linear(emb_dim, num_labels)
        self.loss_fct = loss_fct

    def forward(self, sentence_features, labels):
        reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]
        rep_a, rep_b, rep_c = reps
        features = (rep_a + rep_b + rep_c)/3
        output = self.classifier(features)

        if labels is not None:
            loss = self.loss_fct(output.view(-1), labels.float())
            return loss
        else:
            return reps, output

In [18]:
model = get_model()
train_loss = ClassifierHeadAddContext(model=model, num_labels=1, emb_dim=model.get_sentence_embedding_dimension())

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
%%time

model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          show_progress_bar=False)

CPU times: user 38min 59s, sys: 12min 19s, total: 51min 19s
Wall time: 35min 33s


In [20]:
test_evaluator = LabelAccuracyEvaluator(val_dataloader, train_loss)
test_evaluator()

0.5519564366994452

### 3. Attention Mechanism

In [21]:
class ClassifierHeadAttentionContext(nn.Module):
    def __init__(self, model, emb_dim: int, num_labels: int, loss_fct=nn.BCEWithLogitsLoss()):
        super().__init__()
        self.model = model
        self.num_labels = num_labels
        self.ln = nn.LayerNorm(emb_dim)
        self.multihead_attn = nn.MultiheadAttention(emb_dim, 2, batch_first=True)
        self.classifier = nn.Linear(2*emb_dim, num_labels)
        self.loss_fct = loss_fct

    def forward(self, sentence_features, labels):
        reps = [self.model(sentence_feature)['token_embeddings'] for sentence_feature in sentence_features]
        rep_a, rep_b, rep_c = reps
        
        rep_a = self.ln(rep_a)
        rep_b = self.ln(rep_b)
        rep_c = self.ln(rep_c)
        rep_attn_a, _ = self.multihead_attn(rep_a, rep_c, rep_c)
        rep_attn_b, _ = self.multihead_attn(rep_b, rep_c, rep_c)
        rep_attn_a = self.ln(rep_attn_a)
        rep_attn_b = self.ln(rep_attn_b)
        
        rep_attn_a = rep_attn_a.mean(1)
        rep_attn_b = rep_attn_b.mean(1)
        features = torch.concat([rep_attn_a, rep_attn_b], 1)

        output = self.classifier(features)

        if labels is not None:
            loss = self.loss_fct(output.view(-1), labels.float())
            return loss
        else:
            return reps, output

In [22]:
model = get_model(pool=False)
train_loss = ClassifierHeadAttentionContext(model=model, num_labels=1, emb_dim=768)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [23]:
%%time

model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          show_progress_bar=False)

CPU times: user 40min 20s, sys: 12min 41s, total: 53min 1s
Wall time: 37min 22s


In [49]:
test_evaluator = LabelAccuracyEvaluator(val_dataloader, train_loss)
test_evaluator()

Accuracy: 0.5271 (4791/9090)



0.527062706270627