In [17]:
!cp -r 'drive/MyDrive/nla/FAVORDistilbert' '.'

In [3]:
!git clone https://github.com/anordertoreclaim/transformers-plus-performers.git

Cloning into 'transformers-plus-performers'...
remote: Enumerating objects: 54506, done.[K
remote: Total 54506 (delta 0), reused 0 (delta 0), pack-reused 54506[K
Receiving objects: 100% (54506/54506), 41.21 MiB | 29.90 MiB/s, done.
Resolving deltas: 100% (38144/38144), done.


In [4]:
cd transformers-plus-performers/

/content/transformers-plus-performers


In [5]:
!git checkout 1fa01a234da02ca67a05febb2eb12e6d8ee8c5bd

Note: checking out '1fa01a234da02ca67a05febb2eb12e6d8ee8c5bd'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by performing another checkout.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -b with the checkout command again. Example:

  git checkout -b <new-branch-name>

HEAD is now at 1fa01a23 Aligned Albert's self-attention parameters with Performer's one


In [6]:
!pip install .

Processing /content/transformers-plus-performers
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 8.1MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 42.0MB/s 
Building wheels for collected packages: transformers
  Building wheel for transformers (PEP 517) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.1.0.dev0-cp36-none-any.whl size=1461368 sha256=28e5c76852405c1f45c904055a4981afae520575ebe668ebac

In [7]:
cd ..

/content


In [8]:
!pip install sentencepiece

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |▎                               | 10kB 26.8MB/s eta 0:00:01[K     |▋                               | 20kB 31.9MB/s eta 0:00:01[K     |▉                               | 30kB 11.4MB/s eta 0:00:01[K     |█▏                              | 40kB 8.7MB/s eta 0:00:01[K     |█▌                              | 51kB 7.8MB/s eta 0:00:01[K     |█▊                              | 61kB 7.6MB/s eta 0:00:01[K     |██                              | 71kB 8.6MB/s eta 0:00:01[K     |██▍                             | 81kB 8.6MB/s eta 0:00:01[K     |██▋                             | 92kB 8.9MB/s eta 0:00:01[K     |███                             | 102kB 9.1MB/s eta 0:00:01[K     |███▎                            | 112kB 9.1MB/s eta 0:00:01[K     |███▌                

In [9]:
import pandas as pd
import numpy as np
import torch
import argparse
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from torch import nn
import torch.nn.functional as F
from sklearn.metrics import f1_score
from tqdm.notebook import tqdm
from functools import partial
from scipy.optimize import minimize

import os

In [18]:
from transformers import DistilBertModel, DistilBertTokenizer, DistilBertConfig
config = DistilBertConfig.from_pretrained('distilbert-base-uncased')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [19]:
use_favor = True
BATCH_SIZE = 30
EPOCHS = 5

In [20]:
if use_favor:
  config.attention_type = 'performer'
  pretrained = 'FAVORDistilbert'
else:
  pretrained = 'distilbert-base-uncased'

In [33]:
class PairsDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
    
    def __getitem__(self, id):
        row = self.data.iloc[id]
        q1, q2 = row.question1, row.question2
        q1 = torch.LongTensor(self.tokenizer.encode(q1))
        q2 = torch.LongTensor(self.tokenizer.encode(q2))
        target = row.is_duplicate
        
        return q1, q2, target
    
    def __len__(self):
        return len(self.data)
    

def collate_fn(batch):
    q1, q2, targets = zip(*batch)
    q1 = pad_sequence(q1, batch_first=True)
    q2 = pad_sequence(q2, batch_first=True)
    targets = torch.LongTensor(targets)
    
    return q1, q2, targets

train = pd.read_csv('data/train.csv')
train.loc[:, 'question1'][train.loc[:, 'question1'].isna()] = 'nan'
train.loc[:, 'question2'][train.loc[:, 'question2'].isna()] = 'nan'
seed = 42
train_ratio = 0.8
thresh_ratio = 0.35

train, val = train_test_split(train, train_size=train_ratio, random_state=seed)
thresh, val = train_test_split(val, train_size=thresh_ratio, random_state=seed)

train_loader = DataLoader(PairsDataset(train, tokenizer), 
                          batch_size=BATCH_SIZE, 
                          shuffle=True,
                          drop_last=True,
                          collate_fn=collate_fn)

thresh_loader = DataLoader(PairsDataset(thresh, tokenizer), 
                           batch_size=BATCH_SIZE,
                           collate_fn=collate_fn)

val_loader = DataLoader(PairsDataset(val, tokenizer), 
                        batch_size=BATCH_SIZE,
                        collate_fn=collate_fn)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [35]:
class SiameseTransformer(nn.Module):
    def __init__(self, model, d_model, reduction='mean'):
        super(SiameseTransformer, self).__init__()
        self.transformer = transformer
        self.classifier = nn.Linear(3 * d_model, 2)
        assert reduction in ['mean', 'cls', 'max'], 'Invalid reduction mode'
        self.reduction = reduction
        self.dropout = nn.Dropout(0.2)
        
    def _embed(self, x, mask):
        x = self.transformer(x, mask).last_hidden_state

        if self.reduction == 'mean':
            x = x.masked_fill(~mask[..., None], 0).sum(1) / mask.sum(1)[:, None]
        elif self.reduction == 'max':
            x = x.masked_fill(~mask[..., None], 0).max(1)[0]
        else:
            x = x[:, 0, :]
        return x
        
    def forward(self, q1, mask1, q2, mask2):
        q1 = self._embed(q1, mask1)
        q2 = self._embed(q2, mask2)
        q1 = self.dropout(q1)
        q2 = self.dropout(q2)
        
        embedding = torch.cat([q1, q2, torch.abs(q1 - q2)], axis=-1)
        logits = self.classifier(embedding)
        return logits
    
    def compute_similarity(self, q1, mask1, q2, mask2):
        q1 = self._embed(q1, mask1)
        q2 = self._embed(q2, mask2)
        sim = F.cosine_similarity(q1, q2)
        return sim


class OptimizedRounder:
    """
    An optimizer for rounding thresholds
    to maximize F1 score
    # https://www.kaggle.com/naveenasaithambi/optimizedrounder-improved
    """
    def __init__(self, initial_coef=0.5):
        self.thresh = None
        self.initial_coef = initial_coef

    def _f_score_loss(self, threshold, similarities, trues):
        """
        Get loss according to
        using current coefficients
        
        :param threshold: Prediction threshold
        :param similarities: Predicted cosine similarities
        :param true: The ground truth labels
        """
        preds = pd.cut(similarities, [-np.inf] + list(threshold) + [np.inf], labels = [0, 1])

        return -f1_score(trues, preds)

    def fit(self, similarities, trues):
        """
        Optimize prediction threshold
        
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        loss_partial = partial(self._f_score_loss, similarities=similarities, trues=trues)
        self.thresh = minimize(loss_partial, self.initial_coef, method='nelder-mead')['x']

    def predict(self, similarities):
        """
        Make predictions with optimized threshold
        
        :param threshold: Prediction threshold
        :param similarities: Predicted cosine similarities
        """
        return pd.cut(similarities, [-np.inf] + list(self.thresh) + [np.inf], labels = [0, 1])

In [36]:
transformer = DistilBertModel.from_pretrained(pretrained, config=config)
model = SiameseTransformer(transformer, 768, reduction='cls')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
criterion = torch.nn.CrossEntropyLoss()
model.to(device)

SiameseTransformer(
  (transformer): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): PerformerAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(

In [39]:
best_f1 = 0

for epoch in range(1, EPOCHS):
    model.train()
    losses = []
    for q1, q2, targets in tqdm(train_loader):
        q1, q2, targets = (x.to(device) for x in (q1, q2, targets))
    
        mask1, mask2 = q1 != 0, q2 != 0
        logits = model(q1, mask1, q2, mask2)
        loss = criterion(logits, targets)
        losses.append(loss.item())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print('Average loss', np.mean(losses))
    model.eval()
    with torch.no_grad():
        similarities = []
        trues = []
        
        for q1, q2, targets in thresh_loader:
            q1, q2 = q1.to(device), q2.to(device)
            mask1, mask2 = q1 != 0, q2 != 0
            sim = model.compute_similarity(q1, mask1, q2, mask2)
            similarities.extend(sim.cpu().numpy().tolist())
            trues.extend(targets.numpy().tolist())
        
        threshold_tuner = OptimizedRounder(0.5)
        threshold_tuner.fit(similarities, trues)
        
        similarities = []
        trues = []
        
        for q1, q2, targets in val_loader:
            q1, q2 = q1.to(device), q2.to(device)
            mask1, mask2 = q1 != 0, q2 != 0
            sim = model.compute_similarity(q1, mask1, q2, mask2)
            similarities.extend(sim.cpu().numpy().tolist())
            trues.extend(targets.numpy().tolist())
        
        preds = threshold_tuner.predict(similarities)
        f1 = f1_score(trues, preds)
        print(f'Epoch: {epoch + 1}, f1: {f1}')
        
        if f1 > best_f1:
            best_f1 = f1   
            torch.save({'model': model.state_dict(),
                        'f1': f1,
                        'threshold': threshold_tuner.thresh},
                        f'epoch{epoch + 1}.pth')

HBox(children=(FloatProgress(value=0.0, max=10781.0), HTML(value='')))


Average loss 0.40602461274621016
Epoch: 2, f1: 0.734335613971912


HBox(children=(FloatProgress(value=0.0, max=10781.0), HTML(value='')))


Average loss 0.3809136891292349
Epoch: 3, f1: 0.7479143834850485


HBox(children=(FloatProgress(value=0.0, max=10781.0), HTML(value='')))


Average loss 0.36349707449751895
Epoch: 4, f1: 0.7538688612577155


HBox(children=(FloatProgress(value=0.0, max=10781.0), HTML(value='')))


Average loss 0.34993410423534854
Epoch: 5, f1: 0.7584152252212193


In [38]:
 torch.save({'model': model.state_dict(),
                        'f1': f1,
                        'threshold': threshold_tuner.thresh},
                        f'epoch{epoch + 1}.pth')