In [None]:
!pip install transformers

In [2]:
import pandas as pd
import torch
import re
import numpy as np

from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from transformers import AutoTokenizer, AutoModel
from transformers import get_scheduler

from tqdm import tqdm

from sklearn.metrics import accuracy_score

In [None]:
!wget --no-check-certificate https://russiansuperglue.com/tasks/download/RUSSE

In [None]:
!unzip RUSSE -d WiC

In [5]:
train = pd.read_json('/content/WiC/RUSSE/train.jsonl', orient='records', lines = True)
val = pd.read_json('/content/WiC/RUSSE/val.jsonl', orient='records', lines = True)
test = pd.read_json('/content/WiC/RUSSE/test.jsonl', orient='records', lines = True)

print('Train size:', len(train))
print('Val size:', len(val))
print('Test size:', len(test))
print('\n')
print('Train labels counts\n', train['label'].value_counts().to_dict(), '\n')
print('Eval labels counts\n', val['label'].value_counts().to_dict(), '\n')

Train size: 19845
Val size: 8505
Test size: 18892


Train labels counts
 {False: 12653, True: 7192} 

Eval labels counts
 {False: 5366, True: 3139} 



# Model

In [None]:
tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruBert-base")
model = AutoModel.from_pretrained("ai-forever/ruBert-base", output_hidden_states = True)

In [7]:
max_len = 0

# For every sentence...
for sent in pd.concat([train['sentence1'], train['sentence2']]):

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Max sentence length:  71


In [8]:
def convert_ids_to_mask(text, start, end, max_length=False, debug=False):
    text = text.replace('й', 'и').replace('ё', 'е')
    input_ids = tokenizer(text, return_tensors='pt')['input_ids']
    n = 0
    symbols = []
    mask = []
    tokens = tokenizer.convert_ids_to_tokens(input_ids.reshape(-1))
    for t in range(len(tokens)):
        if t == 0 :
            symbols.append([-1])
            continue
        elif t == len(tokens)-1:
            symbols.append([-1])
            break
        symbol = []
        for s in tokens[t].replace('#', ""):
            if text.lower()[n] == s:
                symbol.append(n)
                n += 1
        symbols.append(symbol)
        if n != len(text):
            # В тексте есть символы #, из-за этого может в ошибку упасть
            try: 
                if text.lower()[n] != tokens[t+1].replace('#', "")[0]:
                    n += 1
            except:
                n += 1
    for s in symbols:
        if set(s) & set(range(start, end)):
            mask.append(1.)
        else:
            mask.append(0.)
    if debug:
        for t, m in zip(tokens, mask):
            print(f'{t:<15} {m}')
    else:
        if max_length:
            if max_length > len(mask):
                mask.extend([0]*(max_length - len(mask)))
            else:
                mask = mask[:max_length]
        return mask

convert_ids_to_mask('Солнце стояло уже высоко, когда справа от дороги я увидел деревеньку дворов в пятнадцать', 57, 68, debug=True)

[CLS]           0.0
солнце          0.0
стояло          0.0
уже             0.0
высоко          0.0
,               0.0
когда           0.0
справа          0.0
от              0.0
дороги          0.0
я               0.0
увидел          0.0
деревень        1.0
##ку            1.0
дворов          0.0
в               0.0
пятнадцать      0.0
[SEP]           0.0


In [9]:
class TrainDataset(Dataset):
    
    def __init__(self, df):
        self.df = df.reset_index(drop=True).drop(columns='idx')

    def tokenize(self, text):
        return tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=71)

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        row = self.df.iloc[index, :]
        input_ids_1, _, attention_mask_1 = self.tokenize(row['sentence1']).values()
        input_ids_2, _, attention_mask_2 = self.tokenize(row['sentence2']).values()
        mask1 = convert_ids_to_mask(row['sentence1'], row['start1'], row['end1'], max_length=71)
        mask2 = convert_ids_to_mask(row['sentence2'], row['start2'], row['end2'], max_length=71)
        label = [1.] if row['label'] else [-1.]
        output = {
            'input_ids_1': input_ids_1.reshape(-1),
            'attention_mask_1': attention_mask_1.reshape(-1),
            'input_ids_2': input_ids_2.reshape(-1),
            'attention_mask_2': attention_mask_2.reshape(-1),
            'word1_mask': torch.tensor(mask1),
            'word2_mask': torch.tensor(mask2),
            'labels': torch.tensor(label),
        }
        return {k: v.to(device) for k, v in output.items()}

class TestDataset(Dataset):
    
    def __init__(self, df):
        self.df = df.reset_index(drop=True)
    
    def tokenize(self, text):
        return tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=71)

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        row = self.df.iloc[index, :]
        input_ids_1, _, attention_mask_1 = self.tokenize(row['sentence1']).values()
        input_ids_2, _, attention_mask_2 = self.tokenize(row['sentence2']).values()
        mask1 = convert_ids_to_mask(row['sentence1'], row['start1'], row['end1'], max_length=71)
        mask2 = convert_ids_to_mask(row['sentence2'], row['start2'], row['end2'], max_length=71)
        output = {
            'input_ids_1': input_ids_1.reshape(-1),
            'attention_mask_1': attention_mask_1.reshape(-1),
            'input_ids_2': input_ids_2.reshape(-1),
            'attention_mask_2': attention_mask_2.reshape(-1),
            'word1_mask': torch.tensor(mask1),
            'word2_mask': torch.tensor(mask2),
        }
        return {k: v.to(device) for k, v in output.items()}
        

train_ds = TrainDataset(train)
train_dataloader = DataLoader(train_ds, batch_size=16, shuffle=True)

eval_ds = TrainDataset(val)
eval_dataloader = DataLoader(eval_ds, batch_size=16)

test_ds = TestDataset(test)
test_dataloader = DataLoader(test_ds, batch_size=16)

In [12]:
class WiC_Head(torch.nn.Module):
    """
    original: https://github.com/llightts/CSI5138_Project/blob/master/RoBERTa_WiC_baseline.ipynb
    
    Логика:
    1. Находим эмбеддинг токенов таргетных слов
    2. Суммируем все слои
    3. Считаем разницу
    4. Пропускаем через несколько полносвязных слоёв
    5. Получаем скор схожести
    """
    def __init__(self, model, embedding_size = 768):
        """
        Keeps a reference to the provided RoBERTa model. 
        It then adds a linear layer that takes the distance between two 
        """
        super(WiC_Head, self).__init__()
        self.embedding_size = embedding_size
        self.embedder = model
        self.linear_1 = torch.nn.Linear(3*embedding_size, embedding_size, bias = True)
        self.dropout = torch.nn.Dropout1d(p=0.1)
        self.linear_2 = torch.nn.Linear(embedding_size, 1, bias = True)
        self.activation = torch.nn.ReLU()

    def hinge_loss(self, y_pred, y_true):
        return torch.mean(torch.clamp(1 - y_pred * y_true, min=0)) 

    def forward(self, input_ids_1=None, input_ids_2=None, attention_mask_1=None, attention_mask_2=None, 
                labels=None, word1_mask = None, word2_mask = None, **kwargs):
        # Get the embeddings
        batch_size = word1_mask.shape[0]

        _, _, hidden_states_1 = self.embedder(input_ids=input_ids_1, attention_mask=attention_mask_1).values()
        _, _, hidden_states_2 = self.embedder(input_ids=input_ids_2, attention_mask=attention_mask_2).values()

        words1_hidden_states = [torch.bmm(word1_mask.unsqueeze(1), h).view(batch_size, self.embedding_size) for h in hidden_states_1]
        words2_hidden_states = [torch.bmm(word2_mask.unsqueeze(1), h).view(batch_size, self.embedding_size) for h in hidden_states_2]

        word1_emb = torch.zeros(batch_size, self.embedding_size, device=device)
        word2_emb = torch.zeros(batch_size, self.embedding_size, device=device)

        for w1, w2 in zip(words1_hidden_states, words2_hidden_states):
            word1_emb += w1
            word2_emb += w2

        diff = word1_emb - word2_emb
        
        word_emb = torch.concat([word1_emb, word1_emb, diff], axis=1)

        # Calculate outputs using activation
        layer1_results = self.activation(self.dropout(self.linear_1(word_emb)))
        layer2_results = self.activation(self.linear_2(layer1_results))
        logits = torch.tanh(layer2_results).view(-1)

        outputs = logits
        # Calculate the loss
        if labels is not None:
            #  We want seperation like a SVM so use Hinge loss
            loss = self.hinge_loss(logits, labels.view(-1))
            outputs = (loss, logits)
        return outputs

In [13]:
wic_model = WiC_Head(model)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)
wic_model.to(device);

In [15]:
optimizer = Adam(wic_model.parameters(), lr=5e-6)

num_epochs = 2
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [16]:
# [-1, 1] -> [0, 1]
def scale(y_pred):
    return (y_pred + 1)/2

In [17]:
def train_model(train_dataloader, num_epochs):
    wic_model.train()
    for epoch in range(num_epochs):
        print(f'Epoch {epoch+1} \n -------------------')
        for n_batch, batch in enumerate(train_dataloader):
            loss, logits = wic_model(**batch)
            if n_batch % 200 == 0:
                loss_value, current = loss.item(), n_batch * batch['input_ids_1'].shape[0]
                print(f"Loss train: {loss_value:>7f}  [{current:>5d}/{len(train_ds):>5d}]")
                print('Evaluating...')
                preds, true = test_model(eval_dataloader, eval=True)
                preds = scale(preds)
                true = scale(preds)
                print(f'Accuracy_score = {accuracy_score(preds > 0.5, true):>3f}\n')
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad() 

def test_model(test_dataloader, eval=False):
    wic_model.eval()
    y_pred = np.array([])
    y_true = np.array([])
    for n_batch, batch in enumerate(test_dataloader):
        if eval:
            y_true = np.hstack([y_true, batch['labels'].cpu().numpy().reshape(-1)])
            _, logits = wic_model(**batch)
        else:
            logits = wic_model(**batch)
        y_pred = np.hstack([y_pred, logits.detach().cpu().numpy()])
    return y_pred, y_true

In [None]:
train_model(train_dataloader, num_epochs)

In [None]:
test_logits, _ = test_model(test_dataloader, eval=False)

In [None]:
output = ["true" if float(i) > 0 else "false"  for i in test_logits]
output = [f'{{"idx": {n}, "label": "{i}"}}' for n, i in enumerate(output)]

In [None]:
with open('RUSSE.jsonl', 'w') as f:
    f.writelines('\n'.join(output))

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
model.save_pretrained('drive/MyDrive/OTUS/wic')
tokenizer.save_pretrained('drive/MyDrive/OTUS/wic')

('drive/MyDrive/OTUS/wic/tokenizer_config.json',
 'drive/MyDrive/OTUS/wic/special_tokens_map.json',
 'drive/MyDrive/OTUS/wic/vocab.txt',
 'drive/MyDrive/OTUS/wic/added_tokens.json',
 'drive/MyDrive/OTUS/wic/tokenizer.json')

In [None]:
torch.save(wic_model.state_dict(), 'drive/MyDrive/OTUS/wic/wic_model.pt')