In [None]:
!git clone https://github.com/RussianNLP/RuCoLA.git

In [None]:
!pip install transformers

In [1]:
import pandas as pd
import numpy as np
import torch

from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import get_scheduler

from tqdm import tqdm

from sklearn.metrics import f1_score

In [5]:
train = pd.read_csv("/content/RuCoLA/data/in_domain_train.csv", usecols=[1,2])
test = pd.read_csv("/content/RuCoLA/data/in_domain_dev.csv", usecols=[1,2])

idx = train.sample(frac=0.8, random_state=123).index
val = train[~train.index.isin(idx)]
train = train[train.index.isin(idx)]

del idx

print('Train size:', len(train))
print('Val size:', len(val))
print('Test size:', len(test))
print('\n')
print('Train labels counts\n', train['acceptable'].value_counts().to_dict(), '\n')
print('Eval labels counts\n', val['acceptable'].value_counts().to_dict(), '\n')
print('Test labels counts\n', test['acceptable'].value_counts().to_dict(), '\n')

Train size: 6295
Val size: 1574
Test size: 983


Train labels counts
 {1: 4704, 0: 1591} 

Eval labels counts
 {1: 1160, 0: 414} 

Test labels counts
 {1: 733, 0: 250} 



# BERT

In [None]:
tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruBert-base")
model = AutoModelForSequenceClassification.from_pretrained("ai-forever/ruBert-base")

In [17]:
max_len = 0

# For every sentence...
for sent in train['sentence']:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Max sentence length:  45


In [15]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)
model.to(device);

cuda


In [19]:
class EvalDataset(Dataset):
    
    def __init__(self, X):
        self.text = X.reset_index(drop=True)

    def tokenize(self, text):
        return tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=45)

    def __len__(self):
        return self.text.shape[0]

    def __getitem__(self, index):
        output = self.text[index]
        output = self.tokenize(output)
        return {k: v.reshape(-1).to(device) for k, v in output.items()}

class TrainDataset(Dataset):
    
    def __init__(self, X, label):
        self.text = X.reset_index(drop=True)
        self.label = label.reset_index(drop=True)
    
    def tokenize(self, text):
        return tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=45)

    def __len__(self):
        return self.label.shape[0]

    def __getitem__(self, index):
        output = self.text[index]
        output = self.tokenize(output)
        output.update({'labels': torch.tensor(self.label[index])})
        return {k: v.reshape(-1).to(device) for k, v in output.items()}
        

train_ds = TrainDataset(train['sentence'], train['acceptable'])
train_dataloader = DataLoader(train_ds, batch_size=32, shuffle=True)

eval_ds = TrainDataset(val['sentence'], val['acceptable'])
eval_dataloader = DataLoader(eval_ds, batch_size=32)

test_ds = EvalDataset(test['sentence'])
test_dataloader = DataLoader(test_ds, batch_size=32)

In [20]:
optimizer = Adam(model.parameters(), lr=5e-6)

num_epochs = 2
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [21]:
def train_model(train_dataloader, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        print(f'Epoch {epoch+1} \n -------------------')
        for n_batch, batch in enumerate(train_dataloader):
            outputs = model(**batch)
            loss = outputs.loss
            if n_batch % 50 == 0:
                loss_value, current = loss.item(), n_batch * batch['input_ids'].shape[0]
                print(f"Loss train: {loss_value:>7f}  [{current:>5d}/{len(train_ds):>5d}]")
                print('Evaluating...')
                preds, true = test_model(eval_dataloader, eval=True)
                print(f'F1-score = {f1_score(preds, true):>3f}\n')
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad() 

def test_model(test_dataloader, eval=False):
    model.eval()
    y_pred = np.array([])
    y_true = np.array([])
    for n_batch, batch in enumerate(test_dataloader):
        if eval:
            y_true = np.hstack([y_true, batch['labels'].cpu().numpy().reshape(-1)])
        outputs = model(**batch)
        y_pred = np.hstack([y_pred, outputs['logits'].argmax(axis=1).detach().cpu().numpy()])
    return y_pred, y_true

In [22]:
train_model(train_dataloader, num_epochs)

Epoch 1 
 -------------------
Loss train: 0.798126  [    0/ 6295]
Evaluating...
F1-score = 0.174248

Loss train: 0.528468  [ 1600/ 6295]
Evaluating...
F1-score = 0.848574

Loss train: 0.456054  [ 3200/ 6295]
Evaluating...
F1-score = 0.847619

Loss train: 0.654314  [ 4800/ 6295]
Evaluating...
F1-score = 0.848863

Epoch 2 
 -------------------
Loss train: 0.429925  [    0/ 6295]
Evaluating...
F1-score = 0.848889

Loss train: 0.675536  [ 1600/ 6295]
Evaluating...
F1-score = 0.851190

Loss train: 0.416684  [ 3200/ 6295]
Evaluating...
F1-score = 0.852324

Loss train: 0.529038  [ 4800/ 6295]
Evaluating...
F1-score = 0.853503



In [23]:
y_pred, _ = test_model(test_dataloader, eval=False)
print(f'F1-score = {f1_score(y_pred, test["acceptable"]):>3f}\n')

F1-score = 0.861945



# RuGPT3

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("ai-forever/rugpt3large_based_on_gpt2")
model = AutoModelForCausalLM.from_pretrained("ai-forever/rugpt3large_based_on_gpt2")

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)
model.to(device);

In [35]:
def calc_loss(text):
    with torch.no_grad():
        inputs = tokenizer.encode(text, return_tensors='pt').reshape(-1).to(device)
        loss = model(input_ids=inputs, labels=inputs).loss.item()
        return loss
calc_loss('Предложение корректное?')

5.507739067077637

Zero shot

In [49]:
#zero shot
from tqdm import tqdm
tqdm.pandas()

def shot(start: str, text: str, end: list):
    first = ' '.join([start, text, end[0]])
    second = ' '.join([start, text, end[1]])

    loss_1, loss_2 =  calc_loss(first), calc_loss(second)
    return 1 if loss_1 > loss_2 else 0

y_pred = test['sentence'].progress_apply(lambda x: shot('Проверь корректность предложения.', x, ['Это предложение корректное.', 'Это предложение некорректно']))

100%|██████████| 983/983 [01:42<00:00,  9.55it/s]


In [50]:
print(f'F1-score = {f1_score(y_pred, test["acceptable"]):>3f}\n')

F1-score = 0.518062



In [45]:
y_pred = test['sentence'].progress_apply(lambda x: shot('Если ли здесь ошибка?', x, ['Предложение правильное.', 'Допущена ошибка']))

100%|██████████| 983/983 [01:07<00:00, 14.61it/s]

F1-score = 0.854810






In [46]:
print(f'F1-score = {f1_score(y_pred, test["acceptable"]):>3f}\n') # Какая большая разница с предыдущей затравкой

F1-score = 0.854810



Few shot

In [55]:
# 2 shots
promt = """Проверь корректность предложения:
Вдруг решетка беззвучно поехала в сторону, и на балконе возникла таинственная фигура, прячущаяся от лунного света, и погрозила Ивану пальцем. => Верно
Этим летом не никуда ездили. => Неверно
"""
y_pred = test['sentence'].apply(lambda x: shot(promt, x, ['=> Верно', '=> Неверно']))
print(f'F1-score = {f1_score(y_pred, test["acceptable"]):>3f}\n')

F1-score = 0.802253



In [58]:
# 2 shots. Change promt
promt = """Проверь корректность предложения:
Вдруг решетка беззвучно поехала в сторону, и на балконе возникла таинственная фигура, прячущаяся от лунного света, и погрозила Ивану пальцем. Предложение правильное.
Этим летом не никуда ездили. Допущена ошибка
"""
y_pred = test['sentence'].apply(lambda x: shot(promt, x, ['Предложение правильное.', 'Допущена ошибка']))
print(f'F1-score = {f1_score(y_pred, test["acceptable"]):>3f}\n')

F1-score = 0.849589



In [63]:
# 4 shots
promt = """Проверь корректность предложения:
Вдруг решетка беззвучно поехала в сторону, и на балконе возникла таинственная фигура, прячущаяся от лунного света, и погрозила Ивану пальцем. Предложение правильное.
Этим летом не никуда ездили. Допущена ошибка
На поверку вся теория оказалась полной чепухой. Предложение правильное.
Симптомов болезни не исчезло. Допущена ошибка
"""
y_pred = test['sentence'].apply(lambda x: shot(promt, x, ['Предложение правильное.', 'Допущена ошибка']))
print(f'F1-score = {f1_score(y_pred, test["acceptable"]):>3f}\n')

F1-score = 0.820859



# Т5

In [None]:
!pip uninstall transformers
!pip install --no-cache-dir transformers sentencepiece

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruT5-base", use_fast=False)
model = T5ForConditionalGeneration.from_pretrained("ai-forever/ruT5-base")

In [3]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device);

In [6]:
class EvalDataset(Dataset):
    
    def __init__(self, X):
        self.text = X.reset_index(drop=True)

    def tokenize(self, text):
        return tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=45)

    def __len__(self):
        return self.text.shape[0]

    def __getitem__(self, index):
        output = self.text[index]
        output = self.tokenize(output)
        return {k: v.reshape(-1).to(device) for k, v in output.items()}

class TrainDataset(Dataset):
    
    def __init__(self, X, label):
        self.text = X.reset_index(drop=True)
        self.label = label.reset_index(drop=True)
    
    def tokenize(self, text, length=45):
        return tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=length)

    def __len__(self):
        return self.label.shape[0]

    def __getitem__(self, index):
        output = self.text[index]
        output = self.tokenize(output)
        output = {k: v.reshape(-1).to(device) for k, v in output.items()}

        label = 'верно' if self.label[index] == 1 else 'неверно'
        label = self.tokenize(label, length=2).input_ids.reshape(-1).to(device)

        output.update({'labels': label})
        return output
        
train_ds = TrainDataset(train['sentence'], train['acceptable'])
train_dataloader = DataLoader(train_ds, batch_size=32, shuffle=True)

eval_ds = TrainDataset(val['sentence'], val['acceptable'])
eval_dataloader = DataLoader(eval_ds, batch_size=32)

test_ds = EvalDataset(test['sentence'])
test_dataloader = DataLoader(test_ds, batch_size=32)

In [7]:
optimizer = Adam(model.parameters(), lr=1e-5)

num_epochs = 2
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [8]:
def train_model(train_dataloader, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        print(f'Epoch {epoch+1} \n -------------------')
        for n_batch, batch in enumerate(train_dataloader):
            outputs = model(**batch)
            loss = outputs.loss
            if n_batch % 50 == 0:
                loss_train, current = loss.item(), n_batch * batch['input_ids'].shape[0]
                print(f"Loss train: {loss_train:>7f}  [{current:>5d}/{len(train_ds):>5d}]")
                print('Evaluating...')
                loss_val, _ = test_model(eval_dataloader, eval=True)
                print(f"Loss test: {loss_val:>7f}\n")
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad() 

def test_model(test_dataloader, eval=False):
    model.eval()
    y_pred = np.array([])
    y_true = np.array([])
    loss = []
    for n_batch, batch in enumerate(test_dataloader):
        if not eval:
            gen_tok = model.generate(**batch)
            gen_tok = [1 if 2937 in i else 0 for i in gen_tok]  # tokenizer.decode(2937) == 'верно'
            y_true = np.hstack([y_true, gen_tok])
        else:
            outputs = model(**batch)
            loss.append(outputs.loss.item())
    if not eval:
        return y_true
    else:
        return np.sum(loss)/len(loss), y_true

In [11]:
train_model(train_dataloader, num_epochs)

Epoch 1 
 -------------------
Loss train: 13.994221  [    0/ 6295]
Evaluating...
Loss test: 16.116276

Loss train: 0.341062  [ 1600/ 6295]
Evaluating...
Loss test: 0.341219

Loss train: 0.284652  [ 3200/ 6295]
Evaluating...
Loss test: 0.312012

Loss train: 0.312784  [ 4800/ 6295]
Evaluating...
Loss test: 0.305643

Epoch 2 
 -------------------
Loss train: 0.296331  [    0/ 6295]
Evaluating...
Loss test: 0.289475

Loss train: 0.234875  [ 1600/ 6295]
Evaluating...
Loss test: 0.290852

Loss train: 0.268220  [ 3200/ 6295]
Evaluating...
Loss test: 0.296519

Loss train: 0.258969  [ 4800/ 6295]
Evaluating...
Loss test: 0.287277



In [12]:
y_pred = test_model(test_dataloader, eval=False)
print(f'F1-score = {f1_score(y_pred, test["acceptable"]):>3f}\n')



F1-score = 0.854427



<table>
    <thead>
        <tr>
            <td>Model</td>
            <td>F1 score</td>
        </tr>
    </thead>
    <tbody>
        <tr>
            <td>ruBERT-base</td>
            <td>0.86</td>
        </tr>
        <tr>
            <td>ruGPT3-large best zero shot</td>
            <td>0.85</td>
        </tr>
        <tr>
            <td>ruGPT3-large best few shot</td>
            <td>0.85</td>
        </tr>
        <tr>
            <td>ruT5-base</td>
            <td>0.85</td>
        </tr>
    </tbody>
</table>


Модели показали схожий результат, однако я бы отдал предпочтение методу few-shot, потому что там почти ничего обучать не надо)