In [None]:
!pip install transformers

In [None]:
import pandas as pd
import torch
import re
import numpy as np

from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import get_scheduler

from tqdm import tqdm

from sklearn.metrics import accuracy_score

In [None]:
!wget --no-check-certificate https://russiansuperglue.com/tasks/download/DaNetQA

In [None]:
!unzip DaNetQA -d QA

In [None]:
train = pd.read_json('/content/QA/DaNetQA/train.jsonl', orient='records', lines = True)
val = pd.read_json('/content/QA/DaNetQA/val.jsonl', orient='records', lines = True)
test = pd.read_json('/content/QA/DaNetQA/test.jsonl', orient='records', lines = True)

print('Train size:', len(train))
print('Val size:', len(val))
print('Test size:', len(test))
print('\n')
print('Train labels counts\n', train['label'].value_counts().to_dict(), '\n')
print('Eval labels counts\n', val['label'].value_counts().to_dict(), '\n')

Train size: 1749
Val size: 821
Test size: 805


Train labels counts
 {True: 1061, False: 688} 

Eval labels counts
 {True: 412, False: 409} 



# Model

In [None]:
tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruBert-base")
model = AutoModelForSequenceClassification.from_pretrained("ai-forever/ruBert-base")

Some weights of the model checkpoint at ai-forever/ruBert-base were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not ini

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
print(device)

cuda


In [None]:
max_len = 0

# For every sentence...
for sent in pd.concat([train['question'], train['passage']]):

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len) # Больше максимально возможного

Max sentence length:  745


In [None]:
class TrainDataset(Dataset):
    
    def __init__(self, df):
        self.df = df.reset_index(drop=True).drop(columns='idx')

    def tokenize(self, text1, text2):
        return tokenizer(text1, text2, return_tensors='pt', padding='max_length', truncation=True, max_length=512)

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        row = self.df.iloc[index, :]
        output = self.tokenize(row['question'], row['passage'])
        output.update({'labels': torch.tensor(row['label'].astype(int))})
        return {k: v.reshape(-1).to(device) for k, v in output.items()}

class TestDataset(Dataset):
    
    def __init__(self, df):
        self.df = df.reset_index(drop=True).drop(columns='idx')

    def tokenize(self, text1, text2):
        return tokenizer(text1, text2, return_tensors='pt', padding='max_length', truncation=True, max_length=512)

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        row = self.df.iloc[index, :]
        output = self.tokenize(row['question'], row['passage'])
        return {k: v.reshape(-1).to(device) for k, v in output.items()}
        

train_ds = TrainDataset(train)
train_dataloader = DataLoader(train_ds, batch_size=8, shuffle=True)

eval_ds = TrainDataset(val)
eval_dataloader = DataLoader(eval_ds, batch_size=8)

test_ds = TestDataset(test)
test_dataloader = DataLoader(test_ds, batch_size=8)

In [None]:
optimizer = Adam(model.parameters(), lr=5e-6)

num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [None]:
def train_model(train_dataloader, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        print(f'Epoch {epoch+1} \n -------------------')
        for n_batch, batch in enumerate(train_dataloader):
            outputs = model(**batch)
            loss = outputs.loss
            if n_batch % 50 == 0:
                loss_value, current = loss.item(), n_batch * batch['input_ids'].shape[0]
                print(f"Loss train: {loss_value:>7f}  [{current:>5d}/{len(train_ds):>5d}]")
                print('Evaluating...')
                preds, true = test_model(eval_dataloader, eval=True)
                print(f'Accuracy = {accuracy_score(preds, true):>3f}\n')
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad() 

def test_model(test_dataloader, eval=False):
    model.eval()
    y_pred = np.array([])
    y_true = np.array([])
    for n_batch, batch in enumerate(test_dataloader):
        if eval:
            y_true = np.hstack([y_true, batch['labels'].cpu().numpy().reshape(-1)])
        outputs = model(**batch)
        y_pred = np.hstack([y_pred, outputs['logits'].argmax(axis=1).detach().cpu().numpy()])
    return y_pred, y_true

In [None]:
train_model(train_dataloader, num_epochs)

Epoch 1 
 -------------------
Loss train: 0.669859  [    0/ 1749]
Evaluating...
Accuracy = 0.554202

Loss train: 0.619839  [  400/ 1749]
Evaluating...
Accuracy = 0.510353

Loss train: 0.607884  [  800/ 1749]
Evaluating...
Accuracy = 0.533496

Loss train: 0.529855  [ 1200/ 1749]
Evaluating...
Accuracy = 0.538368

Loss train: 0.690921  [ 1600/ 1749]
Evaluating...
Accuracy = 0.540804

Epoch 2 
 -------------------
Loss train: 0.434038  [    0/ 1749]
Evaluating...
Accuracy = 0.548112

Loss train: 0.425070  [  400/ 1749]
Evaluating...
Accuracy = 0.551766

Loss train: 0.559670  [  800/ 1749]
Evaluating...
Accuracy = 0.550548

Loss train: 0.421098  [ 1200/ 1749]
Evaluating...
Accuracy = 0.551766

Loss train: 0.418260  [ 1600/ 1749]
Evaluating...
Accuracy = 0.573691

Epoch 3 
 -------------------
Loss train: 0.299250  [    0/ 1749]
Evaluating...
Accuracy = 0.583435

Loss train: 0.354687  [  400/ 1749]
Evaluating...
Accuracy = 0.585871

Loss train: 0.441581  [  800/ 1749]
Evaluating...
Accuracy

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
model.save_pretrained('drive/MyDrive/OTUS/qa')
tokenizer.save_pretrained('drive/MyDrive/OTUS/qa')

('drive/MyDrive/OTUS/qa/tokenizer_config.json',
 'drive/MyDrive/OTUS/qa/special_tokens_map.json',
 'drive/MyDrive/OTUS/qa/vocab.txt',
 'drive/MyDrive/OTUS/qa/added_tokens.json',
 'drive/MyDrive/OTUS/qa/tokenizer.json')

In [None]:
test_logits, _ = test_model(test_dataloader, eval=False)

In [None]:
output = ["true" if i == 1 else "false"  for i in test_logits]
output = [f'{{"idx": {n}, "label": "{i}"}}' for n, i in enumerate(output)]

In [None]:
with open('DaNetQA.jsonl', 'w') as f:
    f.writelines('\n'.join(output))