In [1]:
!pip install jsonlines
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Загрузка данных

In [2]:
!wget https://raw.githubusercontent.com/Xeanst/Grammaticality-judgements-with-neural-language-models/main/data/train_data.jsonl
!wget https://raw.githubusercontent.com/Xeanst/Grammaticality-judgements-with-neural-language-models/main/data/val_data.jsonl
!wget https://raw.githubusercontent.com/Xeanst/Grammaticality-judgements-with-neural-language-models/main/data/test_data.jsonl

--2022-10-05 11:18:52--  https://raw.githubusercontent.com/Xeanst/Grammaticality-judgements-with-neural-language-models/main/data/train_data.jsonl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 65238 (64K) [text/plain]
Saving to: ‘train_data.jsonl.1’


2022-10-05 11:18:52 (7.93 MB/s) - ‘train_data.jsonl.1’ saved [65238/65238]

--2022-10-05 11:18:52--  https://raw.githubusercontent.com/Xeanst/Grammaticality-judgements-with-neural-language-models/main/data/val_data.jsonl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 16493 (16K) [

In [3]:
import jsonlines

with jsonlines.open("train_data.jsonl", "r") as fin:
    train_data = list(fin)
with jsonlines.open("val_data.jsonl", "r") as fin:
    dev_data = list(fin)
with jsonlines.open("test_data.jsonl", "r") as fin:
    test_data = list(fin)

In [4]:
train_data[10].items()

dict_items([('sentence', 'банкир предполагал что писатель были пожилыми'), ('idx', 0), ('label', 'chto_anim')])

In [5]:
train_data[11].items()

dict_items([('sentence', 'фермеры возле охранника был высоким'), ('idx', 0), ('label', 'pp_anim')])

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("blinoff/roberta-base-russian-v0")
for key, value in train_data[10].items():
    print(key, value)
tokenized_data = tokenizer(train_data[10]["sentence"])
token_indexes = tokenized_data["input_ids"]
tokens = tokenizer.convert_ids_to_tokens(token_indexes)
print("")
for index, token in zip(token_indexes, tokens):
    print(f"{index}:{token}", end=" ")

Downloading:   0%|          | 0.00/16.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.68M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

sentence банкир предполагал что писатель были пожилыми
idx 0
label chto_anim

0:<s> 606:Ð±Ð° 1740:Ð½ÐºÐ¸ 267:ÑĢ 25896:ĠÐ¿ÑĢÐµÐ´Ð¿Ð¾Ð»Ð°Ð³Ð°Ð» 392:ĠÑĩÑĤÐ¾ 6592:ĠÐ¿Ð¸ÑģÐ°ÑĤÐµÐ»ÑĮ 873:ĠÐ±ÑĭÐ»Ð¸ 4617:ĠÐ¿Ð¾Ð¶Ð¸ 4839:Ð»ÑĭÐ¼Ð¸ 2:</s> 

 Напишем соответствующий `Dataset`

In [7]:
from torch.utils.data.dataset import Dataset

class GramDataset(Dataset):

    def __init__(self, data, tokenizer, pos_label=1):
        self.data = data
        self.tokenizer = tokenizer
        self.pos_label = pos_label

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        item = self.data[index]
        answer = {"input_ids": self.tokenizer(item["sentence"])["input_ids"]}
        if "idx" in item:
            answer["y"] = float(item["idx"] == self.pos_label)
        return answer 

In [8]:
train_dataset = GramDataset(train_data, tokenizer)
for key, value in train_dataset[2].items():
    print(key, value, sep="\n")

input_ids
[0, 10228, 1163, 27555, 47634, 5662, 8128, 2]
y
0.0


In [9]:
import torch
import numpy as np
import itertools

def pad_tensor(vec, length, dim, pad_symbol):
    # vec.shape = [3, 4, 5]
    # length=7, dim=1 -> pad_size = (3, 7-4, 5)
    pad_size = list(vec.shape)
    pad_size[dim] = length - vec.shape[dim]
    answer = torch.cat([vec, torch.ones(*pad_size, dtype=torch.long) * pad_symbol], axis=dim)
    return answer

def pad_tensors(tensors, pad=0):
    # дополняет тензоры из tensors до общей максимальной длины символом pad
    if isinstance(tensors[0], (int, np.integer)):
        return torch.LongTensor(tensors)
    elif isinstance(tensors[0], (float, np.float)):
        return torch.Tensor(tensors)
    tensors = [torch.LongTensor(tensor) for tensor in tensors]
    L = max(tensor.shape[0] for tensor in tensors)
    tensors = [pad_tensor(tensor, L, dim=0, pad_symbol=pad) for tensor in tensors]
    return torch.stack(tensors, axis=0)

class FieldBatchDataLoader:

    def __init__(self, X, batch_size=32, sort_by_length=True, 
                 length_field=None, state=115, device="cpu"):
        self.X = X
        self.batch_size = batch_size
        self.sort_by_length = sort_by_length
        self.length_field = length_field  ## добавилось
        self.device = device
        np.random.seed(state)

    def __len__(self):
        return (len(self.X)-1) // self.batch_size + 1 

    def __iter__(self):
        if self.sort_by_length:
            # отсортировать индексы по длине объектов [1, ..., 32] -> [7, 4, 15, ...]
            # изменилось взятие длины из поля
            if self.length_field is not None:
                lengths = [len(x[self.length_field]) for x in self.X]
            else:
                lengths = [len(list(x.values())[0]) for x in self.X]
            order = np.argsort(lengths)
            # сгруппировать в батчи [7, 4, 15, 31, 3, ...] -> [[7, 4, 15, 31], [3, ...], ...]
            batched_order = np.array([order[start:start+self.batch_size] 
                                      for start in range(0, len(self.X), self.batch_size)])
            # переупорядочить батчи случайно: [[3, 11, 21, 19], [27, ...], ..., [7, ...], ...]
            np.random.shuffle(batched_order[:-1])
            # собрать посл-ть индексов: -> [3, 11, 21, 19, 27, ...]
            self.order = np.fromiter(itertools.chain.from_iterable(batched_order), dtype=int)
        else:
            self.order = np.arange(len(self.X))
            np.random.shuffle(self.order)
        self.idx = 0
        return self

    def __next__(self):
        if self.idx >= len(self.X):
            raise StopIteration()
        end = min(self.idx + self.batch_size, len(self.X))
        indexes = [self.order[i] for i in range(self.idx, end)]
        batch = dict()
        # перебираем все поля
        for field in self.X[indexes[0]]:
            batch[field] = pad_tensors([self.X[i][field] for i in indexes]).to(self.device)
        batch["indexes"] = indexes
        self.idx = end
        return batch

In [10]:
train_dataloader = iter(FieldBatchDataLoader(train_dataset, batch_size=8, device="cuda"))
for i in range(10):
    batch = next(train_dataloader)
    for field, data in batch.items():
        print(f"{field}:{np.shape(data)}", end="\t")
    print("")

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


input_ids:torch.Size([8, 10])	y:torch.Size([8])	indexes:(8,)	
input_ids:torch.Size([8, 5])	y:torch.Size([8])	indexes:(8,)	
input_ids:torch.Size([8, 8])	y:torch.Size([8])	indexes:(8,)	
input_ids:torch.Size([8, 12])	y:torch.Size([8])	indexes:(8,)	
input_ids:torch.Size([8, 10])	y:torch.Size([8])	indexes:(8,)	
input_ids:torch.Size([8, 9])	y:torch.Size([8])	indexes:(8,)	
input_ids:torch.Size([8, 9])	y:torch.Size([8])	indexes:(8,)	
input_ids:torch.Size([8, 7])	y:torch.Size([8])	indexes:(8,)	
input_ids:torch.Size([8, 9])	y:torch.Size([8])	indexes:(8,)	
input_ids:torch.Size([8, 8])	y:torch.Size([8])	indexes:(8,)	


## Создание и обучение модели

In [11]:
from transformers import AutoModel


roberta_model = AutoModel.from_pretrained("blinoff/roberta-base-russian-v0").to("cuda")

Downloading:   0%|          | 0.00/500M [00:00<?, ?B/s]

Some weights of the model checkpoint at blinoff/roberta-base-russian-v0 were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
answer = roberta_model(batch['input_ids'])
print(answer["last_hidden_state"].shape)
print(answer["pooler_output"].shape)

torch.Size([8, 8, 768])
torch.Size([8, 768])


Напишем класс-обёртку для классификации

In [13]:
import torch.nn as nn
from transformers.optimization import AdamW

class BasicTransformersClassificationModel(nn.Module):

    def __init__(self, model, labels_number, lr=1e-5, device="cpu", **kwargs):
        super(BasicTransformersClassificationModel, self).__init__()
        self.model = model
        self.labels_number = labels_number
        self.build_network(labels_number)
        # определяем функцию потерь
        if self.labels_number > 1:
            self.log_softmax = nn.LogSoftmax(dim=-1)
            self.criterion = nn.NLLLoss(reduction="mean")
        else:
            self.log_softmax = nn.Sigmoid()
            self.criterion = nn.BCELoss(reduction="mean")
        self.device = device
        if self.device is not None:
            self.to(self.device)
        self.optimizer = AdamW(self.parameters(), lr=lr, weight_decay=0.01)

    @property
    def hidden_size(self):
        return self.model.config.hidden_size

    def forward(self, input_ids, **kwargs):
        raise NotImplementedError("You should implement forward pass in your derived class.") 

    def train_on_batch(self, x, y):
        self.train()
        self.optimizer.zero_grad()
        loss = self._validate(x, y)
        loss["loss"].backward()
        self.optimizer.step()
        return loss

    def validate_on_batch(self, x, y):
        self.eval()
        with torch.no_grad():
            return self._validate(x, y)

    def _validate(self, x, y):
        if self.device is not None:
            y = y.to(self.device)
        log_probs = self(**x) #   self.forward(x) = self.__call__(x)
        loss = self.criterion(log_probs, y)
        if self.labels_number > 1:
            _, labels = torch.max(log_probs, dim=-1)
        else:
            labels = (log_probs >= 0.5).int()
        return {"loss": loss, "labels": labels}

class TransformersClassificationModel(BasicTransformersClassificationModel):

    def build_network(self, labels_number):
        self.proj_layer = torch.nn.Linear(self.hidden_size, self.labels_number)
        return self

    def forward(self, input_ids, **kwargs):
        input_ids = input_ids.to(self.device)
        cls_output = self.model(input_ids)["pooler_output"]
        logits = self.proj_layer(cls_output)
        log_probs = self.log_softmax(logits)
        if self.labels_number == 1:
            log_probs = log_probs[...,0]
        return log_probs


In [14]:
model = TransformersClassificationModel(
    roberta_model, labels_number=1, device="cuda"
)
for name, elem in model.named_parameters():
    print(name, elem.device, elem.shape)

model.embeddings.word_embeddings.weight cuda:0 torch.Size([50021, 768])
model.embeddings.position_embeddings.weight cuda:0 torch.Size([514, 768])
model.embeddings.token_type_embeddings.weight cuda:0 torch.Size([1, 768])
model.embeddings.LayerNorm.weight cuda:0 torch.Size([768])
model.embeddings.LayerNorm.bias cuda:0 torch.Size([768])
model.encoder.layer.0.attention.self.query.weight cuda:0 torch.Size([768, 768])
model.encoder.layer.0.attention.self.query.bias cuda:0 torch.Size([768])
model.encoder.layer.0.attention.self.key.weight cuda:0 torch.Size([768, 768])
model.encoder.layer.0.attention.self.key.bias cuda:0 torch.Size([768])
model.encoder.layer.0.attention.self.value.weight cuda:0 torch.Size([768, 768])
model.encoder.layer.0.attention.self.value.bias cuda:0 torch.Size([768])
model.encoder.layer.0.attention.output.dense.weight cuda:0 torch.Size([768, 768])
model.encoder.layer.0.attention.output.dense.bias cuda:0 torch.Size([768])
model.encoder.layer.0.attention.output.LayerNorm.wei



In [15]:
train_dataloader = iter(FieldBatchDataLoader(train_dataset, batch_size=8, device="cuda"))
batch = next(train_dataloader)
labels = batch["y"]
print(labels)
for i in range(100):
    loss = model.train_on_batch(batch, labels)["loss"].item()
    if i < 5 or (i+1) % 10 == 0:
        print(i, loss)
print(model.validate_on_batch(batch, labels)["loss"].item())

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


tensor([1., 0., 0., 1., 1., 0., 0., 1.], device='cuda:0')
0 0.7491636276245117
1 0.6771023273468018
2 0.6412955522537231
3 0.5781126618385315
4 0.562039315700531
9 0.3212659955024719
19 0.0310315303504467
29 0.010011037811636925
39 0.005983233451843262
49 0.004027879796922207
59 0.003377116285264492
69 0.0028777639381587505
79 0.0024890436325222254
89 0.0023962766863405704
99 0.0021653794683516026
0.0016025621443986893


## Обучение классификационной модели

In [16]:
def update_metrics(metrics, batch_output, batch_labels):
    n_batches = metrics["n_batches"]
    metrics["loss"] = (metrics["loss"] * n_batches + batch_output["loss"].item()) / (n_batches + 1)
    metrics["n_batches"] += 1
    are_equal = (batch_output["labels"] == batch_labels).float()
    are_TP = are_equal * batch_labels
    are_FP = (1.0 - are_equal) * (1-batch_labels)
    are_FN = (1.0 - are_equal) * batch_labels
    metrics["correct"] += are_equal.sum().item()
    metrics["total"] += are_equal.shape[0]
    metrics["TP"] += are_TP.sum().item()
    metrics["FP"] += are_FP.sum().item()
    metrics["FN"] += are_FN.sum().item()
    metrics["accuracy"] = metrics["correct"] / max(metrics["total"], 1)
    metrics["P"] = metrics["TP"] / max(metrics["TP"]+metrics["FP"], 1)
    metrics["R"] = metrics["TP"] / max(metrics["TP"]+metrics["FN"], 1)
    metrics["F1"] = metrics["TP"] / max(metrics["TP"]+0.5*metrics["FN"]+0.5*metrics["FP"], 1)

In [17]:
import tqdm

def do_epoch(model, dataloader, mode="validate", epoch=1):
    metrics = {"correct": 0, "total": 0, "loss": 0.0, "n_batches": 0,
               "TP": 0, "FP": 0, "FN": 0}
    func = model.train_on_batch if mode == "train" else model.validate_on_batch
    progress_bar = tqdm.notebook.tqdm(dataloader, leave=True)
    progress_bar.set_description(f"{mode}, epoch={epoch}")
    for batch in progress_bar:
        batch_answers = batch["y"]
        batch_output = func(batch, batch_answers)
        update_metrics(metrics, batch_output, batch_answers)
        postfix = {"loss": round(metrics["loss"], 4), "acc": round(100 * metrics["accuracy"], 2)}
        for key in ["P", "R", "F1"]:
            postfix[key] =  round(100 * metrics[key], 2)
        progress_bar.set_postfix(postfix)
    return metrics

In [18]:
train_dataset = GramDataset(train_data, tokenizer)
dev_dataset = GramDataset(dev_data, tokenizer)
train_dataloader = iter(FieldBatchDataLoader(train_dataset, batch_size=8, device="cuda"))
dev_dataloader = iter(FieldBatchDataLoader(dev_dataset, batch_size=8, device="cuda"))

roberta_model = AutoModel.from_pretrained("blinoff/roberta-base-russian-v0").to("cuda")
model = TransformersClassificationModel(roberta_model, labels_number=1, device="cuda")
best_val_acc = 0.0
checkpoint = "checkpoint_best.pt"
for epoch in range(5):
    do_epoch(model, train_dataloader, mode="train", epoch=epoch+1)
    epoch_metrics = do_epoch(model, dev_dataloader, mode="validate", epoch=epoch+1)
    if epoch_metrics["accuracy"] > best_val_acc:
        best_val_acc = epoch_metrics["accuracy"]
        torch.save(model.state_dict(), checkpoint)
        # print("Saving ")
model.load_state_dict(torch.load(checkpoint))
do_epoch(model, dev_dataloader, mode="validate", epoch="evaluate")

Some weights of the model checkpoint at blinoff/roberta-base-russian-v0 were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/70 [00:00<?, ?it/s]

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

{'correct': 125.0,
 'total': 140,
 'loss': 0.28905536969088846,
 'n_batches': 18,
 'TP': 63.0,
 'FP': 5.0,
 'FN': 10.0,
 'accuracy': 0.8928571428571429,
 'P': 0.9264705882352942,
 'R': 0.863013698630137,
 'F1': 0.8936170212765957}

In [19]:
def predict_with_model(model, X, classes):
    model.eval()
    dataloader = FieldBatchDataLoader(X, device=model.device)
    answer = [None] * len(X)
    for batch in dataloader:
        with torch.no_grad():
            batch_answer = model(**batch)
        labels = (batch_answer >= 0.5).int().cpu().numpy()
        for i, label in zip(batch["indexes"], labels):
            answer[i] = classes[label]
    return answer

In [20]:
test_dataset = GramDataset(test_data, tokenizer)
predicted = predict_with_model(model, test_dataset, classes=[0, 1])

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


In [21]:
for i in range(0,140):
  if test_data[i]["idx"] != predicted[i]:
    print(test_data[i]["sentence"], test_data[i]["label"], test_data[i]["idx"], predicted[i])

спектакли был новым intr_inan 0 1
чиновники игнорировал секретарей tr_direct_anim 0 1
механики знали что спектакль веселил chto_inan 1 0
чиновники игнорировали офицера tr_direct_anim 1 0
рассказ обожал чиновники tr_reverse_inan 0 1
студенты обожал секретарей tr_direct_anim 0 1
спектакли от офицеров веселили pp_inan_train 1 0
чиновник игнорировали рассказ tr_direct_inan 0 1
студенты обожал спектакль tr_direct_inan 0 1
механик знал что чиновники был молодым chto_anim 0 1
чиновники игнорировали офицеров tr_direct_anim 1 0
чиновник которого обожали офицеры был молодым obj_rel_anim 1 0
механик знал что рассказы веселили chto_inan 1 0
студенты игнорировали спектакль tr_direct_inan 1 0
спектакли от секретарей были новыми pp_inan_train 1 0
студенты которых обожал секретарь обедали obj_rel_anim 1 0
студенты которые обожали рассказ были молодыми subj_rel_inan 1 0
студенты которые игнорировали офицера были молодыми subj_rel_anim 1 0
секретарей обожал студенты tr_reverse_anim 0 1
чиновники которые

In [22]:
correct = []
for i in range(len(test_data)):
  correct.append(test_data[i]["idx"])

In [23]:
from sklearn.metrics import accuracy_score
accuracy_score(predicted, correct)

0.7571428571428571

In [24]:
from sklearn.metrics import matthews_corrcoef
matthews_corrcoef(correct, predicted, sample_weight=None)

0.5157018453003511