In [1]:
!pip install jsonlines
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting jsonlines
  Downloading jsonlines-3.0.0-py3-none-any.whl (8.5 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-3.0.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.0-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 4.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 14.6 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux20

## Загрузка данных

In [2]:
!wget https://raw.githubusercontent.com/Xeanst/grammaticality_judgments/main/train_data.jsonl
!wget https://raw.githubusercontent.com/Xeanst/grammaticality_judgments/main/val_data.jsonl
!wget https://raw.githubusercontent.com/Xeanst/grammaticality_judgments/main/test_data.jsonl

--2022-06-20 13:29:58--  https://raw.githubusercontent.com/Xeanst/grammaticality_judgments/main/train_data.jsonl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13342932 (13M) [text/plain]
Saving to: ‘train_data.jsonl’


2022-06-20 13:30:00 (106 MB/s) - ‘train_data.jsonl’ saved [13342932/13342932]

--2022-06-20 13:30:00--  https://raw.githubusercontent.com/Xeanst/grammaticality_judgments/main/val_data.jsonl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2859307 (2.7M) [text/plain]
Saving to: ‘val_data.jsonl’


2022-06-20 13:30:00

In [3]:
import jsonlines

with jsonlines.open("train_data.jsonl", "r") as fin:
    train_data = list(fin)
with jsonlines.open("val_data.jsonl", "r") as fin:
    dev_data = list(fin)
with jsonlines.open("test_data.jsonl", "r") as fin:
    test_data = list(fin)

In [4]:
train_data[10].items()

dict_items([('sentence', 'чертежи которые презирал родитель был плохим'), ('idx', 0), ('label', 'obj_rel_inan')])

In [5]:
train_data[11].items()

dict_items([('sentence', 'руководители которых знал ассистент рассуждал'), ('idx', 0), ('label', 'obj_rel_anim')])

In [6]:
from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

for key, value in train_data[10].items():
    print(key, value)
tokenized_data = tokenizer(train_data[10]["sentence"])
token_indexes = tokenized_data["input_ids"]
tokens = tokenizer.convert_ids_to_tokens(token_indexes)
print("")
for index, token in zip(token_indexes, tokens):
    print(f"{index}:{token}", end=" ")

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

sentence чертежи которые презирал родитель был плохим
idx 0
label obj_rel_inan

101:[CLS] 14816:че 92965:##рте 14974:##жи 14028:которые 12112:през 25667:##ира 10517:##л 17517:род 49280:##итель 10702:был 556:п 11602:##ло 80517:##хим 102:[SEP] 

 Напишем соответствующий `Dataset`

In [7]:
from torch.utils.data.dataset import Dataset

class GramDataset(Dataset):

    def __init__(self, data, tokenizer, pos_label=1):
        self.data = data
        self.tokenizer = tokenizer
        self.pos_label = pos_label

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        item = self.data[index]
        answer = {"input_ids": self.tokenizer(item["sentence"])["input_ids"]}
        if "idx" in item:
            answer["y"] = float(item["idx"] == self.pos_label)
        return answer 

In [8]:
train_dataset = GramDataset(train_data, tokenizer)
for key, value in train_dataset[2].items():
    print(key, value, sep="\n")

input_ids
[101, 24933, 21859, 548, 25829, 561, 11977, 33930, 10292, 102]
y
0.0


In [9]:
import torch
import numpy as np
import itertools

def pad_tensor(vec, length, dim, pad_symbol):
    # vec.shape = [3, 4, 5]
    # length=7, dim=1 -> pad_size = (3, 7-4, 5)
    pad_size = list(vec.shape)
    pad_size[dim] = length - vec.shape[dim]
    answer = torch.cat([vec, torch.ones(*pad_size, dtype=torch.long) * pad_symbol], axis=dim)
    return answer

def pad_tensors(tensors, pad=0):
    # дополняет тензоры из tensors до общей максимальной длины символом pad
    if isinstance(tensors[0], (int, np.integer)):
        return torch.LongTensor(tensors)
    elif isinstance(tensors[0], (float, np.float)):
        return torch.Tensor(tensors)
    tensors = [torch.LongTensor(tensor) for tensor in tensors]
    L = max(tensor.shape[0] for tensor in tensors)
    tensors = [pad_tensor(tensor, L, dim=0, pad_symbol=pad) for tensor in tensors]
    return torch.stack(tensors, axis=0)

class FieldBatchDataLoader:

    def __init__(self, X, batch_size=32, sort_by_length=True, 
                 length_field=None, state=115, device="cpu"):
        self.X = X
        self.batch_size = batch_size
        self.sort_by_length = sort_by_length
        self.length_field = length_field  ## добавилось
        self.device = device
        np.random.seed(state)

    def __len__(self):
        return (len(self.X)-1) // self.batch_size + 1 

    def __iter__(self):
        if self.sort_by_length:
            # отсортировать индексы по длине объектов [1, ..., 32] -> [7, 4, 15, ...]
            # изменилось взятие длины из поля
            if self.length_field is not None:
                lengths = [len(x[self.length_field]) for x in self.X]
            else:
                lengths = [len(list(x.values())[0]) for x in self.X]
            order = np.argsort(lengths)
            # сгруппировать в батчи [7, 4, 15, 31, 3, ...] -> [[7, 4, 15, 31], [3, ...], ...]
            batched_order = np.array([order[start:start+self.batch_size] 
                                      for start in range(0, len(self.X), self.batch_size)])
            # переупорядочить батчи случайно: [[3, 11, 21, 19], [27, ...], ..., [7, ...], ...]
            np.random.shuffle(batched_order[:-1])
            # собрать посл-ть индексов: -> [3, 11, 21, 19, 27, ...]
            self.order = np.fromiter(itertools.chain.from_iterable(batched_order), dtype=int)
        else:
            self.order = np.arange(len(self.X))
            np.random.shuffle(self.order)
        self.idx = 0
        return self

    def __next__(self):
        if self.idx >= len(self.X):
            raise StopIteration()
        end = min(self.idx + self.batch_size, len(self.X))
        indexes = [self.order[i] for i in range(self.idx, end)]
        batch = dict()
        # перебираем все поля
        for field in self.X[indexes[0]]:
            batch[field] = pad_tensors([self.X[i][field] for i in indexes]).to(self.device)
        batch["indexes"] = indexes
        self.idx = end
        return batch

In [10]:
train_dataloader = iter(FieldBatchDataLoader(train_dataset, batch_size=8, device="cuda"))
for i in range(10):
    batch = next(train_dataloader)
    for field, data in batch.items():
        print(f"{field}:{np.shape(data)}", end="\t")
    print("")

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


input_ids:torch.Size([8, 10])	y:torch.Size([8])	indexes:(8,)	
input_ids:torch.Size([8, 13])	y:torch.Size([8])	indexes:(8,)	
input_ids:torch.Size([8, 16])	y:torch.Size([8])	indexes:(8,)	
input_ids:torch.Size([8, 15])	y:torch.Size([8])	indexes:(8,)	
input_ids:torch.Size([8, 16])	y:torch.Size([8])	indexes:(8,)	
input_ids:torch.Size([8, 19])	y:torch.Size([8])	indexes:(8,)	
input_ids:torch.Size([8, 14])	y:torch.Size([8])	indexes:(8,)	
input_ids:torch.Size([8, 14])	y:torch.Size([8])	indexes:(8,)	
input_ids:torch.Size([8, 17])	y:torch.Size([8])	indexes:(8,)	
input_ids:torch.Size([8, 14])	y:torch.Size([8])	indexes:(8,)	


## Создание и обучение модели

In [11]:
from transformers import AutoModel


bert_model = AutoModel.from_pretrained("bert-base-multilingual-cased").to("cuda")

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
answer = bert_model(batch['input_ids'])
print(answer["last_hidden_state"].shape)
print(answer["pooler_output"].shape)

torch.Size([8, 14, 768])
torch.Size([8, 768])


Напишем класс-обёртку для классификации

In [13]:
import torch.nn as nn
from transformers.optimization import AdamW

class BasicTransformersClassificationModel(nn.Module):

    def __init__(self, model, labels_number, lr=1e-5, device="cpu", **kwargs):
        super(BasicTransformersClassificationModel, self).__init__()
        self.model = model
        self.labels_number = labels_number
        self.build_network(labels_number)
        # определяем функцию потерь
        if self.labels_number > 1:
            self.log_softmax = nn.LogSoftmax(dim=-1)
            self.criterion = nn.NLLLoss(reduction="mean")
        else:
            self.log_softmax = nn.Sigmoid()
            self.criterion = nn.BCELoss(reduction="mean")
        self.device = device
        if self.device is not None:
            self.to(self.device)
        self.optimizer = AdamW(self.parameters(), lr=lr, weight_decay=0.01)

    @property
    def hidden_size(self):
        return self.model.config.hidden_size

    def forward(self, input_ids, **kwargs):
        raise NotImplementedError("You should implement forward pass in your derived class.") 

    def train_on_batch(self, x, y):
        self.train()
        self.optimizer.zero_grad()
        loss = self._validate(x, y)
        loss["loss"].backward()
        self.optimizer.step()
        return loss

    def validate_on_batch(self, x, y):
        self.eval()
        with torch.no_grad():
            return self._validate(x, y)

    def _validate(self, x, y):
        if self.device is not None:
            y = y.to(self.device)
        log_probs = self(**x) #   self.forward(x) = self.__call__(x)
        loss = self.criterion(log_probs, y)
        if self.labels_number > 1:
            _, labels = torch.max(log_probs, dim=-1)
        else:
            labels = (log_probs >= 0.5).int()
        return {"loss": loss, "labels": labels}

class TransformersClassificationModel(BasicTransformersClassificationModel):

    def build_network(self, labels_number):
        self.proj_layer = torch.nn.Linear(self.hidden_size, self.labels_number)
        return self

    def forward(self, input_ids, **kwargs):
        input_ids = input_ids.to(self.device)
        cls_output = self.model(input_ids)["pooler_output"]
        logits = self.proj_layer(cls_output)
        log_probs = self.log_softmax(logits)
        if self.labels_number == 1:
            log_probs = log_probs[...,0]
        return log_probs


In [14]:
model = TransformersClassificationModel(
    bert_model, labels_number=1, device="cuda"
)
for name, elem in model.named_parameters():
    print(name, elem.device, elem.shape)

model.embeddings.word_embeddings.weight cuda:0 torch.Size([119547, 768])
model.embeddings.position_embeddings.weight cuda:0 torch.Size([512, 768])
model.embeddings.token_type_embeddings.weight cuda:0 torch.Size([2, 768])
model.embeddings.LayerNorm.weight cuda:0 torch.Size([768])
model.embeddings.LayerNorm.bias cuda:0 torch.Size([768])
model.encoder.layer.0.attention.self.query.weight cuda:0 torch.Size([768, 768])
model.encoder.layer.0.attention.self.query.bias cuda:0 torch.Size([768])
model.encoder.layer.0.attention.self.key.weight cuda:0 torch.Size([768, 768])
model.encoder.layer.0.attention.self.key.bias cuda:0 torch.Size([768])
model.encoder.layer.0.attention.self.value.weight cuda:0 torch.Size([768, 768])
model.encoder.layer.0.attention.self.value.bias cuda:0 torch.Size([768])
model.encoder.layer.0.attention.output.dense.weight cuda:0 torch.Size([768, 768])
model.encoder.layer.0.attention.output.dense.bias cuda:0 torch.Size([768])
model.encoder.layer.0.attention.output.LayerNorm.we



In [15]:
train_dataloader = iter(FieldBatchDataLoader(train_dataset, batch_size=8, device="cuda"))
batch = next(train_dataloader)
labels = batch["y"]
print(labels)
for i in range(100):
    loss = model.train_on_batch(batch, labels)["loss"].item()
    if i < 5 or (i+1) % 10 == 0:
        print(i, loss)
print(model.validate_on_batch(batch, labels)["loss"].item())

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


tensor([0., 0., 1., 1., 1., 1., 0., 1.], device='cuda:0')
0 0.6855854988098145
1 0.6594206094741821
2 0.6476946473121643
3 0.6109155416488647
4 0.5846738219261169
9 0.4944866895675659
19 0.2943667769432068
29 0.23350226879119873
39 0.09380091726779938
49 0.041193701326847076
59 0.025577982887625694
69 0.018536929041147232
79 0.014987614937126637
89 0.012852350249886513
99 0.010363299399614334
0.0074281468987464905


## Обучение классификационной модели

In [16]:
def update_metrics(metrics, batch_output, batch_labels):
    n_batches = metrics["n_batches"]
    metrics["loss"] = (metrics["loss"] * n_batches + batch_output["loss"].item()) / (n_batches + 1)
    metrics["n_batches"] += 1
    are_equal = (batch_output["labels"] == batch_labels).float()
    are_TP = are_equal * batch_labels
    are_FP = (1.0 - are_equal) * (1-batch_labels)
    are_FN = (1.0 - are_equal) * batch_labels
    metrics["correct"] += are_equal.sum().item()
    metrics["total"] += are_equal.shape[0]
    metrics["TP"] += are_TP.sum().item()
    metrics["FP"] += are_FP.sum().item()
    metrics["FN"] += are_FN.sum().item()
    metrics["accuracy"] = metrics["correct"] / max(metrics["total"], 1)
    metrics["P"] = metrics["TP"] / max(metrics["TP"]+metrics["FP"], 1)
    metrics["R"] = metrics["TP"] / max(metrics["TP"]+metrics["FN"], 1)
    metrics["F1"] = metrics["TP"] / max(metrics["TP"]+0.5*metrics["FN"]+0.5*metrics["FP"], 1)

In [17]:
import tqdm

def do_epoch(model, dataloader, mode="validate", epoch=1):
    metrics = {"correct": 0, "total": 0, "loss": 0.0, "n_batches": 0,
               "TP": 0, "FP": 0, "FN": 0}
    func = model.train_on_batch if mode == "train" else model.validate_on_batch
    progress_bar = tqdm.notebook.tqdm(dataloader, leave=True)
    progress_bar.set_description(f"{mode}, epoch={epoch}")
    for batch in progress_bar:
        batch_answers = batch["y"]
        batch_output = func(batch, batch_answers)
        update_metrics(metrics, batch_output, batch_answers)
        postfix = {"loss": round(metrics["loss"], 4), "acc": round(100 * metrics["accuracy"], 2)}
        for key in ["P", "R", "F1"]:
            postfix[key] =  round(100 * metrics[key], 2)
        progress_bar.set_postfix(postfix)
    return metrics

In [18]:
train_dataset = GramDataset(train_data, tokenizer)
dev_dataset = GramDataset(dev_data, tokenizer)
train_dataloader = iter(FieldBatchDataLoader(train_dataset, batch_size=8, device="cuda"))
dev_dataloader = iter(FieldBatchDataLoader(dev_dataset, batch_size=8, device="cuda"))

bert_model = AutoModel.from_pretrained("bert-base-multilingual-cased").to("cuda")
model = TransformersClassificationModel(bert_model, labels_number=1, device="cuda")
best_val_acc = 0.0
checkpoint = "checkpoint_best.pt"
for epoch in range(5):
    do_epoch(model, train_dataloader, mode="train", epoch=epoch+1)
    epoch_metrics = do_epoch(model, dev_dataloader, mode="validate", epoch=epoch+1)
    if epoch_metrics["accuracy"] > best_val_acc:
        best_val_acc = epoch_metrics["accuracy"]
        torch.save(model.state_dict(), checkpoint)
        # print("Saving ")
model.load_state_dict(torch.load(checkpoint))
do_epoch(model, dev_dataloader, mode="validate", epoch="evaluate")

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/12666 [00:00<?, ?it/s]

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


  0%|          | 0/2713 [00:00<?, ?it/s]

  0%|          | 0/12666 [00:00<?, ?it/s]

  0%|          | 0/2713 [00:00<?, ?it/s]

  0%|          | 0/12666 [00:00<?, ?it/s]

  0%|          | 0/2713 [00:00<?, ?it/s]

  0%|          | 0/12666 [00:00<?, ?it/s]

  0%|          | 0/2713 [00:00<?, ?it/s]

  0%|          | 0/12666 [00:00<?, ?it/s]

  0%|          | 0/2713 [00:00<?, ?it/s]

  0%|          | 0/2713 [00:00<?, ?it/s]

{'F1': 1.0,
 'FN': 0.0,
 'FP': 0.0,
 'P': 1.0,
 'R': 1.0,
 'TP': 10811.0,
 'accuracy': 1.0,
 'correct': 21701.0,
 'loss': 0.0003029712217232707,
 'n_batches': 2713,
 'total': 21701}

In [19]:
def predict_with_model(model, X, classes):
    model.eval()
    dataloader = FieldBatchDataLoader(X, device=model.device)
    answer = [None] * len(X)
    for batch in dataloader:
        with torch.no_grad():
            batch_answer = model(**batch)
        labels = (batch_answer >= 0.5).int().cpu().numpy()
        for i, label in zip(batch["indexes"], labels):
            answer[i] = classes[label]
    return answer

In [20]:
test_dataset = GramDataset(test_data, tokenizer)
predicted = predict_with_model(model, test_dataset, classes=[0, 1])

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


In [51]:
for i in range(10,20):
    print(test_data[i]["idx"], predicted[i])

1 1
0 0
1 1
1 1
0 0
1 1
1 1
0 0
1 1
1 1


In [52]:
correct = []
for i in range(len(test_data)):
  correct.append(test_data[i]["idx"])

In [53]:
from sklearn.metrics import accuracy_score
accuracy_score(predicted, correct)

0.9999539764359352

In [54]:
from sklearn.metrics import matthews_corrcoef
matthews_corrcoef(correct, predicted, sample_weight=None)

0.9999079569440563

## Результаты

In [25]:
def accuracy_for_group(test_data, label):

  subset = []
  for i in range(len(test_data)):
    if test_data[i]["label"] in label:
      subset.append(test_data[i])

  dataset = GramDataset(subset, tokenizer)
  subset_predicted = predict_with_model(model, dataset, classes=[0, 1]) 

  subset_correct = []

  for i in range(len(subset)):
    subset_correct.append(subset[i]["idx"])

  accuracy = accuracy_score(subset_predicted, subset_correct)

  return accuracy

### Непереходные предложения

In [26]:
intr_inan_acc = accuracy_for_group(test_data, ['intr_inan'])
intr_inan_acc

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


0.8717948717948718

In [71]:
intr_inan_subset = []
for i in range(len(test_data)):
  if test_data[i]["label"]=='intr_inan':
    intr_inan_subset.append(test_data[i])

intr_inan_dataset = GramDataset(intr_inan_subset, tokenizer)
intr_inan_predicted = predict_with_model(model, intr_inan_dataset, classes=[0, 1]) 
intr_inan_predicted
intr_inan_correct = []

for i in range(len(intr_inan_subset)):
  intr_inan_correct.append(intr_inan_subset[i]["idx"])

for i in range(len(intr_inan_subset)):
  if intr_inan_subset[i]["idx"] != intr_inan_predicted[i]:
    print( intr_inan_subset[i]["sentence"], intr_inan_correct[i], intr_inan_predicted[i])

романы был популярным 0 1
рассказы был плохим 0 1
сериалы веселил 0 1
фильмы увлекали 1 0
фильмы был плохим 0 1


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


In [27]:
intr_anim_acc = accuracy_for_group(test_data, ['intr_anim'])
intr_anim_acc

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


1.0

In [28]:
intr_acc = accuracy_for_group(test_data, ['intr_inan','intr_anim'])
intr_acc

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


0.9512195121951219

### Переходные предложения

In [29]:
tr_direct_inan_acc = accuracy_for_group(test_data, ['tr_direct_inan'])
tr_direct_inan_acc

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


1.0

In [30]:
tr_direct_anim_acc = accuracy_for_group(test_data, ['tr_direct_anim'])
tr_direct_anim_acc

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


1.0

In [31]:
tr_direct_acc = accuracy_for_group(test_data, ['tr_direct_inan','tr_direct_anim'])
tr_direct_acc

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


1.0

In [32]:
tr_reverse_inan_acc = accuracy_for_group(test_data, ['tr_reverse_inan'])
tr_reverse_inan_acc

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


0.9981515711645101

In [77]:
tr_reverse_inan_subset = []
for i in range(len(test_data)):
  if test_data[i]["label"]=='tr_reverse_inan':
    tr_reverse_inan_subset.append(test_data[i])

tr_reverse_inan_dataset = GramDataset(tr_reverse_inan_subset, tokenizer)
tr_reverse_inan_predicted = predict_with_model(model, tr_reverse_inan_dataset, classes=[0, 1]) 
tr_reverse_inan_predicted
tr_reverse_inan_correct = []

for i in range(len(tr_reverse_inan_subset)):
  tr_reverse_inan_correct.append(tr_reverse_inan_subset[i]["idx"])

for i in range(len(tr_reverse_inan_subset)):
  if tr_reverse_inan_subset[i]["idx"] != tr_reverse_inan_predicted[i]:
    print(tr_reverse_inan_subset[i]["sentence"], tr_reverse_inan_correct[i], tr_reverse_inan_predicted[i])

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


рисунок недолюбливали студенты 1 0


In [33]:
tr_reverse_anim_acc = accuracy_for_group(test_data, ['tr_reverse_anim'])
tr_reverse_anim_acc

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


1.0

In [34]:
tr_reverse_acc = accuracy_for_group(test_data, ['tr_reverse_inan','tr_reverse_anim'])
tr_reverse_acc

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


0.999124343257443

In [35]:
tr_acc = accuracy_for_group(test_data, ['tr_direct_inan','tr_direct_anim','tr_reverse_inan','tr_reverse_anim'])
tr_acc

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


0.9995621716287215

### Предложения с распространенным подлежащим

In [36]:
pp_inan_acc = accuracy_for_group(test_data, ['pp_inan'])
pp_inan_acc

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


1.0

In [37]:
pp_anim_acc = accuracy_for_group(test_data, ['pp_anim'])
pp_anim_acc

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


1.0

In [38]:
pp_acc = accuracy_for_group(test_data, ['pp_inan','pp_anim'])
pp_acc

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


1.0

### Сложноподчиненные предложения с союзом "что"

In [39]:
chto_inan_acc = accuracy_for_group(test_data, ['chto_inan'])
chto_inan_acc

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


1.0

In [40]:
chto_anim_acc = accuracy_for_group(test_data, ['chto_anim'])
chto_anim_acc

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


1.0

In [41]:
chto_acc = accuracy_for_group(test_data, ['chto_inan', 'chto_anim'])
chto_acc

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


1.0

### Относительные предложения

In [42]:
subj_rel_inan_acc = accuracy_for_group(test_data, ['subj_rel_inan'])
subj_rel_inan_acc

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


1.0

In [55]:
subj_rel_anim_acc = accuracy_for_group(test_data, ['subj_rel_anim'])
subj_rel_anim_acc

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


1.0

In [44]:
subj_rel_acc = accuracy_for_group(test_data, ['subj_rel_inan','subj_rel_anim'])
subj_rel_acc

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


1.0

In [56]:
obj_rel_inan_acc = accuracy_for_group(test_data, ['obj_rel_inan'])
obj_rel_inan_acc

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


1.0

In [57]:
obj_rel_anim_acc = accuracy_for_group(test_data, ['obj_rel_anim'])
obj_rel_anim_acc

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


1.0

In [58]:
obj_rel_acc = accuracy_for_group(test_data, ['obj_rel_inan','obj_rel_anim'])
obj_rel_acc

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


1.0

In [59]:
rel_acc = accuracy_for_group(test_data, ['subj_rel_inan','subj_rel_anim','obj_rel_inan','obj_rel_anim'])
rel_acc

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


1.0

### Проверка

In [60]:
example = [{'idx': 1,
  'label': 'ex',
  'sentence': 'Профессиональные тренеры провели занятия по историческому и артистическому фехтованию'},
 {'idx': 0,
  'label': 'ex',
  'sentence': 'Профессиональные тренеры провел занятия по историческому и артистическому фехтованию'},
  {'idx': 1,
  'label': 'ex',
  'sentence': 'Профессиональные тренеры проведут занятия по историческому и артистическому фехтованию'},
 {'idx': 0,
  'label': 'ex',
  'sentence': 'Профессиональные тренеры проведет занятия по историческому и артистическому фехтованию'}]

In [61]:
example_dataset = GramDataset(example, tokenizer)
example_predicted = predict_with_model(model, example_dataset, classes=[0, 1])
example_predicted

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


[1, 0, 1, 0]