In [1]:
import os
import pandas as pd
import numpy as np
import json
import pathlib
import random
import zipfile

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from transformers import BertTokenizer, BertConfig, AutoTokenizer
from transformers.optimization import AdamW
from transformers import RobertaForSequenceClassification

from scipy.special import expit
from keras.preprocessing.sequence import pad_sequences
from sklearn import metrics

In [2]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [3]:
MAX_LEN = 512

In [4]:
SEED = 128

In [5]:
data_path = ''


склейка контекста и вопроса

In [6]:
import json
t_sentences, t_labels = [], []
with open(data_path + 'train.json', 'r') as f:
  for line in f:
    data = json.loads(line)
    s1, s2 = data['context'], data['question']
    t_sentences.append(s1 + ' ' + s2)
    t_labels.append(1 if data['answer']=='да' else 0)



In [7]:
v_sentences, v_labels = [], []
with open(data_path + 'dev.json', 'r') as f:
  for line in f:
    data = json.loads(line)
    s1, s2 = data['context'], data['question']
    v_sentences.append(s1 + ' ' + s2)
    v_labels.append(1 if data['answer']=='да' else 0)


загрузка модели


In [8]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [9]:
tokenizer = AutoTokenizer.from_pretrained("alexyalunin/RuBioRoBERTa",model_max_length=512)
model = RobertaForSequenceClassification.from_pretrained('alexyalunin/RuBioRoBERTa').to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at alexyalunin/RuBioRoBERTa and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
pip install datasets




In [11]:
from datasets import DatasetDict
from datasets import Dataset
train = Dataset.from_dict({
        'text': t_sentences,
        'labels': t_labels
    })
test = Dataset.from_dict({
        'text': v_sentences,
        'labels': v_labels
    })
dataset_dict = DatasetDict({
    'train': train,
    'test': test
})


In [12]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 1308
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 256
    })
})

приведеение данных к приемлемому для модели виду (токенизация текстов, исключение столбца текстов, разбитие на батчи)

In [13]:
from transformers import DataCollatorWithPadding

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding = True
)


tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


Map:   0%|          | 0/1308 [00:00<?, ? examples/s]

Map:   0%|          | 0/256 [00:00<?, ? examples/s]

In [14]:
tokenized_datasets = tokenized_datasets.remove_columns(['text'])


In [15]:
max(len(i) for i in tokenized_datasets['test']['input_ids'])

269

In [16]:
from torch.utils.data import DataLoader
tokenized_datasets.set_format("torch")

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=4, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["test"], batch_size=4, collate_fn=data_collator
)

In [17]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([4]),
 'input_ids': torch.Size([4, 512]),
 'attention_mask': torch.Size([4, 512])}

In [18]:
outputs = model(**batch.to(device))
print(outputs.loss, outputs.logits.shape)

tensor(0.6530, device='cuda:0', grad_fn=<NllLossBackward0>) torch.Size([4, 2])


выбор оптимизатора и определение затухания скорости обучения


In [19]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=0.0005)

In [20]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

981


In [21]:
torch.cuda.memory_summary(device=None, abbreviated=False)




тренировка

In [22]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        progress_bar.update(1)

  0%|          | 0/981 [00:00<?, ?it/s]

валидация
F1= (precision+recall)/(2⋅precision⋅recall)
Где:
Precision (точность) — это отношение количества истинно положительных результатов к общему количеству положительных результатов, предсказанных моделью.
Recall (полнота) — это отношение количества истинно положительных результатов к общему количеству истинно положительных результатов в данных.

In [23]:
from datasets import load_metric

metric = load_metric("f1")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

{'f1': 0.6666666666666666}

In [24]:
torch.save(model.state_dict(), 'fine_tuned_model5e-4.pth')
