# NER for bsnlp

Решение задачи `NER` на преобразованных данных `bsnlp`.

### Загрузка данных

Подключаем нужные библиотеки.

In [2]:
import corus
import pandas as pd

import torch
import torch.nn as nn
from tqdm import tqdm

from transformers import BertModel, BertPreTrainedModel, BertTokenizer
from sklearn.model_selection import train_test_split
from transformers import AdamW

Загружаем преобразованные под задачу `NER` данные.

In [3]:
data = pd.read_csv("/datasets/bsnlp/bsnlp_train_processed.csv", encoding="utf-8").fillna(method="ffill")

In [4]:
data.head()

Unnamed: 0,doc_idxs,tokens,tags,clidxs,base_forms
0,ru-110,"Азия Норин , известная как Азия Биби восемь ле...",B_PER I_PER O O O B_PER I_PER O O O O,B_PER-Asia-Bibi I_PER-Asia-Bibi O O O B_PER-As...,Азия Норин O O O Азия Биби O O O O
1,ru-110,Теперь она планирует покинуть Пакистан ва исла...,O O O O B_LOC O O O O,O O O O B_GPE-Pakistan O O O O,O O O O Пакистан O O O O
2,ru-110,Верховный суд в Исламабаде признал ее невиновн...,B_ORG I_ORG I_ORG I_ORG O O O O O O O O O O O ...,B_ORG-Supreme-Court-of-Pakistan I_ORG-Supreme-...,Верховный суд в Исламабаде O O O O O O O O O O...
3,ru-110,Под суд Норин в 2010 году после ссоры с соседя...,O O B_PER O O O O O O O O,O O B_PER-Asia-Bibi O O O O O O O O,O O Норин O O O O O O O O
4,ru-110,"Мусульманки , вместе с которыми она собирала ф...",O O O O O O O O O O O O O O O O O O O O O,O O O O O O O O O O O O O O O O O O O O O,O O O O O O O O O O O O O O O O O O O O O


In [5]:
tag_values = list(set(' '.join(data['tags'].values).split()))
tag_values.append('PAD')
tag2idx = {t: i for i, t in enumerate(tag_values)}
tag2idx

{'B_LOC': 0,
 'I_ORG': 1,
 'I_PER': 2,
 'I_LOC': 3,
 'I_EVT': 4,
 'B_PRO': 5,
 'B_ORG': 6,
 'O': 7,
 'I_PRO': 8,
 'B_EVT': 9,
 'B_PER': 10,
 'PAD': 11}

### Токенизация

Токенизируем данные:

In [6]:
MAX_LEN = 75
bs = 32

In [7]:
tokenizer = BertTokenizer.from_pretrained('/tokenizer_ru/', do_lower_case=False)

In [8]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence.split(), text_labels.split()):
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        tokenized_sentence.extend(tokenized_word)

        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [9]:
data.iloc[0]['tags']

'B_PER I_PER O O O B_PER I_PER O O O O'

In [10]:
sentences = data['tokens'].values
labels = data['tags'].values

In [11]:
tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(sentences, labels)
]

In [12]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

### Построение модели

Функция `dataloader`'а:

In [13]:
class NerDataloader(torch.utils.data.Dataset):
    def __init__(self, text, labels, MAX_LEN):
        self.text = text
        self.labels = labels
        self.MAX_LEN = MAX_LEN
        
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, index):
        sentence = self.text[index]
        input = torch.zeros(MAX_LEN, dtype=torch.long)
        ids = tokenizer.convert_tokens_to_ids(['[CLS]'] + sentence + ['[SEP]'])
        input[:len(ids)] = torch.LongTensor(ids)
        
        mask = torch.LongTensor([int(i != 0.0) for i in input])
        
        taget = torch.zeros(MAX_LEN, dtype=torch.long)
        label = self.labels[index]
        label = [tag2idx['PAD']] + [tag2idx.get(l) for l in label] + [tag2idx['PAD']]
        taget[:len(label)] = torch.LongTensor(label)
        return input,mask, taget

Разбиваем до этого загруженные и токенизированные даннные на train и valid и передаем `dataloader`:

In [14]:
X_train, X_val, Y_train, Y_val = train_test_split(tokenized_texts, labels,
                                                            random_state=2018, test_size=0.1)

In [15]:
train_dataloader =  torch.utils.data.DataLoader(NerDataloader(X_train, Y_train, MAX_LEN), batch_size=16,
                                               shuffle=True, num_workers=6)

val_dataloader =  torch.utils.data.DataLoader(NerDataloader(X_val, Y_val, MAX_LEN), batch_size=16,
                                               shuffle=True, num_workers=6)

In [16]:
batch = next(iter(train_dataloader))

In [17]:
input_ids, attention_mask, label = batch

Строим саму модель, в данном случае в качестве feature extractor берется `RuBERT` от (deeppavlov):

In [18]:
class BertSoftmaxForNer(nn.Module):
    def __init__(self, num_classes, hidden_dropout_prob=768, dropout = 0.1):
        super(BertSoftmaxForNer, self).__init__()
        self.num_labels = num_classes
        self.bert = BertModel.from_pretrained('/bert_ru/')
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(hidden_dropout_prob, num_classes)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.bert(input_ids = input_ids,attention_mask=attention_mask,token_type_ids=None)
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        outputs = (logits,) + outputs[2:]
        return outputs

Задаем модель, оптимизатор и т.д.:

In [19]:
model = BertSoftmaxForNer(len(tag2idx)).cuda()
criterion = nn.CrossEntropyLoss(ignore_index=tag2idx['PAD'])

In [20]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)

In [21]:
from transformers import get_linear_schedule_with_warmup

epochs = 3
max_grad_norm = 1.0

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

### Обучение и результаты

Запускаем обучение с выводом `accuracy` и `f1`:

In [22]:
from sklearn.metrics import f1_score, accuracy_score, classification_report
from tqdm import tqdm
import numpy as np

In [23]:
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = tuple(t.cuda() for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        model.zero_grad()

        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask)
        
        logits = outputs[0]
        loss = criterion(logits.view(logits.size(0)*logits.size(1),-1), b_labels.view(logits.size(0)*logits.size(1)))
        loss.backward()

        total_loss += loss.item()
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)

        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))
    torch.save(model.state_dict(), '/weights/weights' + str(epoch) + '.pht')

    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in val_dataloader:
        batch = tuple(t.cuda() for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask)
        logits = outputs[0].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

    pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                                 for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
    valid_tags = [tag_values[l_i] for l in true_labels
                                  for l_i in l if tag_values[l_i] != "PAD"]
    print("Validation Accuracy: {}".format(accuracy_score(pred_tags, valid_tags)))
    print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags,average='weighted')))
    print()

100%|██████████| 128/128 [00:13<00:00,  9.66it/s]


Average train loss: 0.15201487969898153


  0%|          | 0/128 [00:00<?, ?it/s]

Validation Accuracy: 0.991792891195462
Validation F1-Score: 0.9931423561574096



100%|██████████| 128/128 [00:12<00:00, 10.56it/s]


Average train loss: 0.021506608061827137


  0%|          | 0/128 [00:00<?, ?it/s]

Validation Accuracy: 0.9948102106088951
Validation F1-Score: 0.9957354833724985



100%|██████████| 128/128 [00:12<00:00, 10.41it/s]


Average train loss: 0.011972891865298152
Validation Accuracy: 0.9959567919859996
Validation F1-Score: 0.9963425696340653



Получившиеся результаты по различным тегам в виде `precision`, `recall` и `f1`.

In [24]:
print(classification_report(valid_tags, pred_tags))

              precision    recall  f1-score   support

       B_EVT       0.85      0.87      0.86        39
       B_LOC       1.00      1.00      1.00     13173
       B_ORG       0.85      0.95      0.90        73
       B_PER       0.96      0.97      0.97       187
       B_PRO       0.92      0.67      0.77        18
       I_EVT       1.00      0.30      0.46        20
       I_LOC       0.00      0.00      0.00         5
       I_ORG       0.86      0.83      0.85        53
       I_PER       0.94      0.96      0.95        69
       I_PRO       0.90      1.00      0.95         9
           O       0.99      1.00      0.99      2925

    accuracy                           1.00     16571
   macro avg       0.84      0.78      0.79     16571
weighted avg       1.00      1.00      1.00     16571



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
