In [None]:
%%writefile requirements.txt
torch
numpy
pandas
scikit-learn
razdel
ipymarkup

In [None]:
!pip install --upgrade -r requirements.txt

In [None]:
!wget https://web.archive.org/web/20220331225529/http://ai-center.botik.ru/Airec/ai-resources/Persons-1000.zip
!unzip Persons-1000.zip

In [None]:
!cat Persons-1000/collection/001/anno.markup.xml  

# NER

## Датасет

Named Entity Recognition - распознавание именных сущностей. Выделяем в тексте спаны PER, LOC, ORG.

В случае с Persons-1000 только PER. 

In [None]:
import os
import xml.etree.ElementTree as ET
from ipymarkup import show_box_markup
from ipymarkup.palette import palette, BLUE, RED, GREEN

directory = "Persons-1000/collection/"

def read_text_with_markup(directory):
    markup_file_name = os.path.join(directory, "anno.markup.xml")
    text_file_name = os.path.join(directory, "text.txt")
    with open(text_file_name, "r", encoding="windows-1251") as r:
        text = r.read()
    text = text.replace("\n", "\r\n")
    root = ET.parse(markup_file_name).getroot()
    spans = []
    for entry in root.findall("entry"):
        start_pos = int(entry.find("offset").text)
        end_pos = start_pos + int(entry.find("length").text)
        tag = entry.find("class").text
        spans.append((start_pos, end_pos, "PER"))
    return text, spans

data = []
for sample_name in os.listdir(directory):
    sample_path = os.path.join(directory, sample_name)
    data.append(read_text_with_markup(sample_path))

ipymarkup - модуль для вывода NER разметки в ipynb

In [None]:
show_box_markup(data[0][0], data[0][1], palette=palette(PER=GREEN, ORG=BLUE, LOC=RED))

## BIO

BIO разметка: B - begin, I - inner, O - outer. Преобразуем задачу разметки спанов в задачу классификации каждого слова.

In [None]:
from razdel import tokenize
from collections import namedtuple

Sample = namedtuple("Sample", "text,tokens,spans,labels")

samples = []
for text, spans in data:
    labels = []
    tokens = list(tokenize(text))
    for token in tokens:
        label = 0
        for span in spans:
            if token.start == span[0]:
                label = 1
            elif token.start > span[0] and token.stop <= span[1]:
                label = 2
        labels.append(label)
    sample = Sample(text, tokens, spans, labels)
    samples.append(sample)

show_box_markup(samples[0].text, samples[0].spans, palette=palette(PER=BLUE, ORG=RED, LOC=GREEN))
print(samples[0].labels)
print(len(samples))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

Бьём на выборки

In [None]:
import random
random.shuffle(samples)

train = samples[:700]
val = samples[700:850]
test = samples[850:]

In [None]:
char_set = ["<pad>", "<unk>"] + list({ch for sample in samples for token in sample.tokens for ch in token.text})
print(char_set)

['<pad>', '<unk>', 'I', '»', ',', 'Ъ', 'G', 'c', 'Т', "'", 'н', '!', '4', 'ц', 'Л', '3', 'x', 'Ф', '[', 'ж', 'T', '(', ':', 'Ы', 'о', 'z', 'K', '\xad', 'р', '0', '$', '&', 'S', '2', 'l', '«', '—', 'p', 'О', 'r', 'Э', 'V', 'Р', 'o', 'D', 'e', 'B', 'g', 'Б', 'O', 'Я', 'X', '=', '?', 'ф', 'М', 'З', '#', 't', '–', 'б', 'H', 'Ь', 'С', 'м', 'й', '>', 'ъ', '“', '…', 'm', 'ы', 'к', 'д', 'Й', '+', '*', '<', 'Г', ')', 'щ', 'п', 'л', 'Н', '1', 'у', '©', 'в', 'W', '/', 'ю', '-', 'u', '_', 'A', 'ч', 'R', 'Y', 'i', 'y', 'ь', '.', 'с', 'L', 'и', 'Ч', 'n', 'г', 'U', '5', 'я', 'E', '€', '6', 'J', 'F', 'f', 'b', '7', '8', '9', 'Д', 'Щ', '|', 'k', '”', '"', 'P', 'х', 'd', 'Ш', 'Ю', '•', 'П', 'w', 'Ё', 'Е', ';', 'Ц', 'Ж', 'Q', 'v', 'q', 's', 'N', 'А', 'з', 'И', 'Х', 'э', ']', 'ш', '%', 'j', 'ё', 'a', 'У', 'т', 'C', 'е', '№', 'К', 'Z', 'і', 'В', 'M', 'h', 'а']


Для каждого слова сохраняем его символьный состав, а в остальном старый добрый пайплайн

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import time
import numpy as np

def get_next_gen_batch(samples, max_seq_len=500, max_char_seq_len=40, batch_size=32):
    indices = np.arange(len(samples))
    np.random.shuffle(indices)
    batch_begin = 0
    while batch_begin < len(samples):
        batch_indices = indices[batch_begin: batch_begin + batch_size]
        batch = []
        batch_labels = []
        batch_max_len = 0
        for data_ind in batch_indices:
            sample = samples[data_ind]
            inputs = []
            for token in sample.tokens[:max_seq_len]:
                chars = [char_set.index(ch) if ch in char_set else char_set.index("<unk>") for ch in token.text][:max_char_seq_len]
                chars += [0] * (max_char_seq_len - len(chars))
                inputs.append(chars)
            batch_max_len = max(batch_max_len, len(inputs))
            inputs += [[0]*max_char_seq_len] * (max_seq_len - len(inputs))
            batch.append(inputs)
            labels = sample.labels[:max_seq_len]
            labels += [0] * (max_seq_len - len(labels))
            batch_labels.append(labels)
        batch_begin += batch_size
        batch = torch.cuda.LongTensor(batch)[:, :batch_max_len]
        labels = torch.cuda.LongTensor(batch_labels)[:, :batch_max_len]
        yield batch_indices, batch, labels


def train_gen_model(model, train_samples, val_samples, epochs_count=10, 
                    loss_every_nsteps=1000, lr=0.01, save_path="model.pt", device_name="cuda",
                    early_stopping=True):
    params_count = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print("Trainable params: {}".format(params_count))
    device = torch.device(device_name)
    model = model.to(device)
    total_loss = 0
    start_time = time.time()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_function = nn.CrossEntropyLoss().cuda()
    prev_avg_val_loss = None
    for epoch in range(epochs_count):
        model.train()
        for step, (_, batch, batch_labels) in enumerate(get_next_gen_batch(train)):
            logits = model(batch) # Прямой проход
            logits = logits.transpose(1, 2)
            loss = loss_function(logits, batch_labels) # Подсчёт ошибки
            loss.backward() # Подсчёт градиентов dL/dw
            optimizer.step() # Градиентный спуск или его модификации (в данном случае Adam)
            optimizer.zero_grad() # Зануление градиентов, чтобы их спокойно менять на следующей итерации
            total_loss += loss.item()
        val_total_loss = 0
        val_batch_count = 0
        model.eval()
        for _, (_, batch, batch_labels) in enumerate(get_next_gen_batch(val)):
            logits = model(batch) # Прямой проход
            logits = logits.transpose(1, 2)
            val_total_loss += loss_function(logits, batch_labels) # Подсчёт ошибки
            val_batch_count += 1
        avg_val_loss = val_total_loss/val_batch_count
        print("Epoch = {}, Avg Train Loss = {:.4f}, Avg val loss = {:.4f}, Time = {:.2f}s".format(epoch, total_loss / loss_every_nsteps, avg_val_loss, time.time() - start_time))
        total_loss = 0
        start_time = time.time()

        if early_stopping and prev_avg_val_loss is not None and avg_val_loss > prev_avg_val_loss:
            model.load_state_dict(torch.load(save_path))
            model.eval()
            break
        prev_avg_val_loss = avg_val_loss
        torch.save(model.state_dict(), save_path)

## Бесконтекстная модель

In [None]:
import torch
from torch import nn

class SuperSimpleModel(nn.Module):
    def __init__(self, char_set_size, char_embedding_dim=16, classes_count=3, char_max_seq_len=40):
        super().__init__()
        
        self.embeddings_layer = nn.Embedding(char_set_size, char_embedding_dim)
        self.out_layer = nn.Linear(char_max_seq_len * char_embedding_dim, classes_count)

    def forward(self, inputs):
        projections = self.embeddings_layer.forward(inputs)
        projections = projections.reshape(projections.size(0), projections.size(1), -1)
        output = self.out_layer.forward(projections)
        return output


model = SuperSimpleModel(len(char_set))
train_gen_model(model, train, val, epochs_count=50, early_stopping=False, lr=0.02)

Trainable params: 4611
Epoch = 0, Avg Train Loss = 0.0276, Avg val loss = 0.3948, Time = 5.20s
Epoch = 1, Avg Train Loss = 0.0052, Avg val loss = 0.1412, Time = 5.12s
Epoch = 2, Avg Train Loss = 0.0023, Avg val loss = 0.0912, Time = 5.15s
Epoch = 3, Avg Train Loss = 0.0019, Avg val loss = 0.0834, Time = 5.13s
Epoch = 4, Avg Train Loss = 0.0018, Avg val loss = 0.0810, Time = 5.17s
Epoch = 5, Avg Train Loss = 0.0017, Avg val loss = 0.0776, Time = 5.13s
Epoch = 6, Avg Train Loss = 0.0017, Avg val loss = 0.0757, Time = 5.16s
Epoch = 7, Avg Train Loss = 0.0016, Avg val loss = 0.0751, Time = 5.23s
Epoch = 8, Avg Train Loss = 0.0016, Avg val loss = 0.0755, Time = 5.06s
Epoch = 9, Avg Train Loss = 0.0016, Avg val loss = 0.0743, Time = 5.12s
Epoch = 10, Avg Train Loss = 0.0015, Avg val loss = 0.0732, Time = 6.25s
Epoch = 11, Avg Train Loss = 0.0016, Avg val loss = 0.0733, Time = 5.12s
Epoch = 12, Avg Train Loss = 0.0015, Avg val loss = 0.0725, Time = 5.22s
Epoch = 13, Avg Train Loss = 0.0015, A

## Метрики

Можно использовать как классические мультиклассификационнные метрики, так и метрики специально для NER.

Например, число точных и частичных совпадений спанов, пропущшенных и лишних спанов.

In [None]:
def get_spans(labels, tokens):
    spans = []
    for i, label in enumerate(labels):
        if label == 1:
            spans.append((tokens[i].start, tokens[i].stop, "PER"))
        elif label == 2:
            spans[-1] = (spans[-1][0], tokens[i].stop, spans[-1][-1])
    return spans


def calc_metrics(true_labels, predicted_labels, samples):
    one_tp = 0
    one_fp = 0
    one_fn = 0
    for true, predicted in zip(true_labels, predicted_labels):
        for l1, l2 in zip(true, predicted):
            if l1 == 1 and l2 == 1:
                one_tp += 1
            elif l1 != 1 and l2 == 1:
                one_fp += 1
            elif l1 == 1 and l2 !=1:
                one_fn += 1
    if one_tp + one_fp == 0:
        print("No positives!")
    else:
        print("1 Precision: {}, 1 Recall: {}".format(float(one_tp)/(one_tp + one_fp), float(one_tp)/(one_tp + one_fn)))

    exact = 0
    partial = 0
    missing = 0
    spurius = 0
    for (true, predicted), sample in zip(zip(true_labels, predicted_labels), samples):
        true_spans = get_spans(true, sample.tokens)
        predicted_spans = get_spans(predicted, sample.tokens)
        for true_span in true_spans:
            is_missing = True
            for predicted_span in predicted_spans:
                if true_span == predicted_span:
                    exact += 1
                    is_missing = False
                    break
                ts = true_span[0]
                te = true_span[1]
                ps = predicted_span[0]
                pe = predicted_span[1]
                # ts te ps pe
                # ps pe ts te
                if ts <= te <= ps <= pe or ps <= pe <= ts <= te:
                    continue
                is_missing = False
                partial += 1
                break
            if is_missing:
                missing += 1
        for predicted_span in predicted_spans:
            is_missing = True
            for true_span in true_spans:
                if true_span == predicted_span:
                    is_missing = False
                    break
                ts = true_span[0]
                te = true_span[1]
                ps = predicted_span[0]
                pe = predicted_span[1]
                if ts <= te <= ps <= pe or ps <= pe <= ts <= te:
                    continue
                is_missing = False
                break
            if is_missing:
                spurius += 1
    print("Exact: {}, partial: {}, missing: {}, spurius: {}".format(exact, partial, missing, spurius))
            


def predict(model, samples):
    model.eval()
    true_labels = []
    predicted_labels = []
    all_indices = []
    for _, (indices, batch, batch_labels) in enumerate(get_next_gen_batch(samples)):
        logits = model(batch)
        plabels = logits.max(dim=2)[1]
        # Убираем неконсистентность
        for sample_num, sample in enumerate(plabels):
            for word_num, label in enumerate(sample):
                if label != 2:
                    continue
                if word_num == 0:
                    plabels[sample_num][word_num] = 0
                    continue
                if sample[word_num - 1] != 1:
                    plabels[sample_num][word_num] = 0
        true_labels.extend(batch_labels)
        predicted_labels.extend(plabels)
        all_indices.extend(indices)
    samples = [samples[index] for index in all_indices]
    calc_metrics(true_labels, predicted_labels, samples)
    show_box_markup(samples[0].text, get_spans(predicted_labels[0], samples[0].tokens), palette=palette(PER=BLUE, ORG=RED, LOC=GREEN))

In [None]:
predict(model, test)

1 Precision: 0.6549391069012178, 1 Recall: 0.660300136425648
Exact: 467, partial: 569, missing: 430, spurius: 258


## Контекстная модель: LSTM 


In [None]:
import torch
from torch import nn

class LstmModel(nn.Module):
    def __init__(self, char_set_size, char_embedding_dim=4, classes_count=3, lstm_embedding_dim=8, char_max_seq_len=40):
        super().__init__()
        
        self.embeddings_layer = nn.Embedding(char_set_size, char_embedding_dim)
        self.dropout = nn.Dropout(0.4)
        self.lstm_layer = nn.LSTM(char_embedding_dim * char_max_seq_len, lstm_embedding_dim // 2, batch_first=True, bidirectional=True)
        self.out_layer = nn.Linear(lstm_embedding_dim, classes_count)

    def forward(self, inputs):
        batch_size = inputs.size(0)
        seq_len = inputs.size(1)
        projections = self.embeddings_layer.forward(inputs)
        projections = projections.reshape(projections.size(0), projections.size(1), -1)
        output, _= self.lstm_layer(projections)
        output = self.dropout(output)
        output = self.out_layer.forward(output)
        return output

model = LstmModel(len(char_set))
train_gen_model(model, train, val, epochs_count=50, early_stopping=False, lr=0.02)

Trainable params: 6011
Epoch = 0, Avg Train Loss = 0.0123, Avg val loss = 0.2192, Time = 5.31s
Epoch = 1, Avg Train Loss = 0.0057, Avg val loss = 0.2011, Time = 5.28s
Epoch = 2, Avg Train Loss = 0.0054, Avg val loss = 0.2041, Time = 6.14s
Epoch = 3, Avg Train Loss = 0.0052, Avg val loss = 0.1938, Time = 5.45s
Epoch = 4, Avg Train Loss = 0.0051, Avg val loss = 0.1922, Time = 5.34s
Epoch = 5, Avg Train Loss = 0.0049, Avg val loss = 0.1863, Time = 5.28s
Epoch = 6, Avg Train Loss = 0.0045, Avg val loss = 0.1745, Time = 5.31s
Epoch = 7, Avg Train Loss = 0.0042, Avg val loss = 0.1611, Time = 6.34s
Epoch = 8, Avg Train Loss = 0.0040, Avg val loss = 0.1491, Time = 5.39s
Epoch = 9, Avg Train Loss = 0.0040, Avg val loss = 0.1464, Time = 5.26s
Epoch = 10, Avg Train Loss = 0.0039, Avg val loss = 0.1431, Time = 5.29s
Epoch = 11, Avg Train Loss = 0.0039, Avg val loss = 0.1463, Time = 5.30s
Epoch = 12, Avg Train Loss = 0.0039, Avg val loss = 0.1388, Time = 5.28s
Epoch = 13, Avg Train Loss = 0.0039, A

In [None]:
  predict(model, test)

1 Precision: 0.8445945945945946, 1 Recall: 0.767394270122783
Exact: 810, partial: 366, missing: 290, spurius: 127


## Контекстная модель: LSTM над CharFF

полносвзяный слой с активацией над конкатенацией символьных эмбедов

In [None]:
from torch import nn

class CharFFLstmModel(nn.Module):
    def __init__(self, char_set_size, char_embedding_dim=4, classes_count=3, word_embedding_dim=16, lstm_embedding_dim=16, char_max_seq_len=40):
        super().__init__()
        
        self.embeddings_layer = nn.Embedding(char_set_size, char_embedding_dim)
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(char_embedding_dim * char_max_seq_len, word_embedding_dim)
        self.relu = nn.ReLU()
        self.lstm_layer = nn.LSTM(word_embedding_dim, lstm_embedding_dim // 2, batch_first=True, bidirectional=True)
        self.out_layer = nn.Linear(lstm_embedding_dim, classes_count)

    def forward(self, inputs):
        projections = self.embeddings_layer.forward(inputs)
        projections = projections.reshape(projections.size(0), projections.size(1), -1)
        projections = self.relu(self.linear(projections))
        projections = self.dropout(projections)
        output, _= self.lstm_layer(projections)
        output = self.dropout(output)
        output = self.out_layer.forward(output)
        return output

model = CharFFLstmModel(len(char_set))
train_gen_model(model, train, val, epochs_count=50, early_stopping=False, lr=0.03)

Trainable params: 4963
Epoch = 0, Avg Train Loss = 0.0065, Avg val loss = 0.1987, Time = 6.36s
Epoch = 1, Avg Train Loss = 0.0048, Avg val loss = 0.1988, Time = 5.26s
Epoch = 2, Avg Train Loss = 0.0047, Avg val loss = 0.1956, Time = 5.29s
Epoch = 3, Avg Train Loss = 0.0047, Avg val loss = 0.1946, Time = 5.26s
Epoch = 4, Avg Train Loss = 0.0046, Avg val loss = 0.1968, Time = 5.25s
Epoch = 5, Avg Train Loss = 0.0043, Avg val loss = 0.1778, Time = 5.31s
Epoch = 6, Avg Train Loss = 0.0032, Avg val loss = 0.1118, Time = 5.25s
Epoch = 7, Avg Train Loss = 0.0023, Avg val loss = 0.0905, Time = 5.25s
Epoch = 8, Avg Train Loss = 0.0020, Avg val loss = 0.0819, Time = 5.27s
Epoch = 9, Avg Train Loss = 0.0018, Avg val loss = 0.0730, Time = 6.15s
Epoch = 10, Avg Train Loss = 0.0015, Avg val loss = 0.0497, Time = 5.30s
Epoch = 11, Avg Train Loss = 0.0012, Avg val loss = 0.0430, Time = 5.35s
Epoch = 12, Avg Train Loss = 0.0011, Avg val loss = 0.0387, Time = 5.26s
Epoch = 13, Avg Train Loss = 0.0010, A

In [None]:
predict(model, test)

1 Precision: 0.9060955518945635, 1 Recall: 0.7503410641200545
Exact: 826, partial: 298, missing: 342, spurius: 89
