<h1><center>Извлечение именованных сущностей </center></h1>

<h3><center>(Named Entity Recognition, NER)</center></h3>

- Spacy, Natasha
- CNN-biLSTM-CRF

### Spacy

In [5]:
import spacy

In [6]:
#!python3 -m spacy download en_core_web_sm

In [11]:
nlp_eng = spacy.load("en_core_web_sm")

text = "HSE opens a building in Moscow worth 1 million rubles"

for ent in nlp_eng(text).ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_, sep = '\t\t')

HSE		0		3		ORG
Moscow		24		30		GPE
1 million rubles		37		53		MONEY


In [24]:
doc = nlp_eng("Kate studies computer science in Higher School of Economics")

ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print(*ents)
print()

for i in range(len(doc)):
    print(*[doc[i].text, doc[i].ent_iob_, doc[i].ent_type_], sep='\t')


('Kate', 0, 4, 'PERSON') ('Higher School of Economics', 33, 59, 'ORG')

Kate	B	PERSON
studies	O	
computer	O	
science	O	
in	O	
Higher	B	ORG
School	I	ORG
of	I	ORG
Economics	I	ORG


In [10]:
from spacy import displacy

text = "In September 1972, Jobs enrolled at Reed College in Portland, Oregon."

nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
displacy.serve(doc, style="ent")

  "__main__", mod_spec)



Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


На данный момент нет модели для русского языка :(

### Natasha

In [18]:
import natasha
from natasha import Doc, NewsEmbedding, NewsNERTagger, MorphVocab

In [16]:
emb = NewsEmbedding()

ner_tagger = NewsNERTagger(emb)

In [17]:
text = 'В феврале 1974 года Стив Джобс устроился техником в молодую компанию Атари в Лос-Гатосе (Калифорния)'
markup = ner_tagger(text)
markup.print()

В феврале 1974 года Стив Джобс устроился техником в молодую компанию 
                    PER───────                                       
Атари в Лос-Гатосе (Калифорния)
PER──   LOC───────  LOC─────── 


Есть отдельные парсеры для разных типов сущностей, они все являются обертками над парсером Yargy.

In [21]:
morph_vocab = MorphVocab() #обертка для Pymorphy2

morph_vocab('стекло')

[MorphForm(normal='стекло', pos='NOUN', feats={'Animacy': 'Inan', 'Gender': 'Neut', 'Number': 'Sing', 'Case': 'Nom'}),
 MorphForm(normal='стекло', pos='NOUN', feats={'Animacy': 'Inan', 'Gender': 'Neut', 'Number': 'Sing', 'Case': 'Acc'}),
 MorphForm(normal='стечь', pos='VERB', feats={'VerbForm': 'Fin', 'Aspect': 'Perf', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Past', 'Mood': 'Ind'})]

Парсер для имен (согласно документации, его лучше применять к спанам текста):

In [23]:
from natasha import NamesExtractor

names_extractor = NamesExtractor(morph_vocab)

text = 'Генеральным директором Pixar является Эд Кэтмелл, а креативный отдел возглавляет Джон Лассетер'
list(names_extractor(text))

[Match(
     start=12,
     stop=22,
     fact=Name(
         first=None,
         last='директором',
         middle=None
     )
 ),
 Match(
     start=38,
     stop=48,
     fact=Name(
         first='Эд',
         last='Кэтмелл',
         middle=None
     )
 ),
 Match(
     start=50,
     stop=51,
     fact=Name(
         first=None,
         last='а',
         middle=None
     )
 ),
 Match(
     start=52,
     stop=62,
     fact=Name(
         first=None,
         last='креативный',
         middle=None
     )
 ),
 Match(
     start=81,
     stop=94,
     fact=Name(
         first='Джон',
         last='Лассетер',
         middle=None
     )
 )]

In [26]:
text = [
    'генеральный директор Эд Кэтмелл',
    'Джон Лассетер',
    'А. С. Пушкин',
    'Лермонтов'
]

for line in text:
    print(names_extractor.find(line))

Match(start=12, stop=20, fact=Name(first=None, last='директор', middle=None))
Match(start=0, stop=13, fact=Name(first='Джон', last='Лассетер', middle=None))
Match(start=0, stop=12, fact=Name(first='А', last='Пушкин', middle='С'))
Match(start=0, stop=9, fact=Name(first=None, last='Лермонтов', middle=None))


Даты:

In [31]:
from natasha import DatesExtractor

dates_extractor = DatesExtractor(morph_vocab)

text = '24.01.2017, 2015 год, 2014 г, 1 апреля, май 2017 г., 9 мая 2017 года'
print(*list(dates_extractor(text)), sep='\n')

Match(start=0, stop=10, fact=Date(year=2017, month=1, day=24))
Match(start=12, stop=20, fact=Date(year=2015, month=None, day=None))
Match(start=22, stop=28, fact=Date(year=2014, month=None, day=None))
Match(start=30, stop=38, fact=Date(year=None, month=4, day=1))
Match(start=40, stop=51, fact=Date(year=2017, month=5, day=None))
Match(start=53, stop=68, fact=Date(year=2017, month=5, day=9))


Деньги:

In [41]:
from natasha import MoneyExtractor

money_extractor = MoneyExtractor(morph_vocab)

text = "$ 23, 100 рублей, 1,565,321 долларов, 34 тыс. евро, тринадцать рублей тридцать две копейки, 13 руб. 32 коп."
print(*list(money_extractor(text)), sep='\n')

Match(start=2, stop=16, fact=Money(amount=23100, currency='RUB'))
Match(start=18, stop=36, fact=Money(amount=1565321, currency='USD'))
Match(start=38, stop=50, fact=Money(amount=34000, currency='EUR'))
Match(start=92, stop=107, fact=Money(amount=13.32, currency='RUB'))


Адреса:

In [49]:
from natasha import AddrExtractor

addr_extractor = AddrExtractor(morph_vocab)

lines = [
    'Россия, Москва, Тверская улица, дом 5, корпус 3',
    '108845, РФ, Приморский край, г. Находка, ул. Добролюбова, 18',
    'поселок Солнченый, ул. Никитская, дом 2'
]
for line in lines:
    print(addr_extractor.find(line), end='\n\n')


Match(start=0, stop=47, fact=Addr(parts=[AddrPart(value='Россия', type='страна'), AddrPart(value='Москва', type=None), AddrPart(value='Тверская', type='улица'), AddrPart(value='5', type='дом'), AddrPart(value='3', type='корпус')]))

Match(start=0, stop=56, fact=Addr(parts=[AddrPart(value='108845', type='индекс'), AddrPart(value='РФ', type='страна'), AddrPart(value='Приморский', type='край'), AddrPart(value='Находка', type='город'), AddrPart(value='Добролюбова', type='улица')]))

Match(start=0, stop=39, fact=Addr(parts=[AddrPart(value='Солнченый', type='посёлок'), AddrPart(value='Никитская', type='улица'), AddrPart(value='2', type='дом')]))



### CNN-biLSTM-CRF

Мы воспользуемся кодом и предобученными моделями отсюда: https://github.com/jayavardhanr/End-to-end-Sequence-Labeling-via-Bi-directional-LSTM-CNNs-CRF-Tutorial

В репозитории воспроизведена архитектура BiLSTM-CNN-CRF, модель обучена на данных соревнования CoNLL 2003 shared task (англоязычный корпус).

В данных размечены четыре типа сущностей:

- (PER) PERSON

- (LOC) LOCATION

- (ORG) ORGANIZATION

- (MISC) MISCELLANEOUS

In [12]:
!git clone https://github.com/jayavardhanr/End-to-end-Sequence-Labeling-via-Bi-directional-LSTM-CNNs-CRF-Tutorial

Cloning into 'End-to-end-Sequence-Labeling-via-Bi-directional-LSTM-CNNs-CRF-Tutorial'...
remote: Enumerating objects: 57, done.[K
remote: Total 57 (delta 0), reused 0 (delta 0), pack-reused 57[K
Unpacking objects: 100% (57/57), done.


Нам понадобятся предобученные вектора слов, воспользуемся векторами glove:

In [None]:
! wget http://nlp.stanford.edu/data/glove.6B.zip && unzip glove.6B.zip

In [13]:
from __future__ import print_function
from collections import OrderedDict

import torch
import torch.nn as nn
from torch.nn import init
from torch.autograd import Variable
from torch import autograd

import time
import _pickle as cPickle

import urllib
import matplotlib.pyplot as plt

import os
import sys
import codecs
import re
import numpy as np

Параметры для модели:

In [29]:

parameters = OrderedDict()
parameters['train'] = "./End-to-end-Sequence-Labeling-via-Bi-directional-LSTM-CNNs-CRF-Tutorial/data/eng.train" # путь к обучающей выборке
parameters['dev'] = "./End-to-end-Sequence-Labeling-via-Bi-directional-LSTM-CNNs-CRF-Tutorial/data/eng.testa" # путь к тестовой выборке
parameters['test'] = "./End-to-end-Sequence-Labeling-via-Bi-directional-LSTM-CNNs-CRF-Tutorial/data/eng.testb" # путь к валидационной выборке
parameters['tag_scheme'] = "BIOES" # BIO или BIOES
parameters['lower'] = True # все токены переводим в нижний регистр
parameters['zeros'] =  True # все числа заменяем на 0 
parameters['char_dim'] = 30 # размерность символьных эмбеддингов
parameters['word_dim'] = 100 # размерность символьных слов (в нашем случсае glove)
parameters['word_lstm_dim'] = 200 # размерность скрытого слоя LSTM 
parameters['word_bidirect'] = True # используем двунаправленную LSTM
parameters['embedding_path'] = "glove.6B.100d.txt" # путь к модели с предобученными векторами слов
parameters['all_emb'] = 1 # загружаем все эмбеддинги
parameters['crf'] =1 # используем CRF (0 если не используем)
parameters['dropout'] = 0.5 # Droupout 
parameters['epoch'] =  50 # число эпох обучения
parameters['weights'] = "" # путь к предобученной модели (если сохраняем новую)
parameters['name'] = "self-trained-model" # название модели
parameters['gradient_clip'] = 5.0
parameters['char_mode'] = "CNN"
models_path = "./End-to-end-Sequence-Labeling-via-Bi-directional-LSTM-CNNs-CRF-Tutorial/models/" # путь к предобученным моделям

#GPU
parameters['use_gpu'] = torch.cuda.is_available() 
use_gpu = parameters['use_gpu']

parameters['reload'] = "./End-to-end-Sequence-Labeling-via-Bi-directional-LSTM-CNNs-CRF-Tutorial/models/pre-trained-model" 

START_TAG = '<START>'
STOP_TAG = '<STOP>'
mapping_file = './data/mapping.pkl'

name = parameters['name']
model_name = models_path + name 

if not os.path.exists(models_path):
    os.makedirs(models_path)

Загружаем и предобрабатываем данные: 

In [17]:
def zero_digits(s):
    """
    заменяем все цифры на "0"
    """
    return re.sub('\d', '0', s)

def load_sentences(path, zeros):
    sentences = []
    sentence = []
    for line in codecs.open(path, 'r', 'utf8'):
        line = zero_digits(line.rstrip()) if zeros else line.rstrip()
        if not line:
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    sentences.append(sentence)
                sentence = []
        else:
            word = line.split()
            assert len(word) >= 2
            sentence.append(word)
    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)
    return sentences

In [18]:
train_sentences = load_sentences(parameters['train'], parameters['zeros'])
test_sentences = load_sentences(parameters['test'], parameters['zeros'])
dev_sentences = load_sentences(parameters['dev'], parameters['zeros'])

In [22]:
train_sentences[0]

[['EU', 'NNP', 'I-NP', 'I-ORG'],
 ['rejects', 'VBZ', 'I-VP', 'O'],
 ['German', 'JJ', 'I-NP', 'I-MISC'],
 ['call', 'NN', 'I-NP', 'O'],
 ['to', 'TO', 'I-VP', 'O'],
 ['boycott', 'VB', 'I-VP', 'O'],
 ['British', 'JJ', 'I-NP', 'I-MISC'],
 ['lamb', 'NN', 'I-NP', 'O'],
 ['.', '.', 'O', 'O']]

Конвертируем BIO разметку в BIOES:

B, I, E - если сущность состоит из более чем одного токена, то у первого токена будет тип B-{TYPE}, у последнего - E-{TYPE}, у всех остальных - I-{TYPE}. 

O - токен не является сущностью

S - токен является сущностью и состоит только из этого токена

Пример BIO-нотации:

ФКН : B-ORG
НИУ : I-ORG
ВШЭ : E-ORG 
открывает : O 
корпус : O
в : O 
Москве : S-LOC 
. : O 

In [27]:
def iob2(tags):
    for i, tag in enumerate(tags):
        if tag == 'O':
            continue
        split = tag.split('-')
        if len(split) != 2 or split[0] not in ['I', 'B']:
            return False
        if split[0] == 'B':
            continue
        elif i == 0 or tags[i - 1] == 'O':  
            tags[i] = 'B' + tag[1:]
        elif tags[i - 1][1:] == tag[1:]:
            continue
        else:  
            tags[i] = 'B' + tag[1:]
    return True

def iob_iobes(tags):
    """
    BIO -> BIOES 
    """
    new_tags = []
    for i, tag in enumerate(tags):
        if tag == 'O':
            new_tags.append(tag)
        elif tag.split('-')[0] == 'B':
            if i + 1 != len(tags) and \
               tags[i + 1].split('-')[0] == 'I':
                new_tags.append(tag)
            else:
                new_tags.append(tag.replace('B-', 'S-'))
        elif tag.split('-')[0] == 'I':
            if i + 1 < len(tags) and \
                    tags[i + 1].split('-')[0] == 'I':
                new_tags.append(tag)
            else:
                new_tags.append(tag.replace('I-', 'E-'))
        else:
            raise Exception('Invalid IOB format!')
    return new_tags

def update_tag_scheme(sentences, tag_scheme):
    for i, s in enumerate(sentences):
        tags = [w[-1] for w in s]
        if not iob2(tags):
            s_str = '\n'.join(' '.join(w) for w in s)
            raise Exception('Sentences should be given in BIO format! ' +
                            'Please check sentence %i:\n%s' % (i, s_str))
        if tag_scheme == 'BIOES':
            new_tags = iob_iobes(tags)
            for word, new_tag in zip(s, new_tags):
                word[-1] = new_tag
        else:
            raise Exception('Wrong tagging scheme!')

In [30]:
update_tag_scheme(train_sentences, parameters['tag_scheme'])
update_tag_scheme(dev_sentences, parameters['tag_scheme'])
update_tag_scheme(test_sentences, parameters['tag_scheme'])

Создадим вспомогательные словари (маппинги)

слово -> ID

символ -> ID

тег -> ID

In [34]:
def create_dico(item_list):
    assert type(item_list) is list
    dico = {}
    for items in item_list:
        for item in items:
            if item not in dico:
                dico[item] = 1
            else:
                dico[item] += 1
    return dico

def create_mapping(dico):
    sorted_items = sorted(dico.items(), key=lambda x: (-x[1], x[0]))
    id_to_item = {i: v[0] for i, v in enumerate(sorted_items)}
    item_to_id = {v: k for k, v in id_to_item.items()}
    return item_to_id, id_to_item

def word_mapping(sentences, lower):
    """
    словарь (маппинг) для слов, отсортированный по частоте
    """
    words = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]
    dico = create_dico(words)
    dico['<UNK>'] = 10000000 #UNK tag for unknown words
    word_to_id, id_to_word = create_mapping(dico)
    print(f"{len(dico)} уникальных слов")
    return dico, word_to_id, id_to_word

def char_mapping(sentences):
    """
    словарь (маппинг) для символов, отсортированный по частоте
    """
    chars = ["".join([w[0] for w in s]) for s in sentences]
    dico = create_dico(chars)
    char_to_id, id_to_char = create_mapping(dico)
    print(f"{len(dico)} уникальных символов" )
    return dico, char_to_id, id_to_char

def tag_mapping(sentences):
    """
    словарь (маппинг) для тегов, отсортированный по частоте
    """
    tags = [[word[-1] for word in s] for s in sentences]
    dico = create_dico(tags)
    dico[START_TAG] = -1
    dico[STOP_TAG] = -2
    tag_to_id, id_to_tag = create_mapping(dico)
    print(f"{len(dico)} уникальных тегов сущностей")
    return dico, tag_to_id, id_to_tag

In [35]:
dico_words,word_to_id,id_to_word = word_mapping(train_sentences, parameters['lower'])
dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)

17493 уникальных слов
75 уникальных символов
19 уникальных тегов сущностей


Преобразуем наши данные в соответствии с этим маппингом:

In [36]:
def lower_case(x,lower=False):
    if lower:
        return x.lower()  
    else:
        return x

In [37]:
def prepare_dataset(sentences, word_to_id, char_to_id, tag_to_id, lower=False):
    data = []
    for s in sentences:
        str_words = [w[0] for w in s]
        words = [word_to_id[lower_case(w,lower) if lower_case(w,lower) in word_to_id else '<UNK>']
                 for w in str_words]

        chars = [[char_to_id[c] for c in w if c in char_to_id]
                 for w in str_words]
        tags = [tag_to_id[w[-1]] for w in s]
        data.append({
            'str_words': str_words,
            'words': words,
            'chars': chars,
            'tags': tags,
        })
    return data

train_data = prepare_dataset(
    train_sentences, word_to_id, char_to_id, tag_to_id, parameters['lower']
)
dev_data = prepare_dataset(
    dev_sentences, word_to_id, char_to_id, tag_to_id, parameters['lower']
)
test_data = prepare_dataset(
    test_sentences, word_to_id, char_to_id, tag_to_id, parameters['lower']
)
print(f"{len(train_data)} / {len(dev_data)} / {len(test_data)} предложений в train / dev / test выборках")


14041 / 3250 / 3453 предложений в train / dev / test выборках


Загружаем веткора слов glove:

In [38]:
all_word_embeds = {}
for i, line in enumerate(codecs.open(parameters['embedding_path'], 'r', 'utf-8')):
    s = line.strip().split()
    if len(s) == parameters['word_dim'] + 1:
        all_word_embeds[s[0]] = np.array([float(i) for i in s[1:]])

word_embeds = np.random.uniform(-np.sqrt(0.06), np.sqrt(0.06), (len(word_to_id), parameters['word_dim']))

for w in word_to_id:
    if w in all_word_embeds:
        word_embeds[word_to_id[w]] = all_word_embeds[w]
    elif w.lower() in all_word_embeds:
        word_embeds[word_to_id[w]] = all_word_embeds[w.lower()]

print(f'Загрузили {len(all_word_embeds)} векторов слов')

Загрузили 400000 векторов слов


**Модель**

Для инициализации слоя эмбеддингов используем  распределение  $-\sqrt{\frac{3}{V}}$ to $+\sqrt{\frac{3}{V}}$ где $V$ - размерность пространства эмбеддингов.

In [39]:
def init_embedding(input_embedding):
    bias = np.sqrt(3.0 / input_embedding.size(1))
    nn.init.uniform(input_embedding, -bias, bias)

In [40]:
def init_linear(input_linear):

    bias = np.sqrt(6.0 / (input_linear.weight.size(0) + input_linear.weight.size(1)))
    nn.init.uniform(input_linear.weight, -bias, bias)
    if input_linear.bias is not None:
        input_linear.bias.data.zero_()

Для инициализации LSTM используем распределение $-\sqrt{\frac{6}{r+c}}$ to $+\sqrt{\frac{6}{r+c}}$ где r и c - число столбцов и строк в матрице весов.

In [41]:
def init_lstm(input_lstm):
    
    for ind in range(0, input_lstm.num_layers):
    
        weight = eval('input_lstm.weight_ih_l' + str(ind))
        
        sampling_range = np.sqrt(6.0 / (weight.size(0) / 4 + weight.size(1)))
        
        nn.init.uniform(weight, -sampling_range, sampling_range)
        
        weight = eval('input_lstm.weight_hh_l' + str(ind))
        sampling_range = np.sqrt(6.0 / (weight.size(0) / 4 + weight.size(1)))
        nn.init.uniform(weight, -sampling_range, sampling_range)
        
        
    if input_lstm.bidirectional:
        for ind in range(0, input_lstm.num_layers):
            weight = eval('input_lstm.weight_ih_l' + str(ind) + '_reverse')
            sampling_range = np.sqrt(6.0 / (weight.size(0) / 4 + weight.size(1)))
            nn.init.uniform(weight, -sampling_range, sampling_range)
            weight = eval('input_lstm.weight_hh_l' + str(ind) + '_reverse')
            sampling_range = np.sqrt(6.0 / (weight.size(0) / 4 + weight.size(1)))
            nn.init.uniform(weight, -sampling_range, sampling_range)
    
    if input_lstm.bias:
        for ind in range(0, input_lstm.num_layers):
            bias = eval('input_lstm.bias_ih_l' + str(ind))
            
            bias.data.zero_()
            
            bias.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1
            
            bias = eval('input_lstm.bias_hh_l' + str(ind))
            bias.data.zero_()
            bias.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1
            
        if input_lstm.bidirectional:
            for ind in range(0, input_lstm.num_layers):
                bias = eval('input_lstm.bias_ih_l' + str(ind) + '_reverse')
                bias.data.zero_()
                bias.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1
                bias = eval('input_lstm.bias_hh_l' + str(ind) + '_reverse')
                bias.data.zero_()
                bias.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1

**CRF**

Пусть $x$ - входная последовательность токенов, $y$ - последовательность тегов, тогда условная вероятность

$$P(y|x) = \frac{\exp{(\text{Score}(x, y)})}{\sum_{y'} \exp{(\text{Score}(x, y')})}$$

где 

$$\text{Score}(x,y) = \sum_i \log \psi_i(x,y)$$

Вспомогательные функции:

In [44]:
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))
    
def argmax(vec):
    _, idx = torch.max(vec, 1)
    return to_scalar(idx)

def to_scalar(var):
    return var.view(-1).data.tolist()[0]


def score_sentences(self, feats, tags):
    r = torch.LongTensor(range(feats.size()[0]))
    if self.use_gpu:
        r = r.cuda()
        pad_start_tags = torch.cat([torch.cuda.LongTensor([self.tag_to_ix[START_TAG]]), tags])
        pad_stop_tags = torch.cat([tags, torch.cuda.LongTensor([self.tag_to_ix[STOP_TAG]])])
    else:
        pad_start_tags = torch.cat([torch.LongTensor([self.tag_to_ix[START_TAG]]), tags])
        pad_stop_tags = torch.cat([tags, torch.LongTensor([self.tag_to_ix[STOP_TAG]])])

    score = torch.sum(self.transitions[pad_stop_tags, pad_start_tags]) + torch.sum(feats[r, tags])

    return score

Ниже реализация алгоритма прямого прохода и алгоритма Витерби для CRF. Алгоритм Витерби - это динамический алгоритм для поиска наиболее вероятной последовательности состояний (в нашем случае - тегов последовательности). 
 

$$ % <![CDATA[
\begin{align*}
\tilde{s}_t(y_t) &= \operatorname{argmax}_{y_t, \ldots, y_m} C(y_t, \ldots, y_m)\\
            &= \operatorname{argmax}_{y_{t+1}} s_t [y_t] + T[y_{t}, y_{t+1}] + \tilde{s}_{t+1}(y^{t+1})
\end{align*} %]]>$$

Вероятность последовательности тегов:

$$ \mathbb{P}(y_1, \ldots, y_m) = \frac{e^{C(y_1, \ldots, y_m)}}{Z} $$

In [45]:
def forward_alg(self, feats):
    
    init_alphas = torch.Tensor(1, self.tagset_size).fill_(-10000.)
    init_alphas[0][self.tag_to_ix[START_TAG]] = 0.
    
    forward_var = autograd.Variable(init_alphas)
    if self.use_gpu:
        forward_var = forward_var.cuda()
        
    for feat in feats:
        emit_score = feat.view(-1, 1)
        tag_var = forward_var + self.transitions + emit_score
        max_tag_var, _ = torch.max(tag_var, dim=1)
        tag_var = tag_var - max_tag_var.view(-1, 1)
        forward_var = max_tag_var + torch.log(torch.sum(torch.exp(tag_var), dim=1)).view(1, -1) # ).view(1, -1)
    terminal_var = (forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]).view(1, -1)
    alpha = log_sum_exp(terminal_var)
    return alpha

In [46]:
def viterbi_algo(self, feats):
    backpointers = []

    init_vvars = torch.Tensor(1, self.tagset_size).fill_(-10000.)
    init_vvars[0][self.tag_to_ix[START_TAG]] = 0
    
    forward_var = Variable(init_vvars)
    if self.use_gpu:
        forward_var = forward_var.cuda()
    for feat in feats:
        next_tag_var = forward_var.view(1, -1).expand(self.tagset_size, self.tagset_size) + self.transitions
        _, bptrs_t = torch.max(next_tag_var, dim=1)
        bptrs_t = bptrs_t.squeeze().data.cpu().numpy() 
        next_tag_var = next_tag_var.data.cpu().numpy() 
        viterbivars_t = next_tag_var[range(len(bptrs_t)), bptrs_t] 
        viterbivars_t = Variable(torch.FloatTensor(viterbivars_t))
        if self.use_gpu:
            viterbivars_t = viterbivars_t.cuda()

        forward_var = viterbivars_t + feat
        backpointers.append(bptrs_t)

    terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
    terminal_var.data[self.tag_to_ix[STOP_TAG]] = -10000.
    terminal_var.data[self.tag_to_ix[START_TAG]] = -10000.
    best_tag_id = argmax(terminal_var.unsqueeze(0))
    path_score = terminal_var[best_tag_id]
    
    best_path = [best_tag_id]
    for bptrs_t in reversed(backpointers):
        best_tag_id = bptrs_t[best_tag_id]
        best_path.append(best_tag_id)

    start = best_path.pop()
    best_path.reverse()
    return path_score, best_path

In [47]:
def forward_calc(self, sentence, chars, chars2_length, d):
    
    feats = self._get_lstm_features(sentence, chars, chars2_length, d)
    if self.use_crf:
        score, tag_seq = self.viterbi_decode(feats)
    else:
        score, tag_seq = torch.max(feats, 1)
        tag_seq = list(tag_seq.cpu().data)

    return score, tag_seq


Теперь можем переходить к непосредственно модели:

1. На вход моодели подаются токены, из символов с помощью CNN получаем символьные эмбеддинги.
2. Конкатенируем символьные эмбеддинги с glove векторами слов, подаем на вход BiLSTM.
3. Выход BiLSTM слоя пропускаем через линейный слой, из пространства выходных векторов BiLSTM получаем вектор пространства тегов.

In [48]:
def get_lstm_features(self, sentence, chars2, chars2_length, d):
    
    if self.char_mode == 'LSTM':
        
            chars_embeds = self.char_embeds(chars2).transpose(0, 1)
            
            packed = torch.nn.utils.rnn.pack_padded_sequence(chars_embeds, chars2_length)
            
            lstm_out, _ = self.char_lstm(packed)
            
            outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(lstm_out)
            
            outputs = outputs.transpose(0, 1)
            
            chars_embeds_temp = Variable(torch.FloatTensor(torch.zeros((outputs.size(0), outputs.size(2)))))
            
            if self.use_gpu:
                chars_embeds_temp = chars_embeds_temp.cuda()
            
            for i, index in enumerate(output_lengths):
                chars_embeds_temp[i] = torch.cat((outputs[i, index-1, :self.char_lstm_dim], outputs[i, 0, self.char_lstm_dim:]))
            
            chars_embeds = chars_embeds_temp.clone()
            
            for i in range(chars_embeds.size(0)):
                chars_embeds[d[i]] = chars_embeds_temp[i]
    
    
    if self.char_mode == 'CNN':
        chars_embeds = self.char_embeds(chars2).unsqueeze(1)

        chars_cnn_out3 = self.char_cnn3(chars_embeds)
        chars_embeds = nn.functional.max_pool2d(chars_cnn_out3,
                                             kernel_size=(chars_cnn_out3.size(2), 1)).view(chars_cnn_out3.size(0), self.out_channels)

    embeds = self.word_embeds(sentence)

    embeds = torch.cat((embeds, chars_embeds), 1)

    embeds = embeds.unsqueeze(1)

    embeds = self.dropout(embeds)

    lstm_out, _ = self.lstm(embeds)

    lstm_out = lstm_out.view(len(sentence), self.hidden_dim*2)

    lstm_out = self.dropout(lstm_out)

    lstm_feats = self.hidden2tag(lstm_out)
    
    return lstm_feats

In [49]:
def get_neg_log_likelihood(self, sentence, tags, chars2, chars2_length, d):
    feats = self._get_lstm_features(sentence, chars2, chars2_length, d)

    if self.use_crf:
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score
    else:
        tags = Variable(tags)
        scores = nn.functional.cross_entropy(feats, tags)
        return scores

In [50]:
class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim,
                 char_to_ix=None, pre_word_embeds=None, char_out_dimension=25,char_embedding_dim=25, use_gpu=False
                 , use_crf=True, char_mode='CNN'):
        '''     
                vocab_size= размер словаря 
                tag_to_ix = словарь маппинга тегов
                embedding_dim = размерность эмбеддингов слов
                hidden_dim = размер скрытого LSTM слоя 
                char_to_ix = словарь маппинга символов
                pre_word_embeds = маппинг эмбеддинг слова -> индекс слова
                char_out_dimension = размерность выходного слоя символьной CNN 
                char_embedding_dim = размерность символьных эмбеддингов
        '''
        
        super(BiLSTM_CRF, self).__init__()
        
        self.use_gpu = use_gpu
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.use_crf = use_crf
        self.tagset_size = len(tag_to_ix)
        self.out_channels = char_out_dimension
        self.char_mode = char_mode

        if char_embedding_dim is not None:
            self.char_embedding_dim = char_embedding_dim
            
            self.char_embeds = nn.Embedding(len(char_to_ix), char_embedding_dim)
            init_embedding(self.char_embeds.weight)
            
            if self.char_mode == 'LSTM':
                self.char_lstm = nn.LSTM(char_embedding_dim, char_lstm_dim, num_layers=1, bidirectional=True)
                init_lstm(self.char_lstm)
                
            if self.char_mode == 'CNN':
                self.char_cnn3 = nn.Conv2d(in_channels=1, out_channels=self.out_channels, kernel_size=(3, char_embedding_dim), padding=(2,0))

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        if pre_word_embeds is not None:
            self.pre_word_embeds = True
            self.word_embeds.weight = nn.Parameter(torch.FloatTensor(pre_word_embeds))
        else:
            self.pre_word_embeds = False
    
        self.dropout = nn.Dropout(parameters['dropout'])
        
        if self.char_mode == 'LSTM':
            self.lstm = nn.LSTM(embedding_dim+char_lstm_dim*2, hidden_dim, bidirectional=True)
        if self.char_mode == 'CNN':
            self.lstm = nn.LSTM(embedding_dim+self.out_channels, hidden_dim, bidirectional=True)
        
        init_lstm(self.lstm)
        
        self.hidden2tag = nn.Linear(hidden_dim*2, self.tagset_size)
        
        init_linear(self.hidden2tag) 

        if self.use_crf:
            self.transitions = nn.Parameter(
                torch.zeros(self.tagset_size, self.tagset_size))
            
            self.transitions.data[tag_to_ix[START_TAG], :] = -10000
            self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

    _score_sentence = score_sentences
    _get_lstm_features = get_lstm_features
    _forward_alg = forward_alg
    viterbi_decode = viterbi_algo
    neg_log_likelihood = get_neg_log_likelihood
    forward = forward_calc

In [53]:
model = BiLSTM_CRF(vocab_size=len(word_to_id),
                   tag_to_ix=tag_to_id,
                   embedding_dim=parameters['word_dim'],
                   hidden_dim=parameters['word_lstm_dim'],
                   use_gpu=use_gpu,
                   char_to_ix=char_to_id,
                   pre_word_embeds=word_embeds,
                   use_crf=parameters['crf'],
                   char_mode=parameters['char_mode'])

  This is separate from the ipykernel package so we can avoid doing imports until
  if __name__ == '__main__':
  del sys.path[0]
  after removing the cwd from sys.path.


Загружаем предобученную модедль:

In [54]:
if parameters['reload']:
    if not os.path.exists(parameters['reload']):
        model_url="https://github.com/TheAnig/NER-LSTM-CNN-Pytorch/raw/master/trained-model-cpu"
        urllib.request.urlretrieve(model_url, parameters['reload'])
    model.load_state_dict(torch.load(parameters['reload']))
    print("модель загружена:", parameters['reload'])

if use_gpu:
    model.cuda()

модель загружена: ./End-to-end-Sequence-Labeling-via-Bi-directional-LSTM-CNNs-CRF-Tutorial/models/pre-trained-model


In [55]:
learning_rate = 0.015
momentum = 0.9
number_of_epochs = parameters['epoch'] 
decay_rate = 0.05
gradient_clip = parameters['gradient_clip']
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)

losses = [] 
loss = 0.0 
best_dev_F = -1.0 
best_test_F = -1.0 
best_train_F = -1.0
all_F = [[0, 0, 0]]
eval_every = len(train_data) 
plot_every = 2000 
count = 0 

Еще немного вспомогательных функций:

In [56]:
def get_chunk_type(tok, idx_to_tag): 
    """
    "B-PER" -> (PER) и (B)
    """
    
    tag_name = idx_to_tag[tok]
    tag_class = tag_name.split('-')[0]
    tag_type = tag_name.split('-')[-1]
    return tag_class, tag_type

In [57]:
def get_chunks(seq, tags):
    """

    Аргументы:
        seq: [4, 4, 0, 0, ...] последовтельность меток класса
        tags: dict["O"] = 4

    Возвращает:
        list of (chunk_type, chunk_start, chunk_end)

    Пример:
        seq = [4, 5, 0, 3]
        tags = {"B-PER": 4, "I-PER": 5, "B-LOC": 3}
        result = [("PER", 0, 2), ("LOC", 3, 4)]

    """

    default = tags["O"]
    
    idx_to_tag = {idx: tag for tag, idx in tags.items()}
    
    chunks = []
    
    chunk_type, chunk_start = None, None
    for i, tok in enumerate(seq):
        if tok == default and chunk_type is not None:
            chunk = (chunk_type, chunk_start, i)
            chunks.append(chunk)
            chunk_type, chunk_start = None, None

        elif tok != default:
            tok_chunk_class, tok_chunk_type = get_chunk_type(tok, idx_to_tag)
            if chunk_type is None:
                chunk_type, chunk_start = tok_chunk_type, i
            elif tok_chunk_type != chunk_type or tok_chunk_class == "B":
                chunk = (chunk_type, chunk_start, i)
                chunks.append(chunk)
                chunk_type, chunk_start = tok_chunk_type, i
        else:
            pass

    if chunk_type is not None:
        chunk = (chunk_type, chunk_start, len(seq))
        chunks.append(chunk)

    return chunks

In [58]:
def evaluating(model, datas, best_F,dataset="Train"):
    '''
    Функция для вычисления F1 score
    '''

    prediction = [] 
    save = False 
    new_F = 0.0 
    correct_preds, total_correct, total_preds = 0., 0., 0. # Count variables
    
    for data in datas:
        ground_truth_id = data['tags']
        words = data['str_words']
        chars2 = data['chars']
        
        if parameters['char_mode'] == 'LSTM':
            chars2_sorted = sorted(chars2, key=lambda p: len(p), reverse=True)
            d = {}
            for i, ci in enumerate(chars2):
                for j, cj in enumerate(chars2_sorted):
                    if ci == cj and not j in d and not i in d.values():
                        d[j] = i
                        continue
            chars2_length = [len(c) for c in chars2_sorted]
            char_maxl = max(chars2_length)
            chars2_mask = np.zeros((len(chars2_sorted), char_maxl), dtype='int')
            for i, c in enumerate(chars2_sorted):
                chars2_mask[i, :chars2_length[i]] = c
            chars2_mask = Variable(torch.LongTensor(chars2_mask))
        
        
        if parameters['char_mode'] == 'CNN':
            d = {} 

            chars2_length = [len(c) for c in chars2]
            char_maxl = max(chars2_length)
            chars2_mask = np.zeros((len(chars2_length), char_maxl), dtype='int')
            for i, c in enumerate(chars2):
                chars2_mask[i, :chars2_length[i]] = c
            chars2_mask = Variable(torch.LongTensor(chars2_mask))

        dwords = Variable(torch.LongTensor(data['words']))
        
        if use_gpu:
            val,out = model(dwords.cuda(), chars2_mask.cuda(), chars2_length, d)
        else:
            val,out = model(dwords, chars2_mask, chars2_length, d)
        predicted_id = out
    
        lab_chunks = set(get_chunks(ground_truth_id,tag_to_id))
        lab_pred_chunks = set(get_chunks(predicted_id,
                                         tag_to_id))

        correct_preds += len(lab_chunks & lab_pred_chunks)
        total_preds   += len(lab_pred_chunks)
        total_correct += len(lab_chunks)
    
    # F1-Score
    p   = correct_preds / total_preds if correct_preds > 0 else 0
    r   = correct_preds / total_correct if correct_preds > 0 else 0
    new_F  = 2 * p * r / (p + r) if correct_preds > 0 else 0

    print("{}: new_F: {} best_F: {} ".format(dataset,new_F,best_F))

    
    if new_F>best_F:
        best_F=new_F
        save=True

    return best_F, new_F, save

In [59]:
# learning rate decay

def adjust_learning_rate(optimizer, lr):
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

**Обучение модели.**

Мы загрузим предобученную модель, а код ниже позволит обучить модель с нуля и сохранить.

In [60]:
#parameters['reload']=False

if not parameters['reload']:
    tr = time.time()
    model.train(True)
    for epoch in range(1,number_of_epochs):
        for i, index in enumerate(np.random.permutation(len(train_data))):
            count += 1
            data = train_data[index]

            model.zero_grad()

            sentence_in = data['words']
            sentence_in = Variable(torch.LongTensor(sentence_in))
            tags = data['tags']
            chars2 = data['chars']
            
            if parameters['char_mode'] == 'LSTM':
                chars2_sorted = sorted(chars2, key=lambda p: len(p), reverse=True)
                d = {}
                for i, ci in enumerate(chars2):
                    for j, cj in enumerate(chars2_sorted):
                        if ci == cj and not j in d and not i in d.values():
                            d[j] = i
                            continue
                chars2_length = [len(c) for c in chars2_sorted]
                char_maxl = max(chars2_length)
                chars2_mask = np.zeros((len(chars2_sorted), char_maxl), dtype='int')
                for i, c in enumerate(chars2_sorted):
                    chars2_mask[i, :chars2_length[i]] = c
                chars2_mask = Variable(torch.LongTensor(chars2_mask))
            
            if parameters['char_mode'] == 'CNN':

                d = {}

                chars2_length = [len(c) for c in chars2]
                char_maxl = max(chars2_length)
                chars2_mask = np.zeros((len(chars2_length), char_maxl), dtype='int')
                for i, c in enumerate(chars2):
                    chars2_mask[i, :chars2_length[i]] = c
                chars2_mask = Variable(torch.LongTensor(chars2_mask))


            targets = torch.LongTensor(tags)

            if use_gpu:
                neg_log_likelihood = model.neg_log_likelihood(sentence_in.cuda(), targets.cuda(), chars2_mask.cuda(), chars2_length, d)
            else:
                neg_log_likelihood = model.neg_log_likelihood(sentence_in, targets, chars2_mask, chars2_length, d)
            loss += neg_log_likelihood.data[0] / len(data['words'])
            neg_log_likelihood.backward()

            torch.nn.utils.clip_grad_norm(model.parameters(), gradient_clip)
            optimizer.step()

            if count % plot_every == 0:
                loss /= plot_every
                print(count, ': ', loss)
                if losses == []:
                    losses.append(loss)
                losses.append(loss)
                loss = 0.0

            if count % (eval_every) == 0 and count > (eval_every * 20) or \
                    count % (eval_every*4) == 0 and count < (eval_every * 20):
                model.train(False)
                best_train_F, new_train_F, _ = evaluating(model, train_data, best_train_F,"Train")
                best_dev_F, new_dev_F, save = evaluating(model, dev_data, best_dev_F,"Dev")
                if save:
                    print("Saving Model to ", model_name)
                    torch.save(model.state_dict(), model_name)
                best_test_F, new_test_F, _ = evaluating(model, test_data, best_test_F,"Test")

                all_F.append([new_train_F, new_dev_F, new_test_F])
                model.train(True)

            if count % len(train_data) == 0:
                adjust_learning_rate(optimizer, lr=learning_rate/(1+decay_rate*count/len(train_data)))

    print(time.time() - tr)
    plt.plot(losses)
    plt.show()

if not parameters['reload']:
    model.load_state_dict(torch.load(model_name))

Посмотрим на модель на примерах:

In [66]:
model_testing_sentences = ['Higher School of Economics opens a building in Moscow ',
                           'Steve Jobs is the co-creator of the Macintosh, iPod, iPhone, iPad, and first Apple Stores']


# параметры
lower=parameters['lower']

# предобработка
final_test_data = []
for sentence in model_testing_sentences:
    s=sentence.split()
    str_words = [w for w in s]
    words = [word_to_id[lower_case(w,lower) if lower_case(w,lower) in word_to_id else '<UNK>'] for w in str_words]
    
    chars = [[char_to_id[c] for c in w if c in char_to_id] for w in str_words]
    
    final_test_data.append({
        'str_words': str_words,
        'words': words,
        'chars': chars,
    })

# предскзание модели
predictions = []
print("word : tag \n")
for data in final_test_data:
    words = data['str_words']
    chars2 = data['chars']

    d = {} 
    
    chars2_length = [len(c) for c in chars2]
    char_maxl = max(chars2_length)
    chars2_mask = np.zeros((len(chars2_length), char_maxl), dtype='int')
    for i, c in enumerate(chars2):
        chars2_mask[i, :chars2_length[i]] = c
    chars2_mask = Variable(torch.LongTensor(chars2_mask))

    dwords = Variable(torch.LongTensor(data['words']))

    if use_gpu:
        val,predicted_id = model(dwords.cuda(), chars2_mask.cuda(), chars2_length, d)
    else:
        val,predicted_id = model(dwords, chars2_mask, chars2_length, d)

    pred_chunks = get_chunks(predicted_id,tag_to_id)
    temp_list_tags=['O']*len(words)
    for p in pred_chunks:
        temp_list_tags[p[1]]=p[0]
        
    for word,tag in zip(words,temp_list_tags):
        print(word,':',tag)
    print('\n')

word : tag 

Higher : O
School : O
of : O
Economics : O
opens : O
a : O
building : O
in : O
Moscow : LOC


Steve : ORG
Jobs : O
is : O
the : O
co-creator : O
of : O
the : O
Macintosh, : MISC
iPod, : O
iPhone, : O
iPad, : O
and : O
first : O
Apple : ORG
Stores : O


