# NER BERT Tutorial

This IPython Notebook contains an example how to use BERT embeddings to perform Named Entity Recognition. Note that as input we use a tokenized document. However, this step can easily removed if a tokenizer is available.

Details of the model can be found in:

Moreno, José G., et al. "TLR at BSNLP2019: A Multilingual Named Entity Recognition System." Proceedings of the 7th Workshop on Balto-Slavic Natural Language Processing. 2019.

This code is based on https://github.com/UKPLab/elmo-bilstm-cnn-crf

# Basics

In [1]:
#import nltk
import sys
import torch
from util.preprocessing import addCharInformation, createMatrices, addCasingInformation, readCoNLL
from neuralnets.BERTBiLSTM import BERTBiLSTM
from neuralnets.BERTWordEmbeddings import BERTWordEmbeddings
from IPython.core.display import display, HTML
from ipywidgets import interact, widgets, Layout

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
def displayEntitiesInHTML(sentences,tags,debug=False):
    parts = ['<html>'+
    '<head>'+
    '<style>'+
    'hr {border-width: 10px;}'+
    '.button {background-color: #FFFFFF;border: none;color: white;padding: 10px;text-align: center;text-decoration: none;display: inline-block;font-size: 16px;margin: 4px 2px;cursor: pointer;border-radius: 8px;}'+
    '.ORG {background-color: #4CAF50;}'+
    '.PER {background-color: #008CBA;}'+
    '.LOC {background-color: #f44336;}'+
    '.MISC {background-color: #e7e7e7; color: black;}'+
    '.ISC {background-color: #e7e7e7; color: black;}'+
    '.EVT {background-color: #555555;}'+
    '.PRO {background-color: #555555;}'+
    '.O {color: black;}'+
    '</style>'+
    '</head>'+
    '<body>',
    '</body>'+
    '</html>']

    # :: Output to stdout ::
    lines = ''
    modelName = list(tags.keys())[0]
    for sentenceIdx in range(len(sentences)):
        tokens = sentences[sentenceIdx]['tokens']
        for tokenIdx in range(len(tokens)):
            tokenTag = tags[modelName][sentenceIdx][tokenIdx]
            tokenTagPrev = tags[modelName][sentenceIdx][tokenIdx-1] if tokenIdx > 0 else 'X'
            tokenTagNext = tags[modelName][sentenceIdx][tokenIdx+1] if tokenIdx < len(tokens)-1 else 'X'
            tokenTag = str(tokenTag[-4:]) if len(tokenTag)==6 else str(tokenTag[-3:]) if len(tokenTag)>3 else tokenTag 
            tokenTagPrev = str(tokenTagPrev[-3:]) if len(tokenTagPrev)>3 else tokenTagPrev 
            tokenTagNext = str(tokenTagNext[-3:]) if len(tokenTagNext)>3 else tokenTagNext 
            if tokenTag == 'O':
                lines += '<button class="button '+tokenTag+'">'+tokens[tokenIdx]+'</button>'
            elif tokenTag != tokenTagPrev and tokenTag != tokenTagNext:
                lines += '<button class="button '+tokenTag+'">'+tokens[tokenIdx]+'&nbsp;&nbsp;&nbsp;&nbsp;<small><b>'+tokenTag+'</b></small></button>'
            elif tokenTag != tokenTagPrev and tokenTag == tokenTagNext:
                lines += '<button class="button '+tokenTag+'">'+tokens[tokenIdx]
            elif tokenTag == tokenTagPrev and tokenTag == tokenTagNext:
                lines += ' '+tokens[tokenIdx]
            elif tokenTag == tokenTagPrev and tokenTag != tokenTagNext:
                lines += ' '+tokens[tokenIdx]+'&nbsp;&nbsp;&nbsp;&nbsp;<small><b>'+tokenTag+'</b></small></button>'
            lines += ''
        lines += '<br><br><hr><br><br>'
    if debug:
        print(lines)
    display(HTML(parts[0]+lines+parts[1]))

In [3]:
#Which GPU to use for . -1 for CPU
if torch.cuda.is_available():
    bert_cuda_device = 0
else:
    bert_cuda_device = -1

# Model load

In [4]:
modelPath = "models/conll2003_ner_0.9250_0.8854_13.h5"


In [5]:
# :: embeddings file
embeddings_file = 'embeddings/komninos_english_embeddings.gz'

In [6]:
# :: Load the model ::
lstmModel = BERTBiLSTM.loadModel(modelPath,embeddings_file)

{'dropout': [0.5, 0.5], 'classifier': ['CRF'], 'LSTM-Size': [100, 100], 'customClassifier': {}, 'optimizer': 'adam', 'charEmbeddings': 'CNN', 'charEmbeddingsSize': 30, 'charFilterSize': 30, 'charFilterLength': 3, 'charLSTMSize': 25, 'maxCharLength': 25, 'useTaskIdentifier': False, 'clipvalue': 0, 'clipnorm': 1, 'earlyStopping': 5, 'miniBatchSize': 32, 'featureNames': ['casing', 'tokens_embeddings', 'bert_embeddings', 'characters'], 'addFeatureDimensions': 10, 'embeddingsConfig': {'embeddings_path': 'embeddings/komninos_english_embeddings.gz', 'bert_mode': 'average', 'bert_path': '/data6T/Datasets/BERT/cased_L-12_H-768_A-12/', 'bert_n_layers': 12, 'bert_cuda_device': -1}}


# Example 1

In [7]:
# :: Prepare the input ::
sentences = readCoNLL("input.conll", {0: "tokens"})
print("\n".join([" - "+" ".join(x['tokens']) for x in sentences]))
addCharInformation(sentences)
addCasingInformation(sentences)


['Rockwell']
['International']
['Corp.']
["'s"]
['Tulsa']
['unit']
['said']
['it']
['signed']
['a']
['tentative']
['agreement']
['extending']
['its']
['contract']
['with']
['Boeing']
['Co.']
['to']
['provide']
['structural']
['parts']
['for']
['Boeing']
["'s"]
['747']
['jetliners']
['.']
['He']
['reckons']
['the']
['current']
['account']
['deficit']
['will']
['narrow']
['to']
['only']
['1.8']
['billion']
['in']
['September']
['.']
['The']
['group']
[',']
['crossing']
['at']
['Al']
['Yarubiyah']
[',']
['Syria']
[',']
['was']
['transferred']
['to']
['the']
['U.N.']
['refugee']
['agency']
["'s"]
['camp']
['at']
['El']
['Hol']
[',']
['100']
['kilometers']
['(']
['60']
['miles']
[')']
['to']
['the']
['west']
[',']
['agency']
['spokesman']
['Peter']
['Kessler']
['said']
['in']
['Amman']
['.']
 - Rockwell International Corp. 's Tulsa unit said it signed a tentative agreement extending its contract with Boeing Co. to provide structural parts for Boeing 's 747 jetliners .
 - He reckons the curr

In [8]:

# :: Map casing and character information to integer indices ::
dataMatrix = createMatrices(sentences, lstmModel.mappings, True)


# :: Tag the input ::
tags = lstmModel.tagSentences(dataMatrix)

In [9]:
displayEntitiesInHTML(sentences,tags)

# Example 2

In [10]:
#https://www.theguardian.com/politics/2019/dec/10/final-scramble-for-votes-in-most-important-election-in-a-generation
text = widgets.Textarea(
    value='Hello World',
    placeholder='Type something',
    description='String:',
    disabled=False,
    layout=Layout(width='80%', height='200px')
)
display(text)


Textarea(value='Hello World', description='String:', layout=Layout(height='200px', width='80%'), placeholder='…

In [11]:
print(text.value)

Boris Johnson and Jeremy Corbyn are to embark on a final frantic 24 hours of campaigning as both teams insist the election remains closely fought and that polls giving the Conservatives a lead could be wrong.

Both Labour and the Conservatives have branded Thursday ’s vote the “most important in a generation” as the two sides have vastly different plans for Brexit and spending on public services.

The prime minister was set to crisscross the country from Yorkshire to the Midlands, Wales and London on Wednesday , sending out his core message that the Conservatives need only another 12 seats to win a majority.


In [12]:
sentences = [{'tokens':x.split()} for x in text.value.split("\n") if len(x)>0]
addCharInformation(sentences)
addCasingInformation(sentences)

# :: Map casing and character information to integer indices ::
dataMatrix = createMatrices(sentences, lstmModel.mappings, True)


# :: Tag the input ::
tags = lstmModel.tagSentences(dataMatrix)

In [13]:
displayEntitiesInHTML(sentences,tags)

# Example 3 - Croatian

In [14]:
modelPath = "models/zagreb/282NER_hr_0.8636_0.8619_24.h5"

# :: embeddings file
embeddings_file = 'embeddings/fastText157/cc.hr.300.vec.gz.top1.gz'

# :: Load the model ::
lstmModel = BERTBiLSTM.loadModel(modelPath,embeddings_file,use_fastext = True)

{'dropout': [0.5, 0.5], 'classifier': ['CRF'], 'LSTM-Size': [100, 100], 'customClassifier': {}, 'optimizer': 'adam', 'charEmbeddings': 'CNN', 'charEmbeddingsSize': 30, 'charFilterSize': 30, 'charFilterLength': 3, 'charLSTMSize': 25, 'maxCharLength': 25, 'useTaskIdentifier': False, 'clipvalue': 0, 'clipnorm': 1, 'earlyStopping': 5, 'miniBatchSize': 32, 'featureNames': ['casing', 'tokens_embeddings', 'bert_embeddings', 'characters'], 'addFeatureDimensions': 10, 'embeddingsConfig': {'embeddings_path': 'embeddings/fastText157/cc.hr.300.vec.gz.top1.gz', 'bert_mode': 'average', 'bert_path': '/users/iris/jmoreno/Projets/BERTexp/multi_cased_L-12_H-768_A-12/', 'bert_n_layers': 2, 'bert_cuda_device': -1}}


In [20]:
#https://euractiv.jutarnji.hr/aktualno/neovisni-odvjetnik-zakljucio-da-sud-eu-nije-nadlezan-za-slovensku-tuzbu-protiv-hrvatske/9731425/

display(text)


Textarea(value='Svojim današnjim mišljenjem nezavisni odvjetnik Priit Pikamäe Sudu predlaže da se proglasi nen…

In [16]:
print(text.value)

Svojim današnjim mišljenjem nezavisni odvjetnik Priit Pikamäe Sudu predlaže da se proglasi nenadležnim za odlučivanje o tužbi koju je podnijela Slovenija.

Situacija je sada jasnija i veliki su izgledi da se Sud EU proglasi nenadležnim za tužbu Republike Slovenije protiv Hrvatske zbog neprovedbe arbitražnog postupka. Dosadašnja praksa pokazuje da u predmetima koji se vode pred Velikim vijećem Suda EU-a, a to je slučaj i s ovom slovenskom tužbom, Sud slijedi mišljenje nezavisnog odvjetnika u oko polovici slučajeva.


In [17]:
sentences = [{'tokens':x.split()} for x in text.value.split("\n") if len(x)>0]
addCharInformation(sentences)
addCasingInformation(sentences)

# :: Map casing and character information to integer indices ::
dataMatrix = createMatrices(sentences, lstmModel.mappings, True)


# :: Tag the input ::
tags = lstmModel.tagSentences(dataMatrix)




In [18]:
displayEntitiesInHTML(sentences,tags)

In [19]:
#In his opinion today, independent lawyer Priit Pikamäe proposes that the Court declare itself incompetent to 
#decide the action brought by Slovenia.

#The situation is now clearer and there is a strong chance that the EU Court will declare itself not competent 
#to sue the Republic of Slovenia against Croatia for failure to conduct arbitration proceedings. 
#The practice so far shows that in the cases before the Grand Chamber of the EU Court, which is the case with 
#this Slovenian lawsuit, the Court follows the opinion of an independent lawyer in about half of the cases.

