In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('../data/ner_dataset.csv', encoding='latin1')

In [3]:
# fill NaN
data = data.fillna(method='ffill')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [4]:
data.tail(10)

Unnamed: 0,Sentence #,Word,POS,Tag
1048565,Sentence: 47958,impact,NN,O
1048566,Sentence: 47958,.,.,O
1048567,Sentence: 47959,Indian,JJ,B-gpe
1048568,Sentence: 47959,forces,NNS,O
1048569,Sentence: 47959,said,VBD,O
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O
1048574,Sentence: 47959,attack,NN,O


Essential info about entities:

    geo = Geographical Entity
    org = Organization
    per = Person
    gpe = Geopolitical Entity
    tim = Time indicator
    art = Artifact
    eve = Event
    nat = Natural Phenomenon

In [5]:
data.Tag.value_counts()

O        887908
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: Tag, dtype: int64

In [6]:
words = list(set(data['Word'].values))
n_words = len(words)
n_words

35178

In [7]:
tags = list(set(data["Tag"].values))
n_tags = len(tags)
n_tags

17

In [116]:
tags

['I-nat',
 'B-geo',
 'B-per',
 'I-art',
 'I-per',
 'B-art',
 'I-gpe',
 'B-eve',
 'O',
 'B-nat',
 'I-org',
 'I-eve',
 'I-tim',
 'B-org',
 'I-geo',
 'B-gpe',
 'B-tim']

So we have 47959 sentences containing 35178 different words with 17 different tags. We use the SentenceGetter class from last post to retrieve sentences with their labels.

In [8]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [9]:
getter = SentenceGetter(data)

In [10]:
sent = getter.get_next()

In [11]:
sent

[('Thousands', 'NNS', 'O'),
 ('of', 'IN', 'O'),
 ('demonstrators', 'NNS', 'O'),
 ('have', 'VBP', 'O'),
 ('marched', 'VBN', 'O'),
 ('through', 'IN', 'O'),
 ('London', 'NNP', 'B-geo'),
 ('to', 'TO', 'O'),
 ('protest', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('war', 'NN', 'O'),
 ('in', 'IN', 'O'),
 ('Iraq', 'NNP', 'B-geo'),
 ('and', 'CC', 'O'),
 ('demand', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('withdrawal', 'NN', 'O'),
 ('of', 'IN', 'O'),
 ('British', 'JJ', 'B-gpe'),
 ('troops', 'NNS', 'O'),
 ('from', 'IN', 'O'),
 ('that', 'DT', 'O'),
 ('country', 'NN', 'O'),
 ('.', '.', 'O')]

In [12]:
# get all sentences
sentences = getter.sentences

# Prepare the data



In [99]:
max_len = 75
word2idx = {w: i + 1 for i, w in enumerate(words)}
tag2idx = {t: i + 1 for i, t in enumerate(tags)}
tag2idx['<pad>'] = 0

In [59]:
word2idx["Obama"]

12571

In [60]:
tag2idx

{'B-art': 6,
 'B-eve': 8,
 'B-geo': 2,
 'B-gpe': 16,
 'B-nat': 10,
 'B-org': 14,
 'B-per': 3,
 'B-tim': 17,
 'I-art': 4,
 'I-eve': 12,
 'I-geo': 15,
 'I-gpe': 7,
 'I-nat': 1,
 'I-org': 11,
 'I-per': 5,
 'I-tim': 13,
 'O': 9}

Now we map the senctences to a sequence of numbers and then pad the sequence. Note that we increased the index of the words by one to use zero as a padding value. This is done because we want to use the mask_zeor parameter of the embedding layer to ignore inputs with value zero.

In [61]:
from keras.preprocessing.sequence import pad_sequences

In [62]:
# example 0
sentences[0]

[('Thousands', 'NNS', 'O'),
 ('of', 'IN', 'O'),
 ('demonstrators', 'NNS', 'O'),
 ('have', 'VBP', 'O'),
 ('marched', 'VBN', 'O'),
 ('through', 'IN', 'O'),
 ('London', 'NNP', 'B-geo'),
 ('to', 'TO', 'O'),
 ('protest', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('war', 'NN', 'O'),
 ('in', 'IN', 'O'),
 ('Iraq', 'NNP', 'B-geo'),
 ('and', 'CC', 'O'),
 ('demand', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('withdrawal', 'NN', 'O'),
 ('of', 'IN', 'O'),
 ('British', 'JJ', 'B-gpe'),
 ('troops', 'NNS', 'O'),
 ('from', 'IN', 'O'),
 ('that', 'DT', 'O'),
 ('country', 'NN', 'O'),
 ('.', '.', 'O')]

In [63]:
X = [[word2idx[w[0]] for w in s] for s in sentences]

In [64]:
X[0]

[29812,
 9447,
 13560,
 34345,
 13717,
 33694,
 19520,
 22338,
 35041,
 32951,
 10730,
 23113,
 10193,
 25217,
 23956,
 32951,
 14392,
 9447,
 33986,
 12798,
 11146,
 32466,
 981,
 16279]

word start with 1, we preserve 0 for pad

In [65]:
for i, item in enumerate(word2idx.items()):
    print(item)
    if i > 3:
        break

('Crescent', 1)
('meteorologists', 2)
('Martyrs', 3)
('soldiers', 4)
('legislator', 5)


In [66]:
len(word2idx)

35178

In [67]:
# here the value should be 0
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=0)

In [68]:
X[0]

array([29812,  9447, 13560, 34345, 13717, 33694, 19520, 22338, 35041,
       32951, 10730, 23113, 10193, 25217, 23956, 32951, 14392,  9447,
       33986, 12798, 11146, 32466,   981, 16279,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0], dtype=int32)

And we need to do the same for our tag sequence.

In [69]:
tag2idx

{'B-art': 6,
 'B-eve': 8,
 'B-geo': 2,
 'B-gpe': 16,
 'B-nat': 10,
 'B-org': 14,
 'B-per': 3,
 'B-tim': 17,
 'I-art': 4,
 'I-eve': 12,
 'I-geo': 15,
 'I-gpe': 7,
 'I-nat': 1,
 'I-org': 11,
 'I-per': 5,
 'I-tim': 13,
 'O': 9}

In [70]:
y = [[tag2idx[w[2]] for w in s] for s in sentences]

In [71]:
y[0]

[9, 9, 9, 9, 9, 9, 2, 9, 9, 9, 9, 9, 2, 9, 9, 9, 9, 9, 16, 9, 9, 9, 9, 9]

In [72]:
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=0)

In [73]:
y[0]

array([ 9,  9,  9,  9,  9,  9,  2,  9,  9,  9,  9,  9,  2,  9,  9,  9,  9,
        9, 16,  9,  9,  9,  9,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0], dtype=int32)

For training the network we also need to change the labels y to categorial.

    y = (batch_size, max_length, one-hot NER token index)

In [75]:
from keras.utils import to_categorical
y = [to_categorical(i, num_classes=n_tags+1) for i in y] 

In [77]:
print(len(y))
print(y[0].shape)

47959
(75, 18)


In [78]:
from sklearn.model_selection import train_test_split

In [79]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1)

# Setup the CRF-LSTM

Now we can fit a LSTM-CRF network with an embedding layer.

In [80]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF

In [81]:
input = Input(shape=(max_len,))
model = Embedding(input_dim=n_words + 1, output_dim=20,
                  input_length=max_len, mask_zero=True)(input)  # 20-dim embedding
model = Bidirectional(LSTM(units=50, return_sequences=True,
                           recurrent_dropout=0.1))(model)  # variational biLSTM
model = TimeDistributed(Dense(50, activation="tanh"))(model)  # a dense layer as suggested by neuralNer
crf = CRF(n_tags+1)  # CRF layer
out = crf(model)  # output

In [82]:
model = Model(input, out)
model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 75)                0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 75, 20)            703580    
_________________________________________________________________
bidirectional_3 (Bidirection (None, 75, 100)           28400     
_________________________________________________________________
time_distributed_3 (TimeDist (None, 75, 50)            5050      
_________________________________________________________________
crf_3 (CRF)                  (None, 75, 18)            1278      
Total params: 738,308
Trainable params: 738,308
Non-trainable params: 0
_________________________________________________________________


In [84]:
history = model.fit(X_tr, np.array(y_tr), batch_size=32, epochs=1,
                    validation_split=0.1, verbose=1)

Train on 38846 samples, validate on 4317 samples
Epoch 1/1


In [85]:
hist = pd.DataFrame(history.history)

In [87]:
# import matplotlib.pyplot as plt
# %matplotlib inline
# plt.style.use("ggplot")
# plt.figure(figsize=(12,12))
# plt.plot(hist["acc"])
# plt.plot(hist["val_acc"])
# plt.show()

# Now look at some predictions.

pick a sentence from X_test

In [100]:
idx2word = {value: key for key, value in word2idx.items()}
idx2tag = {value: key for key, value in tag2idx.items()}

In [112]:
i = 1927

print(X_te[i])
print(' '.join([idx2word[inx] for inx in X_te[i] if inx!=0]))

[17651 35053  5672  7429  7598 22129 22650  9447 25786  3175 26655 15649
 30218 24940 32951 34535 29926 23113 32951 28112 11174  9447 31687 16279
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0]
Nigerian health officials say a deadly strain of bird flu has been detected for the first time in the southwestern state of Ogun .


In [101]:
tag2idx

{'<pad>': 0,
 'B-art': 6,
 'B-eve': 8,
 'B-geo': 2,
 'B-gpe': 16,
 'B-nat': 10,
 'B-org': 14,
 'B-per': 3,
 'B-tim': 17,
 'I-art': 4,
 'I-eve': 12,
 'I-geo': 15,
 'I-gpe': 7,
 'I-nat': 1,
 'I-org': 11,
 'I-per': 5,
 'I-tim': 13,
 'O': 9}

In [104]:
np.argmax(y_te[i], -1)

array([16,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  9,  2,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0])

In [108]:
sample_tag = [idx2tag[inx] for inx in np.argmax(y_te[i], -1) if inx!=0]
print(len(sample_tag))
print(sample_tag)

24
['B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O']


In [109]:
y_te[i].shape

(75, 18)

In [115]:
i = 1927
p = model.predict(np.array([X_te[i]]))
p = np.argmax(p, axis=-1)
print(p.shape)
print(p)

(1, 75)
[[16  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0]]


Here `words[w-1], tags[t-1], tags[pred-1]` all subtract 1, because we use PAD as the 0 in `words` and `tags`

In [118]:
i = 1927
p = model.predict(np.array([X_te[i]]))
p = np.argmax(p, axis=-1)
true = np.argmax(y_te[i], -1)

print("{:15}||{:5}||{}".format("Word", "True", "Pred"))
print(30 * "=")
for w, t, pred in zip(X_te[i], true, p[0]):
    if w != 0:
        print("{:15}: {:5} {}".format(words[w-1], tags[t-1], tags[pred-1]))

Word           ||True ||Pred
Nigerian       : B-gpe B-gpe
health         : O     O
officials      : O     O
say            : O     O
a              : O     O
deadly         : O     O
strain         : O     O
of             : O     O
bird           : O     O
flu            : O     O
has            : O     O
been           : O     O
detected       : O     O
for            : O     O
the            : O     O
first          : O     O
time           : O     O
in             : O     O
the            : O     O
southwestern   : O     O
state          : O     O
of             : O     O
Ogun           : B-geo O
.              : O     O


In [120]:
p_all = model.predict(np.array(X_te))
p_all.shape

(4796, 75, 18)

In [121]:
p_all= np.argmax(p_all, axis=-1)
p_all.shape 

(4796, 75)

delete the padding index 

In [132]:
p_all_tags = [[idx2tag[idx] for idx in s if idx!=0] for s in p_all]

In [147]:
print(p_all_tags[i])

['B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [124]:
true_all = np.argmax(y_te, -1)
true_all.shape

(4796, 75)

delete the padding index 0

In [134]:
true_all_tags = [[idx2tag[idx] for idx in s if idx!=0] for s in true_all]

In [139]:
true_all_tags[i]

['B-gpe',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-geo',
 'O']

In [136]:
from seqeval.metrics import f1_score

In [140]:
f1_score(true_all_tags, p_all_tags)

0.7338019284491304

## Test on new samples

In [141]:
test_sentence = ["Hawking", "was", "a", "Fellow", "of", "the", "Royal", "Society", ",", "a", "lifetime", "member",
                 "of", "the", "Pontifical", "Academy", "of", "Sciences", ",", "and", "a", "recipient", "of",
                 "the", "Presidential", "Medal", "of", "Freedom", ",", "the", "highest", "civilian", "award",
                 "in", "the", "United", "States", "."]

In [142]:
[word2idx.get(w, 0) for w in test_sentence]

[0,
 32806,
 7598,
 7246,
 9447,
 32951,
 26891,
 9829,
 33065,
 7598,
 28073,
 26726,
 9447,
 32951,
 1088,
 10110,
 9447,
 23661,
 33065,
 25217,
 7598,
 33605,
 9447,
 32951,
 6852,
 0,
 9447,
 1839,
 33065,
 32951,
 14377,
 14536,
 9700,
 23113,
 32951,
 13595,
 5688,
 16279]

In [143]:
x_test_sent = pad_sequences(sequences=[[word2idx.get(w, 0) for w in test_sentence]],
                            padding="post", value=0, maxlen=max_len)

In [145]:
p = model.predict(np.array([x_test_sent[0]]))
p = np.argmax(p, axis=-1)
print("{:15}||{}".format("Word", "Prediction"))
print(30 * "=")
for w, pred in zip(test_sentence, p[0]):
    print("{:15}: {:5}".format(w, tags[pred-1]))

Word           ||Prediction
Hawking        : B-tim
was            : O    
a              : O    
Fellow         : O    
of             : O    
the            : O    
Royal          : B-org
Society        : I-org
,              : O    
a              : O    
lifetime       : O    
member         : O    
of             : O    
the            : O    
Pontifical     : B-org
Academy        : I-org
of             : O    
Sciences       : B-geo
,              : O    
and            : O    
a              : O    
recipient      : O    
of             : O    
the            : O    
Presidential   : O    
Medal          : B-tim
of             : O    
Freedom        : B-geo
,              : O    
the            : O    
highest        : O    
civilian       : O    
award          : O    
in             : O    
the            : O    
United         : B-geo
States         : I-geo
.              : O    
