In [3]:
import nltk
nltk.download('treebank')
 
tagged_sentences = nltk.corpus.treebank.tagged_sents()
 
print(tagged_sentences[0])
print("Tagged sentences: ", len(tagged_sentences))
print("Tagged words:", len(nltk.corpus.treebank.tagged_words()))

[nltk_data] Downloading package treebank to /root/nltk_data...


[nltk_data]   Unzipping corpora/treebank.zip.
[(u'Pierre', u'NNP'), (u'Vinken', u'NNP'), (u',', u','), (u'61', u'CD'), (u'years', u'NNS'), (u'old', u'JJ'), (u',', u','), (u'will', u'MD'), (u'join', u'VB'), (u'the', u'DT'), (u'board', u'NN'), (u'as', u'IN'), (u'a', u'DT'), (u'nonexecutive', u'JJ'), (u'director', u'NN'), (u'Nov.', u'NNP'), (u'29', u'CD'), (u'.', u'.')]


('Tagged sentences: ', 3914)


('Tagged words:', 100676)


Let’s restructure the data a bit. Let’s separate the words from the tags.

In [12]:
import numpy as np
 
sentences, sentence_tags =[], [] 
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*tagged_sentence)
    sentences.append(np.array(sentence))
    sentence_tags.append(np.array(tags))
 
# Let's see how a sequence looks
 
print(sentences[5])
print(sentence_tags[5])

[u'Lorillard' u'Inc.' u',' u'the' u'unit' u'of' u'New' u'York-based'
 u'Loews' u'Corp.' u'that' u'*T*-2' u'makes' u'Kent' u'cigarettes' u','
 u'stopped' u'using' u'crocidolite' u'in' u'its' u'Micronite' u'cigarette'
 u'filters' u'in' u'1956' u'.']
[u'NNP' u'NNP' u',' u'DT' u'NN' u'IN' u'JJ' u'JJ' u'NNP' u'NNP' u'WDT'
 u'-NONE-' u'VBZ' u'NNP' u'NNS' u',' u'VBD' u'VBG' u'NN' u'IN' u'PRP$'
 u'NN' u'NN' u'NNS' u'IN' u'CD' u'.']


As always, before training a model, we need to split the data in training and testing data. As usual, let’s use the train_test_split function from Scikit-Learn:

In [13]:
from sklearn.model_selection import train_test_split
 
 
(train_sentences, 
 test_sentences, 
 train_tags, 
 test_tags) = train_test_split(sentences, sentence_tags, test_size=0.2)

Keras also needs to work with numbers, not with words (or tags). Let’s assign to each word (and tag) a unique integer. We’re computing a set of unique words (and tags) then transforming it in a list and indexing them in a dictionary. These dictionaries are the word vocabulary and the tag vocabulary. We’ll also add a special value for padding the sequences (more on that later), and another one for unknown words (OOV – Out Of Vocabulary).

In [14]:
words, tags = set([]), set([])
 
for s in train_sentences:
    for w in s:
        words.add(w.lower())
 
for ts in train_tags:
    for t in ts:
        tags.add(t)
 
word2index = {w: i + 2 for i, w in enumerate(list(words))}
word2index['-PAD-'] = 0  # The special value used for padding
word2index['-OOV-'] = 1  # The special value used for OOVs
 
tag2index = {t: i + 1 for i, t in enumerate(list(tags))}
tag2index['-PAD-'] = 0  # The special value used to padding


Let’s now convert the word dataset to integer dataset, both the words and the tags.

In [15]:
train_sentences_X, test_sentences_X, train_tags_y, test_tags_y = [], [], [], []
 
for s in train_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    train_sentences_X.append(s_int)
 
for s in test_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    test_sentences_X.append(s_int)
 
for s in train_tags:
    train_tags_y.append([tag2index[t] for t in s])
 
for s in test_tags:
    test_tags_y.append([tag2index[t] for t in s])
 
print(train_sentences_X[0])
print(test_sentences_X[0])
print(train_tags_y[0])
print(test_tags_y[0])

[3851, 3060, 6428, 5417, 3486, 5238, 8848, 4883, 5238, 5983, 6026, 2131, 9940, 3060, 1684, 2842, 5238, 3390, 5539, 3060, 1586, 5238, 7450, 5238, 8777, 8568, 3664, 9886, 33, 9317, 4578, 2588, 5500, 3060, 9891, 450]
[5015, 1531, 5417, 3486, 5946, 3060, 3660, 5238, 8568, 3676, 727, 7522, 8267, 192, 1, 5500, 5464, 2802, 5238, 3060, 1354, 7788, 3676, 1274, 1, 1, 1386, 5500, 2646, 450, 619]
[24, 12, 26, 3, 37, 5, 37, 16, 5, 16, 35, 40, 27, 12, 26, 25, 5, 22, 22, 12, 25, 5, 25, 5, 25, 29, 25, 10, 35, 7, 34, 35, 37, 12, 16, 19]
[21, 22, 3, 37, 37, 12, 25, 5, 29, 35, 4, 2, 12, 9, 16, 37, 16, 25, 5, 12, 16, 11, 35, 20, 27, 9, 25, 37, 39, 19, 6]


Keras can only deal with fixed size sequences. We will pad to the right all the sequences with a special value (0 as the index and “-PAD-“` as the corresponding word/tag) to the length of the longest sequence in the dataset. Let’s compute the maximum length of all the sequences.

In [16]:
MAX_LENGTH = len(max(train_sentences_X, key=len))
print(MAX_LENGTH)

271


Now we can use Keras’s convenient pad_sequences utility function:

In [17]:
from keras.preprocessing.sequence import pad_sequences
 
train_sentences_X = pad_sequences(train_sentences_X, maxlen=MAX_LENGTH, padding='post')
test_sentences_X = pad_sequences(test_sentences_X, maxlen=MAX_LENGTH, padding='post')
train_tags_y = pad_sequences(train_tags_y, maxlen=MAX_LENGTH, padding='post')
test_tags_y = pad_sequences(test_tags_y, maxlen=MAX_LENGTH, padding='post')
 
print(train_sentences_X[0])
print(test_sentences_X[0])
print(train_tags_y[0])
print(test_tags_y[0])

[3851 3060 6428 5417 3486 5238 8848 4883 5238 5983 6026 2131 9940 3060
 1684 2842 5238 3390 5539 3060 1586 5238 7450 5238 8777 8568 3664 9886
   33 9317 4578 2588 5500 3060 9891  450    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

In [18]:
from keras import backend as K
 
def ignore_class_accuracy(to_ignore=0):
    def ignore_accuracy(y_true, y_pred):
        y_true_class = K.argmax(y_true, axis=-1)
        y_pred_class = K.argmax(y_pred, axis=-1)
 
        ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'int32')
        matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
        accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
        return accuracy
    return ignore_accuracy
 

Network architecture
Let’s now define the model. Here’s what we need to have in mind:

We’ll need an embedding layer that computes a word vector model for our words. Remember that in the Word Embeddings Guide we’ve mentioned that this is one of the methods of computing a word embeddings model.
We’ll need an LSTM layer with a Bidirectional modifier. bidirectional modifier inputs to the LSTM the next values in the sequence, not just the previous.
We need to set the return_sequences=True parameter so that the LSTM outputs a sequence, not only the final value.
After the LSTM Layer we need a Dense Layer (or fully-connected layer) that picks the appropriate POS tag. Since this dense layer needs to run on each element of the sequence, we need to add the TimeDistributed modifier.

In [22]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras.optimizers import Adam
 
 
model = Sequential()
model.add(InputLayer(input_shape=(MAX_LENGTH, )))
model.add(Embedding(len(word2index), 128))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(len(tag2index))))
model.add(Activation('softmax'))
 
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy'])
 
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 271, 128)          1306496   
_________________________________________________________________
bidirectional_4 (Bidirection (None, 271, 512)          788480    
_________________________________________________________________
time_distributed_4 (TimeDist (None, 271, 47)           24111     
_________________________________________________________________
activation_4 (Activation)    (None, 271, 47)           0         
Total params: 2,119,087
Trainable params: 2,119,087
Non-trainable params: 0
_________________________________________________________________


There’s one more thing to do before training. We need to transform the sequences of tags to sequences of One-Hot Encoded tags. This is what the Dense Layer outputs. Here’s a function that does that:

In [23]:
def to_categorical(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)
 

In [10]:
cat_train_tags_y = to_categorical(train_tags_y, len(tag2index))
print(cat_train_tags_y[0])


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


training the model

In [24]:
model.fit(train_sentences_X, to_categorical(train_tags_y, len(tag2index)), batch_size=128, epochs=5, validation_split=0.2)

Train on 2504 samples, validate on 627 samples
Epoch 1/5


 128/2504 [>.............................] - ETA: 1:34 - loss: 3.8561 - acc: 0.0020

 256/2504 [==>...........................] - ETA: 1:07 - loss: 3.7811 - acc: 0.4522

 384/2504 [===>..........................] - ETA: 56s - loss: 3.7010 - acc: 0.6031 

 512/2504 [=====>........................] - ETA: 49s - loss: 3.6043 - acc: 0.6790

































Epoch 2/5


 128/2504 [>.............................] - ETA: 45s - loss: 0.3842 - acc: 0.9101

 256/2504 [==>...........................] - ETA: 43s - loss: 0.3912 - acc: 0.9082

 384/2504 [===>..........................] - ETA: 40s - loss: 0.3858 - acc: 0.9090

 512/2504 [=====>........................] - ETA: 38s - loss: 0.3908 - acc: 0.9069

































Epoch 3/5


 128/2504 [>.............................] - ETA: 46s - loss: 0.3341 - acc: 0.9000

 256/2504 [==>...........................] - ETA: 44s - loss: 0.3290 - acc: 0.9024

 384/2504 [===>..........................] - ETA: 41s - loss: 0.3328 - acc: 0.9014

 512/2504 [=====>........................] - ETA: 39s - loss: 0.3254 - acc: 0.9035

































Epoch 4/5


 128/2504 [>.............................] - ETA: 46s - loss: 0.3275 - acc: 0.9108

 256/2504 [==>...........................] - ETA: 43s - loss: 0.3146 - acc: 0.9144

 384/2504 [===>..........................] - ETA: 41s - loss: 0.3082 - acc: 0.9166

 512/2504 [=====>........................] - ETA: 38s - loss: 0.3103 - acc: 0.9159

































Epoch 5/5


 128/2504 [>.............................] - ETA: 46s - loss: 0.3091 - acc: 0.9141

 256/2504 [==>...........................] - ETA: 43s - loss: 0.3012 - acc: 0.9165

 384/2504 [===>..........................] - ETA: 41s - loss: 0.3039 - acc: 0.9157

 512/2504 [=====>........................] - ETA: 38s - loss: 0.3018 - acc: 0.9163

































<keras.callbacks.History at 0x7f3413593810>

save the model

In [25]:
model.save("tagger.h5")

Measuring model accuracy by tag 

In [50]:
from sklearn.metrics import classification_report

target_names = list(tags)
y_preds  = model.predict(test_sentences_X)

#print(target_names)
print(y_predict)
#print(test_tags_y)

classif_report = classification_report(y_true=test_tags_y, y_pred=y_predict, target_names=target_names)

print(classif_report)

[[[2.24509202e-02 8.63002893e-03 1.41064525e-02 ... 3.97340022e-03
   6.29566901e-04 5.82579931e-04]
  [1.94893535e-02 8.47794581e-03 1.38734309e-02 ... 3.96054517e-03
   6.22523134e-04 5.72526595e-04]
  [1.77739691e-02 8.37009307e-03 1.38596324e-02 ... 3.93461762e-03
   6.14880526e-04 5.67297277e-04]
  ...
  [9.99979019e-01 2.19684210e-07 5.32118207e-08 ... 1.26722213e-07
   2.05008632e-08 6.10532851e-08]
  [9.99975204e-01 3.23417851e-07 7.22309252e-08 ... 2.00304228e-07
   3.65859378e-08 1.07884581e-07]
  [9.99970198e-01 4.67980186e-07 9.76776349e-08 ... 3.07132609e-07
   6.07737860e-08 1.78043493e-07]]

 [[2.13194247e-02 8.32926389e-03 1.39857875e-02 ... 3.84037662e-03
   6.01661042e-04 5.61314577e-04]
  [1.92499589e-02 8.23348947e-03 1.38821164e-02 ... 3.74356564e-03
   5.95647201e-04 5.57192310e-04]
  [2.24350393e-02 8.34710523e-03 1.42755844e-02 ... 3.76671320e-03
   6.02194632e-04 5.70853997e-04]
  ...
  [9.99979019e-01 2.19684210e-07 5.32118207e-08 ... 1.26722213e-07
   2.05008

ValueError: Classification metrics can't handle a mix of multiclass-multioutput and unknown targets

In [44]:
from sklearn.metrics import classification_report
# Our target names are our label encoded targets
target_names = ["PRP$", "VBG", "VBD", "VBP", "WDT", "JJ", "WP", "VBZ", "DT", "RP", "NN", "FW", "POS", ".", "TO", "PRP", "RB", "-LRB-", "NNS", "NNP", "VB", "WRB", "CC", "LS", "PDT", "RBS", "RBR", "VBN", "-NONE-", "EX", "IN", "WP$", "CD", "MD", "NNPS", "-RRB-", "JJS", "JJR", "SYM", "H"]#list(tags)
#print(test_tags_y)
print(target_names)
y_preds  = model.predict(test_sentences_X)
#Compute classification report
classif_report = classification_report(y_true=test_tags_y, y_pred=y_predict, target_names=target_names)
print(classif_report)

['PRP$', 'VBG', 'VBD', 'VBP', 'WDT', 'JJ', 'WP', 'VBZ', 'DT', 'RP', 'NN', 'FW', 'POS', '.', 'TO', 'PRP', 'RB', '-LRB-', 'NNS', 'NNP', 'VB', 'WRB', 'CC', 'LS', 'PDT', 'RBS', 'RBR', 'VBN', '-NONE-', 'EX', 'IN', 'WP$', 'CD', 'MD', 'NNPS', '-RRB-', 'JJS', 'JJR', 'SYM', 'H']


ValueError: Classification metrics can't handle a mix of multiclass-multioutput and unknown targets

In [52]:
from sklearn.metrics import classification_report
y_true = [0.0, 1.0, 2.0, 2.0, 2.0, 3.0]
y_pred = [0.1, 0.5, 2.2, 2.5, 1.7, 3.3]
target_names = ['class 0', 'class 1', 'class 2', 'class 3']
print(classification_report(y_true, y_pred, target_names=target_names))



ValueError: Classification metrics can't handle a mix of multiclass and continuous targets