<b>Task:</b>
- Design a POS tagging neural network (POS tagging or part-of-speech tagging - частеречная разметка или автоматическая морфологическая разметка)

In [1]:
import warnings
warnings.filterwarnings('ignore')

import nltk
import sys
import numpy as np

In [2]:
# Step 1. load and split data
nltk.download('brown')
nltk.download('universal_tagset')
data = nltk.corpus.brown.tagged_sents(tagset='universal')
all_tags = ['#EOS#','#UNK#','ADV', 'NOUN', 'ADP', 'PRON', 'DET', '.', 'PRT', 'VERB', 'X', 'NUM', 'CONJ', 'ADJ']

data = np.array([ [(word.lower(),tag) for word,tag in sentence] for sentence in data ]) # work for numpy == 1.23.5

[nltk_data] Error loading brown: <urlopen error Tunnel connection
[nltk_data]     failed: 407 Proxy authentication required>
[nltk_data] Error loading universal_tagset: <urlopen error Tunnel
[nltk_data]     connection failed: 407 Proxy authentication required>


In [3]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data,test_size=0.25,random_state=42)

In [4]:
# Showing data
from IPython.display import HTML, display
def draw(sentence):
    words,tags = zip(*sentence)
    display(HTML('<table><tr>{tags}</tr>{words}<tr></table>'.format(
                words = '<td>{}</td>'.format('</td><td>'.join(words)),
                tags = '<td>{}</td>'.format('</td><td>'.join(tags)))))


draw(data[11])
draw(data[10])
draw(data[7])

0,1,2,3,4,5,6,7,8,9,10,11,12,13
NOUN,ADP,NOUN,NOUN,NOUN,NOUN,VERB,ADV,VERB,ADP,DET,ADJ,NOUN,.
,,,,,,,,,,,,,


0,1,2,3,4,5,6,7,8,9,10,11,12,13
PRON,VERB,ADP,DET,NOUN,.,VERB,NOUN,PRT,VERB,.,DET,NOUN,.
,,,,,,,,,,,,,


0,1
NOUN,VERB
,


In [5]:
# Step 2. Building vocabularies
from collections import Counter
word_counts = Counter()
for sentence in data:
    words,tags = zip(*sentence)
    word_counts.update(words)

    
    #EOS - "end of sentence"
    #UNK - "unknown token"
all_words = ['#EOS#','#UNK#'] + list(list(zip(*word_counts.most_common(10000)))[0])

#let's measure what fraction of data words are in the dictionary
print("Coverage = %.5f" % (float(sum(word_counts[w] for w in all_words)) / sum(word_counts.values())))

all_words[:10]

Coverage = 0.92876


['#EOS#', '#UNK#', 'the', ',', '.', 'of', 'and', 'to', 'a', 'in']

In [6]:
# Step 3. Creating collections
from collections import defaultdict
word_to_id = defaultdict(lambda:1, { word: i for i, word in enumerate(all_words) }) # FOR  defaultdict['ANY_NEW_WORD'] returns 1
tag_to_id = { tag: i for i, tag in enumerate(all_tags)}

In [7]:
# getting a tensor from dataset by transforming each token into id
def to_matrix(lines, token_to_id, max_len=None, pad=0, dtype='int32', time_major=False):
    """
    Converts a list of names into rnn-digestable matrix with paddings added after the end
    input:
    lines - dataset (words or tags),
    token_to_id - collection
    max_len - maximum lenght of rows
    pad - value for filling
    dtype - dtype
    time_major - for transpose?
    return: tensor
    """

    max_len = max_len or max(map(len,lines))
    matrix = np.empty([len(lines), max_len],dtype)  # Return a new array of given shape and type, without initializing entries.
    matrix.fill(pad)  # Fill the array with a scalar value.

    for i in range(len(lines)):
        line_ix = list(map(token_to_id.__getitem__,lines[i]))[:max_len] # getting a indexes of each word in a set
        matrix[i,:len(line_ix)] = line_ix  # rewriting ones in matrix 

    return matrix.T if time_major else matrix


batch_words, batch_tags = zip(*[zip(*sentence) for sentence in data[-3:]])


# checking a work of function
print("Word ids:")
print(to_matrix(batch_words, word_to_id))
print("Tag ids:")
print(to_matrix(batch_tags, tag_to_id))

Word ids:
[[   2 3057    5    2 2238 1334 4238 2454    3    6   19   26 1070   69
     8 2088    6    3    1    3  266   65  342    2    1    3    2  315
     1    9   87  216 3322   69 1558    4    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0]
 [  45   12    8  511 8419    6   60 3246   39    2    1    1    3    2
   845    1    3    1    3   10 9910    2    1 3470    9   43    1    1
     3    6    2 1046  385   73 4562    3    9    2    1    1 3250    3
    12   10    2  861 5240   12    8 8936  121    1    4]
 [  33   64   26   12  445    7 7346    9    8 3337    3    1 2811    3
     2  463  572    2    1    1 1649   12    1    4    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0]]
Tag ids:
[[ 6  3  4  6  3  3  9  9  7 12  4  5  9  4  6  3 12  7  9  7  9  8  4  6
   3  7  6 13  3  4  6  3  9  4  3  7  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0

In [8]:
# Step 4. Build model

import keras
import keras.layers as L


# setting seed for reproducibility
keras.utils.set_random_seed(42)

model = keras.models.Sequential()
model.add(L.InputLayer([None],dtype='int32'))
model.add(L.Embedding(len(all_words),50))
model.add(L.SimpleRNN(64,return_sequences=True)) # Fully-connected RNN where the output is to be fed back to input.

#add top layer that predicts tag probabilities
stepwise_dense = L.Dense(len(all_tags),activation='softmax')
stepwise_dense = L.TimeDistributed(stepwise_dense) #This wrapper allows to apply a layer to every temporal slice of an input.
"""
По умолчанию keras.layers.Dense будет применяться один раз ко всем согласованным шагам времени (Dense would apply once to all time-steps concatenated). 
Мы используем keras.layers.TimeDistributed для изменения Dense слоя таким образом, 
чтобы он применялся как по пакетной, так и по временной оси.
"""

model.add(stepwise_dense)

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 50)          500100    
                                                                 
 simple_rnn (SimpleRNN)      (None, None, 64)          7360      
                                                                 
 time_distributed (TimeDist  (None, None, 14)          910       
 ributed)                                                        
                                                                 
Total params: 508370 (1.94 MB)
Trainable params: 508370 (1.94 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [9]:
# Step 5. Creating generator fun
import tensorflow
from tensorflow.keras.utils import to_categorical

BATCH_SIZE=32
def generate_batches(sentences,batch_size=BATCH_SIZE,max_len=None,pad=0):
    assert isinstance(sentences,np.ndarray),"Make sure sentences is q numpy array"

    while True:
        indices = np.random.permutation(np.arange(len(sentences))) # mixing up of indexes
        for start in range(0,len(indices)-1,batch_size):
            batch_indices = indices[start:start+batch_size]
            batch_words,batch_tags = [],[]
            for sent in sentences[batch_indices]:
                words,tags = zip(*sent)
                batch_words.append(words)
                batch_tags.append(tags)

            batch_words = to_matrix(batch_words,word_to_id,max_len,pad)
            batch_tags = to_matrix(batch_tags,tag_to_id,max_len,pad)

            batch_tags_1hot = to_categorical(batch_tags,len(all_tags)).reshape(batch_tags.shape+(-1,))
            yield batch_words,batch_tags_1hot

In [10]:
# Step 6. Callbacks
def compute_test_accuracy(model):
    test_words,test_tags = zip(*[zip(*sentence) for sentence in test_data])
    test_words,test_tags = to_matrix(test_words,word_to_id),to_matrix(test_tags,tag_to_id)

    #predict tag probabilities of shape [batch,time,n_tags]
    predicted_tag_probabilities = model.predict(test_words,verbose=1)
    predicted_tags = predicted_tag_probabilities.argmax(axis=-1)

    #compute accurary excluding padding
    numerator = np.sum(np.logical_and((predicted_tags == test_tags),(test_words != 0)))
    denominator = np.sum(test_words != 0)
    return float(numerator)/denominator


class EvaluateAccuracy(keras.callbacks.Callback):
    def on_epoch_end(self,epoch,logs=None):
        sys.stdout.flush()
        print("\nMeasuring validation accuracy...")
        acc = compute_test_accuracy(self.model)
        print("\nValidation accuracy: %.5f\n"%acc)
        sys.stdout.flush()

In [11]:
# launching a model
model.compile('adam','categorical_crossentropy')

model.fit_generator(generator=generate_batches(train_data),steps_per_epoch=len(train_data)/BATCH_SIZE,
                    callbacks=[EvaluateAccuracy()], epochs=5,)

Epoch 1/5
Measuring validation accuracy...

Validation accuracy: 0.94055

Epoch 2/5
Measuring validation accuracy...

Validation accuracy: 0.94564

Epoch 3/5
Measuring validation accuracy...

Validation accuracy: 0.94656

Epoch 4/5
Measuring validation accuracy...

Validation accuracy: 0.94655

Epoch 5/5
Measuring validation accuracy...

Validation accuracy: 0.94659



<keras.src.callbacks.History at 0x2e2ccf796a0>

In [12]:
acc = compute_test_accuracy(model)
print("Final accuracy: %.5f"%acc)

assert acc>0.94, "Keras has gone on a rampage again, please contact course staff."

Final accuracy: 0.94659


In [13]:
# Step 7.1. Bidirectional RNN - first way MANUAL

inputs = keras.layers.Input([None], dtype='int32')  # there is Input(return Tensor) insted of InputLayer (return InputLayer)


embedding1 = keras.layers.Embedding(len(all_words),50, name='embeddings')(inputs)
srnn1 = keras.layers.SimpleRNN(units=64,  name='srnn1', return_sequences=True)(embedding1)  #activation='relu',

#embedding2 = keras.layers.Embedding(len(all_words),50, name='embeddings2')(inputs)
srnn2 = keras.layers.SimpleRNN(units=64,  name='srnn2', return_sequences=True, go_backwards=True)(embedding1) #go_backwards = True then return reversed sequence

concatenated = keras.layers.Concatenate(name='concatenate')([srnn1, srnn2])

outputs = keras.layers.Dense(len(all_tags), activation='softmax', name='dense')
outputs = keras.layers.TimeDistributed(outputs, name='timedistributed')(concatenated)

model = keras.models.Model(inputs=inputs, outputs=outputs)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embeddings (Embedding)      (None, None, 50)             500100    ['input_2[0][0]']             
                                                                                                  
 srnn1 (SimpleRNN)           (None, None, 64)             7360      ['embeddings[0][0]']          
                                                                                                  
 srnn2 (SimpleRNN)           (None, None, 64)             7360      ['embeddings[0][0]']          
                                                                                              

In [14]:
model.compile('adam','categorical_crossentropy')

model.fit_generator(generator=generate_batches(train_data),steps_per_epoch=len(train_data)/BATCH_SIZE,
                    callbacks=[EvaluateAccuracy()], epochs=5,)

Epoch 1/5
Measuring validation accuracy...

Validation accuracy: 0.94112

Epoch 2/5
Measuring validation accuracy...

Validation accuracy: 0.94479

Epoch 3/5
Measuring validation accuracy...

Validation accuracy: 0.94545

Epoch 4/5
Measuring validation accuracy...

Validation accuracy: 0.94519

Epoch 5/5
Measuring validation accuracy...

Validation accuracy: 0.94573



<keras.src.callbacks.History at 0x2e2d1229d90>

In [15]:
# Step 7.2. Bidirectional RNN - second way keras.layers.Bidirectional

model = keras.models.Sequential()
model.add(keras.layers.InputLayer([None], dtype='int32'))
model.add(keras.layers.Embedding(len(all_words), 50))
model.add(keras.layers.Bidirectional(keras.layers.SimpleRNN(units=64, return_sequences=True)))

stepwise_dense = keras.layers.Dense(len(all_tags), activation='softmax')
stepwise_dense = keras.layers.TimeDistributed(stepwise_dense)

model.add(stepwise_dense)
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 50)          500100    
                                                                 
 bidirectional (Bidirection  (None, None, 128)         14720     
 al)                                                             
                                                                 
 time_distributed_1 (TimeDi  (None, None, 14)          1806      
 stributed)                                                      
                                                                 
Total params: 516626 (1.97 MB)
Trainable params: 516626 (1.97 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [16]:
# compiling and getting accuracy
model.compile('adam','categorical_crossentropy')

model.fit_generator(generator=generate_batches(train_data),steps_per_epoch=len(train_data)/BATCH_SIZE,
                    callbacks=[EvaluateAccuracy()], epochs=5,)


# test
acc = compute_test_accuracy(model)
print("\nFinal accuracy: %.5f"%acc)

assert acc>0.96, "Bidirectional RNNs are better than this!"
print("Well done!")

Epoch 1/5
Measuring validation accuracy...

Validation accuracy: 0.95631

Epoch 2/5
Measuring validation accuracy...

Validation accuracy: 0.96086

Epoch 3/5
Measuring validation accuracy...

Validation accuracy: 0.96279

Epoch 4/5
Measuring validation accuracy...

Validation accuracy: 0.96247

Epoch 5/5
Measuring validation accuracy...

Validation accuracy: 0.96145


Final accuracy: 0.96145
Well done!


In [17]:
# Step 8. Experiments
# Step 8.1. LSTM experiments
model = keras.models.Sequential()
model.add(keras.layers.InputLayer([None], dtype='int32'))
model.add(keras.layers.Embedding(len(all_words), 50))
model.add(keras.layers.Bidirectional(keras.layers.LSTM(units=32, return_sequences=True)))  

stepwise_dense = keras.layers.Dense(len(all_tags), activation='softmax')
stepwise_dense = keras.layers.TimeDistributed(stepwise_dense)

model.add(stepwise_dense)

model.compile('adam','categorical_crossentropy')

model.fit_generator(generator=generate_batches(train_data),steps_per_epoch=len(train_data)/BATCH_SIZE,
                    callbacks=[EvaluateAccuracy()], epochs=5,)

# getting accuracy
acc = compute_test_accuracy(model)
print("\nFinal accuracy: %.5f"%acc)

Epoch 1/5
Measuring validation accuracy...

Validation accuracy: 0.95212

Epoch 2/5
Measuring validation accuracy...

Validation accuracy: 0.96006

Epoch 3/5
Measuring validation accuracy...

Validation accuracy: 0.96329

Epoch 4/5
Measuring validation accuracy...

Validation accuracy: 0.96469

Epoch 5/5
Measuring validation accuracy...

Validation accuracy: 0.96519


Final accuracy: 0.96519


In [18]:
# Step 8.2. GRU experiments
model = keras.models.Sequential()
model.add(keras.layers.InputLayer([None], dtype='int32'))
model.add(keras.layers.Embedding(len(all_words), 50))
model.add(keras.layers.Bidirectional(keras.layers.GRU(units=64, activation='relu', return_sequences=True))) 

stepwise_dense = keras.layers.Dense(len(all_tags), activation='softmax')
stepwise_dense = keras.layers.TimeDistributed(stepwise_dense)

model.add(stepwise_dense)

model.compile('adam','categorical_crossentropy')

model.fit_generator(generator=generate_batches(train_data),steps_per_epoch=len(train_data)/BATCH_SIZE,
                    callbacks=[EvaluateAccuracy()], epochs=5,)

# getting accuracy
acc = compute_test_accuracy(model)
print("\nFinal accuracy: %.5f"%acc)

Epoch 1/5
Measuring validation accuracy...

Validation accuracy: 0.95642

Epoch 2/5
Measuring validation accuracy...

Validation accuracy: 0.96065

Epoch 3/5
Measuring validation accuracy...

Validation accuracy: 0.96189

Epoch 4/5
Measuring validation accuracy...

Validation accuracy: 0.96372

Epoch 5/5
Measuring validation accuracy...

Validation accuracy: 0.96406


Final accuracy: 0.96406


In [19]:
# Step 8.3. More layers
model = keras.models.Sequential()
model.add(keras.layers.InputLayer([None], dtype='int32'))


model.add(keras.layers.Embedding(len(all_words), 50))
model.add(keras.layers.Conv1D(filters=16, kernel_size=4, activation='relu', padding='same'))
#model.add(keras.layers.MaxPooling1D(pool_size=1, padding='same'))
model.add(keras.layers.Conv1D(filters=32, kernel_size=3, activation='relu', padding='same'))
model.add(keras.layers.Bidirectional(keras.layers.GRU(units=64, activation='relu', return_sequences=True))) 

stepwise_dense = keras.layers.Dense(len(all_tags), activation='softmax')
stepwise_dense = keras.layers.TimeDistributed(stepwise_dense)

model.add(stepwise_dense)
model.summary()


model.compile('adam','categorical_crossentropy')

model.fit_generator(generator=generate_batches(train_data),steps_per_epoch=len(train_data)/BATCH_SIZE,
                    callbacks=[EvaluateAccuracy()], epochs=10,)

# getting accuracy
#acc = compute_test_accuracy(model)
#print("\nFinal accuracy: %.5f"%acc)

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, None, 50)          500100    
                                                                 
 conv1d (Conv1D)             (None, None, 16)          3216      
                                                                 
 conv1d_1 (Conv1D)           (None, None, 32)          1568      
                                                                 
 bidirectional_3 (Bidirecti  (None, None, 128)         37632     
 onal)                                                           
                                                                 
 time_distributed_4 (TimeDi  (None, None, 14)          1806      
 stributed)                                                      
                                                                 
Total params: 544322 (2.08 MB)
Trainable params: 54432

<keras.src.callbacks.History at 0x2e2d32e15b0>

In [20]:
# Step 8.4. Recurrent_dropout + clipnorm
model = keras.models.Sequential()
model.add(keras.layers.InputLayer([None], dtype='int32'))
model.add(keras.layers.Embedding(len(all_words), 50))
model.add(keras.layers.Bidirectional(keras.layers.GRU(units=64, activation='relu', recurrent_dropout=0.5, return_sequences=True))) 

stepwise_dense = keras.layers.Dense(len(all_tags), activation='softmax')
stepwise_dense = keras.layers.TimeDistributed(stepwise_dense)

model.add(stepwise_dense)

model.compile(keras.optimizers.Adam(clipnorm=1.0),'categorical_crossentropy')

model.fit_generator(generator=generate_batches(train_data),steps_per_epoch=len(train_data)/BATCH_SIZE,
                    callbacks=[EvaluateAccuracy()], epochs=5,)

# getting accuracy
acc = compute_test_accuracy(model)
print("\nFinal accuracy: %.5f"%acc)

Epoch 1/5
Measuring validation accuracy...

Validation accuracy: 0.95383

Epoch 2/5
Measuring validation accuracy...

Validation accuracy: 0.95959

Epoch 3/5
Measuring validation accuracy...

Validation accuracy: 0.96247

Epoch 4/5
Measuring validation accuracy...

Validation accuracy: 0.96466

Epoch 5/5
Measuring validation accuracy...

Validation accuracy: 0.96554


Final accuracy: 0.96554


<b>Conclusion:</b>

| method          | accuracy |
|-----------------|----------|
|SimpleRNN        |  96%     |
|LSTM             |  96%     |
|GRU              |  96%     |
|More layers      |  96%     |
|recurrent_dropout|  96%     |