# Setting up the Notebook

In [1]:
#Import the necessary modules
import numpy as np, math, string, pandas as pan, time
import tensorflow as tf
from tensorflow import keras
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Bidirectional, LSTM, Dense, Embedding, TimeDistributed
from keras_preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score


Another good tool: https://towardsdatascience.com/named-entity-recognition-ner-using-keras-bidirectional-lstm-28cd3f301f54

# Parameters and Loading Functions

In [2]:

#Parameters
np.random.seed(2018); learning_rate = 0.01; momentum = 0.9
activation = 'selu'; out_act = 'softmax'; opt = 'adam'
n_units = 1000; batch_size = 32; punctuation = set(string.punctuation)
input_shape = 75; epochs = 1; validation_split = 0.10; output_dim = 20


In [3]:
 
def load_data():
    text_data = open('Data/train.txt', 'r').readlines()
    text_data = [text_data[k].replace('\t', ' ').split() for k in range(0, len(text_data))]
    print('Text example after being split up is a list of words, then parts-of-speech, then the tag', text_data[0:3], '\n')
    index = range(0, len(text_data), 3)
    
    #Transforming data to matrix format for neural network
    input_data =  list()
    for i in range(1, len(index)-1):
        rows = text_data[index[i-1]:index[i]]
        sentence_no = np.array([i for i in np.repeat(i, len(rows[0]))], dtype=str)
        rows.append(np.array(sentence_no))
        rows = np.array(rows).T
        input_data.append(rows)
    
    input_data = pan.DataFrame(np.concatenate([input_data[j] for j in range(0,len(input_data))]), 
                           columns=['word', 'pos', 'tag', 'sent_no'])
    print('input_data as a dataframe/matrix', input_data.head(), '\n')
    
    labels, vocabulary = list(set(input_data['tag'].values)), list(set(input_data['word'].values))
    vocabulary.append('endpad'); vocab_size = len(vocabulary); label_size = len(labels)
    
    aggregate_function = lambda input: [(word, pos, label) for word, pos, label in zip(input['word'].values.tolist(),
                                                      input['pos'].values.tolist(),
                                                       input['tag'].values.tolist())]
                           
    grouped_input_data= input_data.groupby('sent_no').apply(aggregate_function)
    print('grouped_input_data', grouped_input_data[0:5], '\n')
    sentences = [sentence for sentence in grouped_input_data]
    word_dictionary = {word: i for i, word in enumerate(vocabulary)}
    label_dictionary = {label: i for i, label in enumerate(labels)}
    output_dictionary = {i: labels for i, labels in enumerate(labels)}
    x = [[word_dictionary[word[0]] for word in sent] for sent in sentences]    
    x = pad_sequences(maxlen=input_shape, sequences=x, padding='post', value=0)
    y = [[label_dictionary[word[2]] for word in sent] for sent in sentences]  
    y = pad_sequences(maxlen=input_shape, sequences=y, padding='post', value=0)
    y = [np_utils.to_categorical(label, num_classes=label_size) for label in y]            
    return x, y, output_dictionary, vocab_size, label_size


## Outputs of the above functions for clarity

In [4]:
x, y, output_dictionary, vocab_size, label_size = load_data()

print('output_dict', output_dictionary, '\n')
print('vocab size', vocab_size, '\n')
print('label size', label_size, '\n')
print('x', x[0], '\n')
print('x.shape', x.shape, '\n')
print('y.shape', len(y[0]), '\n')
print('y', y[0], '\n')

Text example after being split up is a list of words, then parts-of-speech, then the tag [['played', 'on', 'Monday', '(', 'home', 'team', 'in', 'CAPS', ')', ':'], ['VBD', 'IN', 'NNP', '(', 'NN', 'NN', 'IN', 'NNP', ')', ':'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']] 

input_data as a dataframe/matrix      word  pos tag sent_no
0  played  VBD   O       1
1      on   IN   O       1
2  Monday  NNP   O       1
3       (    (   O       1
4    home   NN   O       1 

grouped_input_data sent_no
1        [(played, VBD, O), (on, IN, O), (Monday, NNP, ...
10       [(SAN, NNP, B-ORG), (FRANCISCO, NNP, I-ORG), (...
100      [(VfB, NNP, B-ORG), (Stuttgart, NNP, I-ORG), (...
1000     [(Results, NNS, O), (of, IN, O), (Major, NNP, ...
10000    [(Hartlepool, NNP, B-ORG), (2, CD, O), (Fulham...
dtype: object 

output_dict {0: 'sO', 1: 'B-LOC', 2: 'I-ORG', 3: 'I-LOC', 4: 'B-MISC', 5: 'I-MISC', 6: 'B-PER', 7: 'I-PER', 8: 'O', 9: 'B-ORG'} 

vocab size 24340 

label size 10 

x [ 6397 23757 10339 

So, each instance of x is a *padded* sentence of integers corresponding to the words. Each y is a $75 \times 10$ matrix, where each of the 75 lists of length 10 is one-hot encoding of the tag.

# Bidirectional LSTM

## Defining and Training

In [5]:
    
def train_brnn_keras():

    train_end = int(math.floor(len(x)*.80))
    train_x, train_y = x[0:train_end] , np.array(y[0:train_end])
    test_x, test_y = x[train_end:], np.array(y[train_end:])
    
    def create_brnn():
        model = Sequential()
        # The Embedding is to transform the input into the correct shape (batch_size, sent_length, output_dim)
        model.add(Embedding(input_dim=vocab_size+1, output_dim=output_dim,
                            input_length=input_shape, mask_zero=True))
        model.add(Bidirectional(LSTM(units=n_units, activation=activation,
                                     return_sequences=True)))
        # The TimeDistributed is because we are doing a Many-to-Many input and output scheme. So, this will provide
        # an output (a one-hot encoding of the tag) for each integer (representing a word in the sentence)
        model.add(TimeDistributed(Dense(label_size, activation=out_act)))
        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
        model.summary()
        return model

    lstm_model = create_brnn()
    lstm_model.fit(train_x, train_y, epochs=epochs, shuffle=True, batch_size=batch_size, verbose=1)

    for start, end in zip(range(0, 10, 1), range(1, 11, 1)):
        y_predict = lstm_model.predict(test_x[start:end])
        input_sequences, output_sequences =  [], []
        for i in range(0, len(y_predict[0])): 
            output_sequences.append(np.argmax(y_predict[0][i]))
            input_sequences.append(np.argmax(test_y[start][0]))
        
        print('Test Accuracy: ' + str(lstm_model.evaluate(test_x[start:end], test_y[start:end])))
        output_sequences = ' '.join([output_dictionary[key] for key in output_sequences]).split()
        input_sequences = ' '.join([output_dictionary[key] for key in input_sequences]).split()
        output_input_comparison = pan.DataFrame([output_sequences, input_sequences]).T
        print(output_input_comparison)

In [6]:
lstm_model = train_brnn_keras()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 75, 20)            486820    
                                                                 
 bidirectional (Bidirectiona  (None, 75, 2000)         8168000   
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 75, 10)           20010     
 ibuted)                                                         
                                                                 
Total params: 8,674,830
Trainable params: 8,674,830
Non-trainable params: 0
_________________________________________________________________
Test Accuracy: [0.00014949627802707255, 1.0]
    0  1
0   O  O
1   O  O
2   O  O
3   O  O
4   O  O
.. .. ..
70  O  O
71  O  O
72  O  O
73  O  O
74  O  O

[75 rows x 2 columns]
Te