In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# **References** - 
1. https://towardsdatascience.com/named-entity-recognition-ner-using-keras-bidirectional-lstm-28cd3f301f54

2. https://www.kaggle.com/dvircohen0/ner-lstm/comments?select=ner_dataset.csv

# **Load and Prepare the data**

In [2]:
import pandas as pd
data = pd.read_csv('/content/gdrive/MyDrive/Sequence Learning Assignments/ner_dataset.csv', encoding = "ISO-8859-1")
print(data.shape)
print(data.head(30))

(1048575, 4)
     Sentence #           Word  POS    Tag
0   Sentence: 1      Thousands  NNS      O
1           NaN             of   IN      O
2           NaN  demonstrators  NNS      O
3           NaN           have  VBP      O
4           NaN        marched  VBN      O
5           NaN        through   IN      O
6           NaN         London  NNP  B-geo
7           NaN             to   TO      O
8           NaN        protest   VB      O
9           NaN            the   DT      O
10          NaN            war   NN      O
11          NaN             in   IN      O
12          NaN           Iraq  NNP  B-geo
13          NaN            and   CC      O
14          NaN         demand   VB      O
15          NaN            the   DT      O
16          NaN     withdrawal   NN      O
17          NaN             of   IN      O
18          NaN        British   JJ  B-gpe
19          NaN         troops  NNS      O
20          NaN           from   IN      O
21          NaN           that   DT      

In [3]:
from itertools import chain
def get_dict_map(data, token_or_tag):
    tok2idx = {}
    idx2tok = {}
    
    if token_or_tag == 'token':
        vocab = list(set(data['Word'].to_list()))
    else:
        vocab = list(set(data['Tag'].to_list()))
    
    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}
    return tok2idx, idx2tok


token2idx, idx2token = get_dict_map(data, 'token')
tag2idx, idx2tag = get_dict_map(data, 'tag')

In [4]:
data['Word_idx'] = data['Word'].map(token2idx)
data['Tag_idx'] = data['Tag'].map(tag2idx)
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx
0,Sentence: 1,Thousands,NNS,O,25761,0
1,,of,IN,O,3981,0
2,,demonstrators,NNS,O,1004,0
3,,have,VBP,O,32966,0
4,,marched,VBN,O,8056,0


In [5]:
data.isnull().sum()

Sentence #    1000616
Word                0
POS                 0
Tag                 0
Word_idx            0
Tag_idx             0
dtype: int64

In [6]:
data_fillna = data.fillna(method='ffill')

In [7]:
data_fillna.isnull().sum()

Sentence #    0
Word          0
POS           0
Tag           0
Word_idx      0
Tag_idx       0
dtype: int64

In [10]:
data_group = data_fillna.groupby(['Sentence #'],as_index=False)[['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx']].agg(lambda x: list(x))
# Visualise data
data_group.head()

Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx
0,Sentence: 1,"[Thousands, of, demonstrators, have, marched, ...","[NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo...","[25761, 3981, 1004, 32966, 8056, 18956, 13172,...","[0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 4, 0, 0, ..."
1,Sentence: 10,"[Iranian, officials, say, they, expect, to, ge...","[JJ, NNS, VBP, PRP, VBP, TO, VB, NN, TO, JJ, J...","[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,...","[1993, 8826, 19590, 1112, 21131, 268, 13155, 1...","[14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,Sentence: 100,"[Helicopter, gunships, Saturday, pounded, mili...","[NN, NNS, NNP, VBD, JJ, NNS, IN, DT, NNP, JJ, ...","[O, O, B-tim, O, O, O, O, O, B-geo, O, O, O, O...","[14144, 33961, 21548, 18153, 14726, 6544, 6066...","[0, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 11,..."
3,Sentence: 1000,"[They, left, after, a, tense, hour-long, stand...","[PRP, VBD, IN, DT, NN, JJ, NN, IN, NN, NNS, .]","[O, O, O, O, O, O, O, O, O, O, O]","[22432, 32008, 14811, 15235, 32549, 29261, 242...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,Sentence: 10000,"[U.N., relief, coordinator, Jan, Egeland, said...","[NNP, NN, NN, NNP, NNP, VBD, NNP, ,, NNP, ,, J...","[B-geo, O, O, B-per, I-per, O, B-tim, O, B-geo...","[20411, 580, 23403, 32424, 11771, 6639, 222, 3...","[4, 0, 0, 16, 6, 0, 1, 0, 4, 0, 14, 0, 14, 0, ..."


In [12]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical



def get_pad_train_test_val(data_group, data):

    #get max token and tag length
    n_token = len(list(set(data['Word'].to_list())))
    n_tag = len(list(set(data['Tag'].to_list())))

    #Pad tokens (X var)    
    tokens = data_group['Word_idx'].tolist()
    maxlen = max([len(s) for s in tokens])
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= n_token - 1)

    #Pad Tags (y var) and convert it into one hot encoding
    tags = data_group['Tag_idx'].tolist()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value= tag2idx["O"])
    n_tags = len(tag2idx)
    pad_tags = [to_categorical(i, num_classes=n_tags) for i in pad_tags]
    
    #Split train, test and validation set
    tokens_, test_tokens, tags_, test_tags = train_test_split(pad_tokens, pad_tags, test_size=0.1, train_size=0.9, random_state=2020)
    train_tokens, val_tokens, train_tags, val_tags = train_test_split(tokens_,tags_,test_size = 0.25,train_size =0.75, random_state=2020)

    print(
        'train_tokens length:', len(train_tokens),
        '\ntrain_tokens length:', len(train_tokens),
        '\ntest_tokens length:', len(test_tokens),
        '\ntest_tags:', len(test_tags),
        '\nval_tokens:', len(val_tokens),
        '\nval_tags:', len(val_tags),
    )
    
    return train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags

train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags = get_pad_train_test_val(data_group, data)

train_tokens length: 32372 
train_tokens length: 32372 
test_tokens length: 4796 
test_tags: 4796 
val_tokens: 10791 
val_tags: 10791


In [26]:
print(train_tokens[0])

[15049 15237 11884 19223 10834 15956 31562 10693  7306 30940 15235 19886
  1302   268 16016 31562 18832 20486  5704 10578 17173 27145  7481 29033
 21891 23177   268 17317  6066 14926 18813 35177 35177 35177 35177 35177
 35177 35177 35177 35177 35177 35177 35177 35177 35177 35177 35177 35177
 35177 35177 35177 35177 35177 35177 35177 35177 35177 35177 35177 35177
 35177 35177 35177 35177 35177 35177 35177 35177 35177 35177 35177 35177
 35177 35177 35177 35177 35177 35177 35177 35177 35177 35177 35177 35177
 35177 35177 35177 35177 35177 35177 35177 35177 35177 35177 35177 35177
 35177 35177 35177 35177 35177 35177 35177 35177]


# **Prepare the input parameters for Embedding Layer**

In [13]:
import numpy as np
import tensorflow
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model

from numpy.random import seed
seed(1)
tensorflow.random.set_seed(2)

input_dim = len(list(set(data['Word'].to_list())))+1
output_dim = 64
input_length = max([len(s) for s in data_group['Word_idx'].tolist()])
n_tags = len(tag2idx)
print('input_dim: ', input_dim, '\noutput_dim: ', output_dim, '\ninput_length: ', input_length, '\nn_tags: ', n_tags)

input_dim:  35179 
output_dim:  64 
input_length:  104 
n_tags:  17


# **Build Model**

In [14]:
import numpy as np
import tensorflow
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional

In [19]:
def get_bilstm_lstm_model():
  model = Sequential()
  
  # Add Embedding layer
  model.add(Embedding(input_dim=input_dim, output_dim=140, input_length=140))
  model.add(Dropout(0.2))
  
  #Add bidirectional LSTM
  model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.1), merge_mode = 'concat'))
  
  # Add timeDistributed Layer
  model.add(TimeDistributed(Dense(n_tags, activation="softmax")))
  
  #Optimiser
  # adam = k.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999)
  
  # Compile model
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  model.summary()
  
  
  return model

In [20]:
def train_model(X, y, model):
    loss = list()
    for i in range(25):
        # fit model for one epoch on this sequence
        hist = model.fit(X, y, batch_size=1000, verbose=1, epochs=1, validation_split=0.2)
        loss.append(hist.history['loss'][0])
    return loss

In [21]:
results = pd.DataFrame()
model_bilstm_lstm = get_bilstm_lstm_model()
plot_model(model_bilstm_lstm)
results['with_add_lstm'] = train_model(train_tokens, np.array(train_tags), model_bilstm_lstm)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 140, 140)          4925060   
_________________________________________________________________
dropout_1 (Dropout)          (None, 140, 140)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 140, 128)          104960    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 140, 17)           2193      
Total params: 5,032,213
Trainable params: 5,032,213
Non-trainable params: 0
_________________________________________________________________


In [22]:
model_bilstm_lstm.save('/content/gdrive/MyDrive/Sequence Learning Assignments/NER_model.h5')

In [23]:
from tensorflow import keras
NER_model =keras.models.load_model('/content/gdrive/MyDrive/Sequence Learning Assignments/NER_model.h5')



In [24]:
NER_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 140, 140)          4925060   
_________________________________________________________________
dropout_1 (Dropout)          (None, 140, 140)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 140, 128)          104960    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 140, 17)           2193      
Total params: 5,032,213
Trainable params: 5,032,213
Non-trainable params: 0
_________________________________________________________________
