In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('/content/ner_dataset.csv', encoding = 'latin1')
data.fillna(method = 'ffill', inplace = True)
data.head().append(data.tail())

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O
1048574,Sentence: 47959,attack,NN,O


In [None]:
data.drop(['POS'], axis = 1, inplace = True)

In [None]:
words = list(set(data['Word'].values))
num_words = len(words)
tags = list(set(data['Tag'].values))
n_tags = len(tags)
print(num_words, n_tags)

35178 17


In [None]:
class Grouping:
    
    def __init__(self, data):
        self.data = data
        self.empty = False
        agg_fun = lambda S : [(w, t) for w, t in zip(S['Word'].values.tolist(),
                                                     S['Tag'].values.tolist())]
        self.group = self.data.groupby('Sentence #').apply(agg_fun)
        self.sentence = [sen for sen in self.group]


In [None]:
getter = Grouping(data)
sentences = getter.sentence

In [None]:
word2idx = {w : i + 1 for i, w in enumerate(words)}
tag2idx = {t : i for i, t in enumerate(tags)}

In [None]:
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [None]:
max_len = 140
X = [[word2idx[w[0]] for w in s] for s in sentences]
X = pad_sequences(X, maxlen = max_len, padding = 'post', value = num_words - 1)
y = [[tag2idx[t[1]] for t in s] for s in sentences]
y = pad_sequences(y, maxlen = max_len, padding = 'post', value = tag2idx['O'])
y = [to_categorical(i, num_classes = n_tags) for i in y]


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 44)

In [None]:
input_dim = num_words + 1 #model.fit() iterates through the input with the range(0:35178). Thus we are adding 1 to fullfil the range(0:35179).
output_dim = max_len
input_length = max_len

In [None]:
from tensorflow.keras import Model, Sequential, Input
from tensorflow.keras.layers import Embedding, Dense, LSTM,Dropout
from tensorflow.keras.layers import Bidirectional, TimeDistributed, SpatialDropout1D

In [None]:
input = Input(shape=(140,))
model = Embedding(input_dim = input_dim, output_dim = output_dim, input_length = input_length)(input)
model = Dropout(0.2)(model)
model = Bidirectional(LSTM(units = 100, recurrent_dropout = 0.1, return_sequences = True))(model)
out = TimeDistributed(Dense(units = n_tags, activation = 'softmax'))(model)
model=Model(input,out)

In [None]:
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 140)]             0         
_________________________________________________________________
embedding_10 (Embedding)     (None, 140, 140)          4925060   
_________________________________________________________________
dropout_6 (Dropout)          (None, 140, 140)          0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 140, 200)          192800    
_________________________________________________________________
time_distributed_3 (TimeDist (None, 140, 17)           3417      
Total params: 5,121,277
Trainable params: 5,121,277
Non-trainable params: 0
_________________________________________________________________


In [None]:
from tensorflow.keras.callbacks import EarlyStopping

callbacks = EarlyStopping(monitor = 'val_accuracy', patience = 2, verbose = 1, mode = 'max', restore_best_weights = True)

history = model.fit(X_train, np.array(y_train), batch_size = 30, validation_split = 0.1, epochs = 20, callbacks = callbacks, verbose = 1)


Epoch 1/2

In [None]:
model.evaluate(X_test, np.array(y_test))

In [None]:
i = np.random.randint(0,X_test.shape[0])
p = model.predict(np.array([X_test[i]]))
p = np.argmax(p, axis =-1)
y_true = np.argmax(np.array(y_test), axis =-1)[i]
print("{:15}{:5}\t {} \n".format("Word","True","Pred"))
print("-"*30)
for w,true,pred in zip(X_test[i],y_true,p[0]):
    print("{:15}{}\t{}".format(words[w-1],tags[true],tags[pred]))