# EXPERIMENT NO 8

## NAME ENTITY RECOGNITION

### IMPORTING LIBRARIES

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

### READING THE DATASET

In [2]:
data = pd.read_csv("ner_dataset.csv", encoding="latin1")
data = data.fillna(method="ffill")
data.head(20)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
5,Sentence: 1,through,IN,O
6,Sentence: 1,London,NNP,B-geo
7,Sentence: 1,to,TO,O
8,Sentence: 1,protest,VB,O
9,Sentence: 1,the,DT,O


*Essential info about tagged entities*:
- geo = Geographical Entity
- org = Organization
- per = Person
- gpe = Geopolitical Entity
- tim = Time indicator
- art = Artifact
- eve = Event
- nat = Natural Phenomenon

In [4]:
print("Unique words in corpus:", data['Word'].nunique())
print("Unique tags in corpus:", data['Tag'].nunique())

Unique words in corpus: 35178
Unique tags in corpus: 17


In [3]:
words = list(set(data["Word"].values))
words.append("ENDPAD")
num_words = len(words)

In [5]:
tags = list(set(data["Tag"].values))
num_tags = len(tags)

### CLASS TO MERGE SENTENCE &CORRESPONDING TAGS

In [6]:
class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False

        def agg_func(s):
            return [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(
            ), s["POS"].values.tolist(), s["Tag"].values.tolist())]

        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [7]:
getter = SentenceGetter(data)
sentences = getter.sentences

#### TEST FOR A SENTENCE

In [8]:
sentences[0]

[('Thousands', 'NNS', 'O'),
 ('of', 'IN', 'O'),
 ('demonstrators', 'NNS', 'O'),
 ('have', 'VBP', 'O'),
 ('marched', 'VBN', 'O'),
 ('through', 'IN', 'O'),
 ('London', 'NNP', 'B-geo'),
 ('to', 'TO', 'O'),
 ('protest', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('war', 'NN', 'O'),
 ('in', 'IN', 'O'),
 ('Iraq', 'NNP', 'B-geo'),
 ('and', 'CC', 'O'),
 ('demand', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('withdrawal', 'NN', 'O'),
 ('of', 'IN', 'O'),
 ('British', 'JJ', 'B-gpe'),
 ('troops', 'NNS', 'O'),
 ('from', 'IN', 'O'),
 ('that', 'DT', 'O'),
 ('country', 'NN', 'O'),
 ('.', '.', 'O')]

### MAPPING THE SENTENCE AND TAGS

In [9]:
word2idx = {w: i + 1 for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

#### CHECKING EVERY WORD INDEXED OR NOT

In [10]:
word2idx

{'Bychkova': 1,
 'victorious': 2,
 'struggled': 3,
 'ex-President': 4,
 'Rehnquist': 5,
 'Uganda': 6,
 're-tried': 7,
 'Welfare': 8,
 'intent': 9,
 'stance': 10,
 'Depending': 11,
 'pro-rebel': 12,
 'Catalonians': 13,
 'stalls': 14,
 'gored': 15,
 'jeopardy': 16,
 'epitomizes': 17,
 'Bernie': 18,
 'Price': 19,
 'sets': 20,
 'Cayes': 21,
 'however': 22,
 "'T": 23,
 'immorality': 24,
 'climatic': 25,
 'touch': 26,
 'vivid': 27,
 'Shaffi': 28,
 'Sudan': 29,
 '3,600': 30,
 'carmaker': 31,
 'runway': 32,
 '(': 33,
 'due': 34,
 'Masorin': 35,
 'except': 36,
 'consultation': 37,
 'usually': 38,
 'rebuked': 39,
 'connect': 40,
 'defrauding': 41,
 'cease-fire': 42,
 'Husseindoust': 43,
 'hockey': 44,
 'hay': 45,
 'Elliott': 46,
 'genes': 47,
 'shareholders': 48,
 'foot': 49,
 'quails': 50,
 'decisions': 51,
 'Seche': 52,
 'Kane': 53,
 'representing': 54,
 'Nuristani': 55,
 'Monument': 56,
 'welcome': 57,
 'disappointment': 58,
 'Sunni-dominant': 59,
 'pegs': 60,
 'VanAllen': 61,
 'conspicuous':

### PADDING THE SENTENCE

In [12]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = 50

X = [[word2idx[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=max_len,
                  sequences=X,
                  padding="post",
                  value=num_words - 1)

y = [[tag2idx[w[2]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len,
                  sequences=y,
                  padding="post",
                  value=tag2idx["O"])

### MODEL PRPARATION

In [13]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=1)

#### BUILDING THE MODEL (LSTM) AND COMPILING IT

In [14]:
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.layers import TimeDistributed, SpatialDropout1D, Bidirectional

In [15]:
input_word = Input(shape=(max_len, ))
model = Embedding(input_dim=num_words, output_dim=50,
                  input_length=max_len)(input_word)
model = SpatialDropout1D(0.1)(model)
model = Bidirectional(
    LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(num_tags, activation="softmax"))(model)
model = Model(input_word, out)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 50)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 50, 50)            1758950   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 50, 50)            0         
_________________________________________________________________
bidirectional (Bidirectional (None, 50, 200)           120800    
_________________________________________________________________
time_distributed (TimeDistri (None, 50, 17)            3417      
Total params: 1,883,167
Trainable params: 1,883,167
Non-trainable params: 0
_________________________________________________________________


In [16]:
model.compile(optimizer="adam",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

#### TRAINING THE MODEL

In [17]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [19]:
model.evaluate(x_test, y_test)



[0.04835232572507321, 0.9855588]

In [None]:
chkpt = ModelCheckpoint("model_weights.h5",
                        monitor='val_loss',
                        verbose=1,
                        save_best_only=True,
                        save_weights_only=True,
                        mode='min')

early_stopping = EarlyStopping(monitor='val_accuracy', verbose=0, mode='max')

history = model.fit(x=x_train,
                    y=y_train,
                    validation_data=(x_test, y_test),
                    batch_size=32,
                    epochs=3,
                    verbose=1)

### MODEL EVALUATION

In [19]:
model.evaluate(x_test, y_test)



[0.04835232572507321, 0.9855588]

In [29]:
i = np.random.randint(0, x_test.shape[0])  # 659
p = model.predict(np.array([x_test[i]]))
p = np.argmax(p, axis=-1)
y_true = y_test[i]
print("{:15}{:5}\t {}\n".format("Word", "True", "Pred"))
print("-" * 30)
for w, true, pred in zip(x_test[i], y_true, p[0]):
    print("{:15}{}\t{}".format(words[w - 1], tags[true], tags[pred]))

Word           True 	 Pred

------------------------------
The            O	O
United         B-org	B-org
Nations        I-org	I-org
has            O	O
been           O	O
under          O	O
fire           O	O
for            O	O
failing        O	O
to             O	O
stop           O	O
ongoing        O	O
ethnic         O	O
violence       O	O
in             O	O
Ituri          B-geo	B-geo
.              O	O
Surrey         O	O
Surrey         O	O
Surrey         O	O
Surrey         O	O
Surrey         O	O
Surrey         O	O
Surrey         O	O
Surrey         O	O
Surrey         O	O
Surrey         O	O
Surrey         O	O
Surrey         O	O
Surrey         O	O
Surrey         O	O
Surrey         O	O
Surrey         O	O
Surrey         O	O
Surrey         O	O
Surrey         O	O
Surrey         O	O
Surrey         O	O
Surrey         O	O
Surrey         O	O
Surrey         O	O
Surrey         O	O
Surrey         O	O
Surrey         O	O
Surrey         O	O
Surrey         O	O
Surrey         O	O
Surrey         O	O
Surre

# ---X---X---X---X---X---X---X---X---X---X---X---X---X---X---X---X---X---X---X---X---X---