In [1]:
import pandas as pd
from tensorflow import keras
import numpy as np

In [3]:
df = pd.read_csv('/content/ner_dataset.csv', encoding='unicode-escape')
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [4]:
tags= df.Tag.unique()
tags

array(['O', 'B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim',
       'B-art', 'I-art', 'I-per', 'I-gpe', 'I-tim', 'B-nat', 'B-eve',
       'I-eve', 'I-nat', nan], dtype=object)

In [5]:
id2tag = dict(enumerate(tags))
tag2id = { v : k for k,v in id2tag.items() }

id2tag[0]

'O'

In [6]:
vocab = set(df['Word'].apply(lambda x: x.lower()))
id2word = { i+1 : v for i,v in enumerate(vocab) }
id2word[0] = '<UNK>'
vocab.add('<UNK>')
word2id = { v : k for k,v in id2word.items() }

In [7]:
X,Y = [], []
s,t = [], []
for i,row in df[['Sentence #', 'Word', 'Tag']].iterrows():
    if pd.isna(row['Sentence #']):
        s.append(row['Word'])
        t.append(row['Tag'])
    else:
        if len(s)>0:
            X.append(s)
            Y.append(t)
        s,t = [row['Word']], [row['Tag']]

X.append(s)
Y.append(t)

In [8]:
def vectorize(seq):
    return [word2id[x.lower()] for x in seq]

def tagify(seq):
    return [tag2id[x] for x in seq]

Xv = list(map(vectorize,X))
Yv = list(map(tagify,Y))

Xv[0], Yv[0]

([15746,
  1282,
  6558,
  1130,
  2432,
  13148,
  18731,
  102,
  25981,
  24364,
  100,
  18752,
  14301,
  8512,
  7772,
  24364,
  10426,
  1282,
  12596,
  22865,
  13560,
  7262,
  16358,
  2881],
 [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0])

In [9]:
X_data = keras.preprocessing.sequence.pad_sequences(Xv,padding='post')
Y_data = keras.preprocessing.sequence.pad_sequences(Yv,padding='post')

##Defining Token Classification Network

In [11]:


maxlen = X_data.shape[1]
vocab_size = len(vocab)
num_tags = len(tags)
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size, 300, input_length=maxlen),
    keras.layers.Bidirectional(keras.layers.LSTM(units=100, activation='tanh', return_sequences=True)),
    keras.layers.Bidirectional(keras.layers.LSTM(units=100, activation='tanh', return_sequences=True)),
    keras.layers.TimeDistributed(keras.layers.Dense(num_tags, activation='softmax'))
])
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['acc'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 104, 300)          8073000   
                                                                 
 bidirectional (Bidirection  (None, 104, 200)          320800    
 al)                                                             
                                                                 
 bidirectional_1 (Bidirecti  (None, 104, 200)          240800    
 onal)                                                           
                                                                 
 time_distributed (TimeDist  (None, 104, 18)           3618      
 ributed)                                                        
                                                                 
Total params: 8638218 (32.95 MB)
Trainable params: 8638218 (32.95 MB)
Non-trainable params: 0 (0.00 Byte)
________________

In [None]:
model.fit(X_data, Y_data)



##Testing the Result

In [None]:
sent = 'John Smith went to Paris to attend a conference in cancer development institute'
words = sent.lower().split()
v = keras.preprocessing.sequence.pad_sequences([[word2id[x] for x in words]],padding='post',maxlen=maxlen)
res = model(v)[0]

In [None]:
r = np.argmax(res.nupy(), axis=1)
for i,w in zip(r,words):
    print(f"{w} -> {id2tag[i]}")