# Name Entity Recognition using Deep Learning

* Upload the lab_resources and NERC_nn files to you Drive Account:
  * Lab_resource: https://www.cs.upc.edu/~turmo/mud/lab/lab_resources.zip
  * NERC_nn code: https://www.cs.upc.edu/~turmo/mud/lab/06-NERC-nn.zip
  
* Before running the code, ensure that your Google Colab is set to use GPU:
  * Edit → Notebook Settings
* Mount your Drive disk unit:
  * Left-side menu → Files → Mount drive (the icon that looks like a folder with the Drive logo).


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Define the paths to the data and utils in your Drive unit:

In [2]:
import os
utilsdir='/content/drive/MyDrive/06-NERC-nn 2/'

In [3]:
evaluatordir= os.path.join(utilsdir,'util/')
traindir=os.path.join(utilsdir,'data/train')
validationdir=os.path.join(utilsdir,'data/devel')
testdir=os.path.join(utilsdir,'data/test')
pretrained_model=os.path.join(utilsdir,'GoogleNews-vectors-negative300.bin')
modelname ='model'
outfile ='out.txt'

In [4]:
!pip install tensorflow-addons
import sys
sys.path.insert(1,utilsdir) # Path to the utils folder on your Google Drive disk
sys.path.insert(1,evaluatordir) # Path to the evaluator folder on your Google Drive disk

Collecting tensorflow-addons
  Downloading tensorflow_addons-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (611 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/611.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m327.7/611.8 kB[0m [31m9.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.8/611.8 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<3.0.0,>=2.7 (from tensorflow-addons)
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow-addons
Successfully installed tensorflow-addons-0.23.0 typeguard-2.13.3


In [5]:
from contextlib import redirect_stdout

from tensorflow.keras import Input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, GRU, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, concatenate, Softmax
#from tensorflow_addons.text.crf_wrapper import CRFModelWrapper


#from codemaps_sufpref import *
#from codemaps_lc import *
#from codemaps_lcpos import *
#from codemaps_posNOlc import *
#from codemaps_NOlc_pos_len import *
#from codemaps_NOlc_pos_len_pct import *
#from codemaps_NOlc_pos_len_NOpct_cap import *
from codemaps_NOlc_pos_len_pct_caps_sufpref2 import *

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [6]:
import random
import numpy as np
import tensorflow as tf
import os
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)
os.environ['PYTHONHASHSEED'] = '0'

In [7]:
import gensim.downloader as api
from gensim.models import KeyedVectors

# Load Google News pre-trained word2vec embeddings
#word2vec_model = KeyedVectors.load_word2vec_format(pretrained_model, binary=True)


# Download the GloVe model
glove_model = api.load('glove-wiki-gigaword-300')



In [8]:
def create_embedding_matrix(word_index, pretrained_model, embedding_dim=300):
    embedding_matrix = np.zeros((len(word_index), embedding_dim))
    for word, i in word_index.items():
        try:
            embedding_vector = pretrained_model[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            pass  # Words not found in the embedding index will be all-zeros.
    return embedding_matrix

In [9]:
def build_network(codes) :

   word_index = codes.word_index
   # Create embedding matrix from pre-trained embeddings
   #embedding_matrix = create_embedding_matrix(word_index, word2vec_model, embedding_dim=300)
   embedding_matrix = create_embedding_matrix(word_index, glove_model, embedding_dim=300)

   # sizes
   n_words = codes.get_n_words()
   n_sufs = codes.get_n_sufs()
   n_prefs = codes.get_n_prefs()
   n_sufs2 = codes.get_n_sufs2()
   n_prefs2 = codes.get_n_prefs2()
   #n_lc_words = codes.get_n_lc_words()
   n_pos = codes.get_n_pos()
   n_len = codes.get_n_len()
   n_punct = codes.get_n_punct()
   n_caps = codes.get_n_cap()
   n_labels = codes.get_n_labels()
   max_len = codes.maxlen


   #####################################################
   # word embeddings pretrained model
   inptW = Input(shape=(max_len,))
   embW = Embedding(input_dim=n_words, output_dim=300, weights=[embedding_matrix],
                      input_length=max_len, mask_zero=False)(inptW)
   embW = Dropout(0.1)(embW)

   '''# word embeddings
   inptW = Input(shape=(max_len,))
   embW = Embedding(input_dim=n_words, output_dim=200,
                      input_length=max_len, mask_zero=False)(inptW)
   embW = Dropout(0.1)(embW)'''

   # suf embeddings
   inptS = Input(shape=(max_len,))
   embS = Embedding(input_dim=n_sufs, output_dim=200,
                      input_length=max_len, mask_zero=False)(inptS)
   embS = Dropout(0.1)(embS)

   # pref embeddings
   inptP = Input(shape=(max_len,))
   embP = Embedding(input_dim=n_prefs, output_dim=200,
                      input_length=max_len, mask_zero=False)(inptP)
   embP = Dropout(0.1)(embP)

   # suf embeddings
   inptS2 = Input(shape=(max_len,))
   embS2 = Embedding(input_dim=n_sufs2, output_dim=200,
                      input_length=max_len, mask_zero=False)(inptS2)
   embS2 = Dropout(0.1)(embS2)

   # pref embeddings
   inptP2 = Input(shape=(max_len,))
   embP2 = Embedding(input_dim=n_prefs2, output_dim=200,
                      input_length=max_len, mask_zero=False)(inptP2)
   embP2 = Dropout(0.1)(embP2)

   '''lc_index = codes.lc_index
   # Create embedding matrix from pre-trained embeddings
   #embedding_matrix = create_embedding_matrix(word_index, govel_model, embedding_dim=300)
   embedding_matrix_lc = create_embedding_matrix(lc_index, word2vec_model, embedding_dim=300)

   # lc embeddings pretrained model
   inptLC = Input(shape=(max_len,))
   embLC = Embedding(input_dim=n_lc_words, output_dim=300, weights=[embedding_matrix_lc],
                      input_length=max_len, mask_zero=False)(inptLC)
   embLC = Dropout(0.1)(embLC)'''

   '''# lc embeddings
   inptLC = Input(shape=(max_len,))
   embLC = Embedding(input_dim=n_lc_words, output_dim=200,
                      input_length=max_len, mask_zero=False)(inptLC)
   embLC = Dropout(0.1)(embLC)'''

   # postags embeddings
   inptPOS = Input(shape=(max_len,))
   embPOS = Embedding(input_dim=n_pos, output_dim=200,
                      input_length=max_len, mask_zero=False)(inptPOS)
   embPOS = Dropout(0.1)(embPOS)

   # lenths embeddings
   inptLen = Input(shape=(max_len,))
   embLen = Embedding(input_dim=n_len, output_dim=200,
                      input_length=max_len, mask_zero=False)(inptLen)
   embLen = Dropout(0.1)(embLen)

   # punctuation embeddings
   inptPct = Input(shape=(max_len,))
   embPct = Embedding(input_dim=n_punct, output_dim=200,
                      input_length=max_len, mask_zero=False)(inptPct)
   embPct = Dropout(0.1)(embPct)

   # caps embeddings
   inptCap = Input(shape=(max_len,))
   embCap = Embedding(input_dim=n_caps, output_dim=200,
                      input_length=max_len, mask_zero=False)(inptCap)
   embCap = Dropout(0.1)(embCap)

   ########################################################
   # model concatenation
   #model = concatenate([embW,embS,embP])
   #model = concatenate([embW,embS,embP,embLC])
   #model = concatenate([embW,embS,embP,embLC,embPOS])
   #model = concatenate([embW,embS,embP,embPOS])
   #model = concatenate([embW,embS,embP,embPOS,embLen])
   #model = concatenate([embW,embS,embP,embPOS,embLen,embPct])
   #model = concatenate([embW,embS,embP,embPOS,embLen,embPct,embCap])
   #model = concatenate([embW,embS,embP,embPOS,embLen,embCap])
   model = concatenate([embW,embS,embP,embS2,embP2,embPOS,embLen,embPct,embCap])

   y = Bidirectional(LSTM(units=500, return_sequences=True))(model)  #  biLSTM
   out = TimeDistributed(Dense(n_labels, activation=Softmax()))(y)

   return Model(
        #inputs=[inptW,inptS,inptP], outputs=out
        #inputs=[inptW,inptS,inptP,inptLC], outputs=out
        #inputs=[inptW,inptS,inptP,inptLC,inptPOS], outputs=out
        #inputs=[inptW,inptS,inptP,inptPOS], outputs=out
        #inputs=[inptW,inptS,inptP,inptPOS,inptLen], outputs=out
        #inputs=[inptW,inptS,inptP,inptPOS,inptLen,inptPct], outputs=out
        #inputs=[inptW,inptS,inptP,inptPOS,inptLen,inptPct,inptCap], outputs=out
        #inputs=[inptW,inptS,inptP,inptPOS,inptLen,inptCap], outputs=out
        inputs=[inptW,inptS,inptP,inptS2,inptP2,inptPOS,inptLen,inptPct,inptCap], outputs=out

    )


In [17]:


# load train and validation data
traindata = Dataset(traindir)
valdata = Dataset(validationdir)

# create indexes from training data
max_len = 300 ##
suf_len = 3 ##
pref_len = 3 ##
suf_len2 = 4 ##
pref_len2 = 5 ##
codes  = Codemaps(traindata, max_len, suf_len, pref_len, suf_len2, pref_len2)

# encode datasets
#[Xt,Xts,Xtp] = codes.encode_words(traindata)
#[Xt,Xts,Xtp,Xtlc] = codes.encode_words(traindata)
#[Xt,Xts,Xtp,Xtlc,Xtpos] = codes.encode_words(traindata)
#[Xt,Xts,Xtp,Xtpos] = codes.encode_words(traindata)
#[Xt,Xts,Xtp,Xtpos,Xtlen] = codes.encode_words(traindata)
#[Xt,Xts,Xtp,Xtpos,Xtlen,Xtpct] = codes.encode_words(traindata)
#[Xt,Xts,Xtp,Xtpos,Xtlen,Xtpct,Xtcap] = codes.encode_words(traindata)
#[Xt,Xts,Xtp,Xtpos,Xtlen,Xtcap] = codes.encode_words(traindata)
[Xt,Xts,Xtp,Xts2,Xtp2,Xtpos,Xtlen,Xtpct,Xtcap] = codes.encode_words(traindata)

Yt = codes.encode_labels(traindata)

#[Xv,Xvs,Xvp] = codes.encode_words(valdata)
#[Xv,Xvs,Xvp,Xvlc] = codes.encode_words(valdata)
#[Xv,Xvs,Xvp,Xvlc,Xvpos] = codes.encode_words(valdata)
#[Xv,Xvs,Xvp,Xvpos] = codes.encode_words(valdata)
#[Xv,Xvs,Xvp,Xvpos,Xvlen] = codes.encode_words(valdata)
#[Xv,Xvs,Xvp,Xvpos,Xvlen,Xvpct] = codes.encode_words(valdata)
#[Xv,Xvs,Xvp,Xvpos,Xvlen,Xvpct,Xvcap] = codes.encode_words(valdata)
#[Xv,Xvs,Xvp,Xvpos,Xvlen,Xvcap] = codes.encode_words(valdata)
[Xv,Xvs,Xvp,Xvs2,Xvp2,Xvpos,Xvlen,Xvpct,Xvcap] = codes.encode_words(valdata)
Yv = codes.encode_labels(valdata)

n_tags = codes.get_n_labels()
max_len = codes.maxlen

In [18]:
model = build_network(codes)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
#optimizer = tf.keras.optimizers.Nadam(learning_rate=0.002)

model.compile(optimizer=optimizer ,metrics=["accuracy"], loss="sparse_categorical_crossentropy")
model.build([(None,max_len),(None,max_len),(None,max_len),(None,max_len),(None,max_len),(None,max_len)])

with redirect_stdout(sys.stderr) :
   model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_10 (InputLayer)       [(None, 300)]                0         []                            
                                                                                                  
 input_11 (InputLayer)       [(None, 300)]                0         []                            
                                                                                                  
 input_12 (InputLayer)       [(None, 300)]                0         []                            
                                                                                                  
 input_13 (InputLayer)       [(None, 300)]                0         []                            
                                                                                            

In [None]:
## --------- MAIN PROGRAM -----------
## --
## -- Usage:  train.py ../data/Train ../data/Devel  modelname
## --

# train model
with redirect_stdout(sys.stderr) :
   #model.fit([Xt,Xts,Xtp], Yt, batch_size=32, epochs=10, validation_data=([Xv,Xvs,Xvp],Yv), verbose=1)
   #model.fit([Xt,Xts,Xtp,Xtlc], Yt, batch_size=32, epochs=10, validation_data=([Xv,Xvs,Xvp,Xvlc],Yv), verbose=1)
   #model.fit([Xt,Xts,Xtp,Xtlc,Xtpos], Yt, batch_size=32, epochs=10, validation_data=([Xv,Xvs,Xvp,Xvlc,Xvpos],Yv), verbose=1)
   #model.fit([Xt,Xts,Xtp,Xtpos], Yt, batch_size=32, epochs=10, validation_data=([Xv,Xvs,Xvp,Xvpos],Yv), verbose=1)
   #model.fit([Xt,Xts,Xtp,Xtpos,Xtlen], Yt, batch_size=32, epochs=10, validation_data=([Xv,Xvs,Xvp,Xvpos,Xvlen],Yv), verbose=1)
   #model.fit([Xt,Xts,Xtp,Xtpos,Xtlen,Xtpct,Xtcap], Yt, batch_size=32, epochs=10, validation_data=([Xv,Xvs,Xvp,Xvpos,Xvlen,Xvpct,Xvcap],Yv), verbose=1)
   #model.fit([Xt,Xts,Xtp,Xtpos,Xtlen,Xtcap], Yt, batch_size=32, epochs=10, validation_data=([Xv,Xvs,Xvp,Xvpos,Xvlen,Xvcap],Yv), verbose=1)
   model.fit([Xt,Xts,Xtp,Xts2,Xtp2,Xtpos,Xtlen,Xtpct,Xtcap], Yt, batch_size=32, epochs=10, validation_data=([Xv,Xvs,Xvp,Xvs2,Xvp2,Xvpos,Xvlen,Xvpct,Xvcap],Yv), verbose=1)


# save model and indexs
model.save(modelname)
#codes.save(modelname)
#save_model_and_indexs(model, idx, modelname)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Predict

In [None]:
#import sys
import evaluator

In [None]:
def output_entities(data, preds, outfile) :

   outf = open(outfile, 'w')
   for sid,tags in zip(data.sentence_ids(),preds) :
      inside = False
      for k in range(0,min(len(data.get_sentence(sid)),codes.maxlen)) :
         y = tags[k]
         token = data.get_sentence(sid)[k]

         if (y[0]=="B") :
             entity_form = token['form']
             entity_start = token['start']
             entity_end = token['end']
             entity_type = y[2:]
             inside = True
         elif (y[0]=="I" and inside) :
             entity_form += " "+token['form']
             entity_end = token['end']
         elif (y[0]=="O" and inside) :
             print(sid, str(entity_start)+"-"+str(entity_end), entity_form, entity_type, sep="|", file=outf)
             inside = False

      if inside : print(sid, str(entity_start)+"-"+str(entity_end), entity_form, entity_type, sep="|", file=outf)

   outf.close()

In [None]:
## --------- Evaluator -----------
def evaluation(datadir,outfile) :
   evaluator.evaluate("NER", datadir, outfile)


In [None]:
## --------- MAIN PROGRAM -----------
## --
## -- Usage:  baseline-NER.py target-dir
## --
## -- Extracts Drug NE from all XML files in target-dir
## --

datadir = validationdir

testdata = Dataset(datadir)

#[X,Xs,Xp] = codes.encode_words(testdata)
#[X,Xs,Xp,Xlc] = codes.encode_words(testdata)
#[X,Xs,Xp,Xlc,Xpos] = codes.encode_words(testdata)
#[X,Xs,Xp,Xpos] = codes.encode_words(testdata)
#[X,Xs,Xp,Xpos,Xlen] = codes.encode_words(testdata)
#[X,Xs,Xp,Xpos,Xlen,Xpct] = codes.encode_words(testdata)
#[X,Xs,Xp,Xpos,Xlen,Xpct,Xcap] = codes.encode_words(testdata)
#[X,Xs,Xp,Xpos,Xlen,Xcap] = codes.encode_words(testdata)
[X,Xs,Xp,Xs2,Xp2,Xpos,Xlen,Xpct,Xcap] = codes.encode_words(testdata)


#Y = model.predict([X,Xs,Xp])
#Y = model.predict([X,Xs,Xp,Xlc])
#Y = model.predict([X,Xs,Xp,Xlc,Xpos])
#Y = model.predict([X,Xs,Xp,Xpos])
#Y = model.predict([X,Xs,Xp,Xpos,Xlen])
#Y = model.predict([X,Xs,Xp,Xpos,Xlen,Xpct])
#Y = model.predict([X,Xs,Xp,Xpos,Xlen,Xpct,Xcap])
#Y = model.predict([X,Xs,Xp,Xpos,Xlen,Xcap])
Y = model.predict([X,Xs,Xp,Xs2,Xp2,Xpos,Xlen,Xpct,Xcap])


Y = [[codes.idx2label(np.argmax(w)) for w in s] for s in Y]

# extract entities
output_entities(testdata, Y, outfile)

# evaluate
evaluation(datadir,outfile)


                   tp	  fp	  fn	#pred	#exp	P	R	F1
------------------------------------------------------------------------------
brand             277	  32	  97	 309	 374	89.6%	74.1%	81.1%
drug             1682	 107	 224	1789	1906	94.0%	88.2%	91.0%
drug_n             10	  10	  35	  20	  45	50.0%	22.2%	30.8%
group             560	  82	 127	 642	 687	87.2%	81.5%	84.3%
------------------------------------------------------------------------------
M.avg            -	-	-	-	-	80.2%	66.5%	71.8%
------------------------------------------------------------------------------
m.avg            2529	 231	 483	2760	3012	91.6%	84.0%	87.6%
m.avg(no class)  2590	 170	 422	2760	3012	93.8%	86.0%	89.7%
