<a href="https://colab.research.google.com/github/AyushiKashyapp/NLP/blob/main/NamedEntityRecognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Named Entity Recognition (NER)

A task of NLP to identify named entities in a piece of text.


In [1]:
from google.colab import files
uploaded = files.upload()
import pandas as pd
data = pd.read_csv('ner_dataset.csv', encoding = 'unicode_escape')
data.head()

Saving ner_dataset.csv to ner_dataset.csv


Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


# Data Preparation for Neural Networks

Training a Neural Network for the task of Named Entity Recognition (NER).

Modifying the data and extracting the mappings to fit it into a neural network by facilitating converting tokens and tags to numerical indices for model training and prediction.

In [8]:
from itertools import chain
def get_dict_map(data, token_or_tag):
    # Initializing two dictionaries.
    tok2idx = {}
    idx2tok = {}

    if token_or_tag == 'token':
        # If token_or_tag is token, creating a vocab from unique words in data['Words']
        vocab = list(set(data['Word'].to_list()))
    else:
      # If token_or_tag is a tag, creating a vocab from unique tags in data['Tag']
        vocab = list(set(data['Tag'].to_list()))

    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)} #Mapping indices to tokens.
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)} #Mapping tokens to indices.
    return tok2idx, idx2tok

token2idx, idx2token = get_dict_map(data, 'token') #Mappings for words
tag2idx, idx2tag = get_dict_map(data, 'tag') #Mappings for tags

Transforming the columns in the data to extract the sequential data for our neural network.

In [9]:
#Mapping words and tags to indices.
data['Word_idx'] = data['Word'].map(token2idx) #Convert word to thier respective indices.
data['Tag_idx'] = data['Tag'].map(tag2idx) #Convert tag to thier respective indices.

#Handling missing data using forward fill method
data_fillna = data.fillna(method='ffill', axis=0)

#Group the sentence by sentence # and aggregate the columns into list for each sentence.
data_group = data_fillna.groupby(
['Sentence #'],as_index=False
)[['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx']].agg(lambda x: list(x))

Splitting the data into training and test sets using a function because LSTM layers accept sequences of the same length only. So every sentence that appears as integer in the data must be padded with the same length.

In [11]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

def get_pad_train_test_val(data_group, data):

    #get max token and tag length
    n_token = len(list(set(data['Word'].to_list())))
    n_tag = len(list(set(data['Tag'].to_list())))

    #Pad tokens (X var) to ensure they all have the same length
    tokens = data_group['Word_idx'].tolist()
    maxlen = max([len(s) for s in tokens])
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= n_token - 1)

    #Pad Tags (y var) to ensure they all have the same length and convert it into one hot encoding
    tags = data_group['Tag_idx'].tolist()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value= tag2idx["O"])
    n_tags = len(tag2idx)
    pad_tags = [to_categorical(i, num_classes=n_tags) for i in pad_tags]

    #Split train, test and validation set
    tokens_, test_tokens, tags_, test_tags = train_test_split(pad_tokens, pad_tags, test_size=0.1, train_size=0.9, random_state=2020)
    train_tokens, val_tokens, train_tags, val_tags = train_test_split(tokens_,tags_,test_size = 0.25,train_size =0.75, random_state=2020)

    print(
        'train_tokens length:', len(train_tokens),
        '\ntrain_tokens length:', len(train_tokens),
        '\ntest_tokens length:', len(test_tokens),
        '\ntest_tags:', len(test_tags),
        '\nval_tokens:', len(val_tokens),
        '\nval_tags:', len(val_tags),
    )

    return train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags

train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags = get_pad_train_test_val(data_group, data)

train_tokens length: 32372 
train_tokens length: 32372 
test_tokens length: 4796 
test_tags: 4796 
val_tokens: 10791 
val_tags: 10791


Training Neural Network for NER

In [12]:
import numpy as np
import tensorflow
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model
from numpy.random import seed
seed(1)
tensorflow.random.set_seed(2)

Layer to take the dimensions from LSTM layer and to give the maximum length and maximum tags as an output.

In [13]:
input_dim = len(list(set(data['Word'].to_list())))
output_dim = 64
input_length = max([len(s) for s in data_group['Word_idx'].tolist()])
n_tags = len(tag2idx)

Helper function to give the summary of each layer of the neural network model.

In [14]:
def get_bilstm_lstm_model():

  model = Sequential()

  #Add Embedding layer: Converts input sequences into dense vectors of fixed size.
  model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))

  #Add bidirectional LSTM: Processes sequences in both forward and backward directions.
  model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))

  #Add LSTM: Adds another LSTM layer for further sequence processing.
  model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))

  #Add timeDistributed Layer: Applies a dense layer to each time step of the sequence.
  model.add(TimeDistributed(Dense(n_tags, activation="relu")))

  #Compile model
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
  model.summary()

  return model

Helper function to train the NER model.

In [17]:
def train_model(x, y, model):
  loss = list()
  for i in range(5):
    hist = model.fit(x, y, batch_size = 1000, verbose =1, epochs=1, validation_split=0.2)
    loss.append(hist.history['loss'][0]) #Appends the training loss of the current epoch to the loss list.

  return loss

In [18]:
results = pd.DataFrame()
model_bilstm_lstm = get_bilstm_lstm_model()
plot_model(model_bilstm_lstm)
results['with_add_lstm'] = train_model(train_tokens, np.array(train_tags), model_bilstm_lstm)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 104, 64)           2251392   
                                                                 
 bidirectional_1 (Bidirecti  (None, 104, 128)          66048     
 onal)                                                           
                                                                 
 lstm_3 (LSTM)               (None, 104, 64)           49408     
                                                                 
 time_distributed_1 (TimeDi  (None, 104, 17)           1105      
 stributed)                                                      
                                                                 
Total params: 2367953 (9.03 MB)
Trainable params: 2367953 (9.03 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


Testing the NER Model.

In [19]:
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')
text = nlp('Hi, My name is Ayushi Kashyap \n I am from India \n I am currently studying MSc in Computer Science in University of Galway')
displacy.render(text, style = 'ent', jupyter = True)