# Named Entity Recognition (NER)
In this assignment we will perform NER using RNNs.
For the task, we will use the provided dataset which is already split into train/val/test sets. The dataset is tagged using BIO tagging scheme with a total of 17 different tags.
You need to perform the following:
- Read the dataset
- Encode the data as needed
- Create a model and train it using the train set and plot the loss and accuracy on the validation set
- Select the best performing model on the validation set to evalute your model on the test set.
- For this assignment you can show the performance using the accuracy metric (after delaing with padding, is used) and micro and macro F1-scores.

## Read the dataset

In [189]:
import os 
import numpy as np
import pandas as pd
import random as rnd
from tensorflow.keras import Model,Input
from tensorflow.keras.layers import LSTM,Embedding,Dense, TimeDistributed
import tensorflow as tf

In [166]:
def get_vocab(vocab_path, tags_path):
    vocab = {}
    with open(vocab_path) as f:
        for i, l in enumerate(f.read().splitlines()):
            vocab[l] = i  # to avoid the 0
        # loading tags (we require this to map tags to their indices)
    vocab['<PAD>'] = len(vocab) # 35180
    tag_map = {}
    with open(tags_path) as f:
        for i, t in enumerate(f.read().splitlines()):
            tag_map[t] = i 
    
    return vocab, tag_map

In [167]:
def get_params(vocab, tag_map, sentences_file, labels_file):
    sentences = []
    labels = []

    with open(sentences_file) as f:
        for sentence in f.read().splitlines():
            # replace each token by its index if it is in vocab
            # else use index of UNK_WORD
            s = [vocab[token] if token in vocab 
                 else vocab['UNK']
                 for token in sentence.split(' ')]
            sentences.append(s)

    with open(labels_file) as f:
        for sentence in f.read().splitlines():
            # replace each label by its index
            l = [tag_map[label] for label in sentence.split(' ')] # I added plus 1 here
            labels.append(l) 
    return sentences, labels, len(sentences)

In [168]:
vocab, tag_map = get_vocab('NER/words.txt', 'NER/tags.txt')
t_sentences, t_labels, t_size = get_params(vocab, tag_map, 'NER/train/sentences.txt', 'NER/train/labels.txt')
v_sentences, v_labels, v_size = get_params(vocab, tag_map, 'NER/validate/sentences.txt', 'NER/validate/labels.txt')
x_sentences, x_labels, x_size = get_params(vocab, tag_map, 'NER/test/sentences.txt', 'NER/test/labels.txt')

In [169]:
print(tag_map)

{'O': 0, 'B-geo': 1, 'B-gpe': 2, 'B-per': 3, 'I-geo': 4, 'B-org': 5, 'I-org': 6, 'B-tim': 7, 'B-art': 8, 'I-art': 9, 'I-per': 10, 'I-gpe': 11, 'I-tim': 12, 'B-nat': 13, 'B-eve': 14, 'I-eve': 15, 'I-nat': 16}


### Vocab mapping

In [170]:
# vocab translates from a word to a unique number
print('vocab["the"]:', vocab["the"])
# Pad token
print('padded token:', vocab['<PAD>'])

vocab["the"]: 9
padded token: 35179


In [171]:
def readfile(path):
    with open(path) as file:
        lines = file.readlines()
        sentences = [line.rstrip() for line in lines]
        return sentences

In [172]:
train_sen = readfile('NER/train/sentences.txt')

test_sen = readfile('NER/test/sentences.txt')
tokeniser= tf.keras.preprocessing.text.Tokenizer(lower=False,filters='')

tokeniser.fit_on_texts(test_sen)

val_sen = readfile('NER/validate/sentences.txt')

In [173]:
reverse_tag_map={v: k for k, v in tag_map.items()}

## Exploring information about the data

In [174]:
# Exploring information about the data
print('The number of outputs is tag_map', len(tag_map))
# The number of vocabulary tokens (including <PAD>)
g_vocab_size = len(vocab)
print(f"Num of vocabulary words: {g_vocab_size}")
print('The vocab size is', len(vocab))
print('The training size is', t_size)
print('The validation size is', v_size)
print('An example of the first sentence is', t_sentences[0])
print('An example of its corresponding label is', t_labels[0])

The number of outputs is tag_map 17
Num of vocabulary words: 35180
The vocab size is 35180
The training size is 33570
The validation size is 7194
An example of the first sentence is [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 9, 15, 1, 16, 17, 18, 19, 20, 21]
An example of its corresponding label is [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0]


In [175]:
max_len=128
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

t_padded_sen=pad_sequences(maxlen=max_len,sequences=t_sentences,padding="post",value=0)
t_padded_tags=pad_sequences(maxlen=max_len,sequences=t_labels,padding="post",value=tag_map['O'])
v_padded_sen=pad_sequences(maxlen=max_len,sequences=v_sentences,padding="post",value=0)
v_padded_tags=pad_sequences(maxlen=max_len,sequences=v_labels,padding="post",value=tag_map['O'])
x_padded_sen=pad_sequences(maxlen=max_len,sequences=x_sentences,padding="post",value=0)
x_padded_tags=pad_sequences(maxlen=max_len,sequences=x_labels,padding="post",value=tag_map['O'])


In [176]:
t_target = [to_categorical(i,num_classes = len(tag_map)) for i in  t_padded_tags]
v_target = [to_categorical(i,num_classes = len(tag_map)) for i in  v_padded_tags]
x_target = [to_categorical(i,num_classes = len(tag_map)) for i in  x_padded_tags]

In [177]:
print("Input Train Data Shape ",t_padded_sen.shape)
print("Train Labels Length ",len(t_target))
print("Input Test Data Shape ",x_padded_sen.shape)
print("Test Labels Length ",len(x_target))

print("Input Validation Data Shape ",v_padded_sen.shape)
print("Validation Labels Length ",len(v_target))

Input Train Data Shape  (33570, 128)
Train Labels Length  33570
Input Test Data Shape  (7194, 128)
Test Labels Length  7194
Input Validation Data Shape  (7194, 128)
Validation Labels Length  7194


In [178]:
embedding_dim=128
vocab_size= t_size+x_size+v_size+1
lstm_units=128
max_len=128

input_word = Input(shape = (max_len,))
model = Embedding(input_dim = vocab_size+1,output_dim = embedding_dim,input_length = max_len)(input_word)

model = LSTM(units=embedding_dim,return_sequences=True)(model)
out = TimeDistributed(Dense(len(tag_map),activation = 'softmax'))(model)
model = Model(input_word,out)
model.summary()

Model: "model_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None, 128)]             0         
                                                                 
 embedding_7 (Embedding)     (None, 128, 128)          6138880   
                                                                 
 lstm_7 (LSTM)               (None, 128, 128)          131584    
                                                                 
 time_distributed_7 (TimeDis  (None, 128, 17)          2193      
 tributed)                                                       
                                                                 
Total params: 6,272,657
Trainable params: 6,272,657
Non-trainable params: 0
_________________________________________________________________


In [179]:
model.compile(optimizer = 'adam',loss = 'categorical_crossentropy',metrics = ['accuracy'])

In [180]:
history = model.fit(t_padded_sen,np.array(t_target),validation_data=(v_padded_sen,np.array(v_target)),batch_size = 32,epochs = 1)



In [181]:
preds=model.predict(x_padded_sen)

In [184]:
def evaluatePredictions(test_data,preds,actual_preds):
    print("Shape of Test Data Array",test_data.shape)
    y_actual=np.argmax(np.array(actual_preds),axis=2)
    y_pred=np.argmax(preds,axis=2)
    num_test_data=test_data.shape[0]
    print("Number of Test Data Points ",num_test_data)
    data=pd.DataFrame()
    df_list=[]
    for i in range(num_test_data):
        test_str=list(test_data[i])
        df=pd.DataFrame()
        df['test_tokens']=test_str
        df['tokens']=df['test_tokens'].apply(lambda x:tokeniser.index_word[1] if x!=0 else '<PAD>')
        df['actual_target_index']=list(y_actual[i])
        df['pred_target_index']=list(y_pred[i])
        df['actual_target_tag']=df['actual_target_index'].apply(lambda x:reverse_tag_map[x])
        df['pred_target_tag']=df['pred_target_index'].apply(lambda x:reverse_tag_map[x])
        df['id']=i+1
        df_list.append(df)
    data=pd.concat(df_list)
    pred_data=data[data['tokens']!='<PAD>']
    accuracy=pred_data[pred_data['actual_target_tag']==pred_data['pred_target_tag']].shape[0]/pred_data.shape[0]
    
    
    return pred_data,accuracy

In [185]:
pred_data,accuracy=evaluatePredictions(x_padded_sen,preds,x_target)

Shape of Test Data Array (7194, 128)
Number of Test Data Points  7194


In [186]:
y_pred=pred_data['pred_target_tag'].tolist()
y_actual=pred_data['actual_target_tag'].tolist()

In [187]:
from sklearn.metrics import classification_report
print(classification_report(y_actual,y_pred))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00        66
       B-eve       0.00      0.00      0.00        37
       B-geo       0.72      0.89      0.80      5567
       B-gpe       0.92      0.90      0.91      2492
       B-nat       0.00      0.00      0.00        33
       B-org       0.70      0.33      0.45      3058
       B-per       0.85      0.58      0.69      2478
       B-tim       0.88      0.74      0.80      3105
       I-art       0.00      0.00      0.00        35
       I-eve       0.00      0.00      0.00        33
       I-geo       0.70      0.61      0.65      1022
       I-gpe       0.00      0.00      0.00        38
       I-nat       0.00      0.00      0.00        10
       I-org       0.69      0.55      0.61      2579
       I-per       0.77      0.82      0.80      2521
       I-tim       0.80      0.29      0.43       987
           O       0.97      1.00      0.98    132160

    accuracy              