# BiLSTM Named Entity Recognition (NER) Model for CoNLL-2003 Dataset

In [13]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import json

## Load and Preprocess the CoNLL-2003 Dataset

In [14]:
def load_data(filepath):
    data = []
    sentence = []
    with open(filepath, 'r') as file:
        for line in file:
            if line.startswith('-DOCSTART-') or line == '\n':
                if sentence:
                    data.append(sentence)
                    sentence = []
                continue
            splits = line.split()
            sentence.append((splits[0], splits[1], splits[3]))
    if sentence:
        data.append(sentence)
    return data

train_data = load_data('dataset/train.txt')
val_data = load_data('dataset/valid.txt')
test_data = load_data('dataset/test.txt')

In [15]:
pd.DataFrame(train_data[4])

Unnamed: 0,0,1,2
0,Germany,NNP,B-LOC
1,'s,POS,O
2,representative,NN,O
3,to,TO,O
4,the,DT,O
5,European,NNP,B-ORG
6,Union,NNP,I-ORG
7,'s,POS,O
8,veterinary,JJ,O
9,committee,NN,O


## Extract Words and Tags from the Dataset

In [16]:

def extract_words_and_tags(data):
    words = list(set([word for sentence in data for word, _, _ in sentence]))
    tags = list(set([tag for sentence in data for _, _, tag in sentence]))
    return words, tags

words, tags = extract_words_and_tags(train_data + val_data + test_data)

word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx['UNK'] = 1  # Unknown words
word2idx['PAD'] = 0  # Padding
tag2idx = {t: i for i, t in enumerate(tags)}

idx2tag = {i: w for w, i in tag2idx.items()}

max_len = 75  # Maximum sequence length

## Preprocess the Data

In [20]:
def preprocess_data(data, word2idx: dict, tag2idx: dict, max_len: int):
    X = []
    y = []

    for sentence in data:
        sentence_words = []
        sentence_tags = []
        for word, _, tag in sentence:
            sentence_words.append(word2idx.get(word, 1))
            sentence_tags.append(tag2idx[tag])
        X.append(sentence_words)
        y.append(sentence_tags)
    
    X = pad_sequences(X, maxlen=max_len, padding='post')
    y = pad_sequences(y, maxlen=max_len, padding='post')
    y = [to_categorical(i, num_classes=len(tag2idx)) for i in y]
    
    return np.array(X), np.array(y)

X_train, y_train = preprocess_data(train_data, word2idx, tag2idx, max_len)
X_val, y_val = preprocess_data(val_data, word2idx, tag2idx, max_len)
X_test, y_test = preprocess_data(test_data, word2idx, tag2idx, max_len)
y_train[0]

array([[0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1.

## Define the BiLSTM Model

In [24]:
model = Sequential()
model.add(Embedding(input_dim=len(word2idx), output_dim=50, input_length=max_len))
model.add(Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1)))
model.add(Dropout(0.1))
model.add(TimeDistributed(Dense(len(tag2idx), activation='softmax')))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])



## Train the Model

In [25]:
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=32, epochs=20, verbose=1)

Epoch 1/20
[1m439/439[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 125ms/step - accuracy: 0.9414 - loss: 0.2796 - val_accuracy: 0.9755 - val_loss: 0.0816
Epoch 2/20
[1m439/439[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 123ms/step - accuracy: 0.9824 - loss: 0.0561 - val_accuracy: 0.9860 - val_loss: 0.0467
Epoch 3/20
[1m439/439[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 142ms/step - accuracy: 0.9935 - loss: 0.0232 - val_accuracy: 0.9900 - val_loss: 0.0369
Epoch 4/20
[1m439/439[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 171ms/step - accuracy: 0.9975 - loss: 0.0104 - val_accuracy: 0.9908 - val_loss: 0.0345
Epoch 5/20
[1m439/439[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 178ms/step - accuracy: 0.9987 - loss: 0.0057 - val_accuracy: 0.9911 - val_loss: 0.0347
Epoch 6/20
[1m368/439[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m11s[0m 162ms/step - accuracy: 0.9992 - loss: 0.0035

KeyboardInterrupt: 

## Evaluate the Model

In [None]:
model.evaluate(X_test, y_test)

[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 32ms/step - accuracy: 0.9877 - loss: 0.0442


[0.04246332868933678, 0.9883927702903748]

## Save the Model

In [None]:
model.save("bilstm_ner_model.h5")



In [None]:
with open('word2idx.json', 'w') as json_file:
    json.dump(word2idx, json_file, indent=4)

idx2tag = {v: k for k, v in tag2idx.items()}

with open('idx2tag.json', 'w') as json_file:
    json.dump({int(k): v for k, v in idx2tag.items()}, json_file, indent=4)