# BiLSTM Named Entity Recognition (NER) Model for CoNLL-2003 Dataset

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import json

2024-05-27 16:44:12.695239: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-27 16:44:12.699727: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-27 16:44:12.763482: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Load and Preprocess the CoNLL-2003 Dataset

In [3]:
# Load and preprocess the CoNLL-2003 dataset
def load_data(filepath):
    data = []
    sentence = []
    with open(filepath, 'r') as file:
        for line in file:
            if line.startswith('-DOCSTART-') or line == '\n':
                if sentence:
                    data.append(sentence)
                    sentence = []
                continue
            splits = line.split()
            sentence.append((splits[0], splits[1], splits[3]))  # Append word, POS, and NER tag
    if sentence:
        data.append(sentence)
    return data

train_data = load_data('dataset/train.txt')
val_data = load_data('dataset/valid.txt')
test_data = load_data('dataset/test.txt')

In [4]:
pd.DataFrame(train_data[4])

Unnamed: 0,0,1,2
0,Germany,NNP,B-LOC
1,'s,POS,O
2,representative,NN,O
3,to,TO,O
4,the,DT,O
5,European,NNP,B-ORG
6,Union,NNP,I-ORG
7,'s,POS,O
8,veterinary,JJ,O
9,committee,NN,O


## Extract Words and Tags from the Dataset

In [11]:

# Extract words and tags from the dataset
def extract_words_and_tags(data):
    words = list(set([word for sentence in data for word, _, _ in sentence]))
    tags = list(set([tag for sentence in data for _, _, tag in sentence]))
    return words, tags

words, tags = extract_words_and_tags(train_data + val_data + test_data)

word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx['UNK'] = 1  # Unknown words
word2idx['PAD'] = 0  # Padding
tag2idx = {t: i for i, t in enumerate(tags)}

idx2tag = {i: w for w, i in tag2idx.items()}

max_len = 75  # Maximum sequence length


## Preprocess the Data

In [30]:
def preprocess_data(data, word2idx: dict, tag2idx: dict, max_len: int):
    X = []
    y = []

    for sentence in data:
        sentence_words = []
        sentence_tags = []
        for word, _, tag in sentence:
            sentence_words.append(word2idx.get(word, 1))
            sentence_tags.append(tag2idx[tag])
        X.append(sentence_words)
        y.append(sentence_tags)
    
    X = pad_sequences(X, maxlen=max_len, padding='post')
    y = pad_sequences(y, maxlen=max_len, padding='post')
    y = [to_categorical(i, num_classes=len(tag2idx)) for i in y]
    
    return np.array(X), np.array(y)

X_train, y_train = preprocess_data(train_data, word2idx, tag2idx, max_len)
X_val, y_val = preprocess_data(val_data, word2idx, tag2idx, max_len)
X_test, y_test = preprocess_data(test_data, word2idx, tag2idx, max_len)
y_val[0][0]

array([0., 1., 0., 0., 0., 0., 0., 0., 0.])

## Define the BiLSTM Model

In [32]:
model = Sequential()
model.add(Embedding(input_dim=len(word2idx), output_dim=50, input_length=max_len))
model.add(Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1)))
model.add(TimeDistributed(Dense(len(tag2idx), activation='softmax')))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

## Train the Model

In [33]:

history = model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=32, epochs=5, verbose=1)

Epoch 1/5
[1m439/439[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 119ms/step - accuracy: 0.9302 - loss: 0.2861 - val_accuracy: 0.9740 - val_loss: 0.0836
Epoch 2/5
[1m439/439[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 117ms/step - accuracy: 0.9825 - loss: 0.0567 - val_accuracy: 0.9865 - val_loss: 0.0479
Epoch 3/5
[1m439/439[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 155ms/step - accuracy: 0.9942 - loss: 0.0223 - val_accuracy: 0.9901 - val_loss: 0.0365
Epoch 4/5
[1m439/439[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 165ms/step - accuracy: 0.9974 - loss: 0.0107 - val_accuracy: 0.9904 - val_loss: 0.0349
Epoch 5/5
[1m439/439[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 164ms/step - accuracy: 0.9985 - loss: 0.0061 - val_accuracy: 0.9913 - val_loss: 0.0326


## Evaluate the Model

In [34]:
model.evaluate(X_test, y_test)

[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 32ms/step - accuracy: 0.9877 - loss: 0.0442


[0.04246332868933678, 0.9883927702903748]

## Save the Model

In [46]:
model.save("bilstm_ner_model.h5")



In [45]:
# Save the word2idx dictionary
with open('word2idx.json', 'w') as json_file:
    json.dump(word2idx, json_file, indent=4)

# Create idx2tag dictionary
idx2tag = {v: k for k, v in tag2idx.items()}

# Save the idx2tag dictionary with integer keys
with open('idx2tag.json', 'w') as json_file:
    json.dump({int(k): v for k, v in idx2tag.items()}, json_file, indent=4)