In [2]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout, TimeDistributed
from sklearn.preprocessing import LabelEncoder

In [3]:
#Sample data
sentences = [
    "Barack Obama was born in Hawai",
    "Google is based in Mountain View"
]

labels = [
    ["PERSON", "PERSON", "O", "O", "O", "LOCATION"], 
    ["ORGANIZATION", "O", "O", "O", "LOCATION", "O"]
    
]

#TOKENIZING
tokenizer = Tokenizer(lower=True)
tokenizer.fit_on_texts(sentences)
X = tokenizer.texts_to_sequences(sentences)

#PADDING
X = pad_sequences(X, padding='post')

#LABEL ENCODING
label_encoder = LabelEncoder()
label_encoder.fit(["O", "PERSON", "LOCATION", "ORGANIZATION"])

#Convert labels to NUMERICAL VALUES
Y = [label_encoder.transform(label) for label in labels]

#PADDING LABELS so that they match the input shape
y = pad_sequences(Y, padding='post', maxlen=X.shape[1])

#reshaping y to match the shape of the input sequences
y = np.expand_dims(y, -1)

In [4]:
#model definition
model = Sequential()

#Embedding layer: convert word indices to dense vectors
model.add(Embedding(input_dim = len(tokenizer.word_index) + 1, output_dim =50, input_length=X.shape[1]))

#RNN layer
model.add(SimpleRNN(units=50, return_sequences=True))

# Dropout layer to avoid overfitting
model.add(Dropout(0.1))

# TimeDistributed Dense layer for making predictions at each time step
model.add(TimeDistributed(Dense(len(label_encoder.classes_),activation='softmax')))

#Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])



In [7]:
# train the model
model.fit(np.array(X), np.array(y), epochs=10, batch_size=2)

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.6667 - loss: 1.3262
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - accuracy: 0.6667 - loss: 1.3090
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - accuracy: 0.6667 - loss: 1.2736
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - accuracy: 0.8333 - loss: 1.2338
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 121ms/step - accuracy: 0.9167 - loss: 1.1894
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step - accuracy: 0.9167 - loss: 1.1667
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step - accuracy: 0.8333 - loss: 1.1281
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step - accuracy: 0.8333 - loss: 1.0959
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

<keras.src.callbacks.history.History at 0x1a9d8b52850>

In [6]:
#test with a sentence
test_sentence = ["Barack Obama went to Hawai"]
test_sequence = tokenizer.texts_to_sequences(test_sentence)
test_sequence = pad_sequences(test_sequence, padding='post', maxlen=X.shape[1])

#practise the ner labels
predictions = model.predict(test_sequence)

#decode predictions
decoded_predictions = label_encoder.inverse_transform(np.argmax(predictions, axis=-1)[0])

#display results
for word, label in zip(test_sentence[0].split(), decoded_predictions):
    print(f"Word: {word}, Predicted NER Label: {label}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 493ms/step
Word: Barack, Predicted NER Label: ORGANIZATION
Word: Obama, Predicted NER Label: LOCATION
Word: went, Predicted NER Label: PERSON
Word: to, Predicted NER Label: ORGANIZATION
Word: Hawai, Predicted NER Label: ORGANIZATION
