In [6]:
pip install tensorflow

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, TimeDistributed
from sklearn.preprocessing import LabelEncoder

In [13]:
sentences = [ "Barack Obama was born in hawaii"
"Google is based in mountain view " ]

In [17]:


# Sample data
sentences = [
    "Barack Obama was born in Hawaii",
    "Google is based in Mountain View"
]

labels = [
    ["PERSON", "PERSON", "O", "O", "O", "LOCATION", "O"], # Labels for the first sentence
    ["ORGANIZATION", "O", "O", "O", "LOCATION", "O"] # Labels for the second sentence
]


In [19]:

###Tokenizing the sentences (converting words into integers)
tokenizer = Tokenizer(lower=True)
tokenizer.fit_on_texts(sentences)
X = tokenizer.texts_to_sequences(sentences)

# Padding the sequences to have the same length
# Find the length of the longest sequence for padding
MAXLEN = max(len(s) for s in X) 
X = pad_sequences(X, padding='post', maxlen=MAXLEN)

# Encode the labels
label_encoder = LabelEncoder()
label_encoder.fit(["O", "PERSON", "LOCATION", "ORGANIZATION"])

# Convert labels to numerical values (e.g., O = 0, PERSON = 1)
# Note: The original image had a list comprehension error, this is the corrected version:
y = [label_encoder.transform(l) for l in labels]

# Pad the labels so that they match the shape of the input sequences (X)
# The maxlen should be the same as the padded input X
y = pad_sequences(y, padding='post', maxlen=X.shape[1]) 

# Reshape y to match the shape of the input sequence (for time-step labeling)
# Adds an extra dimension at the end: (batch_size, sequence_length, 1)
y = np.expand_dims(y, -1) 
# Expanding dimensions to match the model's output

print(f"X shape (padded sentences): {X.shape}")
print(f"y shape (padded labels): {y.shape}")
print("\nFirst padded and encoded sentence (X[0]):")
print(X[0])
print("\nFirst padded and encoded label sequence (y[0]):")
print(y[0].flatten())

X shape (padded sentences): (2, 6)
y shape (padded labels): (2, 6, 1)

First padded and encoded sentence (X[0]):
[2 3 4 5 1 6]

First padded and encoded label sequence (y[0]):
[3 1 1 1 0 1]


In [20]:
# Model definition
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dropout, TimeDistributed, Dense

# Assuming 'tokenizer', 'label_encoder', and 'X' were defined in the previous cell

model = Sequential()

# Embedding Layer: Convert words into dense vectors
# input_dim: Vocabulary size (number of unique tokens + 1 for padding/unknown)
# output_dim: Dimensionality of the dense embedding
# input_length: Length of the input sequences (MAXLEN from padding)
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, 
                    output_dim=50, 
                    input_length=X.shape[1]))

# Simple RNN layer
# units=50: Dimensionality of the output space (hidden state)
# return_sequences=True: Essential for sequence labeling, ensures output at every time step
model.add(SimpleRNN(units=50, return_sequences=True))

# Dropout to avoid overfitting
model.add(Dropout(0.1))

# TimeDistributed Dense layer for making predictions at each time step
# TimeDistributed applies the Dense layer independently to every time step in the sequence.
# units: Number of output classes (e.g., PERSON, LOCATION, O, etc.)
model.add(TimeDistributed(Dense(len(label_encoder.classes_), activation='softmax')))

# Compile the model
# loss='sparse_categorical_crossentropy': Used because the output labels (y) are integer-encoded (sparse).
model.compile(optimizer='adam', 
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'])

# Optional: Print the model summary to see the layers and parameter count
model.summary()



In [21]:
model.fit(X, y, epochs=3, batch_size=2)

Epoch 1/3
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.2500 - loss: 1.4134
Epoch 2/3
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step - accuracy: 0.3333 - loss: 1.3860
Epoch 3/3
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step - accuracy: 0.4167 - loss: 1.3629


<keras.src.callbacks.history.History at 0x1e3215df620>

In [23]:
# Test with a new sentence
test_sentence = ["Barack Obama went to Hawaii"]

# Convert the test sentence to sequences using the fitted tokenizer
test_sequence = tokenizer.texts_to_sequences(test_sentence)

# Pad the test sequence to match the length used during training (X.shape[1] is MAXLEN)
test_sequence = pad_sequences(test_sequence, padding='post', maxlen=X.shape[1])

# Predicting the NER labels for the test sentence
predictions = model.predict(test_sequence)

# Decode predictions
# 1. np.argmax(predictions, axis=-1) gets the index of the highest probability for each time step.
# 2. [0] slices the result to get the predictions for the first (and only) sentence.
# 3. label_encoder.inverse_transform converts the numerical indices back to original string labels.
decoded_predictions = label_encoder.inverse_transform(np.argmax(predictions, axis=-1)[0])

# Display results
# test_sentence[0].split() breaks the sentence into words.
for word, label in zip(test_sentence[0].split(), decoded_predictions):
    print(f"Word: {word} - Predicted Label: {label}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
Word: Barack - Predicted Label: PERSON
Word: Obama - Predicted Label: LOCATION
Word: went - Predicted Label: LOCATION
Word: to - Predicted Label: O
Word: Hawaii - Predicted Label: O
