In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
tf.keras.utils.set_random_seed(33) # every time you use the model will get the same accuracy


In [4]:
# function to lead the data
def load_data(file_path):
    with open(file_path,'r') as file:
        data = np.array([line.strip() for line in file.readlines()])
    return data

In [8]:
# import our data
train_sentences = load_data('/content/drive/MyDrive/train/sentences.txt')
train_labels = load_data('/content/drive/MyDrive/train/labels.txt')
val_sentences = load_data('/content/drive/MyDrive/val/sentences.txt')
val_labels = load_data('/content/drive/MyDrive/val/labels.txt')
test_sentences = load_data('/content/drive/MyDrive/test/sentences.txt')
test_labels = load_data('/content/drive/MyDrive/test/labels.txt')


In [9]:
def get_sentence_vectorizer(sentences):
    tf.keras.utils.set_random_seed(33) # we know it before
    sentence_vectorizer = tf.keras.layers.TextVectorization(standardize=None, max_tokens=None, output_mode='int')
    # first dont make standardize for all senenteces second dont limit words got third all in integer output
    sentence_vectorizer.adapt(sentences)  # like transform
    vocab = sentence_vectorizer.get_vocabulary()  # get all vocb after vectrorize sentences
    return sentence_vectorizer, vocab

sentence_vectorizer, vocab = get_sentence_vectorizer(train_sentences) # like fit

In [11]:
#store all unique tags from labels by alphabatic sort
def get_tags(labels):
    tag_set = set()
    for el in labels:
        for tag in el.split(" "):
            tag_set.add(tag)
    tag_list = list(tag_set)
    tag_list.sort()
    return tag_list

tags = get_tags(train_labels)


In [12]:
# map each tag
def make_tag_map(tags):
    return {tag: i for i, tag in enumerate(tags)}

tag_map = make_tag_map(tags)

In [13]:
# we will here make mapping for all labels using the mapped tags and also we will make padding (post meaing in the end ) to ensure that all have same dimension
def label_vectorizer(labels, tag_map):
    label_ids = []
    for element in labels:
        tokens = element.split(" ")
        element_ids = [tag_map.get(token, -1) for token in tokens]
        label_ids.append(element_ids)
    return np.array(tf.keras.preprocessing.sequence.pad_sequences(label_ids, padding='post', value=-1))


In [15]:
# This function creates a TensorFlow dataset from sentences and their corresponding labels by vectorizing them into numerical representations.
def generate_dataset(sentences, labels, sentence_vectorizer, tag_map):
    sentences_ids = sentence_vectorizer(sentences)
    labels_ids = label_vectorizer(labels, tag_map = tag_map)
    return tf.data.Dataset.from_tensor_slices((sentences_ids, labels_ids))

train_dataset = generate_dataset(train_sentences, train_labels, sentence_vectorizer, tag_map)
val_dataset = generate_dataset(val_sentences, val_labels, sentence_vectorizer, tag_map)
test_dataset = generate_dataset(test_sentences, test_labels, sentence_vectorizer, tag_map)

g_vocab_size = len(vocab)


In [16]:
# define our model
def NER(len_tags, vocab_size, embedding_dim=50):
    model = tf.keras.Sequential(name='sequential')
    model.add(tf.keras.layers.Embedding(input_dim=vocab_size + 1, output_dim=embedding_dim, mask_zero=True))
    model.add(tf.keras.layers.LSTM(units=embedding_dim, return_sequences=True))
    model.add(tf.keras.layers.Dense(units=len_tags, activation=tf.nn.log_softmax))
    return model

In [24]:
# This function computes the masked loss by applying Sparse Categorical Crossentropy while ignoring the -1 values in y_true.
# It calculates the average loss only for valid entries to ensure accurate model training.

def masked_loss(y_true, y_pred):
    # Ignore the -1 values in y_true before calculating the loss
    # This will prevent the error
    valid_indices = tf.where(tf.not_equal(y_true, -1))
    y_true_valid = tf.gather_nd(y_true, valid_indices)
    y_pred_valid = tf.gather_nd(y_pred, valid_indices)

    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    loss = loss_fn(y_true_valid, y_pred_valid)  # Calculate loss only on valid indices
    loss = tf.cast(loss, dtype=tf.float32)
    return tf.reduce_mean(loss)

In [25]:
# This function calculates the masked accuracy by comparing the true labels (y_true) with the predicted classes (y_pred) while ignoring the -1 values.
# It computes the ratio of correctly predicted labels to the total valid labels, ensuring accurate evaluation of the model's performance.

def masked_accuracy(y_true, y_pred):
    mask = tf.cast(tf.not_equal(y_true, -1), dtype=tf.float32)
    y_pred_class = tf.math.argmax(y_pred, axis=-1)
    y_true = tf.cast(y_true, tf.int32)
    y_pred_class = tf.cast(y_pred_class, tf.int32)
    matches_true_pred = tf.equal(y_true, y_pred_class)
    matches_true_pred = tf.cast(matches_true_pred, tf.float32)
    matches_true_pred *= mask
    return tf.reduce_sum(matches_true_pred) / tf.reduce_sum(mask)


In [26]:
# Create an NER model with the number of tags and vocabulary size.
model = NER(len(tag_map), len(vocab))
# Compile the model with the Adam optimizer, masked loss function, and masked accuracy metric.
model.compile(optimizer=tf.keras.optimizers.Adam(0.01), loss=masked_loss, metrics=[masked_accuracy])
tf.keras.utils.set_random_seed(33)  # we explain it before
BATCH_SIZE = 64 # for training

model.fit(train_dataset.batch(BATCH_SIZE), validation_data=val_dataset.batch(BATCH_SIZE), shuffle=True, epochs=2) # fitting the model


Epoch 1/2
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 109ms/step - loss: 0.4594 - masked_accuracy: 0.8952 - val_loss: 0.1393 - val_masked_accuracy: 0.9573
Epoch 2/2
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 107ms/step - loss: 0.1299 - masked_accuracy: 0.9612 - val_loss: 0.1359 - val_masked_accuracy: 0.9584


<keras.src.callbacks.history.History at 0x7946b015d090>

In [27]:
# prepare test data
test_sentences_id = sentence_vectorizer(test_sentences)
test_labels_id = label_vectorizer(test_labels, tag_map)
y_true = test_labels_id
y_pred = model.predict(test_sentences_id)

[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step


In [28]:
print(f"The model's accuracy in test set is: {masked_accuracy(y_true,y_pred).numpy():.4f}")


The model's accuracy in test set is: 0.9576


In [30]:
# predict with your own sentence
def predict(sentence, model, sentence_vectorizer, tag_map):
    sentence_vectorized = sentence_vectorizer(sentence)
    sentence_vectorized = tf.expand_dims(sentence_vectorized, 0)
    output = model(sentence_vectorized)
    outputs = np.argmax(output, axis=-1)
    outputs = outputs[0]
    labels = list(tag_map.keys())
    pred = [labels[tag_idx] for tag_idx in outputs]
    return pred



In [33]:
sentence = "ahmed want to visit pyramids in egypt next Year"
predictions = predict(sentence, model, sentence_vectorizer, tag_map)
for x,y in zip(sentence.split(' '), predictions):
    if y != 'O':
        print(x,y)

ahmed B-per
egypt B-geo
Year B-tim
