<a href="https://colab.research.google.com/github/AbdelrahmanTamer11/Named-Entity-Recognition-NER-Corpus/blob/main/Named_Entity_Recognition_(NER)_Corpus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import pandas as pd

# Load the dataset
data_path = '/content/ner.csv'
data = pd.read_csv(data_path)

# No need for ast.literal_eval, directly use the 'Sentence' column and split the sentences into words
sentences = [sentence.split() for sentence in data['Sentence'].tolist()]
tags = [tag.strip('[]').replace("'", "").split(', ') for tag in data['Tag'].tolist()]

# Check the first few examples to ensure the processing is correct
print(sentences[:2], tags[:2])


[['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.'], ['Families', 'of', 'soldiers', 'killed', 'in', 'the', 'conflict', 'joined', 'the', 'protesters', 'who', 'carried', 'banners', 'with', 'such', 'slogans', 'as', '"', 'Bush', 'Number', 'One', 'Terrorist', '"', 'and', '"', 'Stop', 'the', 'Bombings', '.', '"']] [['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-per', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]


In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Create a vocabulary for words and tags
words = list(set([word for sentence in sentences for word in sentence]))
n_words = len(words)

tags_set = list(set([tag for tag_list in tags for tag in tag_list]))
n_tags = len(tags_set)

word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags_set)}

# Convert sentences and labels into indices
X = [[word2idx[w] for w in s] for s in sentences]
y = [[tag2idx[t] for t in t_list] for t_list in tags]

# Padding the sequences
max_len = 50
X = pad_sequences(X, maxlen=max_len, padding="post")
y = pad_sequences(y, maxlen=max_len, padding="post")


In [22]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional

# Define the maximum sequence length
max_len = 50

# Pad the input sentences and tag sequences
X = pad_sequences([[word2idx.get(w, 0) for w in s] for s in sentences], maxlen=max_len, padding="post")
y = pad_sequences([[tag2idx.get(t, 0) for t in tag_seq] for tag_seq in tags], maxlen=max_len, padding="post")

# Convert target labels (y) to categorical format
y = np.array([to_categorical(i, num_classes=n_tags) for i in y])

# Build the NER model
model = Sequential([
    Embedding(input_dim=n_words, output_dim=50, input_length=max_len),
    Dropout(0.1),
    Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1)),
    TimeDistributed(Dense(n_tags, activation="softmax"))
])

# Compile the model
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# Train the model
history = model.fit(X, y, batch_size=32, epochs=5, validation_split=0.2, verbose=1)

Epoch 1/5
[1m1199/1199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m169s[0m 134ms/step - accuracy: 0.9220 - loss: 0.3219 - val_accuracy: 0.9824 - val_loss: 0.0590
Epoch 2/5
[1m1199/1199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 137ms/step - accuracy: 0.9850 - loss: 0.0505 - val_accuracy: 0.9851 - val_loss: 0.0487
Epoch 3/5
[1m1199/1199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m196s[0m 133ms/step - accuracy: 0.9888 - loss: 0.0365 - val_accuracy: 0.9857 - val_loss: 0.0471
Epoch 4/5
[1m1199/1199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 132ms/step - accuracy: 0.9906 - loss: 0.0297 - val_accuracy: 0.9860 - val_loss: 0.0465
Epoch 5/5
[1m1199/1199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 133ms/step - accuracy: 0.9919 - loss: 0.0253 - val_accuracy: 0.9858 - val_loss: 0.0478


In [23]:
from seqeval.metrics import classification_report

# Predict and evaluate
y_pred = model.predict(X)

# Convert indices back to entity labels
idx2tag = {i: w for w, i in tag2idx.items()}

def get_tags(y_pred, y_true):
    pred_tags = [[idx2tag[np.argmax(pred)] for pred in p] for p in y_pred]
    true_tags = [[idx2tag[np.argmax(true)] for true in t] for t in y_true]
    return pred_tags, true_tags

# Get predictions
pred_tags, true_tags = get_tags(y_pred, y)

# Print the evaluation report
print(classification_report(true_tags, pred_tags))


[1m1499/1499[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 29ms/step
              precision    recall  f1-score   support

         art       0.40      0.09      0.15       401
         eve       0.43      0.39      0.41       307
         geo       0.91      0.92      0.91     37633
         gpe       0.97      0.95      0.96     15859
         nat       0.59      0.39      0.47       199
         org       0.77      0.83      0.80     20130
         per       0.86      0.83      0.84     16978
         tim       1.00      1.00      1.00   1370401

   micro avg       0.99      0.99      0.99   1461908
   macro avg       0.74      0.67      0.69   1461908
weighted avg       0.99      0.99      0.99   1461908



In [30]:
import numpy as np

# Function to get the predicted tags for a given sentence
def predict_entities(sentence, model, word2idx, idx2tag, max_len):
    # Tokenize and pad the sentence
    tokens = sentence.split()
    tokenized_sentence = [word2idx.get(w, 0) for w in tokens]  # Convert words to indices
    padded_sentence = pad_sequences([tokenized_sentence], maxlen=max_len, padding='post')

    # Get predictions
    pred = model.predict(padded_sentence)

    # Convert predictions back to tags
    pred_tags = [idx2tag[np.argmax(p)] for p in pred[0]]

    # Pair each word with its predicted tag
    result = list(zip(tokens, pred_tags[:len(tokens)]))  # Cut off padding
    return result

# Example usage
idx2tag = {i: t for t, i in tag2idx.items()}  # Reverse the tag dictionary

# Test the function with an example sentence
sentence = 'Google'
predicted_entities = predict_entities(sentence, model, word2idx, idx2tag, max_len)

# Print the result
for word, tag in predicted_entities:
    print(f"{word}: {tag}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Google: B-org
