In [1]:
import pandas as pd

# Load the dataset as a pandas DataFrame
df = pd.read_csv("/content/ner_dataset.csv", encoding="latin1")

# Filter out unnecessary columns
df = df.drop(columns=["POS"])

# Rename columns to match CoNLL-2003 format
df = df.rename(columns={"Sentence #": "Sentence", "Tag": "NE"})

# Replace NaN values with the string "O"
df = df.fillna("O")

# Group the dataset by sentence and concatenate the words and named entity tags
grouped = df.groupby("Sentence", sort=False).agg({"Word": " ".join, "NE": " ".join}).reset_index()

# Convert the named entity tags to the IOB format
grouped["NE"] = grouped["NE"].apply(lambda x: " ".join([f"B-{tag}" if i == 0 else f"I-{tag}" for i, tag in enumerate(x.split())]))

# Print the first five rows of the preprocessed dataset
print(grouped.head())

      Sentence                                               Word  \
0  Sentence: 1                                          Thousands   
1            O  of demonstrators have marched through London t...   
2  Sentence: 2                                           Families   
3  Sentence: 3                                               They   
4  Sentence: 4                                             Police   

                                                  NE  
0                                                B-O  
1  B-O I-O I-O I-O I-O I-B-geo I-O I-O I-O I-O I-...  
2                                                B-O  
3                                                B-O  
4                                                B-O  


In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [3]:
# Split the dataset into training and testing sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)


In [4]:
# Define the input and output dimensions
n_words = len(df["Word"].unique())
n_tags = len(df["NE"].unique())

In [5]:
# Convert the words and named entity tags to numerical values
word2idx = {w: i + 1 for i, w in enumerate(df["Word"].unique())}
tag2idx = {t: i for i, t in enumerate(df["NE"].unique())}


In [6]:
# Add a padding token to the word and named entity tag dictionaries
word2idx["PAD"] = 0
tag2idx["PAD"] = 0

In [7]:
# Convert the sentences and named entity tags to sequences of numerical values
X_train = [[word2idx[w] for w in sentence.split()] for sentence in train_data["Word"]]
X_train = tf.keras.preprocessing.sequence.pad_sequences(maxlen=10, sequences=X_train, padding="post", value=word2idx["PAD"])
y_train = [[tag2idx[w] for w in sentence.split()] for sentence in train_data["NE"]]
y_train = tf.keras.preprocessing.sequence.pad_sequences(maxlen=10, sequences=y_train, padding="post", value=tag2idx["PAD"])
y_train = [to_categorical(i, num_classes=n_tags) for i in y_train]

X_test = [[word2idx[w] for w in sentence.split()] for sentence in test_data["Word"]]
X_test = tf.keras.preprocessing.sequence.pad_sequences(maxlen=10, sequences=X_test, padding="post", value=word2idx["PAD"])
y_test = [[tag2idx[w] for w in sentence.split()] for sentence in test_data["NE"]]
y_test = tf.keras.preprocessing.sequence.pad_sequences(maxlen=10, sequences=y_test, padding="post", value=tag2idx["PAD"])
y_test = [to_categorical(i, num_classes=n_tags) for i in y_test]

In [12]:
# Define the model architecture
model = Sequential()
model.add(Embedding(input_dim=n_words+1, output_dim=50, input_length=10, mask_zero=True))
model.add(Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1)))
model.add(TimeDistributed(Dense(units=n_tags, activation="softmax")))


In [13]:
# Compile the model
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])



In [14]:
# Train the model
history = model.fit(X_train, np.array(y_train), batch_size=32, epochs=10, validation_split=0.1, verbose=1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [15]:
# Evaluate the model on the testing set
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=-1)
y_test = np.argmax(y_test, axis=-1)
print(classification_report(y_test.ravel(), y_pred.ravel()))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   2065171
           1       0.78      0.85      0.81      7497
           2       0.94      0.92      0.93      3186
           3       0.62      0.67      0.64      3354
           4       0.70      0.59      0.64      1430
           5       0.37      0.55      0.45      4076
           6       0.66      0.51      0.58      3201
           7       0.87      0.76      0.81      4037
           8       0.30      0.04      0.07        77
           9       0.08      0.01      0.02        75
          10       0.72      0.69      0.71      3468
          11       0.64      0.64      0.64        42
          12       0.59      0.16      0.25      1374
          13       0.38      0.39      0.38        38
          14       0.34      0.32      0.33        60
          15       0.33      0.02      0.04        53
          16       0.00      0.00      0.00        11

    accuracy              

In [42]:
# Define a function to preprocess new sentences
def preprocess_sentence(sentence, word_to_int, max_len):
    sentence = [word_to_int.get(word, 0) for word in sentence.split()]
    sentence = sentence + [0] * (max_len - len(sentence))
    return np.array(sentence)

# Define some new sentences to test the model
sentences = [
    "Barack Obama was born in Hawaii.",
    "Steve Jobs co-founded Apple Inc.",
    "The Eiffel Tower is located in Paris, France."
]

# Convert the words to numerical values
word2idx = {w: i + 1 for i, w in enumerate(df["Word"].unique())}
word2idx["PAD"] = 0
word2idx["UNK"] = n_words + 1
tag2idx = {t: i for i, t in enumerate(df["NE"].unique())}
idx2word = {i: w for w, i in word2idx.items()}
idx2tag = {i: t for t, i in tag2idx.items()}

# Preprocess the new sentences
X_test = np.array([preprocess_sentence(sentence, word2idx, 10) for sentence in sentences])

# Make predictions on the new sentences
y_pred = model.predict(X_test)

# Convert the predicted tags to named entities
int_to_tag = {i: t for t, i in tag2idx.items()}
y_pred = np.argmax(y_pred, axis=-1)
y_pred = [[int_to_tag[i] for i in sentence] for sentence in y_pred]

# Print the predicted named entities for each sentence
for i, sentence in enumerate(sentences):
    print(f"Sentence {i+1}:")
    print(sentence)
    print("Predicted named entities:")
    print(y_pred[i][:len(sentence.split())])
    print()



Sentence 1:
Barack Obama was born in Hawaii.
Predicted named entities:
['I-per', 'I-per', 'O', 'O', 'O', 'O']

Sentence 2:
Steve Jobs co-founded Apple Inc.
Predicted named entities:
['B-per', 'O', 'O', 'B-org', 'I-org']

Sentence 3:
The Eiffel Tower is located in Paris, France.
Predicted named entities:
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']



In [43]:
def predict_entities(sentences):
  # Convert the words to numerical values
  word2idx = {w: i + 1 for i, w in enumerate(df["Word"].unique())}
  word2idx["PAD"] = 0
  word2idx["UNK"] = n_words + 1
  tag2idx = {t: i for i, t in enumerate(df["NE"].unique())}
  idx2word = {i: w for w, i in word2idx.items()}
  idx2tag = {i: t for t, i in tag2idx.items()}

  # Preprocess the new sentences
  X_test = np.array([preprocess_sentence(sentence, word2idx, 10) for sentence in sentences])

  # Make predictions on the new sentences
  y_pred = model.predict(X_test)

  # Convert the predicted tags to named entities
  int_to_tag = {i: t for t, i in tag2idx.items()}
  y_pred = np.argmax(y_pred, axis=-1)
  y_pred = [[int_to_tag[i] for i in sentence] for sentence in y_pred]

  # Print the predicted named entities for each sentence
  for i, sentence in enumerate(sentences):
      print(f"Sentence {i+1}:")
      print(sentence)
      print("Predicted named entities:")
      print(y_pred[i][:len(sentence.split())])
      print()

In [54]:
predict_entities(['Steve Jobs lives in Hawaii'])

Sentence 1:
Steve Jobs lives in Hawaii
Predicted named entities:
['B-per', 'O', 'O', 'O', 'B-geo']



In [55]:
# Save the model as an HDF5 file
model.save("ner_model.h5")