In [1]:
!pip install -U pandas keras tensorflow spacy
!python -m spacy download en_core_web_sm

Collecting pandas
  Downloading pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting keras
  Downloading keras-3.6.0-py3-none-any.whl.metadata (5.8 kB)
Collecting spacy
  Downloading spacy-3.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting thinc<8.4.0,>=8.3.0 (from spacy)
  Downloading thinc-8.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting blis<1.1.0,>=1.0.0 (from thinc<8.4.0,>=8.3.0->spacy)
  Downloading blis-1.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.6 kB)
INFO: pip is looking at multiple versions of thinc to determine which version is compatible with other requirements. This could take a while.
Collecting thinc<8.4.0,>=8.3.0 (from spacy)
  Downloading thinc-8.3.1-cp310-cp310-manylinux_2_17_x86_64.many

In [2]:
import pandas as pd
import numpy as np
from itertools import chain
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from sklearn.model_selection import train_test_split
import tensorflow
from tensorflow.keras import Input
import spacy
from spacy import displacy


from numpy.random import seed
seed(1)
tensorflow.random.set_seed(2)

In [5]:
data = pd.read_csv('/content/NER dataset.csv', encoding='unicode_escape')
print(data.head())


    Sentence #           Word  POS Tag
0  Sentence: 1      Thousands  NNS   O
1          NaN             of   IN   O
2          NaN  demonstrators  NNS   O
3          NaN           have  VBP   O
4          NaN        marched  VBN   O


In [7]:
import pandas as pd

data['Word_idx'] = data['Word'].map(token2idx)
data['Tag_idx'] = data['Tag'].map(tag2idx)


data_fillna = data.ffill(axis=0)


data_group = data_fillna.groupby(['Sentence #'], as_index=False)[['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx']].agg(lambda x: list(x))


In [8]:
def get_pad_train_test_val(data_group, data):
    n_token = len(list(set(data['Word'].to_list())))
    n_tag = len(list(set(data['Tag'].to_list())))

    tokens = data_group['Word_idx'].tolist()
    maxlen = max([len(s) for s in tokens])
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value=n_token - 1)

    tags = data_group['Tag_idx'].tolist()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value=tag2idx["O"])
    n_tags = len(tag2idx)
    pad_tags = [to_categorical(i, num_classes=n_tags) for i in pad_tags]

    tokens_, test_tokens, tags_, test_tags = train_test_split(pad_tokens, pad_tags, test_size=0.1, train_size=0.9, random_state=2020)
    train_tokens, val_tokens, train_tags, val_tags = train_test_split(tokens_, tags_, test_size=0.25, train_size=0.75, random_state=2020)

    print('train_tokens length:', len(train_tokens),
          '\ntest_tokens length:', len(test_tokens),
          '\nval_tokens length:', len(val_tokens))

    return train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags

train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags = get_pad_train_test_val(data_group, data)


train_tokens length: 32372 
test_tokens length: 4796 
val_tokens length: 10791


In [9]:
def get_bilstm_lstm_model():
    model = Sequential()
    input_dim = len(list(set(data['Word'].to_list()))) + 1
    output_dim = 64
    input_length = max([len(s) for s in data_group['Word_idx'].tolist()])
    n_tags = len(tag2idx)

    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))
    model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode='concat'))
    model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))
    model.add(TimeDistributed(Dense(n_tags, activation="relu")))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()

    return model

def train_model(X, y, model):
    loss = []
    for i in range(25):
        hist = model.fit(X, y, batch_size=1000, verbose=1, epochs=1, validation_split=0.2)
        loss.append(hist.history['loss'][0])
    return loss

model_bilstm_lstm = get_bilstm_lstm_model()
results = pd.DataFrame()
results['with_add_lstm'] = train_model(train_tokens, np.array(train_tags), model_bilstm_lstm)




[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 4s/step - accuracy: 0.8257 - loss: 1.7563 - val_accuracy: 0.9681 - val_loss: 0.3513
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 4s/step - accuracy: 0.9674 - loss: 0.3977 - val_accuracy: 0.9681 - val_loss: 0.3292
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 4s/step - accuracy: 0.9677 - loss: 0.3553 - val_accuracy: 0.9681 - val_loss: 0.3140
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 4s/step - accuracy: 0.9677 - loss: 0.3300 - val_accuracy: 0.9682 - val_loss: 0.2880
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 5s/step - accuracy: 0.9677 - loss: 0.3060 - val_accuracy: 0.9682 - val_loss: 0.2353
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 4s/step - accuracy: 0.9677 - loss: 0.2733 - val_accuracy: 0.9682 - val_loss: 0.2396
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 5s/step - accuracy: 0.9678 - loss:

In [18]:
text = "My favorite movie is The Shawshank Redemption."
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)
displacy.render(doc, style='ent', jupyter=True)


In [None]:
"Samantha is a software engineer living in San Francisco."
"My brother, Raj, graduated from Stanford University in 2022."


"The Eiffel Tower is located in Paris, France."
"I visited Tokyo during my summer vacation."


"Apple Inc. is launching a new product next month."
"The United Nations held a conference on climate change."

"The event is scheduled for December 15, 2024."
"I have a meeting at 3 PM on Thursday."


"My favorite movie is The Shawshank Redemption."
"The COVID-19 pandemic has affected many countries around the world."