# Introduction

This is my first NLP project, the goal is to named entity in a sentence.

In [7]:
import tensorflow as tf
import keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [8]:
data = pd.read_csv("ner_dataset.csv", encoding="unicode-escape")
data = data.fillna(method="ffill")

print("Number of sentences: ", len(data.groupby(['Sentence #'])))

words = list(set(data["Word"].values))
n_words = len(words)
print("Number of words in the dataset: ", n_words)

tags = list(set(data["Tag"].values))
print("Tags:", tags)
n_tags = len(tags)
print("Number of Labels: ", n_tags)

print("What the dataset looks like:")
# Show the first 10 rows
data.head(n=10) 

  data = data.fillna(method="ffill")


Number of sentences:  47959
Number of words in the dataset:  35177
Tags: ['O', 'I-geo', 'I-per', 'B-gpe', 'B-geo', 'I-org', 'I-tim', 'B-art', 'I-gpe', 'B-eve', 'B-per', 'B-tim', 'I-eve', 'B-nat', 'B-org', 'I-art', 'I-nat']
Number of Labels:  17
What the dataset looks like:


Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
5,Sentence: 1,through,IN,O
6,Sentence: 1,London,NNP,B-geo
7,Sentence: 1,to,TO,O
8,Sentence: 1,protest,VB,O
9,Sentence: 1,the,DT,O


In [9]:
from itertools import chain

def get_dict_map(data, token_or_tag):
    tok2idx = {}
    idx2tok = {}
    
    if token_or_tag == 'token':
        vocab = list(set(data['Word'].to_list()))
    else:
        vocab = list(set(data['Tag'].to_list()))
    
    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}
    return tok2idx, idx2tok
token2idx, idx2token = get_dict_map(data, 'token')
tag2idx, idx2tag = get_dict_map(data, 'tag')

In [10]:
data['Word_idx'] = data['Word'].map(token2idx)
data['Tag_idx'] = data['Tag'].map(tag2idx)
data_fillna = data.fillna(method='ffill', axis=0)

  data_fillna = data.fillna(method='ffill', axis=0)


In [11]:
data_group = data_fillna.groupby(
    ['Sentence #'], as_index=False
)[['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx']].agg(lambda x: list(x))


In [13]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

def get_pad_train_test_val(data_group, data):

    #get max token and tag length
    n_token = len(list(set(data['Word'].to_list())))
    n_tag = len(list(set(data['Tag'].to_list())))

    #Pad tokens (X var)    
    tokens = data_group['Word_idx'].tolist()
    maxlen = max([len(s) for s in tokens])
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= n_token - 1)

    #Pad Tags (y var) and convert it into one hot encoding
    tags = data_group['Tag_idx'].tolist()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value= tag2idx["O"])
    n_tags = len(tag2idx)
    pad_tags = [to_categorical(i, num_classes=n_tags) for i in pad_tags]
    
    #Split train, test and validation set
    tokens_, test_tokens, tags_, test_tags = train_test_split(pad_tokens, pad_tags, test_size=0.1, train_size=0.9, random_state=2020)
    train_tokens, val_tokens, train_tags, val_tags = train_test_split(tokens_,tags_,test_size = 0.25,train_size =0.75, random_state=2020)

    print(
        'train_tokens length:', len(train_tokens),
        '\ntrain_tokens length:', len(train_tokens),
        '\ntest_tokens length:', len(test_tokens),
        '\ntest_tags:', len(test_tags),
        '\nval_tokens:', len(val_tokens),
        '\nval_tags:', len(val_tags),
    )
    
    return train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags

train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags = get_pad_train_test_val(data_group, data)

train_tokens length: 32372 
train_tokens length: 32372 
test_tokens length: 4796 
test_tags: 4796 
val_tokens: 10791 
val_tags: 10791


In [87]:
train_tokens[0]

array([34295, 28818,  5754, 14728, 15933, 26728, 11928,  1213, 16406,
       29188, 19486, 32051, 21818, 29065, 15325, 11928, 32123, 34785,
        7032, 24194,  1577, 17857,  2211, 17986,  6258, 31324, 29065,
       17712, 29012, 28419, 10853, 35176, 35176, 35176, 35176, 35176,
       35176, 35176, 35176, 35176, 35176, 35176, 35176, 35176, 35176,
       35176, 35176, 35176, 35176, 35176, 35176, 35176, 35176, 35176,
       35176, 35176, 35176, 35176, 35176, 35176, 35176, 35176, 35176,
       35176, 35176, 35176, 35176, 35176, 35176, 35176, 35176, 35176,
       35176, 35176, 35176, 35176, 35176, 35176, 35176, 35176, 35176,
       35176, 35176, 35176, 35176, 35176, 35176, 35176, 35176, 35176,
       35176, 35176, 35176, 35176, 35176, 35176, 35176, 35176, 35176,
       35176, 35176, 35176, 35176, 35176])

In [14]:
import numpy as np
import tensorflow
from tensorflow.keras import Sequential, Model
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model
from numpy.random import seed
seed(1)
tensorflow.random.set_seed(2)

In [15]:
input_dim = len(list(set(data['Word'].to_list())))+1
print(input_dim)
output_dim = 64
input_length = max([len(s) for s in data_group['Word_idx'].tolist()])
print(input_length)
n_tags = len(tag2idx)
print(n_tags)

35178
104
17


In [70]:
model = Sequential()

# Add Embedding layer
model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))

# Add bidirectional LSTM
model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))

model.add(Dropout(0.5))

# Add timeDistributed Layer
model.add(TimeDistributed(Dense(n_tags, activation="softmax")))

model.build(input_shape=(None, input_length))

# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()



In [71]:
hist = model.fit(train_tokens, np.array(train_tags), batch_size=1000, verbose=1, epochs=10, validation_split=0.2)

Epoch 1/10
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 2s/step - accuracy: 0.8282 - loss: 1.0132 - val_accuracy: 0.9681 - val_loss: 0.1835
Epoch 2/10
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 2s/step - accuracy: 0.9676 - loss: 0.1734 - val_accuracy: 0.9681 - val_loss: 0.1487
Epoch 3/10
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 2s/step - accuracy: 0.9676 - loss: 0.1481 - val_accuracy: 0.9681 - val_loss: 0.1349
Epoch 4/10
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 2s/step - accuracy: 0.9676 - loss: 0.1345 - val_accuracy: 0.9681 - val_loss: 0.1217
Epoch 5/10
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 2s/step - accuracy: 0.9677 - loss: 0.1212 - val_accuracy: 0.9696 - val_loss: 0.1081
Epoch 6/10
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 2s/step - accuracy: 0.9697 - loss: 0.1078 - val_accuracy: 0.9726 - val_loss: 0.0955
Epoch 7/10
[1m26/26[0m [32m━━━━━━━━━━

In [72]:
# Évaluer le modèle sur les données de test
loss, accuracy = model.evaluate(test_tokens, np.array(test_tags))

# Afficher les résultats
print("Perte sur les données de test :", loss)
print("Précision sur les données de test :", accuracy)

[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 38ms/step - accuracy: 0.9845 - loss: 0.0586
Perte sur les données de test : 0.059653569012880325
Précision sur les données de test : 0.9842416048049927


In [73]:
i = np.random.randint(0, test_tokens.shape[0])

In [75]:
p = model.predict(np.array([test_tokens[i]]))
p = np.argmax(p, axis=-1)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step


In [102]:
np.array([test_tokens[i]])

array([[31392,   546,  1764, 29065,   656,  9594,  9084,  2327, 34465,
        16543, 29173, 25401, 24845, 29111,  8370, 29041,  6663,  5675,
         9594,  7772, 10853, 35176, 35176, 35176, 35176, 35176, 35176,
        35176, 35176, 35176, 35176, 35176, 35176, 35176, 35176, 35176,
        35176, 35176, 35176, 35176, 35176, 35176, 35176, 35176, 35176,
        35176, 35176, 35176, 35176, 35176, 35176, 35176, 35176, 35176,
        35176, 35176, 35176, 35176, 35176, 35176, 35176, 35176, 35176,
        35176, 35176, 35176, 35176, 35176, 35176, 35176, 35176, 35176,
        35176, 35176, 35176, 35176, 35176, 35176, 35176, 35176, 35176,
        35176, 35176, 35176, 35176, 35176, 35176, 35176, 35176, 35176,
        35176, 35176, 35176, 35176, 35176, 35176, 35176, 35176, 35176,
        35176, 35176, 35176, 35176, 35176]])

In [77]:
y_true = np.argmax(np.array(test_tags), axis=-1)[i]
print("{:15}{:5}\t {}\n".format("Word","True","Pred"))
print("-"*30)
for w,true,pred in zip(test_tokens[i], y_true, p[0]):
  print("{:15}{:5}\t{}".format(words[w-1], tags[true],tags[pred]))

Word           True 	 Pred

------------------------------
Wikipedia      B-gpe	B-gpe
              O    	O
Hasyim         O    	O
my             O    	O
Spratly        O    	O
1.55           O    	O
Salehi         O    	O
prevalent      B-per	O
Ram            I-per	I-per
urgently       O    	O
Skipper        O    	O
avenge         O    	O
depicting      O    	O
Necdet         O    	O
goods          O    	O
Coffee         O    	O
224            O    	O
Moor           O    	O
1.55           O    	O
sparse         O    	O
Ash            O    	O
Times          O    	O
Times          O    	O
Times          O    	O
Times          O    	O
Times          O    	O
Times          O    	O
Times          O    	O
Times          O    	O
Times          O    	O
Times          O    	O
Times          O    	O
Times          O    	O
Times          O    	O
Times          O    	O
Times          O    	O
Times          O    	O
Times          O    	O
Times          O    	O
Times          O    	O
Times        

In [81]:
# Enregistrer l'architecture du modèle
model.save('entity_recognition_model.keras')

# Enregistrer les paramètres de l'optimiseur
model.save_weights('entity_recognition.weights.h5')

In [82]:
import pickle

# Sauvegarder les index de jetons
with open('tokens_to_index.pickle', 'wb') as handle:
    pickle.dump(token2idx, handle, protocol=pickle.HIGHEST_PROTOCOL)
 
# Sauvegarder les index d'étiquettes
with open('tag_to_index.pickle', 'wb') as handle:
    pickle.dump(tag2idx, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [83]:
import pickle

# Sauvegarder les index inverses
with open('index_to_tag.pickle', 'wb') as handle:
    pickle.dump(idx2tag, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('index_to_token.pickle', 'wb') as handle:
    pickle.dump(idx2token, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [136]:
# Fonction pour prétraiter la phrase de l'utilisateur
def preprocess_sentence(sentence):
    # Tokeniser la phrase
    tokens = sentence.split()
    # Convertir les jetons en index
    token_ids = [token2idx.get(token, 0) for token in tokens]
    # Remplir jusqu'à la longueur maximale
    token_ids_padded = np.pad(token_ids, (0, 104 - len(token_ids)), 'constant')
    return tokens, np.array([token_ids_padded])

In [138]:
tokens, processed_sentence = preprocess_sentence("London is my favorite city for christmas")

In [151]:
tokens

['London', 'is', 'my', 'favorite', 'city', 'for', 'christmas']

In [140]:
processed_sentence

array([[32700, 12015, 29064, 26738, 31990,  6057,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0]])

In [155]:
predicted_tags = model.predict(processed_sentence)
# Convertir les indices en étiquettes
predicted_tags = np.argmax(predicted_tags, axis=-1)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step


In [163]:
# Fonction pour étiqueter la phrase
def label_sentence(sentence):
    # Prétraiter la phrase
    tokens, processed_sentence = preprocess_sentence(sentence)
    # Prédire les étiquettes
    predicted_tags = model.predict(processed_sentence)
    # Convertir les indices en étiquettes
    predicted_tags = np.argmax(predicted_tags, axis=-1)

    sentence_length = len(tokens)

    predicted_tags = predicted_tags[0][:sentence_length]

    tags = [idx2tag[idx] for idx in predicted_tags]

    print("{:15}{:5}\n".format("Word","Pred"))
    print("-"*30)

    for w,pred in zip(tokens, tags):
        print("{}\t{}".format(w, pred))
        
    return list(zip(sentence.split(), predicted_tags))

# Demander à l'utilisateur de saisir une phrase
user_input = input("Entrez une phrase : ")
# Étiqueter la phrase
result = label_sentence(user_input)
# Afficher le résultat
print("Résultat de l'étiquetage : ", result)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
Word           Pred 

------------------------------
The	O
name	O
Daniel	O
is	O
very	O
common	O
in	O
Spain	B-geo
Résultat de l'étiquetage :  [('The', 0), ('name', 0), ('Daniel', 0), ('is', 0), ('very', 0), ('common', 0), ('in', 0), ('Spain', 4)]
