In [86]:
import nltk
from hazm import *
from nltk.corpus import treebank, conll2000, brown
import pickle
import pandas as pd
from tqdm import tqdm
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, TimeDistributed, Dense, LSTM, GRU
import itertools

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

from sklearn.model_selection import train_test_split

In [2]:
with open('../data.pkl', 'rb') as ff:
    tagged_sentences = pickle.load(ff)

In [3]:
X = [] # store input sequence
Y = [] # store output sequencefor sentence in tagged_sentences:

for sentence in tqdm(tagged_sentences):
    X_sentence = []
    Y_sentence = []
    for entity in sentence: 
        X_sentence.append(entity[0]) # entity[0] contains the word
        Y_sentence.append(entity[1]) # entity[1] contains corresponding tag
 
    X.append(X_sentence)
    Y.append(Y_sentence)

num_words = len(set([word.lower() for sentence in X for word in sentence]))
num_tags   = len(set([word.lower() for sentence in Y for word in sentence]))
print("Total number of tagged sentences: {}".format(len(X)))
print("Vocabulary size: {}".format(num_words))
print("Total number of tags: {}".format(num_tags))

100%|██████████| 344741/344741 [00:05<00:00, 58738.92it/s] 


Total number of tagged sentences: 344741
Vocabulary size: 147114
Total number of tags: 20


In [4]:
# let’s look at first data point
# this is one data point that will be fed to the RNN
print('sample X: ', X[0], '\n')
print('sample Y: ', Y[0], '\n')

sample X:  ['منبع', ':', ')', 'مجلة', 'سروش', 'هفتگی', '،', 'مصاحبه', 'با', 'رئیس', 'دفتر', 'الجزیره', 'در', 'تهران', '،', '۱۳۸۰', '('] 

sample Y:  ['NOUN', 'PUNCT', 'PUNCT', 'NOUN,EZ', 'NOUN,EZ', 'ADJ', 'PUNCT', 'NOUN', 'ADP', 'NOUN,EZ', 'NOUN,EZ', 'NOUN', 'ADP', 'NOUN', 'PUNCT', 'NUM', 'PUNCT'] 



In [5]:
# In this many-to-many problem, the length of each input and output sequence must be the same.
# Since each word is tagged, it’s important to make sure that the length of input sequence equals the output sequenceprint(“Length of first input sequence : {}”.format(len(X[0])))
print('Length of first output sequence : {}'.format(len(Y[0])))

Length of first output sequence : 17


In [6]:
we = WordEmbedding(model_path='/home/roshan/ebi/word_embedding/resources/cc.fa.300.bin',
                   model_type='fasttext')


In [7]:
vocab_to_index = we.get_vocab_to_index()
vocabs = we.get_vocabs()
vectors = we.get_vectors()


In [8]:
arr = list(vectors)
arr.insert(0, np.zeros(300, ))
vectors = np.array(arr)

In [9]:
# encode X
X_encoded = []
for sent in tqdm(X):
    tmp_list = []
    for word in sent:
        tmp_list.append(vocab_to_index.get(word, 0))
    X_encoded.append(tmp_list)


100%|██████████| 344741/344741 [00:07<00:00, 43243.68it/s]


In [10]:
# encode Y
def create_dict(Y):
    labels = []
    for sent in tqdm(Y):
        for label in sent:
            labels.append(label)
    unique_labels = np.unique(labels).tolist()
    unique_labels.insert(0, 'PAD')
    label_dict = {}
    for i in range(len(unique_labels)):
        label_dict[unique_labels[i]] = i
    return label_dict

label_dict = create_dict(Y)
Y_encoded = []
for sent in tqdm(Y):
    tmp_list = []
    for label in sent:
        tmp_list.append(label_dict[label])
    Y_encoded.append(tmp_list)

id2label = {}
for label, id in label_dict.items():
    id2label[id] = label


100%|██████████| 344741/344741 [00:00<00:00, 370893.18it/s]
100%|██████████| 344741/344741 [00:03<00:00, 89249.03it/s] 


In [11]:
# look at first encoded data point
print("** Raw data point **", "\n", "-"*100, "\n")
print('X: ', X[0], '\n')
print('Y: ', Y[0], '\n')
print()
print("** Encoded data point **", "\n", "-"*100, "\n")
print('X: ', X_encoded[0], '\n')
print('Y: ', Y_encoded[0], '\n')

** Raw data point ** 
 ---------------------------------------------------------------------------------------------------- 

X:  ['منبع', ':', ')', 'مجلة', 'سروش', 'هفتگی', '،', 'مصاحبه', 'با', 'رئیس', 'دفتر', 'الجزیره', 'در', 'تهران', '،', '۱۳۸۰', '('] 

Y:  ['NOUN', 'PUNCT', 'PUNCT', 'NOUN,EZ', 'NOUN,EZ', 'ADJ', 'PUNCT', 'NOUN', 'ADP', 'NOUN,EZ', 'NOUN,EZ', 'NOUN', 'ADP', 'NOUN', 'PUNCT', 'NUM', 'PUNCT'] 


** Encoded data point ** 
 ---------------------------------------------------------------------------------------------------- 

X:  [533, 8, 13, 94932, 5776, 6863, 3, 1697, 11, 395, 1061, 16493, 2, 87, 3, 7444, 12] 

Y:  [12, 18, 18, 13, 13, 1, 18, 12, 3, 13, 13, 12, 3, 12, 18, 14, 18] 



In [12]:
# Pad each sequence to MAX_SEQ_LENGTH using KERAS’ pad_sequences() function. 
# Sentences longer than MAX_SEQ_LENGTH are truncated.
# Sentences shorter than MAX_SEQ_LENGTH are padded with zeroes.# Truncation and padding can either be ‘pre’ or ‘post’. 
# For padding we are using ‘pre’ padding type, that is, add zeroes on the left side.
# For truncation, we are using ‘post’, that is, truncate a sentence from right side.# sequences greater than 100 in length will be truncated
MAX_SEQ_LENGTH = 60
X_padded = pad_sequences(X_encoded, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")
Y_padded = pad_sequences(Y_encoded, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")

# print the first sequence
print(X_padded[0], "\n"*3)
print(Y_padded[0])

[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0   533     8    13 94932  5776
  6863     3  1697    11   395  1061 16493     2    87     3  7444    12] 



[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 12 18 18 13 13
  1 18 12  3 13 13 12  3 12 18 14 18]


In [13]:
EMBEDDING_SIZE  = 300
VOCABULARY_SIZE = len(vectors)
embedding_weights = vectors
word2id = we.get_vocab_to_index()

In [14]:
Y = to_categorical(Y_padded)
X = X_padded
NUM_CLASSES = len(Y[0][0])

In [18]:
# create architecture
rnn_model = Sequential()
# create embedding layer — usually the first layer in text problems
# vocabulary size — number of unique words in data
rnn_model.add(Embedding(input_dim = VOCABULARY_SIZE, 
# length of vector with which each word is represented
 output_dim = EMBEDDING_SIZE, 
# length of input sequence
 input_length = MAX_SEQ_LENGTH, 
# False — don’t update the embeddings
 trainable = False 
))
# add an RNN layer which contains 64 RNN cells
# True — return whole sequence; False — return single output of the end of the sequence
rnn_model.add(SimpleRNN(64, 
 return_sequences=True
))
# add time distributed (output at each sequence) layer
rnn_model.add(TimeDistributed(Dense(NUM_CLASSES, activation='softmax')))
#compile model
rnn_model.compile(loss      =  'categorical_crossentropy',
                  optimizer =  'adam',
                  metrics   =  ['acc'])
# check summary of the model
rnn_model.summary()

2023-07-20 07:15:56.932395: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2023-07-20 07:15:56.932461: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:168] retrieving CUDA diagnostic information for host: dev5
2023-07-20 07:15:56.932490: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:175] hostname: dev5
2023-07-20 07:15:56.932694: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:199] libcuda reported version is: 525.105.17
2023-07-20 07:15:56.932724: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:203] kernel reported version is: 525.105.17
2023-07-20 07:15:56.932732: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:309] kernel version seems to match DSO: 525.105.17
2023-07-20 07:15:56.999253: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 2400001200 exceeds 10% of free syste

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 60, 300)           600000300 
                                                                 
 simple_rnn (SimpleRNN)      (None, 60, 64)            23360     
                                                                 
 time_distributed (TimeDist  (None, 60, 21)            1365      
 ributed)                                                        
                                                                 
Total params: 600025025 (2.24 GB)
Trainable params: 24725 (96.58 KB)
Non-trainable params: 600000300 (2.24 GB)
_________________________________________________________________


In [79]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=10)

In [22]:
#fit model
rnn_training = rnn_model.fit(X_train, Y_train, batch_size=128,
                            epochs=10, validation_data=(X_test, Y_test))

Epoch 1/10


2023-07-20 07:18:39.214152: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1563740640 exceeds 10% of free system memory.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [80]:
Y_test = Y_test.argmax(axis=-1)
Y_test_flatten_tmp = Y_test.flatten()
Y_test_flatten = Y_test_flatten_tmp[Y_test_flatten_tmp > 0]
Y_test_flatten = [id2label[id] for id in Y_test_flatten]

In [64]:
Y_pred = rnn_model.predict(X_test)
Y_pred = Y_pred.argmax(axis=-1)



In [82]:
Y_pred_flatten = Y_pred.flatten()
Y_pred_flatten = Y_pred_flatten[Y_test_flatten_tmp > 0]
Y_pred_flatten = [id2label[id] for id in Y_pred_flatten]

In [83]:
len(Y_pred[0]), len(Y_test[0])

(60, 60)

In [84]:
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score
y_test = Y_test_flatten
y_pred = Y_pred_flatten


print(classification_report(y_test, y_pred))


print('Precision                                   : %.4f'%precision_score(y_test, y_pred, average='weighted'))
print('Recall                                      : %.4f'%recall_score(y_test, y_pred, average='weighted'))
print('F1-Score                                    : %.4f'%f1_score(y_test, y_pred, average='weighted'))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         ADJ       0.60      0.63      0.62     64435
      ADJ,EZ       0.58      0.25      0.35     22023
         ADP       0.99      1.00      0.99    108518
      ADP,EZ       0.91      0.94      0.93     10746
         ADV       0.80      0.68      0.74     17353
      ADV,EZ       0.93      0.85      0.88      1077
       CCONJ       0.81      0.95      0.88     55705
    CCONJ,EZ       0.99      0.69      0.81        97
         DET       0.88      0.92      0.90     21398
      DET,EZ       0.77      0.86      0.81      2216
        INTJ       0.83      0.97      0.89        59
        NOUN       0.65      0.64      0.65    202147
     NOUN,EZ       0.64      0.71      0.67    173016
         NUM       0.86      0.80      0.83     24995
      NUM,EZ       0.59      0.14      0.22      1814
         PAD       0.00      0.00      0.00         0
        PRON       0.90      0.88      0.89     24147
     PRON,EZ       0.57    

  _warn_prf(average, modifier, msg_start, len(result))


Recall                                      : 0.7783
F1-Score                                    : 0.7762


In [89]:
# create architecture
lstm_model = Sequential()
# vocabulary size — number of unique words in data
# length of vector with which each word is represented
lstm_model.add(Embedding(input_dim = VOCABULARY_SIZE, 
 output_dim = EMBEDDING_SIZE, 
# length of input sequence
input_length = MAX_SEQ_LENGTH, 
# word embedding matrix
weights = [embedding_weights],
# True — update embeddings_weight matrix
trainable = True 
))
# add an LSTM layer which contains 64 LSTM cells
# True — return whole sequence; False — return single output of the end of the sequence
lstm_model.add(LSTM(64, return_sequences=True))
lstm_model.add(TimeDistributed(Dense(NUM_CLASSES, activation='softmax')))
#compile model
lstm_model.compile(loss      =  'categorical_crossentropy',
                  optimizer =  'adam',
                  metrics   =  ['acc'])
# check summary of the model
lstm_model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 60, 300)           600000300 
                                                                 
 lstm_1 (LSTM)               (None, 60, 64)            93440     
                                                                 
 time_distributed_2 (TimeDi  (None, 60, 21)            1365      
 stributed)                                                      
                                                                 
Total params: 600095105 (2.24 GB)
Trainable params: 600095105 (2.24 GB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [91]:
lstm_training = lstm_model.fit(X_train, Y_train, batch_size=128,
                                epochs=10, validation_data=(X_test, Y_test))

Epoch 1/10
   2/2424 [..............................] - ETA: 5:11:36 - loss: 3.0337 - acc: 0.5661

KeyboardInterrupt: 

In [90]:
# create architecture
lstm_model = Sequential()
# vocabulary size — number of unique words in data
# length of vector with which each word is represented
lstm_model.add(Embedding(input_dim = VOCABULARY_SIZE, 
 output_dim = EMBEDDING_SIZE, 
# length of input sequence
input_length = MAX_SEQ_LENGTH, 
# word embedding matrix
weights = [embedding_weights],
# True — update embeddings_weight matrix
trainable = True 
))
# add an LSTM layer which contains 64 LSTM cells
# True — return whole sequence; False — return single output of the end of the sequence
lstm_model.add(GRU(64, return_sequences=True))
lstm_model.add(TimeDistributed(Dense(NUM_CLASSES, activation='softmax')))
#compile model
lstm_model.compile(loss      =  'categorical_crossentropy',
                  optimizer =  'adam',
                  metrics   =  ['acc'])
# check summary of the model
lstm_model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 60, 300)           600000300 
                                                                 
 gru (GRU)                   (None, 60, 64)            70272     
                                                                 
 time_distributed_3 (TimeDi  (None, 60, 21)            1365      
 stributed)                                                      
                                                                 
Total params: 600071937 (2.24 GB)
Trainable params: 600071937 (2.24 GB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
gru_training = lstm_model.fit(X_train, Y_train, batch_size=128,
                              epochs=10, validation_data=(X_test, Y_test))