# SET UP

In [1]:
!pip install tensorflow==1.15.0

You should consider upgrading via the '/home/andres/Documents/2021-2/NER-wikiner/venv/bin/python -m pip install --upgrade pip' command.[0m


In [1]:
import tensorflow.compat.v1 as tf
import pandas as pd
import numpy as np
#import spacy
#import spacy_spanish_lemmatizer

from evaluation import precision_recall_f1
from collections import defaultdict
from functools import reduce

#nlp = spacy.load("es_core_news_sm")

# Read Data

In [3]:
def read_data(file_name):
  
  def read_tok_tag(s):
    if s[-1] == '\n':
      s = s[:-1]
    s = s.split('|')
    return [s[0], s[2]]
  def concatenate(x,y):
    x[0] += [y[0]] 
    x[1] += [y[1]]
    return x

  with open(file_name, 'r', encoding='utf-8') as file:
    tok = []
    tag = []
    for line in file:
      tok_tag = list(map(read_tok_tag, line.split()))
      dat = reduce(
          concatenate, tok_tag ,[[], []]
      )
      if dat[0]:
        tok += [dat[0]]
        tag += [dat[1]]
  return tok, tag

In [4]:
tok, tag = read_data('wiki_ner_utf.bio')

In [36]:
data = pd.DataFrame({'tokens': tok, 'tags':tag})
data.head()

Unnamed: 0,tokens,tags
0,"[El, Principado, de, Andorra, es, un, pequeño,...","[O, I-LOC, I-LOC, I-LOC, O, O, O, O, O, O, O, ..."
1,"[Su, territorio, ,, con, capital, en, Andorra,...","[O, O, O, O, O, O, I-LOC, I-LOC, I-LOC, O, O, ..."
2,"[No, tiene, fuerzas, armadas, propias, y, su, ...","[O, O, O, O, O, O, O, O, O, O, O, I-LOC, O, I-..."
3,"[Durante, mucho, tiempo, pobre, y, aislado, ,,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, I-M..."
4,"[Debido, a, la, fertilidad, de, las, tierras, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [37]:
def get_lemma(s):
    for i in range(len(s)):
        for token in nlp(s[i]):
            s[i] = token.lemma_
    return s

data['tokens'] = data['tokens'].apply(get_lemma)
data.head()

Unnamed: 0,tokens,tags
0,"[el, Principado, de, Andorra, ser, uno, pequeñ...","[O, I-LOC, I-LOC, I-LOC, O, O, O, O, O, O, O, ..."
1,"[su, territorio, ,, con, capital, en, Andorra,...","[O, O, O, O, O, O, I-LOC, I-LOC, I-LOC, O, O, ..."
2,"[no, tener, fuerza, armada, propio, y, su, def...","[O, O, O, O, O, O, O, O, O, O, O, I-LOC, O, I-..."
3,"[durante, mucho, tiempo, pobre, y, aislado, ,,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, I-M..."
4,"[debido, a, el, fertilidad, de, el, tierra, ,,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [3]:
#data.to_csv('train_test.tsv', sep='\t', index=False)

def str2list(s):
    s = s.strip('][')
    s = s.strip()
    s = s.replace("'", '')
    s = s.replace('"', '')
    return s.split(', ')

data = pd.read_csv('train_test.tsv', sep='\t')
data['tokens'] = data['tokens'].apply(str2list)
data['tags'] = data['tags'].apply(str2list)
data.head()

FileNotFoundError: [Errno 2] No such file or directory: 'train_test.tsv'

In [2]:
import pickle
with open("data_dics.pkl", "rb") as f:
    token2idx = pickle.load(f)
    idx2token = pickle.load(f)
    tag2idx = pickle.load(f)
    idx2tag = pickle.load(f)
    data = pickle.load(f)

# Stadistical Analysis

In [None]:
tags = reduce(lambda x,y: x + y , data['tags'].to_list(), [])
tags = pd.DataFrame({'tags':tags, 'count': [1]*len(tags)})
tags.head()

In [None]:
def remove_pre(s):
  if s == 'O':
    return s
  return s[2:]

tags['tags'] = tags['tags'].apply(remove_pre)
tags = tags.groupby('tags', as_index=False).count()
tags

In [None]:
import matplotlib
matplotlib.rcParams['mathtext.fontset'] = 'stix'
matplotlib.rcParams['font.family'] = 'STIXGeneral'

In [None]:
import matplotlib.pyplot as plt

labels = list(set(tags['tags'].unique()) - set('O'))
sizes = [tags[tags['tags'] == label]['count'] for label in labels]

fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%', colors=['steelblue', 'royalblue', 'cornflowerblue'],
        startangle=90)
ax1.axis('equal')
plt.title(r'Entities Proportion')
plt.savefig('Img/Labels.pdf', format='pdf')

# Build Dictionaries

In [17]:
def build_dict(tokens_or_tags, special_tokens):
    """
        tokens_or_tags: a list of lists of tokens or tags
        special_tokens: some special tokens
    """
    # Create a dictionary with default value 0
    tok2idx = defaultdict(lambda:0)
    idx2tok = []
    
    # Create mappings from tokens (or tags) to indices and vice versa.
    # At first, add special tokens (or tags) to the dictionaries.
    # The first special token must have index 0.

    for token in special_tokens:
      idx2tok.append(token)

    # Mapping tok2idx should contain each token or tag only once. 
    # To do so, you should:
    # 1. extract unique tokens/tags from the tokens_or_tags variable, which is not
    #    occur in special_tokens (because they could have non-empty intersection)
    # 2. index them (for example, you can add them into the list idx2tok
    # 3. for each token/tag save the index into tok2idx).
    
    for token_or_tag_list in tokens_or_tags:
      for token_or_tag in token_or_tag_list:
        if token_or_tag not in idx2tok:
          idx2tok.append(token_or_tag)
      
      tok2idx = {idx2tok[idx]:idx for idx in range(len(idx2tok))}

    return tok2idx,idx2tok

In [39]:
special_tokens = ['<UNK>', '<PAD>']
special_tags = ['O']

# Create dictionaries 
token2idx, idx2token = build_dict(data['tokens'].to_list(), special_tokens)
tag2idx, idx2tag = build_dict(data['tags'], special_tags)

In [44]:
# Save dicts
import pickle
with open('token2idx.pkl', 'wb') as f:
    pickle.dump(token2idx, f)
    
with open('idx2token.pkl', 'wb') as f:
    pickle.dump(idx2token, f)

with open('tag2idx.pkl', 'wb') as f:
    pickle.dump(tag2idx, f)
    
with open('idx2tag.pkl', 'wb') as f:
    pickle.dump(idx2tag, f)

In [None]:
import pickle

with open('token2idx.pkl', 'rb') as f:
    token2idx = pickle.load(f)
    
with open('idx2token.pkl', 'rb') as f:
    idx2token = pickle.load(f)

with open('tag2idx.pkl', 'rb') as f:
    tag2idx = pickle.load(f)
    
with open('idx2tag.pkl', 'rb') as f:
    idx2tag = pickle.load(f)

In [28]:
import pickle

with open("data_dics.pkl", "rb") as f:
    token2idx = pickle.load(f)
    idx2token = pickle.load(f)
    tag2idx = pickle.load(f)
    idx2tag = pickle.load(f)
    data = pickle.load(f)

In [18]:
def words2idxs(tokens_list):
    return [token2idx[word] for word in tokens_list]

def tags2idxs(tags_list):
    return [tag2idx[tag] for tag in tags_list]

def idxs2words(idxs):
    return [idx2token[idx] for idx in idxs]

def idxs2tags(idxs):
    return [idx2tag[idx] for idx in idxs]

# Build Model

In [3]:
def batches_generator(batch_size, tokens, tags,
                      shuffle=True, allow_smaller_last_batch=True):
    """Generates padded batches of tokens and tags."""
    
    n_samples = len(tokens)
    if shuffle:
        order = np.random.permutation(n_samples)
    else:
        order = np.arange(n_samples)

    n_batches = n_samples // batch_size
    if allow_smaller_last_batch and n_samples % batch_size:
        n_batches += 1

    for k in range(n_batches):
        batch_start = k * batch_size
        batch_end = min((k + 1) * batch_size, n_samples)
        current_batch_size = batch_end - batch_start
        x_list = []
        y_list = []
        max_len_token = 0
        for idx in order[batch_start: batch_end]:
            x_list.append(words2idxs(tokens[idx]))
            y_list.append(tags2idxs(tags[idx]))
            max_len_token = max(max_len_token, len(tags[idx]))
            
        # Fill in the data into numpy nd-arrays filled with padding indices.
        x = np.ones([current_batch_size, max_len_token], dtype=np.int32) * token2idx['<PAD>']
        y = np.ones([current_batch_size, max_len_token], dtype=np.int32) * tag2idx['O']
        lengths = np.zeros(current_batch_size, dtype=np.int32)
        for n in range(current_batch_size):
            utt_len = len(x_list[n])
            x[n, :utt_len] = x_list[n]
            lengths[n] = utt_len
            y[n, :utt_len] = y_list[n]
        yield x, y, lengths

In [4]:
class BiLSTMModel():
    pass

In [5]:
def declare_placeholders(self):
    """Specifies placeholders for the model."""

    # Placeholders for input and ground truth output.
    self.input_batch = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_batch') 
    self.ground_truth_tags = tf.placeholder(dtype=tf.int32,shape=[None,None], name='ground_truth_tags')
  
    # Placeholder for lengths of the sequences.
    self.lengths = tf.placeholder(dtype=tf.int32, shape=[None], name='lengths') 
    
    # Placeholder for a dropout keep probability. If we don't feed
    # a value for this placeholder, it will be equal to 1.0.
    self.dropout_ph = tf.placeholder_with_default(tf.cast(1.0, tf.float32), shape=[])
    
    # Placeholder for a learning rate (tf.float32).
    self.learning_rate_ph = tf.placeholder_with_default(tf.cast(0.1, tf.float32), shape=[])

BiLSTMModel.__declare_placeholders = classmethod(declare_placeholders)

In [6]:
def build_layers(self, vocabulary_size, embedding_dim, n_hidden_rnn, n_tags):
    """Specifies bi-LSTM architecture and computes logits for inputs."""
    
    # Create embedding variable (tf.Variable) with dtype tf.float32
    initial_embedding_matrix = np.random.randn(vocabulary_size, embedding_dim) / np.sqrt(embedding_dim)
    embedding_matrix_variable = tf.Variable(initial_embedding_matrix,dtype=tf.float32)
    
    # Create RNN cells (for example, tf.nn.rnn_cell.BasicLSTMCell) with n_hidden_rnn number of units 
    # and dropout (tf.nn.rnn_cell.DropoutWrapper), initializing all *_keep_prob with dropout placeholder.
    forward_cell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.BasicLSTMCell(num_units=n_hidden_rnn),
                                                 input_keep_prob=self.dropout_ph,output_keep_prob=self.dropout_ph,state_keep_prob=self.dropout_ph)
    backward_cell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.BasicLSTMCell(num_units=n_hidden_rnn),
                                                 input_keep_prob=self.dropout_ph,output_keep_prob=self.dropout_ph,state_keep_prob=self.dropout_ph)

    # Look up embeddings for self.input_batch (tf.nn.embedding_lookup).
    # Shape: [batch_size, sequence_len, embedding_dim].
    embeddings = tf.nn.embedding_lookup(embedding_matrix_variable, self.input_batch)
    
    # Pass them through Bidirectional Dynamic RNN (tf.nn.bidirectional_dynamic_rnn).
    # Shape: [batch_size, sequence_len, 2 * n_hidden_rnn]. 
    # Also don't forget to initialize sequence_length as self.lengths and dtype as tf.float32.
    (rnn_output_fw, rnn_output_bw), _ =  tf.nn.bidirectional_dynamic_rnn(cell_fw=forward_cell, cell_bw= backward_cell,inputs=embeddings,sequence_length=self.lengths, dtype=tf.float32)
    rnn_output = tf.concat([rnn_output_fw, rnn_output_bw], axis=2)

    # Dense layer on top.
    # Shape: [batch_size, sequence_len, n_tags].
    self.logits = tf.layers.dense(rnn_output, n_tags, activation=None)
  
BiLSTMModel.__build_layers = classmethod(build_layers)

In [7]:
def compute_predictions(self):
    """Transforms logits to probabilities and finds the most probable tags."""
    
    # Create softmax (tf.nn.softmax) function
    softmax_output = tf.nn.softmax(self.logits)
    
    # Use argmax (tf.argmax) to get the most probable tags
    # Don't forget to set axis=-1
    # otherwise argmax will be calculated in a wrong way
    self.predictions = tf.argmax(softmax_output,axis=-1)

BiLSTMModel.__compute_predictions = classmethod(compute_predictions)

In [8]:
def compute_loss(self, n_tags, PAD_index):
    """Computes masked cross-entopy loss with logits."""
    
    # Create cross entropy function function (tf.nn.softmax_cross_entropy_with_logits_v2)
    ground_truth_tags_one_hot = tf.one_hot(self.ground_truth_tags, n_tags)
    loss_tensor = tf.nn.softmax_cross_entropy_with_logits_v2(ground_truth_tags_one_hot,self.logits)
    
    mask = tf.cast(tf.not_equal(self.input_batch, PAD_index), tf.float32)
    # Create loss function which doesn't operate with <PAD> tokens (tf.reduce_mean)
    # Be careful that the argument of tf.reduce_mean should be
    # multiplication of mask and loss_tensor.
    self.loss = tf.reduce_mean(tf.reduce_sum(tf.multiply(loss_tensor, mask),axis=-1) / tf.reduce_sum(mask,axis=-1))

BiLSTMModel.__compute_loss = classmethod(compute_loss)

In [9]:
def perform_optimization(self):
    """Specifies the optimizer and train_op for the model."""
    
    # Create an optimizer (tf.train.AdamOptimizer)
    self.optimizer = tf.train.AdamOptimizer(self.learning_rate_ph)
    self.grads_and_vars = self.optimizer.compute_gradients(self.loss)
    
    # Gradient clipping (tf.clip_by_norm) for self.grads_and_vars
    # Pay attention that you need to apply this operation only for gradients 
    # because self.grads_and_vars also contains variables.
    # list comprehension might be useful in this case.
    clip_norm = tf.cast(1.0, tf.float32)
    self.grads_and_vars = [[tf.clip_by_norm(gradient,clip_norm),variable] for gradient,variable in self.grads_and_vars]
    
    self.train_op = self.optimizer.apply_gradients(self.grads_and_vars)

BiLSTMModel.__perform_optimization = classmethod(perform_optimization)

In [10]:
def init_model(self, vocabulary_size, n_tags, embedding_dim, n_hidden_rnn, PAD_index):
    self.__declare_placeholders()
    self.__build_layers(vocabulary_size, embedding_dim, n_hidden_rnn, n_tags)
    self.__compute_predictions()
    self.__compute_loss(n_tags, PAD_index)
    self.__perform_optimization()

BiLSTMModel.__init__ = classmethod(init_model)

In [11]:
def train_on_batch(self, session, x_batch, y_batch, lengths, learning_rate, dropout_keep_probability):
    feed_dict = {self.input_batch: x_batch,
                 self.ground_truth_tags: y_batch,
                 self.learning_rate_ph: learning_rate,
                 self.dropout_ph: dropout_keep_probability,
                 self.lengths: lengths}
    
    session.run(self.train_op, feed_dict=feed_dict)
    
BiLSTMModel.train_on_batch = classmethod(train_on_batch)

In [12]:
def predict_for_batch(self, session, x_batch, lengths):
    feed_dict = {self.input_batch:x_batch, self.lengths:lengths}
    predictions = session.run(self.predictions,feed_dict=feed_dict)
    return predictions

BiLSTMModel.predict_for_batch = classmethod(predict_for_batch)

#Train

In [13]:
def predict_tags(model, session, token_idxs_batch, lengths):
    """Performs predictions and transforms indices to tokens and tags."""
    
    tag_idxs_batch = model.predict_for_batch(session, token_idxs_batch, lengths)
    
    tags_batch, tokens_batch = [], []
    for tag_idxs, token_idxs in zip(tag_idxs_batch, token_idxs_batch):
        tags, tokens = [], []
        for tag_idx, token_idx in zip(tag_idxs, token_idxs):
            tags.append(idx2tag[tag_idx])
            tokens.append(idx2token[token_idx])
        tags_batch.append(tags)
        tokens_batch.append(tokens)
    #print(tags_batch)
    return tags_batch, tokens_batch
    
    
def eval_conll(model, session, tokens, tags, short_report=True):
    """Computes NER quality measures using CONLL shared task script."""
    
    y_true, y_pred = [], []
    for x_batch, y_batch, lengths in batches_generator(1, tokens, tags):
        tags_batch, tokens_batch = predict_tags(model, session, x_batch, lengths)
        if len(x_batch[0]) != len(tags_batch[0]):
            raise Exception("Incorrect length of prediction for the input, "
                            "expected length: %i, got: %i" % (len(x_batch[0]), len(tags_batch[0])))
        predicted_tags = []
        ground_truth_tags = []
        for gt_tag_idx, pred_tag, token in zip(y_batch[0], tags_batch[0], tokens_batch[0]): 
            if token != '<PAD>':
                ground_truth_tags.append(idx2tag[gt_tag_idx])
                predicted_tags.append(pred_tag)

        # We extend every prediction and ground truth sequence with 'O' tag
        # to indicate a possible end of entity.
        y_true.extend(ground_truth_tags + ['O'])
        y_pred.extend(predicted_tags + ['O'])
    #print('y_true:', y_true)
    results = precision_recall_f1(y_true, y_pred, print_results=True, short_report=short_report)
    return results

In [14]:
tf.reset_default_graph()
tf.disable_eager_execution()

model = BiLSTMModel(vocabulary_size=len(token2idx),
                    n_tags=len(tag2idx),
                    embedding_dim=200,
                    n_hidden_rnn=200,
                    PAD_index=token2idx['<PAD>'])

batch_size = 256
n_epochs = 20
learning_rate = 0.05
learning_rate_decay = np.sqrt(2)
dropout_keep_probability = 0.9

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.


In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['tokens'].to_list(), data['tags'].to_list(), test_size=0.1, random_state=42)
train_tokens = X_train[:int(len(X_train) - len(X_train)/10)]
validation_tokens = X_train[int(len(X_train) - len(X_train)/10):]
train_tags = y_train[:int(len(y_train) - len(y_train)/10)]
validation_tags = y_train[int(len(y_train) - len(y_train)/10):]

In [19]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

#saver = tf.train.Saver()

print('Start training... \n')

for epoch in range(n_epochs):
    # For each epoch evaluate the model on train and validation data
    print('-' * 20 + ' Epoch {} '.format(epoch+1) + 'of {} '.format(n_epochs) + '-' * 20)
    print('Train data evaluation:')
    eval_conll(model, sess, train_tokens, train_tags, short_report=True)
    print('Validation data evaluation:')
    eval_conll(model, sess, validation_tokens, validation_tags, short_report=True)
    
    # Train the model
    for x_batch, y_batch, lengths in batches_generator(batch_size, train_tokens, train_tags):
        model.train_on_batch(sess, x_batch, y_batch, lengths, learning_rate, dropout_keep_probability)
        
    # Decaying the learning rate
    learning_rate = learning_rate / learning_rate_decay
    
print('...training finished.')

Start training... 

-------------------- Epoch 1 of 20 --------------------
Train data evaluation:
processed 2939957 tokens with 198775 phrases; found: 1637730 phrases; correct: 13014.

precision:  0.79%; recall:  6.55%; F1:  1.42

Validation data evaluation:
processed 325556 tokens with 21963 phrases; found: 181140 phrases; correct: 1490.

precision:  0.82%; recall:  6.78%; F1:  1.47



2022-10-29 22:52:40.168892: W tensorflow/core/framework/cpu_allocator_impl.cc:81] Allocation of 77824000 exceeds 10% of system memory.
2022-10-29 22:52:40.281488: W tensorflow/core/framework/cpu_allocator_impl.cc:81] Allocation of 77824000 exceeds 10% of system memory.
2022-10-29 22:54:38.769563: W tensorflow/core/framework/cpu_allocator_impl.cc:81] Allocation of 72089600 exceeds 10% of system memory.
2022-10-29 22:54:38.843696: W tensorflow/core/framework/cpu_allocator_impl.cc:81] Allocation of 72089600 exceeds 10% of system memory.
2022-10-29 23:02:19.380357: W tensorflow/core/framework/cpu_allocator_impl.cc:81] Allocation of 87244800 exceeds 10% of system memory.


-------------------- Epoch 2 of 20 --------------------
Train data evaluation:
processed 2939957 tokens with 198775 phrases; found: 207281 phrases; correct: 161290.

precision:  77.81%; recall:  81.14%; F1:  79.44

Validation data evaluation:
processed 325556 tokens with 21963 phrases; found: 22735 phrases; correct: 16965.

precision:  74.62%; recall:  77.24%; F1:  75.91

-------------------- Epoch 3 of 20 --------------------
Train data evaluation:
processed 2939957 tokens with 198775 phrases; found: 205566 phrases; correct: 170190.

precision:  82.79%; recall:  85.62%; F1:  84.18

Validation data evaluation:
processed 325556 tokens with 21963 phrases; found: 22739 phrases; correct: 17561.

precision:  77.23%; recall:  79.96%; F1:  78.57

-------------------- Epoch 4 of 20 --------------------
Train data evaluation:
processed 2939957 tokens with 198775 phrases; found: 203052 phrases; correct: 176173.

precision:  86.76%; recall:  88.63%; F1:  87.69

Validation data evaluation:
process

In [18]:
saver.save(sess, 'out/coso')

'out/coso'

In [23]:
tf.train.write_graph(sess.graph.as_graph_def(), 'out/',
                     'saved_model.pbtxt', as_text=True)

'out/saved_model.pbtxt'

In [None]:
from tensorflow.python.tools import freeze_graph
freeze_graph.freeze_graph('out/saved_model.pbtxt', "", False, 
                          './tensorflowModel.ckpt', "output/softmax",
                           "save/restore_all", "save/Const:0",
                           'frozentensorflowModel.pb', True, ""  
                         )

In [24]:
tf.train.write_graph(sess.graph.as_graph_def(), 'out/',
                     'saved_model.pb', as_text=False)

'out/saved_model.pb'

In [18]:
print('-' * 20 + ' Train set quality: ' + '-' * 20)
train_results = eval_conll(model, sess, train_tokens, train_tags, short_report=False)
print('-' * 20 + ' Validation set quality: ' + '-' * 20)
validation_results = eval_conll(model, sess, validation_tokens, validation_tags, short_report=False)
print('-' * 20 + ' Test set quality: ' + '-' * 20)
test_results = eval_conll(model, sess, X_test, y_test, short_report=False)

-------------------- Train set quality: --------------------
processed 2939957 tokens with 198775 phrases; found: 200052 phrases; correct: 190788.

precision:  95.37%; recall:  95.98%; F1:  95.67

	         LOC: precision:   94.73%; recall:   95.76%; F1:   95.24; predicted:  98175

	        MISC: precision:   92.77%; recall:   93.05%; F1:   92.91; predicted:  26790

	         ORG: precision:   93.88%; recall:   93.21%; F1:   93.55; predicted:  17332

	         PER: precision:   98.11%; recall:   98.56%; F1:   98.34; predicted:  57755

-------------------- Validation set quality: --------------------
processed 325556 tokens with 21963 phrases; found: 22684 phrases; correct: 18623.

precision:  82.10%; recall:  84.79%; F1:  83.42

	         LOC: precision:   82.53%; recall:   86.76%; F1:   84.59; predicted:  11420

	        MISC: precision:   68.71%; recall:   69.91%; F1:   69.31; predicted:  2969

	         ORG: precision:   74.53%; recall:   75.83%; F1:   75.18; predicted:  1869

	    

0

In [21]:
with open('model_session.pkl', 'wb') as f:
    pickle.dump(model, f, pickle.HIGHEST_PROTOCOL)

# Predict

In [20]:
def predict(model, session, tokens):
    """Computes NER quality measures using CONLL shared task script."""
    
    tags = [['O']*len(tokens[0])]
    y_true, y_pred = [], []
    for x_batch, y_batch, lengths in batches_generator(1, tokens, tags):
        tags_batch, tokens_batch = predict_tags(model, session, x_batch, lengths)
        if len(x_batch[0]) != len(tags_batch[0]):
            raise Exception("Incorrect length of prediction for the input, "
                            "expected length: %i, got: %i" % (len(x_batch[0]), len(tags_batch[0])))
        predicted_tags = []
        for gt_tag_idx, pred_tag, token in zip(y_batch[0], tags_batch[0], tokens_batch[0]): 
            if token != '<PAD>':
                predicted_tags.append(pred_tag)

        y_pred.extend(predicted_tags)

    return y_pred

In [21]:
def print_prediction(toks, tags):
  for t in  zip(toks, tags):
    print(t[0], '\t', t[1])

In [22]:
idx = 3
pred = predict(model, sess, [X_test[idx]])
print_prediction(X_test[idx], pred)

ser 	 O
Thomas 	 I-PER
Alva 	 I-PER
Edison 	 I-PER
el 	 O
creador 	 O
, 	 O
además 	 O
, 	 O
del 	 O
formato 	 O
cinematográfico 	 O
por 	 O
excelencia 	 O
, 	 O
el 	 O
35 	 O
mm 	 O
, 	 O
sobre 	 O
uno 	 O
soportir 	 O
de 	 O
nitrato 	 O
de 	 O
celulós 	 O
. 	 O


# Tweets

In [23]:
def words2idxs(tokens_list):
    return [token2idx[word] if word in token2idx else token2idx['<UNK>'] for word in tokens_list]

In [24]:
with open('test.pkl', 'rb') as f:
    tweets = pickle.load(f)
    
tweets.head()

Unnamed: 0,tokens,lemmas
0,"[Paro, Nacional, ay, está, lindo, el, paro, No...","[Paro, Nacional, ay, estar, lindar, el, parir,..."
1,"[La, gente, sabe, !, !, La, gente, entiende, !...","[La, gente, saber, !, !, La, gente, entender, ..."
2,"[¡, Que, belleza, !, Deja, ordenando, que, le,...","[¡, Que, belleza, !, Deja, ordenar, que, le, a..."
3,"[Guillermo, Lasso, Si, ,, pero, vaya, a, la, A...","[Guillermo, Lasso, Si, ,, pero, ir, a, lo, Asa..."
4,"[Tan, falsa, esta, nota, ,, porque, el, pasado...","[Tan, falso, este, noto, ,, porque, el, pasar,..."


In [25]:
def predict_from_list(l):
    return predict(model, sess, [l])

In [39]:
import tracemalloc
import time

tracemalloc.start()
start_time = time.time()
tweets['predictions'] = tweets['lemmas'].apply(predict_from_list)
predict_time = time.time() - start_time
print(f'{predict_time}s')
print(tracemalloc.get_traced_memory())
tracemalloc.stop()
tweets.head()

64.26866436004639s
(2448898, 2861973)


Unnamed: 0,tokens,lemmas,predictions
0,"[Paro, Nacional, ay, está, lindo, el, paro, No...","[Paro, Nacional, ay, estar, lindar, el, parir,...","[I-ORG, I-LOC, I-LOC, O, O, O, O, O, O, O, O, O]"
1,"[La, gente, sabe, !, !, La, gente, entiende, !...","[La, gente, saber, !, !, La, gente, entender, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,"[¡, Que, belleza, !, Deja, ordenando, que, le,...","[¡, Que, belleza, !, Deja, ordenar, que, le, a...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[Guillermo, Lasso, Si, ,, pero, vaya, a, la, A...","[Guillermo, Lasso, Si, ,, pero, ir, a, lo, Asa...","[I-PER, I-PER, I-PER, O, O, O, O, O, I-ORG, I-..."
4,"[Tan, falsa, esta, nota, ,, porque, el, pasado...","[Tan, falso, este, noto, ,, porque, el, pasar,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [40]:
tweets_2 = pd.concat([tweets,tweets])
tracemalloc.start()
start_time = time.time()
tweets_2['predictions'] = tweets_2['lemmas'].apply(predict_from_list)
predict_time = time.time() - start_time
print(f'{predict_time}s')
print(tracemalloc.get_traced_memory())
tracemalloc.stop()
tweets.head()

156.01088213920593s
(4885159, 5708880)


Unnamed: 0,tokens,lemmas,predictions
0,"[Paro, Nacional, ay, está, lindo, el, paro, No...","[Paro, Nacional, ay, estar, lindar, el, parir,...","[I-ORG, I-LOC, I-LOC, O, O, O, O, O, O, O, O, O]"
1,"[La, gente, sabe, !, !, La, gente, entiende, !...","[La, gente, saber, !, !, La, gente, entender, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,"[¡, Que, belleza, !, Deja, ordenando, que, le,...","[¡, Que, belleza, !, Deja, ordenar, que, le, a...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[Guillermo, Lasso, Si, ,, pero, vaya, a, la, A...","[Guillermo, Lasso, Si, ,, pero, ir, a, lo, Asa...","[I-PER, I-PER, I-PER, O, O, O, O, O, I-ORG, I-..."
4,"[Tan, falsa, esta, nota, ,, porque, el, pasado...","[Tan, falso, este, noto, ,, porque, el, pasar,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [41]:
tweets_3 = pd.concat([tweets,tweets,tweets])
tracemalloc.start()
start_time = time.time()
tweets_3['predictions'] = tweets_3['lemmas'].apply(predict_from_list)
predict_time = time.time() - start_time
print(f'{predict_time}s')
print(tracemalloc.get_traced_memory())
tracemalloc.stop()
tweets.head()

212.3076093196869s
(7331155, 8564586)


Unnamed: 0,tokens,lemmas,predictions
0,"[Paro, Nacional, ay, está, lindo, el, paro, No...","[Paro, Nacional, ay, estar, lindar, el, parir,...","[I-ORG, I-LOC, I-LOC, O, O, O, O, O, O, O, O, O]"
1,"[La, gente, sabe, !, !, La, gente, entiende, !...","[La, gente, saber, !, !, La, gente, entender, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,"[¡, Que, belleza, !, Deja, ordenando, que, le,...","[¡, Que, belleza, !, Deja, ordenar, que, le, a...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[Guillermo, Lasso, Si, ,, pero, vaya, a, la, A...","[Guillermo, Lasso, Si, ,, pero, ir, a, lo, Asa...","[I-PER, I-PER, I-PER, O, O, O, O, O, I-ORG, I-..."
4,"[Tan, falsa, esta, nota, ,, porque, el, pasado...","[Tan, falso, este, noto, ,, porque, el, pasar,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [29]:
tweets['predictions'] = tweets['lemmas'].apply(predict_from_list) 

145.80478239059448s


In [30]:
import psutil

7077

In [25]:
with open('results.pkl', 'wb') as f:
    pickle.dump(tweets, f)