In [1]:
import os 
import re
import sys
import numpy as np
import pandas as pd
# import nltk
import tensorflow as tf

In [2]:
new_filename = "28_01_2020_NewData.txt"

#read eng sentences

        

In [17]:
def prepare_tag_data(filename):
    eng_lines = []
    with open(filename,'r',encoding='utf-8') as f:
        for line in f:
            line = line.strip().replace('<s>','').replace('</t>','').split('</s> <t>')
            line = line[0].strip()
            line = re.sub('\s+',' ',line)
            eng_lines.append(line)
    return eng_lines

lns = prepare_tag_data(filename=new_filename)

In [18]:
len(lns),lns[:2]

(800640,
 ['rendered Poll library raise MLSDFIELD void ferrari bits dan rangers of periodic explore register load MLSDREG shortcuts quiet until mason attachment cleared relations',
  'Poll MLSDFIELD bits of register MLSDREG until cleared'])

In [5]:
#build params (Hyperparameters)

In [6]:
config = {}
EPOCHS = 15
BATCH_SIZE = 32
LR = 0.001
DECAY_LR = 0.9
OPTM = "Adam"
DROPOUT = 0.3
EPOCHS_NO_IMPROVEMENT = 4
HIDDEN_DIM = 256
DATA_FILEPATH = "data_folder/data.txt"
GLOVE_FILEPATH = "glove/glove.6B.100d.txt"
VOCAB_FILENAME = "data_folder/vocabs.txt"
TAG_FILENAME = "data_folder/tags.txt"



In [7]:
#build files from data

In [8]:
def data_batches(data,batch_size=32):
    """Args >> data=tuple(sentence,tag)
    returns data in batches"""
    steps = len(data)//batch_size
    x_batch, y_batch = [], []
    for (x,y) in data:
        if len(x_batch) == batch_size:
            yield x_batch, y_batch
            x_batch, y_batch = [], []
        if type(x[0])==tuple:
            x = zip(*x)
            x_batch += [x]
            y_batch += [y]
    if len(x_batch) != 0:
        yield x_batch, y_batch

In [9]:
UNK = "$UNK$"
NUM = "$NUM$"
NONE = "O"

In [10]:
def get_vocabs(datasets):
    """Building vocabs present in datasets"""
    print("Building Vocab ....")
    vocab_words = set()
    vocab_tags = set()
    for dataset in datasets:
        for words, tags in dataset:
            vocab_words.update(words)
            vocab_tags.update(tags)
    print("-done vocab words {s}--- vocab_tags {d}".format(s=len(vocab_words),d=len(vocab_tags)))
    return vocab_words, vocab_tags

In [11]:
def get_glove_vocab(path):
    """Building vocabs present in glove"""
    print("Building Vocab..glove...")
    vocab = set()
    with open(path,'r',encoding='utf-8') as f:
        for line in f:
            word = line.strip().split(' ')[0]
            vocab.add(word)
    print("done. {} tokens in glove".format(len(vocab)))
    return vocab

In [12]:
def final_vocab(vocab_words,vocab_glove,*args):
    """Combine word vocabs >> glove_vocabs+dataset_vocabs"""
    vocab = vocab_words & vocab_glove
    for word in args:
        vocab.add(word)
    return vocab
    

In [13]:
def build_vocab(vocab,filename):
    "Write all vocabs line by line in a file"
    with open(filename,'w',encoding='utf-8') as f:
        for i, word in enumerate(vocab):
            if i != len(vocab)-1:
                f.write("{}\n".format(word))
            else:
                f.write(word)
    print("written {s} tokens in {d}".format(s=len(vocab), d=filename))


In [14]:
def load_vocab(filename):
    """Assign id to each word in vocab
        returns dictionary"""
    d = {}
    with open(filename,encoding='utf-8') as f:
        for idx, word in enumerate(f):
            word = word.strip()
            d[word] = idx
    return d

In [15]:
def export_glove_vectors(vocab, glove_filename, filename, dim):
    """Bulding compressed file of vectors of words
    that are present in dataset"""
    embeddings = np.zeros([len(vocab),dim],dtype='float32')
    with open(glove_filename,'r',encoding='utf-8') as f:
        for line in f:
            line = line.strip().split(' ')
            word = line[0]
            if word in vocab:
                embedding = [float(x) for x in line[1:]]
                word_idx = vocab[word]
                embeddings[word_idx] = np.asarray(embedding)
    np.savez_compressed(filename, embeddings=embeddings)

def get_glove_vectors(filename):
    """Loads the saved numpy file (Embeddings)"""
    with np.load(filename) as data:
        return data['embeddings']

In [16]:
#read eng sentences
with open('english_sentences_tatoeba.txt','r',encoding='utf-8') as f:
    lns = f.readlines()
print(len(lns),lns[:6])

1275343 ['Hi\n', 'So?\n', 'Ow!\n', 'Ok.\n', 'Oi!\n', 'OK.\n']


In [19]:
def generate_data(lines, max_sents_per_example=6, n_examples=1000):
    """
        Generates training data for deepsegment from list of sentences.
        Parameters:
        lines (list): Base sentences for data generation.
        max_sents_per_example (int): Maximum number of sentences to be combined to form a single paragraph.
        
        n_examples (int): Number of training examples to be generated.
        
        Returns:
        list, list: Training data and corresponding labels in BIOU format.
    """
    x, y = [], []
    
    for current_i in tqdm(range(n_examples)):
        x.append([])
        y.append([])

        chosen_lines = []
        for _ in range(random.randint(1, max_sents_per_example)):
            chosen_lines.append(random.choice(lines))
        
        chosen_lines = [bad_sentence_generator(line, remove_punctuation=random.randint(0, 3)) for line in chosen_lines]
        
        for line in chosen_lines:
            words = line.strip().split()
            for word_i, word in enumerate(words):
                x[-1].append(word)
                label = 'O'
                if word_i == 0:
                    label = 'B-sent'
                y[-1].append(label)
    
    return x, y


def bad_sentence_generator(sent, remove_punctuation = None):
    """
        Returns sentence with completely/ partially removed punctuation.
        Parameters:
        sent (str): Sentence on which the punctuation removal operation is performed.
        
        remove_punctuation (int): removing punctuation completely if remove_punctuation ==0 or ==1, removing punctuation till a randomly selected point if remove_punctuation ==2
        Returns:
        str: Sentence with modified punctuation
    """

    if not remove_punctuation:
        remove_punctuation = random.randint(0, 3)

    break_point = random.randint(1, len(sent)-2)
    lower_case = random.randint(0, 2)

    if remove_punctuation <= 1:
        # removing punctuation completely if remove_punctuation ==0 or ==1
        sent = re.sub('['+string.punctuation+']', '', sent)
    
    elif remove_punctuation == 2:
        # removing punctuation till a randomly selected point if remove_punctuation ==2
        if random.randint(0,1) == 0:
            sent = re.sub('['+string.punctuation+']', '', sent[:break_point]) + sent[break_point:]
        # removing punctuation after a randomly selected point if remove_punctuation ==2        
        else:
            sent = sent[:break_point] + re.sub('['+string.punctuation+']', '', sent[break_point:])    
    
    if lower_case <= 1:
        # lower casing sentence 
        sent = sent.lower()
    
    return sent


In [20]:
from tqdm import tqdm
import random
import string
string.punctuation
lns[:10]

['rendered Poll library raise MLSDFIELD void ferrari bits dan rangers of periodic explore register load MLSDREG shortcuts quiet until mason attachment cleared relations',
 'Poll MLSDFIELD bits of register MLSDREG until cleared',
 'domestic Poll MLSDFIELD bits of register MLSDREG until cleared eternal',
 'domestic translator Poll MLSDFIELD bits of register MLSDREG until cleared tremendous',
 'domestic Poll MLSDFIELD bits of register MLSDREG until cleared translator tremendous',
 'eternal tremendous Poll MLSDFIELD bits of register MLSDREG until cleared jul galaxy',
 'cameroon Poll donors amsterdam MLSDFIELD decision stem bits broad contest of princeton bread register denver indirect MLSDREG security commissions until zus var cleared asp',
 'Poll MLSDFIELD bits of register MLSDREG until cleared',
 'arctic Poll MLSDFIELD bits of register MLSDREG until cleared etc',
 'arctic employers Poll MLSDFIELD bits of register MLSDREG until cleared turkey']

In [21]:
x, y = generate_data(lines=lns[100000:], max_sents_per_example=6, n_examples=500000)
x_, y_ = generate_data(lines=lns[:100000], max_sents_per_example=6, n_examples=100000)

100%|████████████████████████████████████████████████████████████████████████| 500000/500000 [01:12<00:00, 6866.55it/s]
100%|████████████████████████████████████████████████████████████████████████| 100000/100000 [00:12<00:00, 7934.37it/s]


In [22]:
os.listdir('glove')[0]

'glove.6B.100d.txt'

In [23]:
dataset = [(i,j) for i, j in zip(x,y)]

In [24]:
# dataset[1]

In [25]:
#building vocab from glove and dataset 
#combine it 
#write it
vocab_words, vocab_tags = get_vocabs([dataset])
build_vocab(vocab_tags, TAG_FILENAME)

Building Vocab ....
-done vocab words 9536--- vocab_tags 2
written 2 tokens in data_folder/tags.txt


In [26]:
vocab_glove = get_glove_vocab(GLOVE_FILEPATH)

Building Vocab..glove...
done. 400000 tokens in glove


In [27]:
vocab = final_vocab(vocab_words, vocab_glove, NUM,UNK)

In [28]:
build_vocab(vocab, VOCAB_FILENAME)

written 9412 tokens in data_folder/vocabs.txt


In [29]:
#giving ids to all vocab
word_2_id = load_vocab(VOCAB_FILENAME)
tag_2_id  = load_vocab(TAG_FILENAME)


In [30]:
#saving npz with embedding of vocab
export_glove_vectors(vocab=word_2_id, glove_filename=GLOVE_FILEPATH, filename='vocab_embeddings', dim=100)
export_glove_vectors(vocab=tag_2_id, glove_filename=GLOVE_FILEPATH, filename='tags_embeddings', dim=100)

In [31]:
#loading the glove vectors from saved npz
word_embeddings = get_glove_vectors(filename='vocab_embeddings.npz')
tag_embeddings = get_glove_vectors(filename='tags_embeddings.npz')

In [32]:
#converting them into list of ids

In [33]:
def get_processing_word(vocab_words=None, vocab_chars=None,
                    lowercase=False, chars=False, allow_unk=True):
    """Return lambda function that transform a word (string) into list,
    or tuple of (list, id) of int corresponding to the ids of the word and
    its corresponding characters.

    Args:
        vocab: dict[word] = idx

    Returns:
        f("cat") = ([12, 4, 32], 12345)
                 = (list of char ids, word id)

    """
    def f(word):
        # 0. get chars of words
        UNK = "$UNK$"
        if vocab_chars is not None and chars == True:
            char_ids = []
            for char in word:
                # ignore chars out of vocabulary
                if char in vocab_chars:
                    char_ids += [vocab_chars[char]]

        # 1. preprocess word
        if lowercase:
            word = word.lower()
        if word.isdigit():
            word = NUM

        # 2. get id of word
        if vocab_words is not None:
            if word in vocab_words:
                word = vocab_words[word]
            else:
                if allow_unk:
                    word = vocab_words[UNK]
                else:
                    raise Exception("Unknow key is not allowed. Check that "\
                                    "your vocab (tags?) is correct")

        # 3. return tuple char ids, word id
        if vocab_chars is not None and chars == True:
            return char_ids, word
        else:
            return word

    return f


In [34]:
# process_words("saff")

In [35]:
process_words = get_processing_word(vocab_words=word_2_id)
process_tags = get_processing_word(vocab_words=tag_2_id)

In [36]:
# class CoNLLDataset(object):
#     """Class that iterates over CoNLL Dataset

#     __iter__ method yields a tuple (words, tags)
#         words: list of raw words
#         tags: list of raw tags

#     If processing_word and processing_tag are not None,
#     optional preprocessing is appplied

#     Example:
#         ```python
#         data = CoNLLDataset(filename)
#         for sentence, tags in data:
#             pass
#         ```

#     """
#     def __init__(self, filename, processing_word=None, processing_tag=None,
#                  max_iter=None):
#         """
#         Args:
#             filename: path to the file
#             processing_words: (optional) function that takes a word as input
#             processing_tags: (optional) function that takes a tag as input
#             max_iter: (optional) max number of sentences to yield

#         """
#         self.filename = filename
#         self.processing_word = processing_word
#         self.processing_tag = processing_tag
#         self.max_iter = max_iter
#         self.length = None


#     def __iter__(self):
#         niter = 0
#         with open(self.filename) as f:
#             words, tags = [], []
#             for line in f:
#                 line = line.strip()
#                 if (len(line) == 0 or line.startswith("-DOCSTART-")):
#                     if len(words) != 0:
#                         niter += 1
#                         if self.max_iter is not None and niter > self.max_iter:
#                             break
#                         yield words, tags
#                         words, tags = [], []
#                 else:
#                     print(line)
#                     ls = line.split()
#                     print(ls)
#                     word, tag = ls[0],ls[-1]
#                     print(word),print(tag)
#                     if self.processing_word is not None:
#                         word = self.processing_word(word)
#                     if self.processing_tag is not None:
#                         tag = self.processing_tag(tag)
#                     words += [word]
#                     tags += [tag]


#     def __len__(self):
#         """Iterates once over the corpus to set and store length"""
#         if self.length is None:
#             self.length = 0
#             for _ in self:
#                 self.length += 1

#         return self.length

In [37]:
class CoNLLDataset(object):
    """Class that iterates over CoNLL Dataset

    __iter__ method yields a tuple (words, tags)
        words: list of raw words
        tags: list of raw tags

    If processing_word and processing_tag are not None,
    optional preprocessing is appplied

    Example:
        ```python
        data = CoNLLDataset(filename)
        for sentence, tags in data:
            pass
        ```

    """
    def __init__(self, data, processing_word=None, processing_tag=None,
                 max_iter=32):
        """
        Args:
            filename: path to the file
            processing_words: (optional) function that takes a word as input
            processing_tags: (optional) function that takes a tag as input
            max_iter: (optional) max number of sentences to yield

        """
        self.data = data
        self.processing_word = processing_word
        self.processing_tag = processing_tag
        self.max_iter = 32
        self.length = None


    def __iter__(self):
        niter = 0
#         words, tags = [], []
        for d in self.data:
            words, tags = [], []
            
#             if self.max_iter == niter:
#                 yield words, tags
#                 words, tags = [], []
#             else:
#                 niter+=1
            word, tag = d[0],d[-1]
#             print(word),print(tag)
            for word_, tag_ in zip(word,tag):
                if self.processing_word is not None:
                    word_ = self.processing_word(word_)
                if self.processing_tag is not None:
                    tag_ = self.processing_tag(tag_)
                words += [word_]
                tags += [tag_]
            yield words, tags
                


    def __len__(self):
        """Iterates once over the corpus to set and store length"""
        if self.length is None:
            self.length = 0
            for _ in self:
                self.length += 1

        return self.length

In [38]:
# with open('train.txt','w',encoding='utf-8') as f:
#     for i,item in enumerate(zip(x,y)):
#         if i != len(x)-1:
#             f.write('{} {}\n'.format(item[0],item[1]))
#         else:
#             f.write('{} {}'.format(item[0],item[1]))
# with open('dev.txt','w',encoding='utf-8') as f:
#     for i,item in enumerate(zip(x_,y_)):
#         if i != len(x)-1:
#             f.write('{} {}\n'.format(item[0],item[1]))
#         else:
#             f.write('{} {}'.format(item[0],item[1]))       

In [39]:
train_ = [(i,j) for i, j in zip(x,y)]
test_ = [(i,j) for i, j in zip(x_,y_)]

In [40]:
dev_data   = CoNLLDataset(data=train_, processing_word = process_words,
                     processing_tag = process_tags, max_iter=None)
train_data = CoNLLDataset(data=test_, processing_word = process_words,
                     processing_tag = process_tags, max_iter=None)

In [41]:
tf.reset_default_graph()

In [42]:
#intitate placeholders
word_ids = tf.placeholder(tf.int32, shape=[None,None], name="word_id_placeholder")
sequence_length = tf.placeholder(tf.int32, shape=[None,], name="sequence_length_placholder")
label = tf.placeholder(tf.int32,shape=[None,None], name="labels")
dropout = tf.placeholder(dtype=tf.float32, shape=[], name="dropout")
lr = tf.placeholder(dtype=tf.float32, shape=[], name="lr")

In [43]:
NWORDS = len(word_2_id.keys())
print(NWORDS)

9412


In [44]:
#embedding layer

In [45]:
def word_embedding_fn(nwords,embedding_dict=None,embedding_size=100,word_ids=word_ids):
    with tf.variable_scope("word_embedding"):
        if embedding_dict is None:
            word_embeddings_ = tf.get_variable(name="word_embeddings_",
                                              dtype=tf.float32,
                                              shape=[nwords,embedding_size])
        else:
            word_embeddings_ = tf.Variable(embedding_dict,
                                           name="word_embeddings_",
                                           dtype=tf.float32,
                                           shape=[nwords,embedding_size],
                                           trainable=True
                                           )
            
        word_embedding = tf.nn.embedding_lookup(word_embeddings_,word_ids,name="word_embeddings")

    return word_embedding

In [46]:
word_embedding = word_embedding_fn(nwords=NWORDS, embedding_dict=word_embeddings,embedding_size=100)

In [47]:
def logits_op(hidden_dim,word_embeddings,sequence_length,ntags=len(vocab_tags)):
    with tf.variable_scope("bidirectional_lstm"):
        for_cell = tf.contrib.rnn.LSTMCell(hidden_dim)
        bac_cell = tf.contrib.rnn.LSTMCell(hidden_dim)
        (for_out, bac_out), _ = tf.nn.bidirectional_dynamic_rnn(for_cell, bac_cell,
                                                                word_embeddings,
                                                                sequence_length=sequence_length,
                                                                dtype=tf.float32)
        #shape >> [bs, sequence_length, 2EMD]
        output = tf.concat([for_out, bac_out], axis=-1)

    #         output = tf.nn.dropout(output,)

    with tf.variable_scope("w_b"):
        W = tf.get_variable(name="W",dtype=tf.float32,
                            shape = [2*hidden_dim,ntags])
        b = tf.get_variable(name="b", dtype=tf.float32,
                            shape = [ntags],
                            initializer = tf.zeros_initializer())


        nsteps = tf.shape(output)[1]
        output = tf.reshape(output, [-1,2*hidden_dim])

        pred = tf.matmul(output,W) + b
        logits = tf.reshape(pred,[-1,nsteps,ntags])
    return logits

In [48]:
logits = logits_op(hidden_dim=HIDDEN_DIM,word_embeddings=word_embedding,sequence_length=sequence_length,ntags=len(vocab_tags))

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [49]:
def pred_op(logits):
    label_pred = tf.cast(tf.argmax(logits, axis=-1),tf.int32)
    return label_pred
label_pred = pred_op(logits)

In [50]:
def loss_op(logits,labels,sequence_length,lr):
    losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,labels=labels)
    mask = tf.sequence_mask(sequence_length)
    losses = tf.boolean_mask(losses, mask)
    loss = tf.reduce_mean(losses)
    optimizer = tf.train.AdamOptimizer(lr)
    training_op = optimizer.minimize(loss)
    return training_op, loss

training_op, loss = loss_op(logits=logits, labels=label, sequence_length=sequence_length,lr=LR)

In [51]:
def initialize_session():
    sess = tf.Session()
    return sess

In [52]:
sess = initialize_session()
init = tf.global_variables_initializer()
sess.run(init)
saver = tf.train.Saver()

In [53]:
def save_session():
    if not os.path.exists('model_folder'):
        os.makedirs('model_folder')
    saver.save(sess, 'model_folder')

def close_sess():
    sess.close()

In [54]:
#minibatches of data
def minibatches(data,batch_size):
    x_batch, y_batch = [], []
    for (x,y) in data:
        if len(x_batch) == batch_size:
            yield x_batch, y_batch
            x_batch, y_batch = [], []
        if type(x[0]) == tuple:
            x = zip(*x)
        x_batch += [x]
        y_batch += [y]
        
    if len(x_batch) != 0:
        yield x_batch, y_batch

In [55]:
def get_chunk_type(tok, idx_to_tag):
    """
    Args:
        tok: id of token, ex 4
        idx_to_tag: dictionary {4: "B-PER", ...}
    Returns:
        tuple: "B", "PER"
    """
    tag_name = idx_to_tag[tok]
    tag_class = tag_name.split('-')[0]
    tag_type = tag_name.split('-')[-1]
    return tag_class, tag_type


def get_chunks(seq, tags):
    """Given a sequence of tags, group entities and their position
    Args:
        seq: [4, 4, 0, 0, ...] sequence of labels
        tags: dict["O"] = 4
    Returns:
        list of (chunk_type, chunk_start, chunk_end)
    Example:
        seq = [4, 5, 0, 3]
        tags = {"B-PER": 4, "I-PER": 5, "B-LOC": 3}
        result = [("PER", 0, 2), ("LOC", 3, 4)]
    """
    if NONE in tags:
        default = tags[NONE]
    else:
        default = None
        
    idx_to_tag = {idx: tag for tag, idx in tags.items()}
    chunks = []
    chunk_type, chunk_start = None, None
    for i, tok in enumerate(seq):
        # End of a chunk 1
#         print(tok)
        if tok == default and chunk_type is not None:
            # Add a chunk.
            chunk = (chunk_type, chunk_start, i)
            chunks.append(chunk)
            chunk_type, chunk_start = None, None

        # End of a chunk + start of a chunk!
        elif tok != default:
            tok_chunk_class, tok_chunk_type = get_chunk_type(tok, idx_to_tag)
            if chunk_type is None:
                chunk_type, chunk_start = tok_chunk_type, i
            elif tok_chunk_type != chunk_type or tok_chunk_class == "B":
                chunk = (chunk_type, chunk_start, i)
                chunks.append(chunk)
                chunk_type, chunk_start = tok_chunk_type, i
        else:
            pass

    # end condition
    if chunk_type is not None:
        chunk = (chunk_type, chunk_start, len(seq))
        chunks.append(chunk)

    return chunks

In [56]:
def predict_batch(words):
    fd, sequence_length = get_feed_dict(words) #dropout=1.0)
    labels_pred = sess.run(label_pred, feed_dict=fd)
    return labels_pred, sequence_length

In [57]:
def _pad_sequences(sequences, pad_tok, max_length):
    """
    Args:
        sequences: a generator of list or tuple
        pad_tok: the char to pad with
    Returns:
        a list of list where each sublist has same length
    """
    sequence_padded, sequence_length = [], []

    for seq in sequences:
        seq = list(seq)
        seq_ = seq[:max_length] + [pad_tok]*max(max_length - len(seq), 0)
        sequence_padded +=  [seq_]
        sequence_length += [min(len(seq), max_length)]

    return sequence_padded, sequence_length


def pad_sequences(sequences, pad_tok, nlevels=1):
    """
    Args:
        sequences: a generator of list or tuple
        pad_tok: the char to pad with
        nlevels: "depth" of padding, for the case where we have characters ids
    Returns:
        a list of list where each sublist has same length
    """
    if nlevels == 1:
        max_length = max(map(lambda x : len(x), sequences))
        sequence_padded, sequence_length = _pad_sequences(sequences,
                                            pad_tok, max_length)

    elif nlevels == 2:
        max_length_word = max([max(map(lambda x: len(x), seq))
                               for seq in sequences])
        sequence_padded, sequence_length = [], []
        for seq in sequences:
            # all words are same length now
            sp, sl = _pad_sequences(seq, pad_tok, max_length_word)
            sequence_padded += [sp]
            sequence_length += [sl]

        max_length_sentence = max(map(lambda x : len(x), sequences))
        sequence_padded, _ = _pad_sequences(sequence_padded,
                [pad_tok]*max_length_word, max_length_sentence)
        sequence_length, _ = _pad_sequences(sequence_length, 0,
                max_length_sentence)

    return sequence_padded, sequence_length

In [58]:
def get_feed_dict(words, labels=None, lr=None, dropout=None):
        """Given some data, pad it and build a feed dictionary
        Args:
            words: list of sentences. A sentence is a list of ids of a list of
                words. A word is a list of ids
            labels: list of ids
            lr: (float) learning rate
            dropout: (float) keep prob
        Returns:
            dict {placeholder: value}
        """
        # perform padding of the given data
#         if self.config.use_chars:
#             char_ids, word_ids = zip(*words)
#             word_ids, sequence_lengths = pad_sequences(word_ids, 0)
#             char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0,
#                 nlevels=2)
#         else:
        word_ids, sequence_lengths = pad_sequences(words, 0)

        # build feed dictionary
        feed = {
            "word_id_placeholder:0": word_ids,
            "sequence_length_placholder:0": sequence_lengths
        }

#         if self.config.use_chars:
#             feed[self.char_ids] = char_ids
#             feed[self.word_lengths] = word_lengths
        if labels is not None:
            labels, _ = pad_sequences(labels, 0)
            feed["labels:0"] = labels

        if lr is not None:
            feed["lr:0"] = lr

        if dropout is not None:
            feed["dropout:0"] = dropout

        return feed, sequence_lengths

In [59]:
from tqdm import tqdm

In [60]:
def run_epoch(train, dev, epoch,batch_size=128):
    for i ,(words, labels) in tqdm(enumerate(minibatches(train,batch_size))):
        fd, _= get_feed_dict(words, labels, LR, DROPOUT)
#         print("......................................")
#         print(fd)
        
        _, train_los = sess.run([training_op, loss], feed_dict = fd)
        print("training_loss::::",train_los)
        
    metrics = evaluation(dev)
    msg = " - ".join(["{} {:04.2f}".format(k, v)
            for k, v in metrics.items()])
    print(msg)
    print(metrics['f1'])
    return metrics['f1']
        

def evaluation(dev):
    accs = []
    correct_preds, total_correct, total_preds = 0., 0., 0.
    for words, labels in minibatches(dev, batch_size=32):
        label_preds, sequence_length = predict_batch(words)
        
        for lab, lab_pred, length in zip(labels, label_preds, sequence_length):
            lab = lab[:length]
            lab_pred = lab_pred[:length]
            
            accs += [a==b for (a,b) in zip(lab,lab_pred)]
#             print("lab::",lab,"fininsh")
            lab_chunks = set(get_chunks(lab, tag_2_id))
#             print("lab_pred",lab_pred)
            lab_pred_chunks = set(get_chunks(lab_pred, tag_2_id))
            
            correct_preds += len(lab_chunks & lab_pred_chunks)
            total_preds   += len(lab_pred_chunks)
            total_correct += len(lab_chunks)
            
    p   = correct_preds / total_preds if correct_preds > 0 else 0
    r   = correct_preds / total_correct if correct_preds > 0 else 0
    f1  = 2 * p * r / (p + r) if correct_preds > 0 else 0
    acc = np.mean(accs)

    return {"acc": 100*acc, "f1": 100*f1}

In [61]:
def train(train, dev, epochs, epoch_no_improvement=EPOCHS_NO_IMPROVEMENT):
    global LR
    best_score = 0
    for epoch in range(epochs):
        print(f"epoch {epoch} out of {epochs}")
        score = run_epoch(train, dev, epoch)
        LR *= 0.9#lr_decay # decay learning rate
        if score >= best_score:
            epo_no_improvement = 0
            save_session()
            best_score = score
            print("Best Score", best_score)
        else:
            epo_no_improvement += 1
            if epo_no_improvement > epoch_no_improvement:
                print(f"Early Stoping {epo_no_improvement} with no Improvement")
                break
                
        

In [62]:
# train_data = [(i,j) for i, j in zip(x,y)]
# dev_data = [(i,j) for i, j in zip(x_,y_)]

In [63]:

train(train=train_data, dev=dev_data, epochs = 25, epoch_no_improvement=4)

epoch 0 out of 25


0it [00:00, ?it/s]

training_loss:::: 0.8266405


1it [00:04,  4.79s/it]

training_loss:::: 0.3799069


2it [00:06,  3.77s/it]

training_loss:::: 0.26527265


3it [00:08,  3.47s/it]

training_loss:::: 0.28883535


4it [00:11,  3.27s/it]

training_loss:::: 0.2922727


5it [00:16,  3.79s/it]

training_loss:::: 0.2782803


6it [00:20,  3.68s/it]

training_loss:::: 0.30413368


7it [00:22,  3.24s/it]

training_loss:::: 0.25548184


8it [00:25,  3.25s/it]

training_loss:::: 0.25180426


9it [00:29,  3.38s/it]

training_loss:::: 0.24902418


10it [00:34,  3.78s/it]

training_loss:::: 0.23722611


11it [00:37,  3.63s/it]

training_loss:::: 0.24271657


12it [00:42,  4.11s/it]

training_loss:::: 0.2417763


13it [00:46,  4.17s/it]

training_loss:::: 0.24376106


14it [00:50,  3.89s/it]

training_loss:::: 0.2453289


15it [00:53,  3.69s/it]

training_loss:::: 0.24001886


16it [00:58,  4.07s/it]

training_loss:::: 0.24083959


17it [01:01,  3.77s/it]

training_loss:::: 0.22821684


18it [01:04,  3.58s/it]

training_loss:::: 0.2128182


19it [01:10,  4.34s/it]

training_loss:::: 0.2240638


20it [01:13,  3.92s/it]

training_loss:::: 0.21441503


21it [01:16,  3.77s/it]

training_loss:::: 0.2087376


22it [01:22,  4.40s/it]

training_loss:::: 0.22449188


23it [01:26,  4.28s/it]

training_loss:::: 0.2133041


24it [01:31,  4.42s/it]

training_loss:::: 0.21445984


25it [01:34,  4.08s/it]

training_loss:::: 0.21077232


26it [01:38,  3.97s/it]

training_loss:::: 0.2031924


27it [01:43,  4.28s/it]

training_loss:::: 0.20884986


28it [01:47,  4.24s/it]

training_loss:::: 0.20940213


29it [01:50,  3.85s/it]

training_loss:::: 0.1900136


30it [01:55,  4.12s/it]

training_loss:::: 0.21701925


31it [01:58,  3.80s/it]

training_loss:::: 0.21117775


32it [02:01,  3.67s/it]

training_loss:::: 0.19417085


33it [02:06,  3.97s/it]

training_loss:::: 0.20403723


34it [02:09,  3.72s/it]

training_loss:::: 0.18369284


35it [02:15,  4.30s/it]

training_loss:::: 0.20068184


36it [02:19,  4.15s/it]

training_loss:::: 0.19387619


37it [02:24,  4.53s/it]

training_loss:::: 0.1934441


38it [02:29,  4.80s/it]

training_loss:::: 0.18325096


39it [02:34,  4.76s/it]

training_loss:::: 0.18731828


40it [02:38,  4.40s/it]

training_loss:::: 0.18564698


41it [02:42,  4.24s/it]

training_loss:::: 0.17922609


42it [02:46,  4.15s/it]

training_loss:::: 0.17911375


43it [02:49,  3.96s/it]

training_loss:::: 0.17208949


44it [02:53,  3.95s/it]

training_loss:::: 0.16944377


45it [02:56,  3.72s/it]

training_loss:::: 0.16045636


46it [02:59,  3.57s/it]

training_loss:::: 0.16262324


47it [03:03,  3.60s/it]

training_loss:::: 0.16221125


48it [03:07,  3.86s/it]

training_loss:::: 0.1589578


49it [03:12,  3.95s/it]

training_loss:::: 0.15630522


50it [03:17,  4.24s/it]

training_loss:::: 0.1644649


51it [03:20,  3.94s/it]

training_loss:::: 0.15359733


52it [03:23,  3.78s/it]

training_loss:::: 0.16298941


53it [03:27,  3.67s/it]

training_loss:::: 0.15009134


54it [03:30,  3.67s/it]

training_loss:::: 0.16445456


55it [03:33,  3.51s/it]

training_loss:::: 0.13990684


56it [03:40,  4.32s/it]

training_loss:::: 0.14592616


57it [03:45,  4.58s/it]

training_loss:::: 0.14685826


58it [03:49,  4.36s/it]

training_loss:::: 0.14319351


59it [03:53,  4.24s/it]

training_loss:::: 0.15410842


60it [03:56,  4.13s/it]

training_loss:::: 0.14849481


61it [04:02,  4.42s/it]

training_loss:::: 0.15605523


62it [04:07,  4.69s/it]

training_loss:::: 0.13946047


63it [04:12,  4.84s/it]

training_loss:::: 0.13080925


64it [04:16,  4.54s/it]

training_loss:::: 0.14864035


65it [04:20,  4.27s/it]

training_loss:::: 0.14027612


66it [04:24,  4.30s/it]

training_loss:::: 0.1495158


67it [04:28,  4.14s/it]

training_loss:::: 0.12579437


68it [04:32,  4.03s/it]

training_loss:::: 0.14188404


69it [04:34,  3.72s/it]

training_loss:::: 0.14267527


70it [04:38,  3.57s/it]

training_loss:::: 0.12185495


71it [04:44,  4.35s/it]

training_loss:::: 0.14722717


72it [04:48,  4.14s/it]

training_loss:::: 0.13259785


73it [04:52,  4.09s/it]

training_loss:::: 0.1289154


74it [04:55,  4.00s/it]

training_loss:::: 0.14072192


75it [05:00,  4.20s/it]

training_loss:::: 0.12459213


76it [05:04,  4.11s/it]

training_loss:::: 0.14324653


77it [05:09,  4.44s/it]

training_loss:::: 0.13854332


78it [05:13,  4.15s/it]

training_loss:::: 0.13162513


79it [05:19,  4.87s/it]

training_loss:::: 0.1446771


80it [05:22,  4.32s/it]

training_loss:::: 0.13782007


81it [05:26,  4.20s/it]

training_loss:::: 0.13316694


82it [05:29,  3.83s/it]

training_loss:::: 0.12333598


83it [05:33,  3.96s/it]

training_loss:::: 0.1287031


84it [05:39,  4.52s/it]

training_loss:::: 0.12759595


85it [05:44,  4.59s/it]

training_loss:::: 0.13639449


86it [05:48,  4.34s/it]

training_loss:::: 0.12068241


87it [05:52,  4.22s/it]

training_loss:::: 0.121027924


88it [05:57,  4.63s/it]

training_loss:::: 0.12418418


89it [06:00,  4.20s/it]

training_loss:::: 0.11944467


90it [06:06,  4.73s/it]

training_loss:::: 0.12845181


91it [06:12,  4.93s/it]

training_loss:::: 0.13071655


92it [06:17,  5.05s/it]

training_loss:::: 0.13088118


93it [06:22,  4.93s/it]

training_loss:::: 0.11876982


94it [06:26,  4.76s/it]

training_loss:::: 0.13687566


95it [06:30,  4.60s/it]

training_loss:::: 0.117648214


96it [06:34,  4.28s/it]

training_loss:::: 0.12111801


97it [06:38,  4.29s/it]

training_loss:::: 0.13756071


98it [06:42,  4.11s/it]

training_loss:::: 0.12538694


99it [06:47,  4.55s/it]

training_loss:::: 0.10487822


100it [06:52,  4.45s/it]

training_loss:::: 0.11203896


101it [06:58,  5.01s/it]

training_loss:::: 0.13074014


102it [07:02,  4.68s/it]

training_loss:::: 0.11933294


103it [07:06,  4.55s/it]

training_loss:::: 0.12565964


104it [07:10,  4.42s/it]

training_loss:::: 0.111034214


105it [07:15,  4.62s/it]

training_loss:::: 0.110109165


106it [07:19,  4.30s/it]

training_loss:::: 0.11193261


107it [07:24,  4.47s/it]

training_loss:::: 0.124520786


108it [07:28,  4.39s/it]

training_loss:::: 0.13345617


109it [07:31,  3.95s/it]

training_loss:::: 0.12082366


110it [07:34,  3.84s/it]

training_loss:::: 0.1216182


111it [07:37,  3.47s/it]

training_loss:::: 0.10537548


112it [07:43,  4.18s/it]

training_loss:::: 0.097164124


113it [07:49,  4.84s/it]

training_loss:::: 0.112674795


114it [07:53,  4.56s/it]

training_loss:::: 0.12159979


115it [07:56,  4.06s/it]

training_loss:::: 0.09678111


116it [08:02,  4.64s/it]

training_loss:::: 0.105374075


117it [08:06,  4.47s/it]

training_loss:::: 0.10583087


118it [08:10,  4.40s/it]

training_loss:::: 0.11628434


119it [08:14,  4.26s/it]

training_loss:::: 0.09681765


120it [08:22,  5.29s/it]

training_loss:::: 0.111641236


121it [08:26,  5.03s/it]

training_loss:::: 0.11211935


122it [08:29,  4.39s/it]

training_loss:::: 0.110794015


123it [08:33,  4.23s/it]

training_loss:::: 0.11052561


124it [08:40,  4.86s/it]

training_loss:::: 0.10503438


125it [08:45,  5.12s/it]

training_loss:::: 0.10660774


126it [08:50,  4.93s/it]

training_loss:::: 0.10027997


127it [08:55,  5.04s/it]

training_loss:::: 0.10413214


128it [09:00,  5.02s/it]

training_loss:::: 0.11036432


129it [09:04,  4.81s/it]

training_loss:::: 0.10992204


130it [09:09,  4.72s/it]

training_loss:::: 0.09902782


131it [09:14,  4.71s/it]

training_loss:::: 0.11041445


132it [09:17,  4.33s/it]

training_loss:::: 0.1008405


133it [09:21,  4.26s/it]

training_loss:::: 0.089668125


134it [09:25,  4.19s/it]

training_loss:::: 0.09751425


135it [09:29,  4.07s/it]

training_loss:::: 0.11303692


136it [09:33,  4.12s/it]

training_loss:::: 0.098915316


137it [09:37,  4.01s/it]

training_loss:::: 0.096628025


138it [09:40,  3.76s/it]

training_loss:::: 0.10356743


139it [09:43,  3.64s/it]

training_loss:::: 0.10835076


140it [09:47,  3.51s/it]

training_loss:::: 0.09151139


141it [09:53,  4.44s/it]

training_loss:::: 0.10244044


142it [09:58,  4.47s/it]

training_loss:::: 0.106892675


143it [10:03,  4.58s/it]

training_loss:::: 0.10174314


144it [10:05,  4.07s/it]

training_loss:::: 0.10383126


145it [10:09,  3.79s/it]

training_loss:::: 0.095104955


146it [10:13,  4.08s/it]

training_loss:::: 0.099995196


147it [10:21,  5.18s/it]

training_loss:::: 0.095676124


148it [10:25,  4.90s/it]

training_loss:::: 0.08694908


149it [10:31,  4.98s/it]

training_loss:::: 0.10154069


150it [10:34,  4.48s/it]

training_loss:::: 0.08518928


151it [10:40,  4.84s/it]

training_loss:::: 0.10732474


152it [10:44,  4.81s/it]

training_loss:::: 0.0798525


153it [10:53,  5.98s/it]

training_loss:::: 0.09067154


154it [10:59,  5.92s/it]

training_loss:::: 0.08348956


155it [11:03,  5.49s/it]

training_loss:::: 0.08782603


156it [11:08,  5.21s/it]

training_loss:::: 0.08638301


157it [11:11,  4.74s/it]

training_loss:::: 0.08508109


158it [11:19,  5.53s/it]

training_loss:::: 0.0968971


159it [11:26,  5.98s/it]

training_loss:::: 0.08905997


160it [11:31,  5.66s/it]

training_loss:::: 0.08481736


161it [11:35,  5.37s/it]

training_loss:::: 0.09935156


162it [11:39,  4.86s/it]

training_loss:::: 0.09397199


163it [11:43,  4.59s/it]

training_loss:::: 0.08246724


164it [11:48,  4.58s/it]

training_loss:::: 0.0905513


165it [11:53,  4.88s/it]

training_loss:::: 0.08618289


166it [11:57,  4.52s/it]

training_loss:::: 0.09107454


167it [12:01,  4.40s/it]

training_loss:::: 0.08054177


168it [12:06,  4.61s/it]

training_loss:::: 0.08024736


169it [12:11,  4.73s/it]

training_loss:::: 0.09160755


170it [12:15,  4.46s/it]

training_loss:::: 0.088445


171it [12:20,  4.64s/it]

training_loss:::: 0.08062063


172it [12:28,  5.57s/it]

training_loss:::: 0.083799005


173it [12:33,  5.40s/it]

training_loss:::: 0.08361755


174it [12:36,  4.86s/it]

training_loss:::: 0.08388777


175it [12:41,  4.71s/it]

training_loss:::: 0.07723581


176it [12:45,  4.46s/it]

training_loss:::: 0.087523796


177it [12:50,  4.66s/it]

training_loss:::: 0.071278445


178it [12:56,  5.06s/it]

training_loss:::: 0.071425095


179it [13:00,  4.83s/it]

training_loss:::: 0.07857325


180it [13:06,  5.14s/it]

training_loss:::: 0.090310335


181it [13:09,  4.64s/it]

training_loss:::: 0.08038614


182it [13:13,  4.31s/it]

training_loss:::: 0.08569504


183it [13:16,  3.97s/it]

training_loss:::: 0.08213168


184it [13:20,  3.88s/it]

training_loss:::: 0.081797086


185it [13:24,  3.88s/it]

training_loss:::: 0.089257896


186it [13:28,  4.13s/it]

training_loss:::: 0.08074775


187it [13:35,  4.99s/it]

training_loss:::: 0.08446925


188it [13:40,  4.82s/it]

training_loss:::: 0.08719401


189it [13:43,  4.47s/it]

training_loss:::: 0.072139435


190it [13:49,  4.90s/it]

training_loss:::: 0.07912409


191it [13:56,  5.45s/it]

training_loss:::: 0.0833254


192it [14:01,  5.20s/it]

training_loss:::: 0.07729721


193it [14:06,  5.28s/it]

training_loss:::: 0.08291386


194it [14:11,  5.02s/it]

training_loss:::: 0.07916406


195it [14:15,  4.94s/it]

training_loss:::: 0.073395334


196it [14:21,  5.26s/it]

training_loss:::: 0.070960976


197it [14:26,  5.18s/it]

training_loss:::: 0.080218345


198it [14:30,  4.84s/it]

training_loss:::: 0.08213039


199it [14:37,  5.40s/it]

training_loss:::: 0.0706373


200it [14:42,  5.22s/it]

training_loss:::: 0.07122781


201it [14:50,  6.11s/it]

training_loss:::: 0.07158441


202it [14:55,  5.89s/it]

training_loss:::: 0.07528636


203it [15:02,  6.00s/it]

training_loss:::: 0.084014036


204it [15:07,  5.74s/it]

training_loss:::: 0.08259159


205it [15:14,  6.24s/it]

training_loss:::: 0.06924632


206it [15:18,  5.43s/it]

training_loss:::: 0.07117372


207it [15:22,  5.18s/it]

training_loss:::: 0.06859688


208it [15:29,  5.62s/it]

training_loss:::: 0.07909555


209it [15:33,  5.11s/it]

training_loss:::: 0.07553397


210it [15:38,  5.22s/it]

training_loss:::: 0.074388094


211it [15:46,  5.87s/it]

training_loss:::: 0.066017516


212it [15:50,  5.25s/it]

training_loss:::: 0.06934269


213it [15:56,  5.69s/it]

training_loss:::: 0.07463631


214it [16:01,  5.40s/it]

training_loss:::: 0.07559244


215it [16:05,  4.93s/it]

training_loss:::: 0.0632283


216it [16:10,  5.05s/it]

training_loss:::: 0.06608549


217it [16:15,  5.06s/it]

training_loss:::: 0.078086264


218it [16:20,  4.85s/it]

training_loss:::: 0.067816585


219it [16:24,  4.81s/it]

training_loss:::: 0.06795436


220it [16:32,  5.53s/it]

training_loss:::: 0.07711044


221it [16:37,  5.60s/it]

training_loss:::: 0.07027486


222it [16:44,  5.88s/it]

training_loss:::: 0.0706685


223it [16:51,  6.14s/it]

training_loss:::: 0.062166337


224it [16:57,  6.36s/it]

training_loss:::: 0.06529802


225it [17:06,  6.97s/it]

training_loss:::: 0.06513663


226it [17:11,  6.49s/it]

training_loss:::: 0.067175254


227it [17:19,  6.98s/it]

training_loss:::: 0.06919697


228it [17:23,  6.03s/it]

training_loss:::: 0.06278828


229it [17:32,  6.87s/it]

training_loss:::: 0.0736369


230it [17:39,  6.99s/it]

training_loss:::: 0.059853956


231it [17:43,  6.01s/it]

training_loss:::: 0.062982686


232it [17:48,  5.72s/it]

training_loss:::: 0.07479093


233it [17:53,  5.63s/it]

training_loss:::: 0.062289152


234it [17:59,  5.66s/it]

training_loss:::: 0.06586082


235it [18:05,  5.64s/it]

training_loss:::: 0.059676487


236it [18:10,  5.46s/it]

training_loss:::: 0.06309323


237it [18:15,  5.43s/it]

training_loss:::: 0.061859973


238it [18:21,  5.68s/it]

training_loss:::: 0.065653354


239it [18:26,  5.37s/it]

training_loss:::: 0.06820053


240it [18:30,  4.98s/it]

training_loss:::: 0.051053002


241it [18:34,  4.59s/it]

training_loss:::: 0.06623834


242it [18:39,  4.77s/it]

training_loss:::: 0.062693045


243it [18:43,  4.42s/it]

training_loss:::: 0.06199985


244it [18:46,  4.11s/it]

training_loss:::: 0.057811357


245it [18:51,  4.34s/it]

training_loss:::: 0.066460304


246it [18:55,  4.25s/it]

training_loss:::: 0.06482241


247it [19:00,  4.38s/it]

training_loss:::: 0.055680297


248it [19:04,  4.28s/it]

training_loss:::: 0.057249084


249it [19:10,  4.75s/it]

training_loss:::: 0.062462267


250it [19:15,  4.89s/it]

training_loss:::: 0.049622703


251it [19:24,  6.15s/it]

training_loss:::: 0.06994251


252it [19:28,  5.43s/it]

training_loss:::: 0.05568083


253it [19:34,  5.71s/it]

training_loss:::: 0.06893371


254it [19:38,  5.26s/it]

training_loss:::: 0.061483


255it [19:43,  5.24s/it]

training_loss:::: 0.06557935


256it [19:50,  5.57s/it]

training_loss:::: 0.062483925


257it [19:53,  4.91s/it]

training_loss:::: 0.05466199


258it [19:59,  5.37s/it]

training_loss:::: 0.06139006


259it [20:04,  5.13s/it]

training_loss:::: 0.058298375


260it [20:11,  5.58s/it]

training_loss:::: 0.06419645


261it [20:16,  5.62s/it]

training_loss:::: 0.059230007


262it [20:22,  5.73s/it]

training_loss:::: 0.05600961


263it [20:27,  5.28s/it]

training_loss:::: 0.062068105


264it [20:31,  5.15s/it]

training_loss:::: 0.054394394


265it [20:38,  5.42s/it]

training_loss:::: 0.06463378


266it [20:43,  5.45s/it]

training_loss:::: 0.06525297


267it [20:48,  5.28s/it]

training_loss:::: 0.067326516


268it [20:54,  5.55s/it]

training_loss:::: 0.0556518


269it [21:03,  6.53s/it]

training_loss:::: 0.062473476


270it [21:08,  6.04s/it]

training_loss:::: 0.067410424


271it [21:14,  6.12s/it]

training_loss:::: 0.05847353


272it [21:19,  5.69s/it]

training_loss:::: 0.05998272


273it [21:22,  5.08s/it]

training_loss:::: 0.064713255


274it [21:26,  4.63s/it]

training_loss:::: 0.06162773


275it [21:29,  4.16s/it]

training_loss:::: 0.05629829


276it [21:34,  4.48s/it]

training_loss:::: 0.06095677


277it [21:40,  4.73s/it]

training_loss:::: 0.0566367


278it [21:45,  4.93s/it]

training_loss:::: 0.05988604


279it [21:50,  4.94s/it]

training_loss:::: 0.054796517


280it [21:55,  4.85s/it]

training_loss:::: 0.044139538


281it [21:59,  4.80s/it]

training_loss:::: 0.05063233


282it [22:06,  5.40s/it]

training_loss:::: 0.063694075


283it [22:11,  5.18s/it]

training_loss:::: 0.05871913


284it [22:14,  4.49s/it]

training_loss:::: 0.058062084


285it [22:22,  5.76s/it]

training_loss:::: 0.05389985


286it [22:26,  5.04s/it]

training_loss:::: 0.06265606


287it [22:32,  5.48s/it]

training_loss:::: 0.053881694


288it [22:36,  4.96s/it]

training_loss:::: 0.05123748


289it [22:44,  5.74s/it]

training_loss:::: 0.05792405


290it [22:49,  5.71s/it]

training_loss:::: 0.04721048


291it [22:58,  6.60s/it]

training_loss:::: 0.060132727


292it [23:04,  6.58s/it]

training_loss:::: 0.046625


293it [23:10,  6.41s/it]

training_loss:::: 0.051994197


294it [23:17,  6.56s/it]

training_loss:::: 0.049524914


295it [23:22,  5.99s/it]

training_loss:::: 0.043574512


296it [23:30,  6.65s/it]

training_loss:::: 0.05660237


297it [23:35,  5.95s/it]

training_loss:::: 0.047797814


298it [23:41,  6.07s/it]

training_loss:::: 0.0466864


299it [23:45,  5.39s/it]

training_loss:::: 0.053743552


300it [23:50,  5.26s/it]

training_loss:::: 0.045413963


301it [23:53,  4.81s/it]

training_loss:::: 0.046609238


302it [23:58,  4.71s/it]

training_loss:::: 0.039668925


303it [24:04,  5.02s/it]

training_loss:::: 0.048835352


304it [24:09,  5.21s/it]

training_loss:::: 0.054291073


305it [24:14,  4.98s/it]

training_loss:::: 0.0496894


306it [24:20,  5.37s/it]

training_loss:::: 0.04374855


307it [24:26,  5.66s/it]

training_loss:::: 0.049306795


308it [24:30,  5.11s/it]

training_loss:::: 0.041419744


309it [24:42,  7.01s/it]

training_loss:::: 0.044287946


310it [24:48,  6.77s/it]

training_loss:::: 0.04497134


311it [24:57,  7.39s/it]

training_loss:::: 0.04354089


312it [25:05,  7.66s/it]

training_loss:::: 0.05115774


313it [25:10,  6.94s/it]

training_loss:::: 0.0504363


314it [25:18,  7.19s/it]

training_loss:::: 0.05665457


315it [25:22,  6.30s/it]

training_loss:::: 0.05154121


316it [25:27,  5.71s/it]

training_loss:::: 0.041002575


317it [25:34,  6.16s/it]

training_loss:::: 0.05329083


318it [25:39,  5.77s/it]

training_loss:::: 0.043661777


319it [25:44,  5.77s/it]

training_loss:::: 0.042690963


320it [25:52,  6.26s/it]

training_loss:::: 0.045753036


321it [25:57,  5.93s/it]

training_loss:::: 0.0420638


322it [26:00,  5.20s/it]

training_loss:::: 0.048656516


323it [26:06,  5.19s/it]

training_loss:::: 0.050010975


324it [26:10,  4.88s/it]

training_loss:::: 0.044021532


325it [26:14,  4.63s/it]

training_loss:::: 0.041231528


326it [26:19,  4.78s/it]

training_loss:::: 0.03903802


327it [26:26,  5.42s/it]

training_loss:::: 0.04917041


328it [26:31,  5.35s/it]

training_loss:::: 0.042973116


329it [26:38,  5.89s/it]

training_loss:::: 0.040237945


330it [26:46,  6.41s/it]

training_loss:::: 0.046260204


331it [26:54,  6.82s/it]

training_loss:::: 0.054922886


332it [27:02,  7.23s/it]

training_loss:::: 0.043476433


333it [27:10,  7.44s/it]

training_loss:::: 0.054827232


334it [27:15,  6.80s/it]

training_loss:::: 0.047519214


335it [27:20,  6.40s/it]

training_loss:::: 0.047166344


336it [27:29,  6.91s/it]

training_loss:::: 0.04613062


337it [27:37,  7.46s/it]

training_loss:::: 0.044233084


338it [27:42,  6.70s/it]

training_loss:::: 0.039761018


339it [27:47,  6.18s/it]

training_loss:::: 0.038696334


340it [27:53,  6.21s/it]

training_loss:::: 0.048258316


341it [27:59,  6.09s/it]

training_loss:::: 0.050393607


342it [28:05,  5.94s/it]

training_loss:::: 0.048051823


343it [28:11,  6.00s/it]

training_loss:::: 0.041167896


344it [28:16,  5.68s/it]

training_loss:::: 0.042281713


345it [28:21,  5.35s/it]

training_loss:::: 0.040988762


346it [28:26,  5.37s/it]

training_loss:::: 0.04919522


347it [28:30,  4.97s/it]

training_loss:::: 0.040861335


348it [28:35,  4.98s/it]

training_loss:::: 0.04462876


349it [28:40,  4.92s/it]

training_loss:::: 0.047418285


350it [28:46,  5.38s/it]

training_loss:::: 0.04238379


351it [28:52,  5.55s/it]

training_loss:::: 0.02884182


352it [28:58,  5.78s/it]

training_loss:::: 0.045741513


353it [29:03,  5.43s/it]

training_loss:::: 0.042937975


354it [29:08,  5.17s/it]

training_loss:::: 0.04359825


355it [29:12,  4.91s/it]

training_loss:::: 0.040974766


356it [29:19,  5.69s/it]

training_loss:::: 0.03936219


357it [29:25,  5.62s/it]

training_loss:::: 0.04004211


358it [29:35,  6.84s/it]

training_loss:::: 0.044729035


359it [29:38,  5.88s/it]

training_loss:::: 0.038927387


360it [29:43,  5.44s/it]

training_loss:::: 0.03696718


361it [29:47,  5.24s/it]

training_loss:::: 0.043200564


362it [29:53,  5.32s/it]

training_loss:::: 0.05015618


363it [29:57,  4.98s/it]

training_loss:::: 0.039559063


364it [30:04,  5.65s/it]

training_loss:::: 0.039951995


365it [30:10,  5.60s/it]

training_loss:::: 0.04186668


366it [30:13,  4.99s/it]

training_loss:::: 0.03963095


367it [30:19,  5.16s/it]

training_loss:::: 0.039008945


368it [30:23,  4.83s/it]

training_loss:::: 0.046879783


369it [30:32,  6.05s/it]

training_loss:::: 0.037163004


370it [30:36,  5.51s/it]

training_loss:::: 0.041428465


371it [30:42,  5.50s/it]

training_loss:::: 0.04338099


372it [30:45,  4.85s/it]

training_loss:::: 0.040862184


373it [30:51,  5.21s/it]

training_loss:::: 0.042924777


374it [30:55,  4.84s/it]

training_loss:::: 0.045805622


375it [30:59,  4.73s/it]

training_loss:::: 0.040999044


376it [31:05,  5.03s/it]

training_loss:::: 0.03964119


377it [31:10,  4.95s/it]

training_loss:::: 0.03495878


378it [31:15,  4.99s/it]

training_loss:::: 0.043801274


379it [31:20,  4.85s/it]

training_loss:::: 0.03922228


380it [31:25,  4.98s/it]

training_loss:::: 0.047769826


381it [31:34,  6.10s/it]

training_loss:::: 0.037906546


382it [31:40,  6.20s/it]

training_loss:::: 0.038304646


383it [31:43,  5.24s/it]

training_loss:::: 0.051773094


384it [31:47,  4.93s/it]

training_loss:::: 0.03846136


385it [31:51,  4.73s/it]

training_loss:::: 0.04352511


386it [31:55,  4.50s/it]

training_loss:::: 0.043597184


387it [32:00,  4.64s/it]

training_loss:::: 0.03917105


388it [32:04,  4.43s/it]

training_loss:::: 0.040433224


389it [32:08,  4.29s/it]

training_loss:::: 0.041725773


390it [32:12,  4.17s/it]

training_loss:::: 0.037180178


391it [32:18,  4.57s/it]

training_loss:::: 0.04214229


392it [32:22,  4.42s/it]

training_loss:::: 0.0319807


393it [32:25,  4.21s/it]

training_loss:::: 0.047893204


394it [32:31,  4.72s/it]

training_loss:::: 0.04344992


395it [32:35,  4.47s/it]

training_loss:::: 0.039932158


396it [32:40,  4.58s/it]

training_loss:::: 0.044406585


397it [32:46,  4.84s/it]

training_loss:::: 0.03475061


398it [32:50,  4.81s/it]

training_loss:::: 0.041479383


399it [32:55,  4.82s/it]

training_loss:::: 0.042794406


400it [32:59,  4.57s/it]

training_loss:::: 0.039604407


401it [33:03,  4.43s/it]

training_loss:::: 0.036397886


402it [33:13,  5.91s/it]

training_loss:::: 0.047388416


403it [33:17,  5.58s/it]

training_loss:::: 0.041562412


404it [33:25,  6.19s/it]

training_loss:::: 0.036445245


405it [33:29,  5.68s/it]

training_loss:::: 0.039192345


406it [33:37,  6.30s/it]

training_loss:::: 0.039388333


407it [33:49,  7.82s/it]

training_loss:::: 0.036140587


408it [33:55,  7.50s/it]

training_loss:::: 0.037515488


409it [34:01,  7.06s/it]

training_loss:::: 0.038580302


410it [34:11,  7.83s/it]

training_loss:::: 0.031277616


411it [34:16,  7.09s/it]

training_loss:::: 0.045726053


412it [34:21,  6.32s/it]

training_loss:::: 0.044153582


413it [34:27,  6.20s/it]

training_loss:::: 0.03837261


414it [34:32,  5.96s/it]

training_loss:::: 0.033730853


415it [34:37,  5.61s/it]

training_loss:::: 0.03495702


416it [34:46,  6.62s/it]

training_loss:::: 0.02701109


417it [34:52,  6.49s/it]

training_loss:::: 0.031676296


418it [35:03,  7.82s/it]

training_loss:::: 0.037081026


419it [35:09,  7.28s/it]

training_loss:::: 0.032736402


420it [35:16,  7.25s/it]

training_loss:::: 0.032462593


421it [35:22,  6.72s/it]

training_loss:::: 0.031163333


422it [35:28,  6.70s/it]

training_loss:::: 0.041524995


423it [35:36,  7.02s/it]

training_loss:::: 0.033183206


424it [35:40,  6.20s/it]

training_loss:::: 0.031227278


425it [35:45,  5.83s/it]

training_loss:::: 0.039622575


426it [35:50,  5.48s/it]

training_loss:::: 0.041547935


427it [35:58,  6.28s/it]

training_loss:::: 0.03705884


428it [36:02,  5.58s/it]

training_loss:::: 0.03593991


429it [36:06,  5.10s/it]

training_loss:::: 0.04537742


430it [36:11,  5.08s/it]

training_loss:::: 0.03565651


431it [36:18,  5.67s/it]

training_loss:::: 0.035057094


432it [36:24,  5.78s/it]

training_loss:::: 0.040020984


433it [36:29,  5.41s/it]

training_loss:::: 0.042195115


434it [36:35,  5.51s/it]

training_loss:::: 0.031109687


435it [36:41,  5.67s/it]

training_loss:::: 0.03733186


436it [36:47,  6.01s/it]

training_loss:::: 0.03951982


437it [36:53,  5.85s/it]

training_loss:::: 0.039486744


438it [36:58,  5.54s/it]

training_loss:::: 0.03058301


439it [37:06,  6.42s/it]

training_loss:::: 0.035297602


440it [37:13,  6.51s/it]

training_loss:::: 0.040760767


441it [37:22,  7.15s/it]

training_loss:::: 0.032251064


442it [37:31,  7.86s/it]

training_loss:::: 0.03623546


443it [37:35,  6.79s/it]

training_loss:::: 0.034922853


444it [37:41,  6.54s/it]

training_loss:::: 0.03676573


445it [37:46,  6.03s/it]

training_loss:::: 0.033898074


446it [37:51,  5.70s/it]

training_loss:::: 0.036516245


447it [37:56,  5.50s/it]

training_loss:::: 0.038397282


448it [38:02,  5.57s/it]

training_loss:::: 0.032395568


449it [38:08,  5.77s/it]

training_loss:::: 0.036834136


450it [38:13,  5.47s/it]

training_loss:::: 0.032628424


451it [38:17,  5.05s/it]

training_loss:::: 0.028834108


452it [38:28,  6.71s/it]

training_loss:::: 0.037556306


453it [38:33,  6.47s/it]

training_loss:::: 0.036652595


454it [38:37,  5.50s/it]

training_loss:::: 0.03239606


455it [38:45,  6.47s/it]

training_loss:::: 0.026420178


456it [38:53,  6.93s/it]

training_loss:::: 0.03489196


457it [39:00,  6.88s/it]

training_loss:::: 0.034470674


458it [39:06,  6.62s/it]

training_loss:::: 0.029681591


459it [39:13,  6.74s/it]

training_loss:::: 0.027892582


460it [39:22,  7.34s/it]

training_loss:::: 0.03139674


461it [39:30,  7.63s/it]

training_loss:::: 0.030786406


462it [39:36,  7.16s/it]

training_loss:::: 0.036010403


463it [39:46,  7.92s/it]

training_loss:::: 0.03534089


464it [39:51,  6.90s/it]

training_loss:::: 0.037559256


465it [39:55,  6.30s/it]

training_loss:::: 0.02998662


466it [40:01,  6.05s/it]

training_loss:::: 0.032299146


467it [40:05,  5.54s/it]

training_loss:::: 0.025667666


468it [40:09,  5.06s/it]

training_loss:::: 0.038364474


469it [40:16,  5.51s/it]

training_loss:::: 0.035692967


470it [40:23,  5.93s/it]

training_loss:::: 0.03230291


471it [40:29,  5.97s/it]

training_loss:::: 0.03556034


472it [40:33,  5.39s/it]

training_loss:::: 0.032230955


473it [40:37,  5.05s/it]

training_loss:::: 0.025813485


474it [40:43,  5.30s/it]

training_loss:::: 0.02879911


475it [40:54,  7.11s/it]

training_loss:::: 0.036456645


476it [41:01,  7.04s/it]

training_loss:::: 0.024653874


477it [41:06,  6.34s/it]

training_loss:::: 0.036066264


478it [41:11,  6.09s/it]

training_loss:::: 0.039421473


479it [41:16,  5.81s/it]

training_loss:::: 0.028174987


480it [41:23,  5.92s/it]

training_loss:::: 0.028267099


481it [41:30,  6.49s/it]

training_loss:::: 0.028637601


482it [41:35,  5.84s/it]

training_loss:::: 0.030068405


483it [41:39,  5.36s/it]

training_loss:::: 0.034104053


484it [41:44,  5.33s/it]

training_loss:::: 0.029267322


485it [41:52,  6.01s/it]

training_loss:::: 0.0415707


486it [42:03,  7.41s/it]

training_loss:::: 0.030451646


487it [42:08,  6.68s/it]

training_loss:::: 0.02532238


488it [42:14,  6.68s/it]

training_loss:::: 0.028846383


489it [42:20,  6.33s/it]

training_loss:::: 0.026000401


490it [42:25,  6.06s/it]

training_loss:::: 0.030247767


491it [42:32,  6.39s/it]

training_loss:::: 0.02422347


492it [42:39,  6.54s/it]

training_loss:::: 0.032116927


493it [42:48,  7.15s/it]

training_loss:::: 0.03128478


494it [42:55,  7.12s/it]

training_loss:::: 0.03053812


495it [43:02,  7.19s/it]

training_loss:::: 0.025511019


496it [43:08,  6.73s/it]

training_loss:::: 0.025181592


497it [43:15,  6.99s/it]

training_loss:::: 0.03415945


498it [43:23,  7.01s/it]

training_loss:::: 0.029798577


499it [43:32,  7.79s/it]

training_loss:::: 0.03559604


500it [43:38,  7.13s/it]

training_loss:::: 0.0312456


501it [43:48,  8.12s/it]

training_loss:::: 0.028405374


502it [43:56,  8.06s/it]

training_loss:::: 0.025934858


503it [44:01,  7.26s/it]

training_loss:::: 0.03676154


504it [44:10,  7.68s/it]

training_loss:::: 0.03270119


505it [44:16,  7.02s/it]

training_loss:::: 0.031989373


506it [44:21,  6.54s/it]

training_loss:::: 0.03312436


507it [44:25,  5.89s/it]

training_loss:::: 0.02674733


508it [44:30,  5.48s/it]

training_loss:::: 0.03433808


509it [44:35,  5.40s/it]

training_loss:::: 0.03057128


510it [44:40,  5.23s/it]

training_loss:::: 0.031940814


511it [44:44,  4.92s/it]

training_loss:::: 0.025909552


512it [44:47,  4.35s/it]

training_loss:::: 0.030166073


513it [44:52,  4.57s/it]

training_loss:::: 0.03563037


514it [44:56,  4.46s/it]

training_loss:::: 0.025049375


515it [45:05,  5.79s/it]

training_loss:::: 0.031408984


516it [45:11,  5.60s/it]

training_loss:::: 0.029266192


517it [45:20,  6.82s/it]

training_loss:::: 0.041759808


518it [45:31,  7.95s/it]

training_loss:::: 0.033786032


519it [45:35,  6.88s/it]

training_loss:::: 0.024256857


520it [45:39,  6.03s/it]

training_loss:::: 0.030399416


521it [45:44,  5.69s/it]

training_loss:::: 0.045695443


522it [45:49,  5.40s/it]

training_loss:::: 0.03034343


523it [45:54,  5.29s/it]

training_loss:::: 0.028092435


524it [46:00,  5.41s/it]

training_loss:::: 0.030992951


525it [46:02,  4.38s/it]

training_loss:::: 0.03267669


526it [46:04,  3.69s/it]

training_loss:::: 0.035823673


527it [46:06,  3.32s/it]

training_loss:::: 0.029436417


528it [46:08,  2.82s/it]

training_loss:::: 0.04081473


529it [46:10,  2.73s/it]

training_loss:::: 0.0347004


530it [46:14,  2.92s/it]

training_loss:::: 0.029880844


531it [46:16,  2.65s/it]

training_loss:::: 0.027745128


532it [46:18,  2.50s/it]

training_loss:::: 0.035677668


533it [46:20,  2.40s/it]

training_loss:::: 0.03648689


534it [46:22,  2.37s/it]

training_loss:::: 0.032574743


535it [46:25,  2.36s/it]

training_loss:::: 0.031846553


536it [46:27,  2.45s/it]

training_loss:::: 0.025253383


537it [46:29,  2.37s/it]

training_loss:::: 0.027712284


538it [46:32,  2.36s/it]

training_loss:::: 0.033509128


539it [46:34,  2.19s/it]

training_loss:::: 0.035353445


540it [46:35,  2.05s/it]

training_loss:::: 0.030048175


541it [46:37,  2.00s/it]

training_loss:::: 0.030910341


542it [46:39,  2.04s/it]

training_loss:::: 0.027555265


543it [46:42,  2.16s/it]

training_loss:::: 0.03233009


544it [46:46,  2.86s/it]

training_loss:::: 0.036172077


545it [46:49,  2.79s/it]

training_loss:::: 0.028004538


546it [46:51,  2.47s/it]

training_loss:::: 0.029148566


547it [46:53,  2.49s/it]

training_loss:::: 0.031351034


548it [46:55,  2.35s/it]

training_loss:::: 0.025589485


549it [46:57,  2.33s/it]

training_loss:::: 0.026004432


550it [46:59,  2.14s/it]

training_loss:::: 0.030245677


551it [47:01,  2.00s/it]

training_loss:::: 0.033486042


552it [47:03,  2.05s/it]

training_loss:::: 0.026762923


553it [47:06,  2.29s/it]

KeyboardInterrupt: 

In [64]:
#test

def predict(words_raw):
    """Returns list of tags

    Args:
        words_raw: list of words (string), just one sentence (no batch)

    Returns:
        preds: list of tags (string), one for each word in the sentence

    """
    words = [process_words(w) for w in words_raw.split(" ")]
    if type(words[0]) == tuple:
        words = zip(*words)
    pred_ids, _ = predict_batch([words])
    preds = [idx_to_tag[idx] for idx in list(pred_ids[0])]

    return preds

In [65]:
idx_to_tag = {v:k for k,v in tag_2_id.items()}

In [79]:
text = "set the mlsdreg now clear the field mlsdfield of mlsdreg now if mlsdreg is set to mlsdval then read the mlsdfield OF mlsdreg and assert if mlsdreg is set so clear the mlsdreg"
preds = predict(words_raw=text)
start = 0
sente=[]
for p,w in zip(preds,text.split(" ")):
#     print(w)
#     print(type(w))
    sent = ""
    if start == 1 and p == 'B-sent':
        sente.append(sent)
        sent = ""
        start = 0
        continue
    if p == 'B-sent' and start == 0:
        start = 1
        sent += str(w) + " "
        continue
    if start == 1 and p == "O":
        sent += str(w) + " "
        continue
    if start ==0 and p == "O":
        sent += str(w) + " "
        continue
sente.append(sent)

In [80]:
for p,w in zip(preds,text.split(" ")):
    print(p,"____",w)

B-sent ____ set
O ____ the
O ____ mlsdreg
O ____ now
O ____ clear
O ____ the
O ____ field
O ____ mlsdfield
O ____ of
O ____ mlsdreg
O ____ now
O ____ if
O ____ mlsdreg
O ____ is
O ____ set
O ____ to
O ____ mlsdval
O ____ then
O ____ read
O ____ the
O ____ mlsdfield
O ____ OF
O ____ mlsdreg
O ____ and
B-sent ____ assert
O ____ if
O ____ mlsdreg
O ____ is
O ____ set
B-sent ____ so
O ____ clear
O ____ the
O ____ mlsdreg
