In [44]:
import os 
import re
import sys
import numpy as np
import pandas as pd
# import nltk
import tensorflow as tf
from tqdm import tqdm
import random
import string

In [2]:
new_filename = "data_folder/10_02_2020_OnlySyntax.txt"
#read eng sentences    

In [3]:
def prepare_tag_data(filename):
    eng_lines = []
    with open(filename,'r',encoding='utf-8') as f:
        for line in f:
            line = line.strip().replace('<s>','').replace('</t>','').split('</s> <t>')
            line = line[0].strip()
            line = re.sub('\s+',' ',line)
            eng_lines.append(line)
    return eng_lines

lns = prepare_tag_data(filename=new_filename)

In [4]:
len(lns),lns[:2]

(43805,
 ['poll MLSDFIELD bits of register MLSDREG until cleared',
  'poll MLSDFIELD bits of register MLSDREG until cleared'])

In [5]:
#build params (Hyperparameters)

In [8]:
config = {}
EPOCHS = 15
BATCH_SIZE = 32
LR = 0.001
DECAY_LR = 0.9
OPTM = "Adam"
DROPOUT = 0.3
EPOCHS_NO_IMPROVEMENT = 4
HIDDEN_DIM = 256
CHAR_HIDDEN_DIM = 50
DATA_FILEPATH = "data_folder/data.txt"
GLOVE_FILEPATH = "glove/glove.6B.100d.txt"
VOCAB_FILENAME = "data_folder/vocabs.txt"
TAG_FILENAME = "data_folder/tags.txt"
EMBEDDING_VOCAB = "data_folder/vocab_embeddings.npz"
EMBEDDING_TAG = "data_folder/tag_embeddings.npz"
CHAR_FILENAME = "data_folder.char.txt"

UNK = "$UNK$"
NUM = "$NUM$"
NONE = "O"

In [9]:
#build files from data

In [10]:
def data_batches(data,batch_size=32):
    """Args >> data=tuple(sentence,tag)
    returns data in batches"""
    steps = len(data)//batch_size
    x_batch, y_batch = [], []
    for (x,y) in data:
        if len(x_batch) == batch_size:
            yield x_batch, y_batch
            x_batch, y_batch = [], []
        if type(x[0])==tuple:
            x = zip(*x)
            x_batch += [x]
            y_batch += [y]
    if len(x_batch) != 0:
        yield x_batch, y_batch

In [11]:
def get_vocabs(datasets):
    """Building vocabs present in datasets"""
    print("Building Vocab ....")
    vocab_words = set()
    vocab_tags = set()
    for dataset in datasets:
        for words, tags in dataset:
            vocab_words.update(words)
            vocab_tags.update(tags)
    print("-done vocab words {s}--- vocab_tags {d}".format(s=len(vocab_words),d=len(vocab_tags)))
    return vocab_words, vocab_tags

In [12]:
def get_char_vocab(dataset):
    vocab_char = set()
    for words, _ in dataset:
        for word in words:
            vocab_char.update(word)
    return vocab_char

In [13]:
def get_glove_vocab(path):
    """Building vocabs present in glove"""
    print("Building Vocab..glove...")
    vocab = set()
    with open(path,'r',encoding='utf-8') as f:
        for line in f:
            word = line.strip().split(' ')[0]
            vocab.add(word)
    print("done. {} tokens in glove".format(len(vocab)))
    return vocab

In [14]:
def final_vocab(vocab_words,vocab_glove,*args):
    """Combine word vocabs >> glove_vocabs+dataset_vocabs"""
    vocab = vocab_words & vocab_glove
    for word in args:
        vocab.add(word)
    return vocab
    

In [15]:
def build_vocab(vocab,filename):
    "Write all vocabs line by line in a file"
    with open(filename,'w',encoding='utf-8') as f:
        for i, word in enumerate(vocab):
            if i != len(vocab)-1:
                f.write("{}\n".format(word))
            else:
                f.write(word)
    print("written {s} tokens in {d}".format(s=len(vocab), d=filename))


In [16]:
def load_vocab(filename):
    """Assign id to each word in vocab
        returns dictionary"""
    d = {}
    with open(filename,encoding='utf-8') as f:
        for idx, word in enumerate(f):
            word = word.strip()
            d[word] = idx
    return d

In [17]:
def export_glove_vectors(vocab, glove_filename, filename, dim):
    """Bulding compressed file of vectors of words
    that are present in dataset"""
    embeddings = np.zeros([len(vocab),dim],dtype='float32')
    with open(glove_filename,'r',encoding='utf-8') as f:
        for line in f:
            line = line.strip().split(' ')
            word = line[0]
            if word in vocab:
                embedding = [float(x) for x in line[1:]]
                word_idx = vocab[word]
                embeddings[word_idx] = np.asarray(embedding)
    np.savez_compressed(filename, embeddings=embeddings)

def get_glove_vectors(filename):
    """Loads the saved numpy file (Embeddings)"""
    with np.load(filename) as data:
        return data['embeddings']

In [18]:
# #read eng sentences
# with open('english_sentences_tatoeba.txt','r',encoding='utf-8') as f:
#     lns = f.readlines()
# print(len(lns),lns[:6])

In [19]:
def generate_data(lines, max_sents_per_example=6, n_examples=1000):
    """
        Generates training data for deepsegment from list of sentences.
        Parameters:
        lines (list): Base sentences for data generation.
        max_sents_per_example (int): Maximum number of sentences to be combined to form a single paragraph.
        
        n_examples (int): Number of training examples to be generated.
        
        Returns:
        list, list: Training data and corresponding labels in BIOU format.
    """
    x, y = [], []
    
    for current_i in tqdm(range(n_examples)):
        x.append([])
        y.append([])

        chosen_lines = []
        for _ in range(random.randint(1, max_sents_per_example)):
            chosen_lines.append(random.choice(lines))
        
        chosen_lines = [bad_sentence_generator(line, remove_punctuation=random.randint(0, 3)) for line in chosen_lines]
        
        for line in chosen_lines:
            words = line.strip().split()
            for word_i, word in enumerate(words):
                x[-1].append(word)
                label = 'O'
                if word_i == 0:
                    label = 'B-sent'
                y[-1].append(label)
    
    return x, y


def bad_sentence_generator(sent, remove_punctuation = None):
    """
        Returns sentence with completely/ partially removed punctuation.
        Parameters:
        sent (str): Sentence on which the punctuation removal operation is performed.
        
        remove_punctuation (int): removing punctuation completely if remove_punctuation ==0 or ==1, removing punctuation till a randomly selected point if remove_punctuation ==2
        Returns:
        str: Sentence with modified punctuation
    """

    if not remove_punctuation:
        remove_punctuation = random.randint(0, 3)

    break_point = random.randint(1, len(sent)-2)
    lower_case = random.randint(0, 2)

    if remove_punctuation <= 1:
        # removing punctuation completely if remove_punctuation ==0 or ==1
        sent = re.sub('['+string.punctuation+']', '', sent)
    
    elif remove_punctuation == 2:
        # removing punctuation till a randomly selected point if remove_punctuation ==2
        if random.randint(0,1) == 0:
            sent = re.sub('['+string.punctuation+']', '', sent[:break_point]) + sent[break_point:]
        # removing punctuation after a randomly selected point if remove_punctuation ==2        
        else:
            sent = sent[:break_point] + re.sub('['+string.punctuation+']', '', sent[break_point:])    
    
    if lower_case <= 1:
        # lower casing sentence 
        sent = sent.lower()
    
    return sent


In [20]:
def generate_data(lines, max_sents_per_example=6, n_examples=1000,punct=None):
    """
        Generates training data for deepsegment from list of sentences.
        Parameters:
        lines (list): Base sentences for data generation.
        max_sents_per_example (int): Maximum number of sentences to be combined to form a single paragraph.
        
        n_examples (int): Number of training examples to be generated.
        
        Returns:
        list, list: Training data and corresponding labels in BIOU format.
    """
    x, y = [], []
    
    for current_i in tqdm(range(n_examples)):
        x.append([])
        y.append([])

        chosen_lines = []
        for _ in range(random.randint(1, max_sents_per_example)):
            chosen_lines.append(random.choice(lines))
        
        chosen_lines = [bad_sentence_generator(line, remove_punctuation=random.randint(0, 3),use_punct=punct) for line in chosen_lines]
        
        for line in chosen_lines:
            words = line.strip().split()
            for word_i, word in enumerate(words):
                x[-1].append(word)
                label = 'O'
                if word_i == 0:
                    label = 'B-sent'
                y[-1].append(label)
    
    return x, y

punct = ['And','Or','So','After','Once','Since','So','After that','Though',',',':',';','now','now then',""]

def bad_sentence_generator(sent, remove_punctuation = None,use_punct=None):
    """
        Returns sentence with completely/ partially removed punctuation.
        Parameters:
        sent (str): Sentence on which the punctuation removal operation is performed.
        
        remove_punctuation (int): removing punctuation completely if remove_punctuation ==0 or ==1, removing punctuation till a randomly selected point if remove_punctuation ==2
        Returns:
        str: Sentence with modified punctuation
    """

    if not remove_punctuation:
        remove_punctuation = random.randint(0, 3)

    break_point = random.randint(1, len(sent)-2)
    lower_case = random.randint(0, 2)

    if remove_punctuation <= 1:
        # removing punctuation completely if remove_punctuation ==0 or ==1
        sent = re.sub('['+string.punctuation+']', '', sent)
        if punct and remove_punctuation == 0:
            sent = f"{random.choice(punct)}"+" "+sent
    
    elif remove_punctuation == 2:
        # removing punctuation till a randomly selected point if remove_punctuation ==2
        if random.randint(0,1) == 0:
            sent = re.sub('['+string.punctuation+']', '', sent[:break_point]) + sent[break_point:]
        # removing punctuation after a randomly selected point if remove_punctuation ==2        
        else:
            sent = sent[:break_point] + re.sub('['+string.punctuation+']', '', sent[break_point:])    
    
    if lower_case <= 1:
        # lower casing sentence 
        sent = sent.lower()
    
    return sent


In [21]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [22]:
x, y = generate_data(lines=lns[8000:], max_sents_per_example=6, n_examples=34000,punct=punct)
x_, y_ = generate_data(lines=lns[:8000], max_sents_per_example=6, n_examples=7000,punct=punct)

100%|█████████████████████████████████████████████████████████████████████████| 34000/34000 [00:03<00:00, 10563.36it/s]
100%|███████████████████████████████████████████████████████████████████████████| 7000/7000 [00:00<00:00, 12043.92it/s]


In [23]:
os.listdir('glove')[0]

'glove.6B.100d.txt'

In [24]:
dataset = [(i,j) for i, j in zip(x,y)]

In [25]:
#building vocab from glove and dataset 
#combine it 
#write it
vocab_words, vocab_tags = get_vocabs([dataset])
vocab_chars = get_char_vocab(dataset)
build_vocab(vocab_tags, TAG_FILENAME)
build_vocab(vocab_chars, CHAR_FILENAME)

Building Vocab ....
-done vocab words 153--- vocab_tags 2
written 2 tokens in data_folder/tags.txt
written 50 tokens in data_folder.char.txt


In [26]:
vocab_glove = get_glove_vocab(GLOVE_FILEPATH)

Building Vocab..glove...
done. 400000 tokens in glove


In [27]:
vocab = final_vocab(vocab_words, vocab_glove, NUM,UNK)

In [28]:
build_vocab(vocab, VOCAB_FILENAME)

written 96 tokens in data_folder/vocabs.txt


In [29]:
#giving ids to all vocab
word_2_id = load_vocab(VOCAB_FILENAME)
tag_2_id  = load_vocab(TAG_FILENAME)
char_2_id = load_vocab(CHAR_FILENAME)

In [30]:
NWORDS     = len(vocab_words)
NCHARS     = len(vocab_chars)
NTAGS      = len(vocab_tags)

In [31]:
#saving npz with embedding of vocab
export_glove_vectors(vocab=word_2_id, glove_filename=GLOVE_FILEPATH, filename=EMBEDDING_VOCAB, dim=100)
export_glove_vectors(vocab=tag_2_id, glove_filename=GLOVE_FILEPATH, filename=EMBEDDING_TAG, dim=100)

In [32]:
#loading the glove vectors from saved npz
word_embeddings = get_glove_vectors(filename=EMBEDDING_VOCAB)
tag_embeddings = get_glove_vectors(filename=EMBEDDING_TAG)

In [33]:
#converting them into list of ids

In [34]:
def get_processing_word(vocab_words=None, vocab_chars=None,
                    lowercase=False, chars=False, allow_unk=True):
    """Return lambda function that transform a word (string) into list,
    or tuple of (list, id) of int corresponding to the ids of the word and
    its corresponding characters.

    Args:
        vocab: dict[word] = idx

    Returns:
        f("cat") = ([12, 4, 32], 12345)
                 = (list of char ids, word id)

    """
    def f(word):
        # 0. get chars of words
        UNK = "$UNK$"
        if vocab_chars is not None and chars == True:
            char_ids = []
            for char in word:
                # ignore chars out of vocabulary
                if char in vocab_chars:
                    char_ids += [vocab_chars[char]]

        # 1. preprocess word
        if lowercase:
            word = word.lower()
        if word.isdigit():
            word = NUM

        # 2. get id of word
        if vocab_words is not None:
            if word in vocab_words:
                word = vocab_words[word]
            else:
                if allow_unk:
                    word = vocab_words[UNK]
                else:
                    raise Exception("Unknow key is not allowed. Check that "\
                                    "your vocab (tags?) is correct")

        # 3. return tuple char ids, word id
        if vocab_chars is not None and chars == True:
            return char_ids, word
        else:
            return word

    return f


In [35]:
# process_words("gvjvgvj")

In [36]:
process_words = get_processing_word(vocab_words=word_2_id,vocab_chars=char_2_id,chars=True)
process_tags = get_processing_word(vocab_words=tag_2_id,vocab_chars=char_2_id,chars=False)

In [37]:
class CoNLLDataset(object):
    """Class that iterates over CoNLL Dataset

    __iter__ method yields a tuple (words, tags)
        words: list of raw words
        tags: list of raw tags

    If processing_word and processing_tag are not None,
    optional preprocessing is appplied

    Example:
        ```python
        data = CoNLLDataset(filename)
        for sentence, tags in data:
            pass
        ```

    """
    def __init__(self, data, processing_word=None, processing_tag=None,
                 max_iter=32):
        """
        Args:
            filename: path to the file
            processing_words: (optional) function that takes a word as input
            processing_tags: (optional) function that takes a tag as input
            max_iter: (optional) max number of sentences to yield

        """
        self.data = data
        self.processing_word = processing_word
        self.processing_tag = processing_tag
        self.max_iter = 32
        self.length = None


    def __iter__(self):
        niter = 0
#         words, tags = [], []
        for d in self.data:
            words, tags = [], []
            
#             if self.max_iter == niter:
#                 yield words, tags
#                 words, tags = [], []
#             else:
#                 niter+=1
            word, tag = d[0],d[-1]
#             print(word),print(tag)
            for word_, tag_ in zip(word,tag):
                if self.processing_word is not None:
                    word_ = self.processing_word(word_)
                if self.processing_tag is not None:
                    tag_ = self.processing_tag(tag_)
                words += [word_]
                tags += [tag_]
            yield words, tags
                


    def __len__(self):
        """Iterates once over the corpus to set and store length"""
        if self.length is None:
            self.length = 0
            for _ in self:
                self.length += 1

        return self.length

In [38]:
# with open('train.txt','w',encoding='utf-8') as f:
#     for i,item in enumerate(zip(x,y)):
#         if i != len(x)-1:
#             f.write('{} {}\n'.format(item[0],item[1]))
#         else:
#             f.write('{} {}'.format(item[0],item[1]))
# with open('dev.txt','w',encoding='utf-8') as f:
#     for i,item in enumerate(zip(x_,y_)):
#         if i != len(x)-1:
#             f.write('{} {}\n'.format(item[0],item[1]))
#         else:
#             f.write('{} {}'.format(item[0],item[1]))       

In [39]:
train_ = [(i,j) for i, j in zip(x,y)]
test_ = [(i,j) for i, j in zip(x_,y_)]

In [40]:
dev_data   = CoNLLDataset(data=train_, processing_word = process_words,
                     processing_tag = process_tags, max_iter=None)
train_data = CoNLLDataset(data=test_, processing_word = process_words,
                     processing_tag = process_tags, max_iter=None)

In [41]:
!pip show tensorflow

Name: tensorflow
Version: 1.15.0
Summary: TensorFlow is an open source machine learning framework for everyone.
Home-page: https://www.tensorflow.org/
Author: Google Inc.
Author-email: packages@tensorflow.org
License: Apache 2.0
Location: c:\users\abhis\anaconda3\envs\tensor2_0\lib\site-packages
Requires: tensorflow-estimator, gast, wrapt, grpcio, numpy, wheel, protobuf, keras-preprocessing, opt-einsum, absl-py, keras-applications, termcolor, google-pasta, astor, tensorboard, six
Required-by: 


In [45]:
tf.reset_default_graph()

AttributeError: module 'tensorflow' has no attribute 'reset_default_graph'

In [43]:
#intitate placeholders
word_ids = tf.placeholder(tf.int32, shape=[None,None], name="word_id_placeholder")
sequence_length = tf.placeholder(tf.int32, shape=[None,], name="sequence_length_placholder")
label = tf.placeholder(tf.int32,shape=[None,None], name="labels")
dropout = tf.placeholder(dtype=tf.float32, shape=[], name="dropout")
lr = tf.placeholder(dtype=tf.float32, shape=[], name="lr")
char_id = tf.placeholder(dtype=tf.int32, shape=[None,None,None], name="char_id")
word_lengths = tf.placeholder(dtype=tf.int32, shape=[None,None], name="word_length_placeholder")

AttributeError: module 'tensorflow' has no attribute 'placeholder'

In [70]:
NWORDS = len(word_2_id.keys())
print(NWORDS)

96


In [71]:
CHAR_HIDDEN_DIM

256

In [72]:
#embedding layer

In [73]:
def word_embedding_fn(nwords,nchars=None,char_hidden_size=None,char_id=None,word_lengths=None,dropout=False,embedding_dict=None,embedding_size=100,word_ids=word_ids,use_char_embed=False,):
    with tf.variable_scope("word_embedding"):
        if embedding_dict is None:
            word_embeddings_ = tf.get_variable(name="word_embeddings_",
                                              dtype=tf.float32,
                                              shape=[nwords,embedding_size])
        else:
            word_embeddings_ = tf.Variable(embedding_dict,
                                           name="word_embeddings_",
                                           dtype=tf.float32,
                                           shape=[nwords,embedding_size],
                                           trainable=True
                                           )
            
        word_embeddings = tf.nn.embedding_lookup(word_embeddings_,word_ids,name="word_embeddings")
    
    with tf.variable_scope("char"):
        print('using char embedding')
        if use_char_embed:
            char_embeddings_ = tf.Variable(tf.random_uniform([nchars,50],-1.0,1.0),
                        name="_char_embeddings",
                        dtype=tf.float32)
                        #shape=[nchars, char_hidden_size])
            char_embeddings = tf.nn.embedding_lookup(char_embeddings_,
                                                    char_id,
                                                     name="char_embeddings")
            
            #including time dimesion
            s = tf.shape(char_embeddings)
            char_embeddings = tf.reshape(char_embeddings,
                                        shape=[s[0]*s[1],s[-2],50])
            word_length = tf.reshape(word_lengths,shape=[s[0]*s[1]])
            
            #lstm bidir over chars
            fw_cell = tf.contrib.rnn.LSTMCell(char_hidden_size, state_is_tuple=True)
            bw_cell = tf.contrib.rnn.LSTMCell(char_hidden_size, state_is_tuple=True)
            
            output_ = tf.nn.bidirectional_dynamic_rnn(fw_cell,bw_cell, char_embeddings,
                                                     sequence_length = word_length, dtype=tf.float32)
            
            #read tthe output_
            _, ((_,fw_output),(_,bw_output)) = output_
            output = tf.concat([fw_output,bw_output],axis=-1)
            #shape [bs, max_sequnce_length, char_hidden_size]
            
            output = tf.reshape(output,shape=[s[0],s[1],2*char_hidden_size])
            word_embeddings = tf.concat([word_embeddings, output],axis=-1)
    if dropout:
        word_embeddings = tf.nn.dropout(word_embeddings, dropout)

    return word_embeddings

In [74]:
# word_embedding = word_embedding_fn(nwords=NWORDS,nchars=NCHARS, embedding_dict=word_embeddings,embedding_size=100)
word_embedding = word_embedding_fn(nwords=NWORDS,nchars=NCHARS,char_hidden_size=CHAR_HIDDEN_DIM,char_id=char_id,word_lengths=word_lengths, embedding_dict=word_embeddings,embedding_size=100,use_char_embed=True)

using char embedding


In [75]:
def logits_op(hidden_dim,word_embeddings,sequence_length,ntags=len(vocab_tags)):
    with tf.variable_scope("bidirectional_lstm"):
        for_cell = tf.contrib.rnn.LSTMCell(hidden_dim)
        bac_cell = tf.contrib.rnn.LSTMCell(hidden_dim)
        (for_out, bac_out), _ = tf.nn.bidirectional_dynamic_rnn(for_cell, bac_cell,
                                                                word_embeddings,
                                                                sequence_length=sequence_length,
                                                                dtype=tf.float32)
        #shape >> [bs, sequence_length, 2EMD]
        output = tf.concat([for_out, bac_out], axis=-1)

    #         output = tf.nn.dropout(output,)

    with tf.variable_scope("w_b"):
        W = tf.get_variable(name="W",dtype=tf.float32,
                            shape = [2*hidden_dim,ntags])
        b = tf.get_variable(name="b", dtype=tf.float32,
                            shape = [ntags],
                            initializer = tf.zeros_initializer())


        nsteps = tf.shape(output)[1]
        output = tf.reshape(output, [-1,2*hidden_dim])

        pred = tf.matmul(output,W) + b
        logits = tf.reshape(pred,[-1,nsteps,ntags])
    return logits

In [76]:
logits = logits_op(hidden_dim=HIDDEN_DIM,word_embeddings=word_embedding,sequence_length=sequence_length,ntags=len(vocab_tags))

In [77]:
def pred_op(logits):
    label_pred = tf.cast(tf.argmax(logits, axis=-1),tf.int32)
    return label_pred
label_pred = pred_op(logits)

In [78]:
def loss_op(logits,labels,sequence_length,lr):
    losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,labels=labels)
    mask = tf.sequence_mask(sequence_length)
    losses = tf.boolean_mask(losses, mask)
    loss = tf.reduce_mean(losses)
    optimizer = tf.train.AdamOptimizer(lr)
    training_op = optimizer.minimize(loss)
    return training_op, loss

training_op, loss = loss_op(logits=logits, labels=label, sequence_length=sequence_length,lr=LR)

In [79]:
def initialize_session():
    sess = tf.Session()
    return sess

In [80]:
sess = initialize_session()
init = tf.global_variables_initializer()
sess.run(init)
saver = tf.train.Saver()

In [81]:
def save_session():
    if not os.path.exists('model_folder'):
        os.makedirs('model_folder')
    saver.save(sess, 'model_folder')

def close_sess():
    sess.close()

In [82]:
#minibatches of data
def minibatches(data,batch_size):
    x_batch, y_batch = [], []
    for (x,y) in data:
        if len(x_batch) == batch_size:
            yield x_batch, y_batch
            x_batch, y_batch = [], []
        if type(x[0]) == tuple:
            x = zip(*x)
        x_batch += [x]
        y_batch += [y]
        
    if len(x_batch) != 0:
        yield x_batch, y_batch

In [83]:
def get_chunk_type(tok, idx_to_tag):
    """
    Args:
        tok: id of token, ex 4
        idx_to_tag: dictionary {4: "B-PER", ...}
    Returns:
        tuple: "B", "PER"
    """
    tag_name = idx_to_tag[tok]
    tag_class = tag_name.split('-')[0]
    tag_type = tag_name.split('-')[-1]
    return tag_class, tag_type


def get_chunks(seq, tags):
    """Given a sequence of tags, group entities and their position
    Args:
        seq: [4, 4, 0, 0, ...] sequence of labels
        tags: dict["O"] = 4
    Returns:
        list of (chunk_type, chunk_start, chunk_end)
    Example:
        seq = [4, 5, 0, 3]
        tags = {"B-PER": 4, "I-PER": 5, "B-LOC": 3}
        result = [("PER", 0, 2), ("LOC", 3, 4)]
    """
    if NONE in tags:
        default = tags[NONE]
    else:
        default = None
        
    idx_to_tag = {idx: tag for tag, idx in tags.items()}
    chunks = []
    chunk_type, chunk_start = None, None
    for i, tok in enumerate(seq):
        # End of a chunk 1
#         print(tok)
        if tok == default and chunk_type is not None:
            # Add a chunk.
            chunk = (chunk_type, chunk_start, i)
            chunks.append(chunk)
            chunk_type, chunk_start = None, None

        # End of a chunk + start of a chunk!
        elif tok != default:
            tok_chunk_class, tok_chunk_type = get_chunk_type(tok, idx_to_tag)
            if chunk_type is None:
                chunk_type, chunk_start = tok_chunk_type, i
            elif tok_chunk_type != chunk_type or tok_chunk_class == "B":
                chunk = (chunk_type, chunk_start, i)
                chunks.append(chunk)
                chunk_type, chunk_start = tok_chunk_type, i
        else:
            pass

    # end condition
    if chunk_type is not None:
        chunk = (chunk_type, chunk_start, len(seq))
        chunks.append(chunk)

    return chunks

In [84]:
def predict_batch(words):
    fd, sequence_length = get_feed_dict(words,use_chars=True) #dropout=1.0)
    labels_pred = sess.run(label_pred, feed_dict=fd)
    return labels_pred, sequence_length

In [85]:
def _pad_sequences(sequences, pad_tok, max_length):
    """
    Args:
        sequences: a generator of list or tuple
        pad_tok: the char to pad with
    Returns:
        a list of list where each sublist has same length
    """
    sequence_padded, sequence_length = [], []

    for seq in sequences:
        seq = list(seq)
        seq_ = seq[:max_length] + [pad_tok]*max(max_length - len(seq), 0)
        sequence_padded +=  [seq_]
        sequence_length += [min(len(seq), max_length)]

    return sequence_padded, sequence_length


def pad_sequences(sequences, pad_tok, nlevels=1):
    """
    Args:
        sequences: a generator of list or tuple
        pad_tok: the char to pad with
        nlevels: "depth" of padding, for the case where we have characters ids
    Returns:
        a list of list where each sublist has same length
    """
    if nlevels == 1:
        max_length = max(map(lambda x : len(x), sequences))
        sequence_padded, sequence_length = _pad_sequences(sequences,
                                            pad_tok, max_length)

    elif nlevels == 2:
        max_length_word = max([max(map(lambda x: len(x), seq))
                               for seq in sequences])
        sequence_padded, sequence_length = [], []
        for seq in sequences:
            # all words are same length now
            sp, sl = _pad_sequences(seq, pad_tok, max_length_word)
            sequence_padded += [sp]
            sequence_length += [sl]

        max_length_sentence = max(map(lambda x : len(x), sequences))
        sequence_padded, _ = _pad_sequences(sequence_padded,
                [pad_tok]*max_length_word, max_length_sentence)
        sequence_length, _ = _pad_sequences(sequence_length, 0,
                max_length_sentence)

    return sequence_padded, sequence_length

In [86]:
def get_feed_dict(words, labels=None, lr=None, dropout=None,use_chars=False):
        """Given some data, pad it and build a feed dictionary
        Args:
            words: list of sentences. A sentence is a list of ids of a list of
                words. A word is a list of ids
            labels: list of ids
            lr: (float) learning rate
            dropout: (float) keep prob
        Returns:
            dict {placeholder: value}
        """
        #perform padding of the given data
        if use_chars:
            char_ids, word_ids = zip(*words)
            word_ids, sequence_lengths = pad_sequences(word_ids, 0)
            char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0,
                nlevels=2)
        else:
            word_ids, sequence_lengths = pad_sequences(words, 0)

        # build feed dictionary
        feed = {
            "word_id_placeholder:0": word_ids,
            "sequence_length_placholder:0": sequence_lengths
        }

        if use_chars:
            feed["char_id:0"] = char_ids
            feed["word_length_placeholder:0"] = word_lengths
        if labels is not None:
            labels, _ = pad_sequences(labels, 0)
            feed["labels:0"] = labels

        if lr is not None:
            feed["lr:0"] = lr

        if dropout is not None:
            feed["dropout:0"] = dropout

        return feed, sequence_lengths

In [87]:
from tqdm import tqdm

In [88]:
def run_epoch(train, dev, epoch,batch_size=128):
    for i ,(words, labels) in tqdm(enumerate(minibatches(train,batch_size))):
        fd, _= get_feed_dict(words, labels, LR, DROPOUT,use_chars=True)
#         print("......................................")
        
        _, train_los = sess.run([training_op, loss], feed_dict = fd)
        print("training_loss::::",train_los)
        
    metrics = evaluation(dev)
    msg = " - ".join(["{} {:04.2f}".format(k, v)
            for k, v in metrics.items()])
    print(msg)
    print(metrics['f1'])
    return metrics['f1']
        

def evaluation(dev):
    accs = []
    correct_preds, total_correct, total_preds = 0., 0., 0.
    for words, labels in minibatches(dev, batch_size=32):
        label_preds, sequence_length = predict_batch(words)
        
        for lab, lab_pred, length in zip(labels, label_preds, sequence_length):
            lab = lab[:length]
            lab_pred = lab_pred[:length]
            
            accs += [a==b for (a,b) in zip(lab,lab_pred)]
#             print("lab::",lab,"fininsh")
            lab_chunks = set(get_chunks(lab, tag_2_id))
#             print("lab_pred",lab_pred)
            lab_pred_chunks = set(get_chunks(lab_pred, tag_2_id))
            
            correct_preds += len(lab_chunks & lab_pred_chunks)
            total_preds   += len(lab_pred_chunks)
            total_correct += len(lab_chunks)
            
    p   = correct_preds / total_preds if correct_preds > 0 else 0
    r   = correct_preds / total_correct if correct_preds > 0 else 0
    f1  = 2 * p * r / (p + r) if correct_preds > 0 else 0
    acc = np.mean(accs)

    return {"acc": 100*acc, "f1": 100*f1}

In [89]:
def train(train, dev, epochs, epoch_no_improvement=EPOCHS_NO_IMPROVEMENT):
    global LR
    best_score = 0
    for epoch in range(epochs):
        print(f"epoch {epoch} out of {epochs}")
        score = run_epoch(train, dev, epoch)
        LR *= 0.9#lr_decay # decay learning rate
        if score >= best_score:
            epo_no_improvement = 0
            save_session()
            best_score = score
            print("Best Score", best_score)
        else:
            epo_no_improvement += 1
            if epo_no_improvement > epoch_no_improvement:
                print(f"Early Stoping {epo_no_improvement} with no Improvement")
                break
                
        

In [90]:
# train_data = [(i,j) for i, j in zip(x,y)]
# dev_data = [(i,j) for i, j in zip(x_,y_)]

In [91]:

train(train=train_data, dev=dev_data, epochs = 25, epoch_no_improvement=4)

epoch 0 out of 25


0it [00:00, ?it/s]

training_loss:::: 0.7699002


1it [00:08,  8.68s/it]

training_loss:::: 0.394743


2it [00:16,  8.38s/it]

training_loss:::: 0.42662278


3it [00:25,  8.52s/it]

training_loss:::: 0.42495412


4it [00:35,  8.93s/it]

training_loss:::: 0.3922895


5it [00:42,  8.62s/it]

training_loss:::: 0.3415032


6it [00:51,  8.61s/it]

training_loss:::: 0.31880867


7it [01:00,  8.57s/it]

training_loss:::: 0.3211897


8it [01:07,  8.25s/it]

training_loss:::: 0.3284933


9it [01:15,  8.02s/it]

training_loss:::: 0.2953337


10it [01:24,  8.57s/it]

training_loss:::: 0.26696548


11it [01:34,  8.80s/it]

training_loss:::: 0.2628289


12it [01:41,  8.49s/it]

training_loss:::: 0.24101132


13it [01:50,  8.38s/it]

training_loss:::: 0.22767223


14it [01:58,  8.24s/it]

training_loss:::: 0.22477312


15it [02:05,  7.88s/it]

training_loss:::: 0.1966457


16it [02:13,  8.18s/it]

training_loss:::: 0.18856484


17it [02:21,  8.14s/it]

training_loss:::: 0.16653448


18it [02:30,  8.15s/it]

training_loss:::: 0.14644077


19it [02:39,  8.37s/it]

training_loss:::: 0.13401395


20it [02:47,  8.43s/it]

training_loss:::: 0.13097543


21it [02:55,  8.32s/it]

training_loss:::: 0.11357886


22it [03:03,  8.11s/it]

training_loss:::: 0.10166957


23it [03:11,  8.05s/it]

training_loss:::: 0.08997118


24it [03:18,  7.89s/it]

training_loss:::: 0.07657217


25it [03:27,  8.21s/it]

training_loss:::: 0.0729999


26it [03:36,  8.26s/it]

training_loss:::: 0.06843629


27it [03:44,  8.29s/it]

training_loss:::: 0.065625794


28it [03:52,  8.20s/it]

training_loss:::: 0.07079851


29it [04:00,  8.15s/it]

training_loss:::: 0.045037024


30it [04:08,  8.19s/it]

training_loss:::: 0.05936083


31it [04:18,  8.55s/it]

training_loss:::: 0.051698744


32it [04:25,  8.25s/it]

training_loss:::: 0.05726403


33it [04:33,  8.01s/it]

training_loss:::: 0.065165706


34it [04:41,  8.20s/it]

training_loss:::: 0.050791465


35it [04:50,  8.43s/it]

training_loss:::: 0.043726824


36it [05:01,  9.03s/it]

training_loss:::: 0.042911507


37it [05:10,  9.17s/it]

training_loss:::: 0.04643424


38it [05:19,  9.15s/it]

training_loss:::: 0.03658593


39it [05:27,  8.86s/it]

training_loss:::: 0.031063832


40it [05:35,  8.48s/it]

training_loss:::: 0.040431548


41it [05:43,  8.31s/it]

training_loss:::: 0.03884462


42it [05:51,  8.33s/it]

training_loss:::: 0.025326194


43it [05:59,  8.03s/it]

training_loss:::: 0.02970109


44it [06:07,  8.15s/it]

training_loss:::: 0.020138854


45it [06:15,  8.01s/it]

training_loss:::: 0.027404524


46it [06:23,  8.03s/it]

training_loss:::: 0.021304144


47it [06:31,  8.04s/it]

training_loss:::: 0.020645374


48it [06:39,  8.04s/it]

training_loss:::: 0.023499599


49it [06:48,  8.26s/it]

training_loss:::: 0.017877243


50it [06:56,  8.14s/it]

training_loss:::: 0.014773175


51it [07:03,  7.93s/it]

training_loss:::: 0.014899133


52it [07:11,  7.99s/it]

training_loss:::: 0.014327517


53it [07:20,  8.35s/it]

training_loss:::: 0.015901856


54it [07:28,  8.17s/it]

training_loss:::: 0.015156472


55it [07:34,  8.27s/it]


acc 97.00 - f1 84.42
84.42410597605542
Best Score 84.42410597605542
epoch 1 out of 25


0it [00:00, ?it/s]

training_loss:::: 0.01609655


1it [00:07,  7.69s/it]

training_loss:::: 0.010040144


2it [00:15,  7.74s/it]

training_loss:::: 0.014927541


3it [00:24,  8.11s/it]

training_loss:::: 0.009113354


4it [00:34,  8.78s/it]

training_loss:::: 0.010658251


5it [00:42,  8.54s/it]

training_loss:::: 0.011765984


6it [00:51,  8.70s/it]

training_loss:::: 0.009668988


7it [01:00,  8.57s/it]

training_loss:::: 0.014498273


8it [01:07,  8.29s/it]

training_loss:::: 0.00768126


9it [01:15,  7.97s/it]

training_loss:::: 0.0054243403


10it [01:24,  8.50s/it]

training_loss:::: 0.009596468


11it [01:34,  8.83s/it]

training_loss:::: 0.011641596


12it [01:42,  8.68s/it]

training_loss:::: 0.013211692


13it [01:50,  8.48s/it]

training_loss:::: 0.012169876


14it [01:58,  8.36s/it]

training_loss:::: 0.0085504735


15it [02:06,  8.03s/it]

training_loss:::: 0.01030514


16it [02:14,  8.26s/it]

training_loss:::: 0.0111147575


17it [02:22,  8.19s/it]

training_loss:::: 0.007297908


18it [02:30,  8.13s/it]

training_loss:::: 0.008322038


19it [02:39,  8.39s/it]

training_loss:::: 0.009820469


20it [02:48,  8.48s/it]

training_loss:::: 0.0068513653


21it [02:56,  8.44s/it]

training_loss:::: 0.009070322


22it [03:04,  8.29s/it]

training_loss:::: 0.006254731


23it [03:12,  8.24s/it]

training_loss:::: 0.004964836


24it [03:20,  7.99s/it]

training_loss:::: 0.007617996


25it [03:28,  8.10s/it]

training_loss:::: 0.005760604


26it [03:36,  8.03s/it]

training_loss:::: 0.003920912


27it [03:44,  7.91s/it]

training_loss:::: 0.006649924


28it [03:51,  7.75s/it]

training_loss:::: 0.0067363842


29it [03:59,  7.65s/it]

training_loss:::: 0.0038302764


30it [04:06,  7.58s/it]

training_loss:::: 0.008460445


31it [04:15,  7.97s/it]

training_loss:::: 0.006883854


32it [04:22,  7.84s/it]

training_loss:::: 0.005080871


33it [04:30,  7.71s/it]

training_loss:::: 0.008342123


34it [04:38,  7.80s/it]

training_loss:::: 0.007878918


35it [04:45,  7.75s/it]

training_loss:::: 0.003497989


36it [04:54,  8.01s/it]

training_loss:::: 0.005790295


37it [05:02,  7.97s/it]

training_loss:::: 0.006032504


38it [05:10,  8.03s/it]

training_loss:::: 0.003888753


39it [05:18,  7.89s/it]

training_loss:::: 0.0048209485


40it [05:25,  7.73s/it]

training_loss:::: 0.0055636507


41it [05:33,  7.82s/it]

training_loss:::: 0.004892945


42it [05:41,  7.94s/it]

training_loss:::: 0.0042543705


43it [05:48,  7.69s/it]

training_loss:::: 0.003600629


44it [05:57,  7.90s/it]

training_loss:::: 0.0033453254


45it [06:04,  7.78s/it]

training_loss:::: 0.0047990084


46it [06:12,  7.81s/it]

training_loss:::: 0.0028151658


47it [06:19,  7.61s/it]

training_loss:::: 0.0035910215


48it [06:27,  7.64s/it]

training_loss:::: 0.0045265686


49it [06:36,  7.96s/it]

training_loss:::: 0.0030891832


50it [06:44,  7.97s/it]

training_loss:::: 0.0039800713


51it [06:51,  7.77s/it]

training_loss:::: 0.002715759


52it [06:59,  7.88s/it]

training_loss:::: 0.0024431024


53it [07:08,  8.30s/it]

training_loss:::: 0.0032585573


54it [07:16,  8.14s/it]

training_loss:::: 0.0033750576


55it [07:23,  8.07s/it]


acc 97.57 - f1 87.73
87.7321330591379
Best Score 87.7321330591379
epoch 2 out of 25


0it [00:08, ?it/s]


KeyboardInterrupt: 

In [92]:
#test

def predict(words_raw):
    """Returns list of tags

    Args:
        words_raw: list of words (string), just one sentence (no batch)

    Returns:
        preds: list of tags (string), one for each word in the sentence

    """
    words = [process_words(w) for w in words_raw.split(" ")]
    if type(words[0]) == tuple:
        words = zip(*words)
    pred_ids, _ = predict_batch([words])
    preds = [idx_to_tag[idx] for idx in list(pred_ids[0])]

    return preds

In [93]:
idx_to_tag = {v:k for k,v in tag_2_id.items()}

In [98]:
text = "hey if MLSDREG is set then read the bit MLSDBITVAL of MLSDREG and clear the MLSDREG then set the MLSDREG to MLSDVAL else if MLSDREG is set to MLSDVAL then Write MLSDVAL to bit MLSDBITVAL of MLSDREG"#"i want to check if MLSDREG is initialized to MLSDVAL and then clear the register MLSDREG's MLSDFIELD and then set MLSDREG"
preds = predict(words_raw=text)
start = 0
sente=[]
for p,w in zip(preds,text.split(" ")):
#     print(w)
#     print(type(w))
    sent = ""
    if start == 1 and p == 'B-sent':
        sente.append(sent)
        sent = ""
        start = 0
        continue
    if p == 'B-sent' and start == 0:
        start = 1
        sent += str(w) + " "
        continue
    if start == 1 and p == "O":
        sent += str(w) + " "
        continue
    if start ==0 and p == "O":
        sent += str(w) + " "
        continue
sente.append(sent)

In [99]:
for p,w in zip(preds,text.split(" ")):
    print(p,"____",w)

B-sent ____ hey
O ____ if
O ____ MLSDREG
O ____ is
O ____ set
O ____ then
O ____ read
O ____ the
O ____ bit
O ____ MLSDBITVAL
O ____ of
O ____ MLSDREG
B-sent ____ and
O ____ clear
O ____ the
O ____ MLSDREG
O ____ then
O ____ set
O ____ the
O ____ MLSDREG
O ____ to
O ____ MLSDVAL
B-sent ____ else
O ____ if
O ____ MLSDREG
O ____ is
O ____ set
O ____ to
O ____ MLSDVAL
O ____ then
O ____ Write
O ____ MLSDVAL
O ____ to
O ____ bit
O ____ MLSDBITVAL
O ____ of
O ____ MLSDREG
