In [2]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import numpy as np

In [3]:
 tf.config.run_functions_eagerly(True)

In [4]:
def prep_ds(file):
    inp = ['']
    targ = [[]]
    idx = 0
    for line in file:
        try:
            word, tag, _ = line.split(' ')
            inp[idx]+= word + ' '
            targ[idx].append(tag)
        except:
#             tf.convert_to_tensor(inp[idx])
            targ[idx]=np.asarray(targ[idx])
            idx +=1
            inp.append('')
            targ.append([])
    return tf.convert_to_tensor(inp), targ


In [5]:
with open("train.txt",mode='r', encoding = "utf-8") as f:
    X_train, y_train = prep_ds(f)
with open("test.txt",mode='r', encoding = "utf-8") as f:
    X_test, y_test = prep_ds(f)

In [6]:
y = tf.ragged.constant(y_train)

In [6]:
max(map(len,[x.numpy().split() for x in X_train]))

78

In [7]:
print(X_train[:2])
print(y_train[:2])

tf.Tensor(
[b"Confidence in the pound is widely expected to take another sharp dive if trade figures for September , due for release tomorrow , fail to show a substantial improvement from July and August 's near-record deficits . "
 b"Chancellor of the Exchequer Nigel Lawson 's restated commitment to a firm monetary policy has helped to prevent a freefall in sterling over the past week . "], shape=(2,), dtype=string)
[['NN', 'IN', 'DT', 'NN', 'VBZ', 'RB', 'VBN', 'TO', 'VB', 'DT', 'JJ', 'NN', 'IN', 'NN', 'NNS', 'IN', 'NNP', ',', 'JJ', 'IN', 'NN', 'NN', ',', 'VB', 'TO', 'VB', 'DT', 'JJ', 'NN', 'IN', 'NNP', 'CC', 'NNP', 'POS', 'JJ', 'NNS', '.'], ['NNP', 'IN', 'DT', 'NNP', 'NNP', 'NNP', 'POS', 'VBN', 'NN', 'TO', 'DT', 'NN', 'JJ', 'NN', 'VBZ', 'VBN', 'TO', 'VB', 'DT', 'NN', 'IN', 'NN', 'IN', 'DT', 'JJ', 'NN', '.']]


In [7]:
class TagVectorization:
    def __init__(self,tag_corpus,seq_length):
        self.tags = list(set([tag for tags in tag_corpus for tag in tags]))
        self.sparse_vect = [i for i in range(1,len(self.tags)+1)]
        self.tag_to_vec = {self.tags[i]:self.sparse_vect[i] for i in range(len(self.tags))}
        self.vec_to_tag = {self.sparse_vect[i]:self.tags[i] for i in range(len(self.tags))}
        self.seq_length = seq_length
    def tag2vec(self, tags):
        vectors = []
        for sample in tags.numpy():
            vecs = [self.tag_to_vec[tag.decode('utf-8')] for tag in sample]
            vecs = tf.convert_to_tensor(vecs)
            vecs = tf.pad(vecs,tf.constant([[1,1]]))
            padded_vecs = tf.concat([vecs,tf.zeros(self.seq_length-len(vecs),dtype=tf.dtypes.int32)],0)
            vectors.append(padded_vecs)
        return tf.convert_to_tensor(vectors)
    def vec2tag(self, vecs, mask):
        mask = tf.cast(mask,dtype=tf.bool)
        no_pad = tf.ragged.boolean_mask(vecs, mask)[:,1:-1].numpy().tolist()
        return [list(map(self.vec_to_tag.get,sentence)) for sentence in no_pad]

In [8]:
tag = TagVectorization(y_train,100)

In [9]:
preproc_url =  'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
bert_url = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1'
bert_urls = [preproc_url,bert_url]

In [10]:
class BertEmbeddings(tf.keras.layers.Layer):
    def __init__(self,bert_preproc, bert_model, seq_length):
        super(BertEmbeddings, self).__init__()
        self.preproc =  hub.load(bert_preproc)
        self.bert = hub.KerasLayer(bert_model,trainable= True)
        self.seq_length = seq_length
    def call(self, text):
        
        tokenize = hub.KerasLayer(self.preproc.tokenize)
        tokens = tokenize(text)
        bert_pack_inputs = hub.KerasLayer(
            self.preproc.bert_pack_inputs,
            arguments=dict(seq_length=self.seq_length))
        encoder_inputs = bert_pack_inputs([tokens])
        mask_idx = encoder_inputs['input_mask']
        
        outputs = self.bert(encoder_inputs)['sequence_output']
#         masked_output = []
#         for i in range(outputs.shape[1]):
#             if  mask_idx[0][i]==1:
#                 masked_output.append(outputs[0][i])
#         x = tf.convert_to_tensor([masked_output])      
        return outputs, mask_idx

In [11]:
class Tagger(tf.keras.layers.Layer):
    def __init__(self, units,n_tags):
        super(Tagger, self).__init__()
        self.lstm_1 = tf.keras.layers.LSTM(units,return_sequences=True,return_state=True)
        self.lstm_2 = tf.keras.layers.LSTM(n_tags,return_sequences=True)
        self.W1 = tf.keras.layers.Dense(n_tags)
    def call(self,inputs):
        output_seq, mem_state, state = self.lstm_1(inputs)
        mem_state = self.W1(mem_state)
        state = self.W1(state)
        logits = self.lstm_2(output_seq,initial_state=[mem_state,state])
        return logits

In [12]:
class PosTagger(tf.keras.Model):
    def __init__(self, tagger_units,n_tags,seq_length, bert_urls, tag_corpus):
        super(PosTagger, self).__init__()
        self.seq_length = seq_length
        self.tagger_units = tagger_units
        self.n_tags = n_tags
        self.bert_urls = bert_urls
        
        self.tagvec = TagVectorization(tag_corpus,seq_length)
        self.bert = BertEmbeddings(bert_urls[0],bert_urls[1],seq_length)
        self.tagger = Tagger(tagger_units,n_tags)
    def train_step(self,inputs):
        text, tags = inputs
        
        targets = self.tagvec.tag2vec(tags)
        
        with tf.GradientTape() as tape:
            embeddings, mask = self.bert(text)
            logits = self.tagger(embeddings)
            loss = self.loss(targets,logits)
            
        gradients = tape.gradient(loss, self.trainable_variables)
        
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        return {'loss': loss}
    def call(self,inputs):
        embeddings, mask = self.bert(inputs)
        logits = self.tagger(embeddings)
        
        predictions = tf.argmax(logits,axis=-1)
        tags = self.tagvec.vec2tag(predictions,mask)
        return tags

In [13]:
model = PosTagger(256,45,100,bert_urls,y_train)

In [16]:
model.compile(loss=MaskedLoss(), optimizer = 'adam')

In [17]:
model.fit(x=X_train[:1000],y=y[:1000],batch_size=10, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x28a5658edc0>

In [15]:
class MaskedLoss(tf.keras.losses.Loss):
    def __init__(self):
        self.loss = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True, reduction='none')

    def __call__(self, y_true, y_pred):
        loss = self.loss(y_true, y_pred)
        mask = tf.cast(y_true != 0, tf.float32)

        loss *= mask

        return tf.reduce_sum(loss)
