### Importing Libraries

In [1]:
import numpy as np
import os
import shutil
import tensorflow as tf
import nltk


### Dataset Download and Read Function
This section defines a function to download and read a dataset, specifically handling sentence and part-of-speech (POS) data.

In [2]:
def download_and_read(dataset_dir, num_pairs=None):
    sent_filename = os.path.join(dataset_dir, 'treebank-sents.txt')
    pos_filename = os.path.join(dataset_dir, 'treebank-poss.txt')
    
    if not (os.path.exists(sent_filename) and os.path.exists(pos_filename)):
        if not os.path.exists(dataset_dir):
            os.makedirs(dataset_dir)

        nltk.download('treebank')
        sentences = nltk.corpus.treebank.tagged_sents()

        with open(sent_filename, 'w') as fsents, open(pos_filename, 'w') as fposs:
            for sent in sentences:
                words, tags = zip(*sent)
                fsents.write(' '.join(words) + '\n')
                fposs.write(' '.join(tags) + '\n')

    sents, poss = [], []
    with open(sent_filename, 'r') as fsent:
        for idx, line in enumerate(fsent):
            sents.append(line.strip())
            if num_pairs is not None and idx >= num_pairs - 1:
                break

    with open(pos_filename, 'r') as fposs:
        for idx, line in enumerate(fposs):
            poss.append(line.strip())
            if num_pairs is not None and idx >= num_pairs - 1:
                break

    return sents, poss

sents, poss = download_and_read('./datasets')
print('# of sentences:', len(sents))
print('# of POS sequences:', len(poss))
assert(len(sents) == len(poss))



# of sentences: 3914
# of POS sequences: 3914



### Tokenization and Vocabulary Building
This code segment defines a function for tokenizing text and building a vocabulary, suitable for NLP tasks.

In [3]:
def tokenizer_and_build_vocab(texts, vocab_size=None, lower=True):
    if vocab_size is None:
        tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=lower)
    else:
        tokenizer = tf.keras.preprocessing.text.Tokenizer(
            num_words=vocab_size + 1, oov_token='UNK', lower=lower
        )
    tokenizer.fit_on_texts(texts)
    if vocab_size is not None:
        tokenizer.word_index = {e:i for e, i in tokenizer.word_index.items() if i <= vocab_size + 1}
    word2indx = tokenizer.word_index
    ind2word = {v:k for k, v in word2indx.items()}
    return word2indx, ind2word, tokenizer

word2ind_s, idx2word_s, tokenizer_s = tokenizer_and_build_vocab(
    sents, vocab_size=9000
)
word2ind_t, idx2word_t, tokenizer_t = tokenizer_and_build_vocab(
    poss, vocab_size=38, lower=False
)
source_vocab_size = len(word2ind_s)
target_vocab_size = len(word2ind_t)

print('vocab sizes(source): {:d}, (target): {:d}'.format(
    source_vocab_size, target_vocab_size
))


vocab sizes(source): 9001, (target): 39



### Displaying Sample Sentences and Sequence Length Statistics
This code snippet shows a sample of sentences and calculates statistical percentiles for sequence lengths.

In [4]:
print(sents[:10])
sequence_lenghts = np.array([len(s.split()) for s in sents])
print([(p, np.percentile(sequence_lenghts, p)) for p in [75, 80, 90, 95, 99, 100]])


['Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 .', 'Mr. Vinken is chairman of Elsevier N.V. , the Dutch publishing group .', 'Rudolph Agnew , 55 years old and former chairman of Consolidated Gold Fields PLC , was named *-1 a nonexecutive director of this British industrial conglomerate .', 'A form of asbestos once used * * to make Kent cigarette filters has caused a high percentage of cancer deaths among a group of workers exposed * to it more than 30 years ago , researchers reported 0 *T*-1 .', 'The asbestos fiber , crocidolite , is unusually resilient once it enters the lungs , with even brief exposures to it causing symptoms that *T*-1 show up decades later , researchers said 0 *T*-2 .', 'Lorillard Inc. , the unit of New York-based Loews Corp. that *T*-2 makes Kent cigarettes , stopped using crocidolite in its Micronite cigarette filters in 1956 .', "Although preliminary findings were reported *-2 more than a year ago , the latest results app


### Preprocessing Text for Model Training
This section covers the conversion of sentences and POS tags to integer sequences, padding, and splitting the data into training, validation, and test sets.

In [5]:
max_seqlen = 271

#Convert sentences to sequence of integer

sents_as_ints = tokenizer_s.texts_to_sequences(sents)
sents_as_ints = tf.keras.preprocessing.sequence.pad_sequences(
    sents_as_ints, maxlen = max_seqlen, padding = 'post'
)
# convert POS tags to sequence of (categorial) integers
poss_as_ints = tokenizer_t.texts_to_sequences(poss)
poss_as_ints = tf.keras.preprocessing.sequence.pad_sequences(
    poss_as_ints, maxlen = max_seqlen, padding = 'post'
)

poss_as_catints = []
for p in poss_as_ints:
    poss_as_catints.append(tf.keras.utils.to_categorical(p,
     num_classes = target_vocab_size + 1, dtype = 'int32'))
poss_as_catints = tf.keras.preprocessing.sequence.pad_sequences(
    poss_as_catints, maxlen = max_seqlen
)
dataset = tf.data.Dataset.from_tensor_slices(
    (sents_as_ints, poss_as_catints)
)
idx2word_s[0], idx2word_t[0] = 'PAD', 'PAD'
#split into training, validation, and test datasets

dataset = dataset.shuffle(10000)
test_size = len(sents) // 3
val_size = (len(sents)-test_size) // 10
test_dataset = dataset.take(test_size)
val_dataset = dataset.skip(test_size).take(val_size)
train_dataset = dataset.skip(test_size + val_size)

#create batches 
batch_size = 128

train_dataset = train_dataset.batch(batch_size)
val_dataset = val_dataset.batch(batch_size)
test_dataset  =test_dataset.batch(batch_size)

    


### POS Tagging Model Definition and Training
This section includes the definition of a POS tagging model using TensorFlow, its compilation, and training process.

In [6]:
class POSTaggingModel(tf.keras.Model):
    def __init__ (self, source_vocab_size, target_vocab_size,
                  embedding_dim, max_seqlen, rnn_output_dim, **kwargs):
        super(POSTaggingModel, self).__init__(**kwargs)
        self.embed = tf.keras.layers.Embedding(
            source_vocab_size, embedding_dim, input_length = max_seqlen
        )
        self.dropout = tf.keras.layers.SpatialDropout1D(.2)
        self.rnn = tf.keras.layers.Bidirectional(
            tf.keras.layers.GRU(rnn_output_dim, return_sequences = True)
        )
        self.dense = tf.keras.layers.TimeDistributed(
            tf.keras.layers.Dense(target_vocab_size)
        )
        self.activation = tf.keras.layers.Activation('softmax')
    def call(self, x):
        x = self.embed(x)
        x = self.dropout(x)
        x = self.rnn(x)
        x = self.dense(x)
        x = self.activation(x)
        return x

def masked_accuracy():
    def masked_accuracy_fn(ytrue, ypred):
                ytrue = tf.keras.backend.argmax(ytrue, axis=-1)
                ypred = tf.keras.backend.argmax(ypred, axis=-1)
                mask = tf.keras.backend.cast(
                    tf.keras.backend.not_equal(ypred, 0), tf.int32)
                matches = tf.keras.backend.cast(
                    tf.keras.backend.equal(ytrue, ypred), tf.int32
                )* mask
                numer = tf.keras.backend.sum(matches)
                denom = tf.keras.backend.maximum(tf.keras.backend.sum(mask),1)
                accuracy = numer/denom
                return accuracy
    return masked_accuracy_fn
num_epochs = 50
best_model_file = os.path.join('./datasets', 'best_model.h5')
checkpoint = tf.keras.callbacks.ModelCheckpoint(
            best_model_file, 
            save_weights_only = True,
            save_best_only = True
        )

embedding_dim = 128
rnn_output_dim = 256

model = POSTaggingModel(source_vocab_size, target_vocab_size + 1,
                                embedding_dim, max_seqlen, rnn_output_dim)
        
model.build(input_shape = (batch_size, max_seqlen))
model.summary()
model.compile(
            loss = 'categorical_crossentropy',
            optimizer = 'adam',
            metrics = ['accuracy', masked_accuracy()]
        )

tensorboard = tf.keras.callbacks.TensorBoard(log_dir = './log_dir')
history = model.fit(train_dataset, epochs = num_epochs,
                            validation_data = val_dataset, callbacks = [checkpoint, tensorboard])

Model: "pos_tagging_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  1152128   
                                                                 
 spatial_dropout1d (SpatialD  multiple                 0         
 ropout1D)                                                       
                                                                 
 bidirectional (Bidirectiona  multiple                 592896    
 l)                                                              
                                                                 
 time_distributed (TimeDistr  multiple                 20520     
 ibuted)                                                         
                                                                 
 activation (Activation)     multiple                  0         
                                                 