# Farewell Email Writer in Gluon

In [1]:
import warnings
warnings.filterwarnings('ignore')
import os

import glob
import time
import math
import itertools
import random
import nltk

import mxnet as mx
from mxnet import gluon, autograd
from mxnet.gluon.utils import download

import gluonnlp as nlp
from mxnet.gluon import nn, Block

PATH = 'C:\\Users\\pochetti\\WorkDocs\\Desktop\\Fra\\Francesco\\Farewell'

### Functions

In [2]:
class LMDecoder(object):
    def __init__(self, model): self._model = model
    def __call__(self, inputs, states):
        outputs, states = self._model(mx.nd.expand_dims(inputs, axis=0), states)
        return outputs[0], states
    def state_info(self, *arg, **kwargs): return self._model.state_info(*arg, **kwargs)

def detach(hidden):
    if isinstance(hidden, (tuple, list)):
        hidden = [detach(i) for i in hidden]
    else:
        hidden = hidden.detach()
    return hidden

def train(model, train_data, epochs, lr):

    start_train_time = time.time()
    parameters = model.collect_params().values()
    for epoch in range(epochs):
        total_L = 0.0
        start_epoch_time = time.time()
        start_log_interval_time = time.time()
        hiddens = [model.begin_state(batch_size//len(context), func=mx.nd.zeros, ctx=ctx)
                   for ctx in context]
        for i, (data, target) in enumerate(train_data):
            data_list = gluon.utils.split_and_load(data, context,
                                                   batch_axis=1, even_split=True)
            target_list = gluon.utils.split_and_load(target, context,
                                                     batch_axis=1, even_split=True)
            hiddens = detach(hiddens)
            L = 0
            Ls = []
            with autograd.record():
                for j, (X, y, h) in enumerate(zip(data_list, target_list, hiddens)):
                    output, h = model(X, h)
                    batch_L = loss(output.reshape(-3, -1), y.reshape(-1,))
                    L = L + batch_L.as_in_context(context[0]) / X.size
                    Ls.append(batch_L / X.size)
                    hiddens[j] = h
            L.backward()
            grads = [p.grad(x.context) for p in parameters for x in data_list]
            gluon.utils.clip_global_norm(grads, grad_clip)

            trainer.step(1)

            total_L += sum([mx.nd.sum(l).asscalar() for l in Ls])

        mx.nd.waitall()

        print('[Epoch %d] loss %.2f '%(epoch, total_L/(i+1)))

        lr = lr*0.8
        trainer.set_learning_rate(lr)

    print('Total training throughput %.2f samples/s'%(
                            (batch_size * len(train_data) * epochs) /
                            (time.time() - start_train_time)))

### Excerpt of the corpus I have used to fine tune the pre-trained AWD-LSTM model (~40 emails)

In [3]:
f = open(f'{PATH}\\farewell_corpus.txt', encoding="utf8")
lines = f.readlines()
print(lines[0][:1000])
f.close()

hi guys, i would like to thank you all for this amazing experience and for your help. it has been a pleasure to work with all of you. i wish you all the best for your career as well as for your personal life. let's keep in touch. <eos> hi all,   well, it’s my last day at amazon.  i’ve seen many of these “farewell” notes in my 8.5 year tenure at amazon, so am keenly aware that saying things like “i have truly enjoyed working with all of you” and “you are some of the smartest people i will likely ever work with” are at best overused, and at worst, down right trite.  but i don’t care – i’m going to say them anyway, because they are true.  though i’m moving on to a new challenge, in my years at amazon and across teams i have met some of the kindest, most fun, and smartest people i will likely ever have the pleasure of working with.  you have challenged me, amazed me, encouraged me, and shown me what hard work from smart people can accomplish on a daily basis.   i wish you all the best, as 

### Defining key parameters for training

In [4]:
context = [mx.cpu()]
batch_size = 20
lr = 0.1
epochs = 15
bptt = 35
grad_clip = 0.25

### Loading the emails dataset and preparing it for Gluon

In [5]:
moses_tokenizer = nlp.data.SacreMosesTokenizer()

farewell_train = nlp.data.CorpusDataset(
    f'{PATH}\\farewell_corpus.txt',
    sample_splitter=nltk.tokenize.sent_tokenize,
    tokenizer=moses_tokenizer,
    flatten=True,
    eos='<eos>')

vocab = nlp.Vocab(nlp.data.Counter(farewell_train), padding_token=None, bos_token=None)

bptt_batchify = nlp.data.batchify.CorpusBPTTBatchify(vocab, bptt, batch_size, last_batch='discard')

farewell_train_data = bptt_batchify(farewell_train)

vocab

Vocab(size=1031, unk="<unk>", reserved="['<eos>']")

### Loading the pre-trained AWD-LSTM language model from the GluonNLP zoo

In [6]:
dataset_name = 'wikitext-2'
awd_model_name = 'awd_lstm_lm_1150'
awd_model, voc = nlp.model.get_model(
    awd_model_name,
    vocab=vocab,
    dataset_name=dataset_name,
    pretrained=True)
print(awd_model)
print(voc)

AWDRNN(
  (embedding): HybridSequential(
    (0): Embedding(33278 -> 400, float32)
    (1): Dropout(p = 0.65, axes=(0,))
  )
  (encoder): Sequential(
    (0): LSTM(400 -> 1150, TNC)
    (1): LSTM(1150 -> 1150, TNC)
    (2): LSTM(1150 -> 400, TNC)
  )
  (decoder): HybridSequential(
    (0): Dense(400 -> 33278, linear)
  )
)
Vocab(size=33278, unk="<unk>", reserved="['<eos>']")


### Editing the last Dense layer to reflect the new (smaller) vocabulary

In [7]:
new_decoder = mx.gluon.nn.HybridSequential()
new_decoder.add(mx.gluon.nn.Dense(units=1031, flatten=False))
new_decoder.initialize()
awd_model.decoder = new_decoder

awd_model

AWDRNN(
  (embedding): HybridSequential(
    (0): Embedding(33278 -> 400, float32)
    (1): Dropout(p = 0.65, axes=(0,))
  )
  (encoder): Sequential(
    (0): LSTM(400 -> 1150, TNC)
    (1): LSTM(1150 -> 1150, TNC)
    (2): LSTM(1150 -> 400, TNC)
  )
  (decoder): HybridSequential(
    (0): Dense(None -> 1031, linear)
  )
)

### Printing model params to figure out which ones to optimize

In [8]:
awd_model.collect_params()

awdrnn0_ (
  WeightDropParameter awdrnn0_hybridsequential0_embedding0_weight (shape=(33278, 400), dtype=float32, rate=0.1, mode=training)
  Parameter awdrnn0_sequential0_lstm0_l0_i2h_weight (shape=(4600, 400), dtype=<class 'numpy.float32'>)
  WeightDropParameter awdrnn0_sequential0_lstm0_l0_h2h_weight (shape=(4600, 1150), dtype=<class 'numpy.float32'>, rate=0.5, mode=training)
  Parameter awdrnn0_sequential0_lstm0_l0_i2h_bias (shape=(4600,), dtype=<class 'numpy.float32'>)
  Parameter awdrnn0_sequential0_lstm0_l0_h2h_bias (shape=(4600,), dtype=<class 'numpy.float32'>)
  Parameter awdrnn0_sequential0_lstm1_l0_i2h_weight (shape=(4600, 1150), dtype=<class 'numpy.float32'>)
  WeightDropParameter awdrnn0_sequential0_lstm1_l0_h2h_weight (shape=(4600, 1150), dtype=<class 'numpy.float32'>, rate=0.5, mode=training)
  Parameter awdrnn0_sequential0_lstm1_l0_i2h_bias (shape=(4600,), dtype=<class 'numpy.float32'>)
  Parameter awdrnn0_sequential0_lstm1_l0_h2h_bias (shape=(4600,), dtype=<class 'numpy.

### Defining a Gluon trainer and passing the last dense layer (i.e. DO NOT OPTIMIZE the rest of the network)

In [9]:
trainer = gluon.Trainer([awd_model.collect_params()['dense0_bias'],
                        awd_model.collect_params()['dense0_weight']], 'adam', {
    'learning_rate': lr,
    'wd': 0
})
loss = gluon.loss.SoftmaxCrossEntropyLoss()

### Fine tuning the AWD-LSTM model

In [10]:
train(awd_model, farewell_train_data, epochs, lr)

[Epoch 0] loss 6.26 
[Epoch 1] loss 5.00 
[Epoch 2] loss 4.03 
[Epoch 3] loss 3.49 
[Epoch 4] loss 3.18 
[Epoch 5] loss 2.98 
[Epoch 6] loss 2.93 
[Epoch 7] loss 2.85 
[Epoch 8] loss 2.87 
[Epoch 9] loss 2.83 
[Epoch 10] loss 2.67 
[Epoch 11] loss 2.63 
[Epoch 12] loss 2.67 
[Epoch 13] loss 2.54 
[Epoch 14] loss 2.54 
Total training throughput 5.06 samples/s


### Defining the sequence sampler to perform Beam Search and generate samples of sentences

In [11]:
model = awd_model

decoder = LMDecoder(model)

sampler = nlp.model.SequenceSampler(beam_size=15,
                                    decoder=decoder,
                                    eos_id=vocab['<eos>'],
                                    max_length=500,
                                    temperature=0.97)

### Generating a few email samples

In [12]:
ctx=context[0]

bos = 'hi , '.split()
bos_ids = [vocab[ele] for ele in bos]
begin_states = model.begin_state(batch_size=1, ctx=ctx)
if len(bos_ids) > 1:
    _, begin_states = model(mx.nd.expand_dims(mx.nd.array(bos_ids[:-1]), axis=1), begin_states)
inputs = mx.nd.full(shape=(1,), ctx=ctx, val=bos_ids[-1])
samples, scores, valid_lengths = sampler(inputs, begin_states)
samples = samples[0].asnumpy()
scores = scores[0].asnumpy()
valid_lengths = valid_lengths[0].asnumpy()
sentence = bos[:-1] + [vocab.idx_to_token[ele] for ele in samples[0][:valid_lengths[0]]]
print('Generation Result:')
for i in range(5):
    sentence = bos[:-1] + [vocab.idx_to_token[ele] for ele in samples[i][:valid_lengths[i]]]
    print([' '.join(sentence), scores[i]])

Generation Result:
['hi , please . <eos>', -8.115891]
['hi , since marketplaces yes trying enjoyed unchanged be of you know <eos>', -42.86605]
['hi , move so i would much doing ’ ve like going to day so been a chance for a great personal horizons , and ’ on facebook last day you all for is and my but . <eos>', -128.69484]
['hi , because from wish to stay an opportunity for these wonderful 1,280 that together , career 11 last day and innovate and in luxembourg t of you for , is up on a monday morning morning looking our paths cross again . <eos>', -127.031044]
['hi , first today allowed day day at you working with lux and bbqs me around accomplish with end i start a &lt; cup games my experience here <eos>', -103.4374]
